1 /*
2 * Copyright (c) 2004 The FreeBSD Foundation
3 * Copyright (c) 2004 Robert Watson
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 4. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
32 */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD: releng/5.3/sys/kern/uipc_socket.c 137175 2004-11-04 01:17:31Z rwatson $");
36
37 #include "opt_inet.h"
38 #include "opt_mac.h"
39 #include "opt_zero.h"
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/fcntl.h>
44 #include <sys/limits.h>
45 #include <sys/lock.h>
46 #include <sys/mac.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/mutex.h>
50 #include <sys/domain.h>
51 #include <sys/file.h> /* for struct knote */
52 #include <sys/kernel.h>
53 #include <sys/event.h>
54 #include <sys/poll.h>
55 #include <sys/proc.h>
56 #include <sys/protosw.h>
57 #include <sys/socket.h>
58 #include <sys/socketvar.h>
59 #include <sys/resourcevar.h>
60 #include <sys/signalvar.h>
61 #include <sys/sysctl.h>
62 #include <sys/uio.h>
63 #include <sys/jail.h>
64
65 #include <vm/uma.h>
66
67
68 static int soreceive_rcvoob(struct socket *so, struct uio *uio,
69 int flags);
70
71 #ifdef INET
72 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt);
73 #endif
74
75 static void filt_sordetach(struct knote *kn);
76 static int filt_soread(struct knote *kn, long hint);
77 static void filt_sowdetach(struct knote *kn);
78 static int filt_sowrite(struct knote *kn, long hint);
79 static int filt_solisten(struct knote *kn, long hint);
80
81 static struct filterops solisten_filtops =
82 { 1, NULL, filt_sordetach, filt_solisten };
83 static struct filterops soread_filtops =
84 { 1, NULL, filt_sordetach, filt_soread };
85 static struct filterops sowrite_filtops =
86 { 1, NULL, filt_sowdetach, filt_sowrite };
87
88 uma_zone_t socket_zone;
89 so_gen_t so_gencnt; /* generation count for sockets */
90
91 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
92 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
93
94 SYSCTL_DECL(_kern_ipc);
95
96 static int somaxconn = SOMAXCONN;
97 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
98 &somaxconn, 0, "Maximum pending socket connection queue size");
99 static int numopensockets;
100 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
101 &numopensockets, 0, "Number of open sockets");
102 #ifdef ZERO_COPY_SOCKETS
103 /* These aren't static because they're used in other files. */
104 int so_zero_copy_send = 1;
105 int so_zero_copy_receive = 1;
106 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
107 "Zero copy controls");
108 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
109 &so_zero_copy_receive, 0, "Enable zero copy receive");
110 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
111 &so_zero_copy_send, 0, "Enable zero copy send");
112 #endif /* ZERO_COPY_SOCKETS */
113
114 /*
115 * accept_mtx locks down per-socket fields relating to accept queues. See
116 * socketvar.h for an annotation of the protected fields of struct socket.
117 */
118 struct mtx accept_mtx;
119 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
120
121 /*
122 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
123 * so_gencnt field.
124 *
125 * XXXRW: These variables might be better manipulated using atomic operations
126 * for improved efficiency.
127 */
128 static struct mtx so_global_mtx;
129 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
130
131 /*
132 * Socket operation routines.
133 * These routines are called by the routines in
134 * sys_socket.c or from a system process, and
135 * implement the semantics of socket operations by
136 * switching out to the protocol specific routines.
137 */
138
139 /*
140 * Get a socket structure from our zone, and initialize it.
141 * Note that it would probably be better to allocate socket
142 * and PCB at the same time, but I'm not convinced that all
143 * the protocols can be easily modified to do this.
144 *
145 * soalloc() returns a socket with a ref count of 0.
146 */
147 struct socket *
148 soalloc(int mflags)
149 {
150 struct socket *so;
151 #ifdef MAC
152 int error;
153 #endif
154
155 so = uma_zalloc(socket_zone, mflags | M_ZERO);
156 if (so != NULL) {
157 #ifdef MAC
158 error = mac_init_socket(so, mflags);
159 if (error != 0) {
160 uma_zfree(socket_zone, so);
161 so = NULL;
162 return so;
163 }
164 #endif
165 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
166 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
167 /* sx_init(&so->so_sxlock, "socket sxlock"); */
168 TAILQ_INIT(&so->so_aiojobq);
169 mtx_lock(&so_global_mtx);
170 so->so_gencnt = ++so_gencnt;
171 ++numopensockets;
172 mtx_unlock(&so_global_mtx);
173 }
174 return so;
175 }
176
177 /*
178 * socreate returns a socket with a ref count of 1. The socket should be
179 * closed with soclose().
180 */
181 int
182 socreate(dom, aso, type, proto, cred, td)
183 int dom;
184 struct socket **aso;
185 int type;
186 int proto;
187 struct ucred *cred;
188 struct thread *td;
189 {
190 struct protosw *prp;
191 struct socket *so;
192 int error;
193
194 if (proto)
195 prp = pffindproto(dom, proto, type);
196 else
197 prp = pffindtype(dom, type);
198
199 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL)
200 return (EPROTONOSUPPORT);
201
202 if (jailed(cred) && jail_socket_unixiproute_only &&
203 prp->pr_domain->dom_family != PF_LOCAL &&
204 prp->pr_domain->dom_family != PF_INET &&
205 prp->pr_domain->dom_family != PF_ROUTE) {
206 return (EPROTONOSUPPORT);
207 }
208
209 if (prp->pr_type != type)
210 return (EPROTOTYPE);
211 so = soalloc(M_WAITOK);
212 if (so == NULL)
213 return (ENOBUFS);
214
215 TAILQ_INIT(&so->so_incomp);
216 TAILQ_INIT(&so->so_comp);
217 so->so_type = type;
218 so->so_cred = crhold(cred);
219 so->so_proto = prp;
220 #ifdef MAC
221 mac_create_socket(cred, so);
222 #endif
223 SOCK_LOCK(so);
224 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
225 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
226 soref(so);
227 SOCK_UNLOCK(so);
228 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
229 if (error) {
230 ACCEPT_LOCK();
231 SOCK_LOCK(so);
232 so->so_state |= SS_NOFDREF;
233 sorele(so);
234 return (error);
235 }
236 *aso = so;
237 return (0);
238 }
239
240 int
241 sobind(so, nam, td)
242 struct socket *so;
243 struct sockaddr *nam;
244 struct thread *td;
245 {
246
247 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
248 }
249
250 void
251 sodealloc(struct socket *so)
252 {
253
254 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
255 mtx_lock(&so_global_mtx);
256 so->so_gencnt = ++so_gencnt;
257 mtx_unlock(&so_global_mtx);
258 if (so->so_rcv.sb_hiwat)
259 (void)chgsbsize(so->so_cred->cr_uidinfo,
260 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
261 if (so->so_snd.sb_hiwat)
262 (void)chgsbsize(so->so_cred->cr_uidinfo,
263 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
264 #ifdef INET
265 /* remove acccept filter if one is present. */
266 if (so->so_accf != NULL)
267 do_setopt_accept_filter(so, NULL);
268 #endif
269 #ifdef MAC
270 mac_destroy_socket(so);
271 #endif
272 crfree(so->so_cred);
273 SOCKBUF_LOCK_DESTROY(&so->so_snd);
274 SOCKBUF_LOCK_DESTROY(&so->so_rcv);
275 /* sx_destroy(&so->so_sxlock); */
276 uma_zfree(socket_zone, so);
277 /*
278 * XXXRW: Seems like a shame to grab the mutex again down here, but
279 * we don't want to decrement the socket count until after we free
280 * the socket, and we can't increment the gencnt on the socket after
281 * we free, it so...
282 */
283 mtx_lock(&so_global_mtx);
284 --numopensockets;
285 mtx_unlock(&so_global_mtx);
286 }
287
288 int
289 solisten(so, backlog, td)
290 struct socket *so;
291 int backlog;
292 struct thread *td;
293 {
294 int error;
295
296 /*
297 * XXXRW: Ordering issue here -- perhaps we need to set
298 * SO_ACCEPTCONN before the call to pru_listen()?
299 * XXXRW: General atomic test-and-set concerns here also.
300 */
301 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
302 SS_ISDISCONNECTING))
303 return (EINVAL);
304 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
305 if (error)
306 return (error);
307 ACCEPT_LOCK();
308 if (TAILQ_EMPTY(&so->so_comp)) {
309 SOCK_LOCK(so);
310 so->so_options |= SO_ACCEPTCONN;
311 SOCK_UNLOCK(so);
312 }
313 if (backlog < 0 || backlog > somaxconn)
314 backlog = somaxconn;
315 so->so_qlimit = backlog;
316 ACCEPT_UNLOCK();
317 return (0);
318 }
319
320 /*
321 * Attempt to free a socket. This should really be sotryfree().
322 *
323 * We free the socket if the protocol is no longer interested in the socket,
324 * there's no file descriptor reference, and the refcount is 0. While the
325 * calling macro sotryfree() tests the refcount, sofree() has to test it
326 * again as it's possible to race with an accept()ing thread if the socket is
327 * in an listen queue of a listen socket, as being in the listen queue
328 * doesn't elevate the reference count. sofree() acquires the accept mutex
329 * early for this test in order to avoid that race.
330 */
331 void
332 sofree(so)
333 struct socket *so;
334 {
335 struct socket *head;
336
337 ACCEPT_LOCK_ASSERT();
338 SOCK_LOCK_ASSERT(so);
339
340 if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0 ||
341 so->so_count != 0) {
342 SOCK_UNLOCK(so);
343 ACCEPT_UNLOCK();
344 return;
345 }
346
347 head = so->so_head;
348 if (head != NULL) {
349 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
350 (so->so_qstate & SQ_INCOMP) != 0,
351 ("sofree: so_head != NULL, but neither SQ_COMP nor "
352 "SQ_INCOMP"));
353 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
354 (so->so_qstate & SQ_INCOMP) == 0,
355 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
356 /*
357 * accept(2) is responsible draining the completed
358 * connection queue and freeing those sockets, so
359 * we just return here if this socket is currently
360 * on the completed connection queue. Otherwise,
361 * accept(2) may hang after select(2) has indicating
362 * that a listening socket was ready. If it's an
363 * incomplete connection, we remove it from the queue
364 * and free it; otherwise, it won't be released until
365 * the listening socket is closed.
366 */
367 if ((so->so_qstate & SQ_COMP) != 0) {
368 SOCK_UNLOCK(so);
369 ACCEPT_UNLOCK();
370 return;
371 }
372 TAILQ_REMOVE(&head->so_incomp, so, so_list);
373 head->so_incqlen--;
374 so->so_qstate &= ~SQ_INCOMP;
375 so->so_head = NULL;
376 }
377 KASSERT((so->so_qstate & SQ_COMP) == 0 &&
378 (so->so_qstate & SQ_INCOMP) == 0,
379 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
380 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
381 SOCK_UNLOCK(so);
382 ACCEPT_UNLOCK();
383 SOCKBUF_LOCK(&so->so_snd);
384 so->so_snd.sb_flags |= SB_NOINTR;
385 (void)sblock(&so->so_snd, M_WAITOK);
386 /*
387 * socantsendmore_locked() drops the socket buffer mutex so that it
388 * can safely perform wakeups. Re-acquire the mutex before
389 * continuing.
390 */
391 socantsendmore_locked(so);
392 SOCKBUF_LOCK(&so->so_snd);
393 sbunlock(&so->so_snd);
394 sbrelease_locked(&so->so_snd, so);
395 SOCKBUF_UNLOCK(&so->so_snd);
396 sorflush(so);
397 knlist_destroy(&so->so_rcv.sb_sel.si_note);
398 knlist_destroy(&so->so_snd.sb_sel.si_note);
399 sodealloc(so);
400 }
401
402 /*
403 * Close a socket on last file table reference removal.
404 * Initiate disconnect if connected.
405 * Free socket when disconnect complete.
406 *
407 * This function will sorele() the socket. Note that soclose() may be
408 * called prior to the ref count reaching zero. The actual socket
409 * structure will not be freed until the ref count reaches zero.
410 */
411 int
412 soclose(so)
413 struct socket *so;
414 {
415 int error = 0;
416
417 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
418
419 funsetown(&so->so_sigio);
420 if (so->so_options & SO_ACCEPTCONN) {
421 struct socket *sp;
422 ACCEPT_LOCK();
423 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
424 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
425 so->so_incqlen--;
426 sp->so_qstate &= ~SQ_INCOMP;
427 sp->so_head = NULL;
428 ACCEPT_UNLOCK();
429 (void) soabort(sp);
430 ACCEPT_LOCK();
431 }
432 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
433 TAILQ_REMOVE(&so->so_comp, sp, so_list);
434 so->so_qlen--;
435 sp->so_qstate &= ~SQ_COMP;
436 sp->so_head = NULL;
437 ACCEPT_UNLOCK();
438 (void) soabort(sp);
439 ACCEPT_LOCK();
440 }
441 ACCEPT_UNLOCK();
442 }
443 if (so->so_pcb == NULL)
444 goto discard;
445 if (so->so_state & SS_ISCONNECTED) {
446 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
447 error = sodisconnect(so);
448 if (error)
449 goto drop;
450 }
451 if (so->so_options & SO_LINGER) {
452 if ((so->so_state & SS_ISDISCONNECTING) &&
453 (so->so_state & SS_NBIO))
454 goto drop;
455 while (so->so_state & SS_ISCONNECTED) {
456 error = tsleep(&so->so_timeo,
457 PSOCK | PCATCH, "soclos", so->so_linger * hz);
458 if (error)
459 break;
460 }
461 }
462 }
463 drop:
464 if (so->so_pcb != NULL) {
465 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
466 if (error == 0)
467 error = error2;
468 }
469 discard:
470 ACCEPT_LOCK();
471 SOCK_LOCK(so);
472 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
473 so->so_state |= SS_NOFDREF;
474 sorele(so);
475 return (error);
476 }
477
478 /*
479 * soabort() must not be called with any socket locks held, as it calls
480 * into the protocol, which will call back into the socket code causing
481 * it to acquire additional socket locks that may cause recursion or lock
482 * order reversals.
483 */
484 int
485 soabort(so)
486 struct socket *so;
487 {
488 int error;
489
490 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
491 if (error) {
492 ACCEPT_LOCK();
493 SOCK_LOCK(so);
494 sotryfree(so); /* note: does not decrement the ref count */
495 return error;
496 }
497 return (0);
498 }
499
500 int
501 soaccept(so, nam)
502 struct socket *so;
503 struct sockaddr **nam;
504 {
505 int error;
506
507 SOCK_LOCK(so);
508 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
509 so->so_state &= ~SS_NOFDREF;
510 SOCK_UNLOCK(so);
511 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
512 return (error);
513 }
514
515 int
516 soconnect(so, nam, td)
517 struct socket *so;
518 struct sockaddr *nam;
519 struct thread *td;
520 {
521 int error;
522
523 if (so->so_options & SO_ACCEPTCONN)
524 return (EOPNOTSUPP);
525 /*
526 * If protocol is connection-based, can only connect once.
527 * Otherwise, if connected, try to disconnect first.
528 * This allows user to disconnect by connecting to, e.g.,
529 * a null address.
530 */
531 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
532 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
533 (error = sodisconnect(so))))
534 error = EISCONN;
535 else
536 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
537 return (error);
538 }
539
540 int
541 soconnect2(so1, so2)
542 struct socket *so1;
543 struct socket *so2;
544 {
545
546 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
547 }
548
549 int
550 sodisconnect(so)
551 struct socket *so;
552 {
553 int error;
554
555 if ((so->so_state & SS_ISCONNECTED) == 0)
556 return (ENOTCONN);
557 if (so->so_state & SS_ISDISCONNECTING)
558 return (EALREADY);
559 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
560 return (error);
561 }
562
563 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
564 /*
565 * Send on a socket.
566 * If send must go all at once and message is larger than
567 * send buffering, then hard error.
568 * Lock against other senders.
569 * If must go all at once and not enough room now, then
570 * inform user that this would block and do nothing.
571 * Otherwise, if nonblocking, send as much as possible.
572 * The data to be sent is described by "uio" if nonzero,
573 * otherwise by the mbuf chain "top" (which must be null
574 * if uio is not). Data provided in mbuf chain must be small
575 * enough to send all at once.
576 *
577 * Returns nonzero on error, timeout or signal; callers
578 * must check for short counts if EINTR/ERESTART are returned.
579 * Data and control buffers are freed on return.
580 */
581
582 #ifdef ZERO_COPY_SOCKETS
583 struct so_zerocopy_stats{
584 int size_ok;
585 int align_ok;
586 int found_ifp;
587 };
588 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
589 #include <netinet/in.h>
590 #include <net/route.h>
591 #include <netinet/in_pcb.h>
592 #include <vm/vm.h>
593 #include <vm/vm_page.h>
594 #include <vm/vm_object.h>
595 #endif /*ZERO_COPY_SOCKETS*/
596
597 int
598 sosend(so, addr, uio, top, control, flags, td)
599 struct socket *so;
600 struct sockaddr *addr;
601 struct uio *uio;
602 struct mbuf *top;
603 struct mbuf *control;
604 int flags;
605 struct thread *td;
606 {
607 struct mbuf **mp;
608 struct mbuf *m;
609 long space, len = 0, resid;
610 int clen = 0, error, dontroute;
611 int atomic = sosendallatonce(so) || top;
612 #ifdef ZERO_COPY_SOCKETS
613 int cow_send;
614 #endif /* ZERO_COPY_SOCKETS */
615
616 if (uio != NULL)
617 resid = uio->uio_resid;
618 else
619 resid = top->m_pkthdr.len;
620 /*
621 * In theory resid should be unsigned.
622 * However, space must be signed, as it might be less than 0
623 * if we over-committed, and we must use a signed comparison
624 * of space and resid. On the other hand, a negative resid
625 * causes us to loop sending 0-length segments to the protocol.
626 *
627 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
628 * type sockets since that's an error.
629 */
630 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
631 error = EINVAL;
632 goto out;
633 }
634
635 dontroute =
636 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
637 (so->so_proto->pr_flags & PR_ATOMIC);
638 if (td != NULL)
639 td->td_proc->p_stats->p_ru.ru_msgsnd++;
640 if (control != NULL)
641 clen = control->m_len;
642 #define snderr(errno) { error = (errno); goto release; }
643
644 SOCKBUF_LOCK(&so->so_snd);
645 restart:
646 SOCKBUF_LOCK_ASSERT(&so->so_snd);
647 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
648 if (error)
649 goto out_locked;
650 do {
651 SOCKBUF_LOCK_ASSERT(&so->so_snd);
652 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
653 snderr(EPIPE);
654 if (so->so_error) {
655 error = so->so_error;
656 so->so_error = 0;
657 goto release;
658 }
659 if ((so->so_state & SS_ISCONNECTED) == 0) {
660 /*
661 * `sendto' and `sendmsg' is allowed on a connection-
662 * based socket if it supports implied connect.
663 * Return ENOTCONN if not connected and no address is
664 * supplied.
665 */
666 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
667 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
668 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
669 !(resid == 0 && clen != 0))
670 snderr(ENOTCONN);
671 } else if (addr == NULL)
672 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
673 ENOTCONN : EDESTADDRREQ);
674 }
675 space = sbspace(&so->so_snd);
676 if (flags & MSG_OOB)
677 space += 1024;
678 if ((atomic && resid > so->so_snd.sb_hiwat) ||
679 clen > so->so_snd.sb_hiwat)
680 snderr(EMSGSIZE);
681 if (space < resid + clen &&
682 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
683 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
684 snderr(EWOULDBLOCK);
685 sbunlock(&so->so_snd);
686 error = sbwait(&so->so_snd);
687 if (error)
688 goto out_locked;
689 goto restart;
690 }
691 SOCKBUF_UNLOCK(&so->so_snd);
692 mp = ⊤
693 space -= clen;
694 do {
695 if (uio == NULL) {
696 /*
697 * Data is prepackaged in "top".
698 */
699 resid = 0;
700 if (flags & MSG_EOR)
701 top->m_flags |= M_EOR;
702 } else do {
703 #ifdef ZERO_COPY_SOCKETS
704 cow_send = 0;
705 #endif /* ZERO_COPY_SOCKETS */
706 if (resid >= MINCLSIZE) {
707 #ifdef ZERO_COPY_SOCKETS
708 if (top == NULL) {
709 MGETHDR(m, M_TRYWAIT, MT_DATA);
710 if (m == NULL) {
711 error = ENOBUFS;
712 SOCKBUF_LOCK(&so->so_snd);
713 goto release;
714 }
715 m->m_pkthdr.len = 0;
716 m->m_pkthdr.rcvif = (struct ifnet *)0;
717 } else {
718 MGET(m, M_TRYWAIT, MT_DATA);
719 if (m == NULL) {
720 error = ENOBUFS;
721 SOCKBUF_LOCK(&so->so_snd);
722 goto release;
723 }
724 }
725 if (so_zero_copy_send &&
726 resid>=PAGE_SIZE &&
727 space>=PAGE_SIZE &&
728 uio->uio_iov->iov_len>=PAGE_SIZE) {
729 so_zerocp_stats.size_ok++;
730 if (!((vm_offset_t)
731 uio->uio_iov->iov_base & PAGE_MASK)){
732 so_zerocp_stats.align_ok++;
733 cow_send = socow_setup(m, uio);
734 }
735 }
736 if (!cow_send) {
737 MCLGET(m, M_TRYWAIT);
738 if ((m->m_flags & M_EXT) == 0) {
739 m_free(m);
740 m = NULL;
741 } else {
742 len = min(min(MCLBYTES, resid), space);
743 }
744 } else
745 len = PAGE_SIZE;
746 #else /* ZERO_COPY_SOCKETS */
747 if (top == NULL) {
748 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
749 m->m_pkthdr.len = 0;
750 m->m_pkthdr.rcvif = (struct ifnet *)0;
751 } else
752 m = m_getcl(M_TRYWAIT, MT_DATA, 0);
753 len = min(min(MCLBYTES, resid), space);
754 #endif /* ZERO_COPY_SOCKETS */
755 } else {
756 if (top == NULL) {
757 m = m_gethdr(M_TRYWAIT, MT_DATA);
758 m->m_pkthdr.len = 0;
759 m->m_pkthdr.rcvif = (struct ifnet *)0;
760
761 len = min(min(MHLEN, resid), space);
762 /*
763 * For datagram protocols, leave room
764 * for protocol headers in first mbuf.
765 */
766 if (atomic && m && len < MHLEN)
767 MH_ALIGN(m, len);
768 } else {
769 m = m_get(M_TRYWAIT, MT_DATA);
770 len = min(min(MLEN, resid), space);
771 }
772 }
773 if (m == NULL) {
774 error = ENOBUFS;
775 SOCKBUF_LOCK(&so->so_snd);
776 goto release;
777 }
778
779 space -= len;
780 #ifdef ZERO_COPY_SOCKETS
781 if (cow_send)
782 error = 0;
783 else
784 #endif /* ZERO_COPY_SOCKETS */
785 error = uiomove(mtod(m, void *), (int)len, uio);
786 resid = uio->uio_resid;
787 m->m_len = len;
788 *mp = m;
789 top->m_pkthdr.len += len;
790 if (error) {
791 SOCKBUF_LOCK(&so->so_snd);
792 goto release;
793 }
794 mp = &m->m_next;
795 if (resid <= 0) {
796 if (flags & MSG_EOR)
797 top->m_flags |= M_EOR;
798 break;
799 }
800 } while (space > 0 && atomic);
801 if (dontroute) {
802 SOCK_LOCK(so);
803 so->so_options |= SO_DONTROUTE;
804 SOCK_UNLOCK(so);
805 }
806 /*
807 * XXX all the SBS_CANTSENDMORE checks previously
808 * done could be out of date. We could have recieved
809 * a reset packet in an interrupt or maybe we slept
810 * while doing page faults in uiomove() etc. We could
811 * probably recheck again inside the locking protection
812 * here, but there are probably other places that this
813 * also happens. We must rethink this.
814 */
815 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
816 (flags & MSG_OOB) ? PRUS_OOB :
817 /*
818 * If the user set MSG_EOF, the protocol
819 * understands this flag and nothing left to
820 * send then use PRU_SEND_EOF instead of PRU_SEND.
821 */
822 ((flags & MSG_EOF) &&
823 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
824 (resid <= 0)) ?
825 PRUS_EOF :
826 /* If there is more to send set PRUS_MORETOCOME */
827 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
828 top, addr, control, td);
829 if (dontroute) {
830 SOCK_LOCK(so);
831 so->so_options &= ~SO_DONTROUTE;
832 SOCK_UNLOCK(so);
833 }
834 clen = 0;
835 control = NULL;
836 top = NULL;
837 mp = ⊤
838 if (error) {
839 SOCKBUF_LOCK(&so->so_snd);
840 goto release;
841 }
842 } while (resid && space > 0);
843 SOCKBUF_LOCK(&so->so_snd);
844 } while (resid);
845
846 release:
847 SOCKBUF_LOCK_ASSERT(&so->so_snd);
848 sbunlock(&so->so_snd);
849 out_locked:
850 SOCKBUF_LOCK_ASSERT(&so->so_snd);
851 SOCKBUF_UNLOCK(&so->so_snd);
852 out:
853 if (top != NULL)
854 m_freem(top);
855 if (control != NULL)
856 m_freem(control);
857 return (error);
858 }
859
860 /*
861 * The part of soreceive() that implements reading non-inline out-of-band
862 * data from a socket. For more complete comments, see soreceive(), from
863 * which this code originated.
864 *
865 * XXXRW: Note that soreceive_rcvoob(), unlike the remainder of soreiceve(),
866 * is unable to return an mbuf chain to the caller.
867 */
868 static int
869 soreceive_rcvoob(so, uio, flags)
870 struct socket *so;
871 struct uio *uio;
872 int flags;
873 {
874 struct protosw *pr = so->so_proto;
875 struct mbuf *m;
876 int error;
877
878 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
879
880 m = m_get(M_TRYWAIT, MT_DATA);
881 if (m == NULL)
882 return (ENOBUFS);
883 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
884 if (error)
885 goto bad;
886 do {
887 #ifdef ZERO_COPY_SOCKETS
888 if (so_zero_copy_receive) {
889 vm_page_t pg;
890 int disposable;
891
892 if ((m->m_flags & M_EXT)
893 && (m->m_ext.ext_type == EXT_DISPOSABLE))
894 disposable = 1;
895 else
896 disposable = 0;
897
898 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t)));
899 if (uio->uio_offset == -1)
900 uio->uio_offset =IDX_TO_OFF(pg->pindex);
901
902 error = uiomoveco(mtod(m, void *),
903 min(uio->uio_resid, m->m_len),
904 uio, pg->object,
905 disposable);
906 } else
907 #endif /* ZERO_COPY_SOCKETS */
908 error = uiomove(mtod(m, void *),
909 (int) min(uio->uio_resid, m->m_len), uio);
910 m = m_free(m);
911 } while (uio->uio_resid && error == 0 && m);
912 bad:
913 if (m != NULL)
914 m_freem(m);
915 return (error);
916 }
917
918 /*
919 * Following replacement or removal of the first mbuf on the first mbuf chain
920 * of a socket buffer, push necessary state changes back into the socket
921 * buffer so that other consumers see the values consistently. 'nextrecord'
922 * is the callers locally stored value of the original value of
923 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
924 * NOTE: 'nextrecord' may be NULL.
925 */
926 static __inline void
927 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
928 {
929
930 SOCKBUF_LOCK_ASSERT(sb);
931 /*
932 * First, update for the new value of nextrecord. If necessary, make
933 * it the first record.
934 */
935 if (sb->sb_mb != NULL)
936 sb->sb_mb->m_nextpkt = nextrecord;
937 else
938 sb->sb_mb = nextrecord;
939
940 /*
941 * Now update any dependent socket buffer fields to reflect the new
942 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
943 * addition of a second clause that takes care of the case where
944 * sb_mb has been updated, but remains the last record.
945 */
946 if (sb->sb_mb == NULL) {
947 sb->sb_mbtail = NULL;
948 sb->sb_lastrecord = NULL;
949 } else if (sb->sb_mb->m_nextpkt == NULL)
950 sb->sb_lastrecord = sb->sb_mb;
951 }
952
953
954 /*
955 * Implement receive operations on a socket.
956 * We depend on the way that records are added to the sockbuf
957 * by sbappend*. In particular, each record (mbufs linked through m_next)
958 * must begin with an address if the protocol so specifies,
959 * followed by an optional mbuf or mbufs containing ancillary data,
960 * and then zero or more mbufs of data.
961 * In order to avoid blocking network interrupts for the entire time here,
962 * we splx() while doing the actual copy to user space.
963 * Although the sockbuf is locked, new data may still be appended,
964 * and thus we must maintain consistency of the sockbuf during that time.
965 *
966 * The caller may receive the data as a single mbuf chain by supplying
967 * an mbuf **mp0 for use in returning the chain. The uio is then used
968 * only for the count in uio_resid.
969 */
970 int
971 soreceive(so, psa, uio, mp0, controlp, flagsp)
972 struct socket *so;
973 struct sockaddr **psa;
974 struct uio *uio;
975 struct mbuf **mp0;
976 struct mbuf **controlp;
977 int *flagsp;
978 {
979 struct mbuf *m, **mp;
980 int flags, len, error, offset;
981 struct protosw *pr = so->so_proto;
982 struct mbuf *nextrecord;
983 int moff, type = 0;
984 int orig_resid = uio->uio_resid;
985
986 mp = mp0;
987 if (psa != NULL)
988 *psa = NULL;
989 if (controlp != NULL)
990 *controlp = NULL;
991 if (flagsp != NULL)
992 flags = *flagsp &~ MSG_EOR;
993 else
994 flags = 0;
995 if (flags & MSG_OOB)
996 return (soreceive_rcvoob(so, uio, flags));
997 if (mp != NULL)
998 *mp = NULL;
999 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
1000 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1001
1002 SOCKBUF_LOCK(&so->so_rcv);
1003 restart:
1004 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1005 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1006 if (error)
1007 goto out;
1008
1009 m = so->so_rcv.sb_mb;
1010 /*
1011 * If we have less data than requested, block awaiting more
1012 * (subject to any timeout) if:
1013 * 1. the current count is less than the low water mark, or
1014 * 2. MSG_WAITALL is set, and it is possible to do the entire
1015 * receive operation at once if we block (resid <= hiwat).
1016 * 3. MSG_DONTWAIT is not set
1017 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1018 * we have to do the receive in sections, and thus risk returning
1019 * a short count if a timeout or signal occurs after we start.
1020 */
1021 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1022 so->so_rcv.sb_cc < uio->uio_resid) &&
1023 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1024 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1025 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1026 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1027 ("receive: m == %p so->so_rcv.sb_cc == %u",
1028 m, so->so_rcv.sb_cc));
1029 if (so->so_error) {
1030 if (m != NULL)
1031 goto dontblock;
1032 error = so->so_error;
1033 if ((flags & MSG_PEEK) == 0)
1034 so->so_error = 0;
1035 goto release;
1036 }
1037 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1038 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1039 if (m)
1040 goto dontblock;
1041 else
1042 goto release;
1043 }
1044 for (; m != NULL; m = m->m_next)
1045 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1046 m = so->so_rcv.sb_mb;
1047 goto dontblock;
1048 }
1049 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1050 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1051 error = ENOTCONN;
1052 goto release;
1053 }
1054 if (uio->uio_resid == 0)
1055 goto release;
1056 if ((so->so_state & SS_NBIO) ||
1057 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1058 error = EWOULDBLOCK;
1059 goto release;
1060 }
1061 SBLASTRECORDCHK(&so->so_rcv);
1062 SBLASTMBUFCHK(&so->so_rcv);
1063 sbunlock(&so->so_rcv);
1064 error = sbwait(&so->so_rcv);
1065 if (error)
1066 goto out;
1067 goto restart;
1068 }
1069 dontblock:
1070 /*
1071 * From this point onward, we maintain 'nextrecord' as a cache of the
1072 * pointer to the next record in the socket buffer. We must keep the
1073 * various socket buffer pointers and local stack versions of the
1074 * pointers in sync, pushing out modifications before dropping the
1075 * socket buffer mutex, and re-reading them when picking it up.
1076 *
1077 * Otherwise, we will race with the network stack appending new data
1078 * or records onto the socket buffer by using inconsistent/stale
1079 * versions of the field, possibly resulting in socket buffer
1080 * corruption.
1081 *
1082 * By holding the high-level sblock(), we prevent simultaneous
1083 * readers from pulling off the front of the socket buffer.
1084 */
1085 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1086 if (uio->uio_td)
1087 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
1088 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1089 SBLASTRECORDCHK(&so->so_rcv);
1090 SBLASTMBUFCHK(&so->so_rcv);
1091 nextrecord = m->m_nextpkt;
1092 if (pr->pr_flags & PR_ADDR) {
1093 KASSERT(m->m_type == MT_SONAME,
1094 ("m->m_type == %d", m->m_type));
1095 orig_resid = 0;
1096 if (psa != NULL)
1097 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1098 M_NOWAIT);
1099 if (flags & MSG_PEEK) {
1100 m = m->m_next;
1101 } else {
1102 sbfree(&so->so_rcv, m);
1103 so->so_rcv.sb_mb = m_free(m);
1104 m = so->so_rcv.sb_mb;
1105 sockbuf_pushsync(&so->so_rcv, nextrecord);
1106 }
1107 }
1108
1109 /*
1110 * Process one or more MT_CONTROL mbufs present before any data mbufs
1111 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
1112 * just copy the data; if !MSG_PEEK, we call into the protocol to
1113 * perform externalization (or freeing if controlp == NULL).
1114 */
1115 if (m != NULL && m->m_type == MT_CONTROL) {
1116 struct mbuf *cm = NULL, *cmn;
1117 struct mbuf **cme = &cm;
1118
1119 do {
1120 if (flags & MSG_PEEK) {
1121 if (controlp != NULL) {
1122 *controlp = m_copy(m, 0, m->m_len);
1123 controlp = &(*controlp)->m_next;
1124 }
1125 m = m->m_next;
1126 } else {
1127 sbfree(&so->so_rcv, m);
1128 so->so_rcv.sb_mb = m->m_next;
1129 m->m_next = NULL;
1130 *cme = m;
1131 cme = &(*cme)->m_next;
1132 m = so->so_rcv.sb_mb;
1133 }
1134 } while (m != NULL && m->m_type == MT_CONTROL);
1135 if ((flags & MSG_PEEK) == 0)
1136 sockbuf_pushsync(&so->so_rcv, nextrecord);
1137 while (cm != NULL) {
1138 cmn = cm->m_next;
1139 cm->m_next = NULL;
1140 if (pr->pr_domain->dom_externalize != NULL) {
1141 SOCKBUF_UNLOCK(&so->so_rcv);
1142 error = (*pr->pr_domain->dom_externalize)
1143 (cm, controlp);
1144 SOCKBUF_LOCK(&so->so_rcv);
1145 } else if (controlp != NULL)
1146 *controlp = cm;
1147 else
1148 m_freem(cm);
1149 if (controlp != NULL) {
1150 orig_resid = 0;
1151 while (*controlp != NULL)
1152 controlp = &(*controlp)->m_next;
1153 }
1154 cm = cmn;
1155 }
1156 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1157 orig_resid = 0;
1158 }
1159 if (m != NULL) {
1160 if ((flags & MSG_PEEK) == 0) {
1161 KASSERT(m->m_nextpkt == nextrecord,
1162 ("soreceive: post-control, nextrecord !sync"));
1163 if (nextrecord == NULL) {
1164 KASSERT(so->so_rcv.sb_mb == m,
1165 ("soreceive: post-control, sb_mb!=m"));
1166 KASSERT(so->so_rcv.sb_lastrecord == m,
1167 ("soreceive: post-control, lastrecord!=m"));
1168 }
1169 }
1170 type = m->m_type;
1171 if (type == MT_OOBDATA)
1172 flags |= MSG_OOB;
1173 } else {
1174 if ((flags & MSG_PEEK) == 0) {
1175 KASSERT(so->so_rcv.sb_mb == nextrecord,
1176 ("soreceive: sb_mb != nextrecord"));
1177 if (so->so_rcv.sb_mb == NULL) {
1178 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1179 ("soreceive: sb_lastercord != NULL"));
1180 }
1181 }
1182 }
1183 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1184 SBLASTRECORDCHK(&so->so_rcv);
1185 SBLASTMBUFCHK(&so->so_rcv);
1186
1187 /*
1188 * Now continue to read any data mbufs off of the head of the socket
1189 * buffer until the read request is satisfied. Note that 'type' is
1190 * used to store the type of any mbuf reads that have happened so far
1191 * such that soreceive() can stop reading if the type changes, which
1192 * causes soreceive() to return only one of regular data and inline
1193 * out-of-band data in a single socket receive operation.
1194 */
1195 moff = 0;
1196 offset = 0;
1197 while (m != NULL && uio->uio_resid > 0 && error == 0) {
1198 /*
1199 * If the type of mbuf has changed since the last mbuf
1200 * examined ('type'), end the receive operation.
1201 */
1202 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1203 if (m->m_type == MT_OOBDATA) {
1204 if (type != MT_OOBDATA)
1205 break;
1206 } else if (type == MT_OOBDATA)
1207 break;
1208 else
1209 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
1210 ("m->m_type == %d", m->m_type));
1211 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1212 len = uio->uio_resid;
1213 if (so->so_oobmark && len > so->so_oobmark - offset)
1214 len = so->so_oobmark - offset;
1215 if (len > m->m_len - moff)
1216 len = m->m_len - moff;
1217 /*
1218 * If mp is set, just pass back the mbufs.
1219 * Otherwise copy them out via the uio, then free.
1220 * Sockbuf must be consistent here (points to current mbuf,
1221 * it points to next record) when we drop priority;
1222 * we must note any additions to the sockbuf when we
1223 * block interrupts again.
1224 */
1225 if (mp == NULL) {
1226 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1227 SBLASTRECORDCHK(&so->so_rcv);
1228 SBLASTMBUFCHK(&so->so_rcv);
1229 SOCKBUF_UNLOCK(&so->so_rcv);
1230 #ifdef ZERO_COPY_SOCKETS
1231 if (so_zero_copy_receive) {
1232 vm_page_t pg;
1233 int disposable;
1234
1235 if ((m->m_flags & M_EXT)
1236 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1237 disposable = 1;
1238 else
1239 disposable = 0;
1240
1241 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) +
1242 moff));
1243
1244 if (uio->uio_offset == -1)
1245 uio->uio_offset =IDX_TO_OFF(pg->pindex);
1246
1247 error = uiomoveco(mtod(m, char *) + moff,
1248 (int)len, uio,pg->object,
1249 disposable);
1250 } else
1251 #endif /* ZERO_COPY_SOCKETS */
1252 error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1253 SOCKBUF_LOCK(&so->so_rcv);
1254 if (error)
1255 goto release;
1256 } else
1257 uio->uio_resid -= len;
1258 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1259 if (len == m->m_len - moff) {
1260 if (m->m_flags & M_EOR)
1261 flags |= MSG_EOR;
1262 if (flags & MSG_PEEK) {
1263 m = m->m_next;
1264 moff = 0;
1265 } else {
1266 nextrecord = m->m_nextpkt;
1267 sbfree(&so->so_rcv, m);
1268 if (mp != NULL) {
1269 *mp = m;
1270 mp = &m->m_next;
1271 so->so_rcv.sb_mb = m = m->m_next;
1272 *mp = NULL;
1273 } else {
1274 so->so_rcv.sb_mb = m_free(m);
1275 m = so->so_rcv.sb_mb;
1276 }
1277 if (m != NULL) {
1278 m->m_nextpkt = nextrecord;
1279 if (nextrecord == NULL)
1280 so->so_rcv.sb_lastrecord = m;
1281 } else {
1282 so->so_rcv.sb_mb = nextrecord;
1283 SB_EMPTY_FIXUP(&so->so_rcv);
1284 }
1285 SBLASTRECORDCHK(&so->so_rcv);
1286 SBLASTMBUFCHK(&so->so_rcv);
1287 }
1288 } else {
1289 if (flags & MSG_PEEK)
1290 moff += len;
1291 else {
1292 if (mp != NULL) {
1293 SOCKBUF_UNLOCK(&so->so_rcv);
1294 *mp = m_copym(m, 0, len, M_TRYWAIT);
1295 SOCKBUF_LOCK(&so->so_rcv);
1296 }
1297 m->m_data += len;
1298 m->m_len -= len;
1299 so->so_rcv.sb_cc -= len;
1300 }
1301 }
1302 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1303 if (so->so_oobmark) {
1304 if ((flags & MSG_PEEK) == 0) {
1305 so->so_oobmark -= len;
1306 if (so->so_oobmark == 0) {
1307 so->so_rcv.sb_state |= SBS_RCVATMARK;
1308 break;
1309 }
1310 } else {
1311 offset += len;
1312 if (offset == so->so_oobmark)
1313 break;
1314 }
1315 }
1316 if (flags & MSG_EOR)
1317 break;
1318 /*
1319 * If the MSG_WAITALL flag is set (for non-atomic socket),
1320 * we must not quit until "uio->uio_resid == 0" or an error
1321 * termination. If a signal/timeout occurs, return
1322 * with a short count but without error.
1323 * Keep sockbuf locked against other readers.
1324 */
1325 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1326 !sosendallatonce(so) && nextrecord == NULL) {
1327 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1328 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
1329 break;
1330 /*
1331 * Notify the protocol that some data has been
1332 * drained before blocking.
1333 */
1334 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
1335 SOCKBUF_UNLOCK(&so->so_rcv);
1336 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1337 SOCKBUF_LOCK(&so->so_rcv);
1338 }
1339 SBLASTRECORDCHK(&so->so_rcv);
1340 SBLASTMBUFCHK(&so->so_rcv);
1341 error = sbwait(&so->so_rcv);
1342 if (error)
1343 goto release;
1344 m = so->so_rcv.sb_mb;
1345 if (m != NULL)
1346 nextrecord = m->m_nextpkt;
1347 }
1348 }
1349
1350 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1351 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1352 flags |= MSG_TRUNC;
1353 if ((flags & MSG_PEEK) == 0)
1354 (void) sbdroprecord_locked(&so->so_rcv);
1355 }
1356 if ((flags & MSG_PEEK) == 0) {
1357 if (m == NULL) {
1358 /*
1359 * First part is an inline SB_EMPTY_FIXUP(). Second
1360 * part makes sure sb_lastrecord is up-to-date if
1361 * there is still data in the socket buffer.
1362 */
1363 so->so_rcv.sb_mb = nextrecord;
1364 if (so->so_rcv.sb_mb == NULL) {
1365 so->so_rcv.sb_mbtail = NULL;
1366 so->so_rcv.sb_lastrecord = NULL;
1367 } else if (nextrecord->m_nextpkt == NULL)
1368 so->so_rcv.sb_lastrecord = nextrecord;
1369 }
1370 SBLASTRECORDCHK(&so->so_rcv);
1371 SBLASTMBUFCHK(&so->so_rcv);
1372 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
1373 SOCKBUF_UNLOCK(&so->so_rcv);
1374 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1375 SOCKBUF_LOCK(&so->so_rcv);
1376 }
1377 }
1378 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1379 if (orig_resid == uio->uio_resid && orig_resid &&
1380 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1381 sbunlock(&so->so_rcv);
1382 goto restart;
1383 }
1384
1385 if (flagsp != NULL)
1386 *flagsp |= flags;
1387 release:
1388 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1389 sbunlock(&so->so_rcv);
1390 out:
1391 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1392 SOCKBUF_UNLOCK(&so->so_rcv);
1393 return (error);
1394 }
1395
1396 int
1397 soshutdown(so, how)
1398 struct socket *so;
1399 int how;
1400 {
1401 struct protosw *pr = so->so_proto;
1402
1403 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1404 return (EINVAL);
1405
1406 if (how != SHUT_WR)
1407 sorflush(so);
1408 if (how != SHUT_RD)
1409 return ((*pr->pr_usrreqs->pru_shutdown)(so));
1410 return (0);
1411 }
1412
1413 void
1414 sorflush(so)
1415 struct socket *so;
1416 {
1417 struct sockbuf *sb = &so->so_rcv;
1418 struct protosw *pr = so->so_proto;
1419 struct sockbuf asb;
1420
1421 /*
1422 * XXXRW: This is quite ugly. The existing code made a copy of the
1423 * socket buffer, then zero'd the original to clear the buffer
1424 * fields. However, with mutexes in the socket buffer, this causes
1425 * problems. We only clear the zeroable bits of the original;
1426 * however, we have to initialize and destroy the mutex in the copy
1427 * so that dom_dispose() and sbrelease() can lock t as needed.
1428 */
1429 SOCKBUF_LOCK(sb);
1430 sb->sb_flags |= SB_NOINTR;
1431 (void) sblock(sb, M_WAITOK);
1432 /*
1433 * socantrcvmore_locked() drops the socket buffer mutex so that it
1434 * can safely perform wakeups. Re-acquire the mutex before
1435 * continuing.
1436 */
1437 socantrcvmore_locked(so);
1438 SOCKBUF_LOCK(sb);
1439 sbunlock(sb);
1440 /*
1441 * Invalidate/clear most of the sockbuf structure, but leave
1442 * selinfo and mutex data unchanged.
1443 */
1444 bzero(&asb, offsetof(struct sockbuf, sb_startzero));
1445 bcopy(&sb->sb_startzero, &asb.sb_startzero,
1446 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1447 bzero(&sb->sb_startzero,
1448 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1449 SOCKBUF_UNLOCK(sb);
1450
1451 SOCKBUF_LOCK_INIT(&asb, "so_rcv");
1452 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1453 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1454 sbrelease(&asb, so);
1455 SOCKBUF_LOCK_DESTROY(&asb);
1456 }
1457
1458 #ifdef INET
1459 static int
1460 do_setopt_accept_filter(so, sopt)
1461 struct socket *so;
1462 struct sockopt *sopt;
1463 {
1464 struct accept_filter_arg *afap;
1465 struct accept_filter *afp;
1466 struct so_accf *newaf;
1467 int error = 0;
1468
1469 newaf = NULL;
1470 afap = NULL;
1471
1472 /*
1473 * XXXRW: Configuring accept filters should be an atomic test-and-set
1474 * operation to prevent races during setup and attach. There may be
1475 * more general issues of racing and ordering here that are not yet
1476 * addressed by locking.
1477 */
1478 /* do not set/remove accept filters on non listen sockets */
1479 SOCK_LOCK(so);
1480 if ((so->so_options & SO_ACCEPTCONN) == 0) {
1481 SOCK_UNLOCK(so);
1482 return (EINVAL);
1483 }
1484
1485 /* removing the filter */
1486 if (sopt == NULL) {
1487 if (so->so_accf != NULL) {
1488 struct so_accf *af = so->so_accf;
1489 if (af->so_accept_filter != NULL &&
1490 af->so_accept_filter->accf_destroy != NULL) {
1491 af->so_accept_filter->accf_destroy(so);
1492 }
1493 if (af->so_accept_filter_str != NULL) {
1494 FREE(af->so_accept_filter_str, M_ACCF);
1495 }
1496 FREE(af, M_ACCF);
1497 so->so_accf = NULL;
1498 }
1499 so->so_options &= ~SO_ACCEPTFILTER;
1500 SOCK_UNLOCK(so);
1501 return (0);
1502 }
1503 SOCK_UNLOCK(so);
1504
1505 /*-
1506 * Adding a filter.
1507 *
1508 * Do memory allocation, copyin, and filter lookup now while we're
1509 * not holding any locks. Avoids sleeping with a mutex, as well as
1510 * introducing a lock order between accept filter locks and socket
1511 * locks here.
1512 */
1513 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP,
1514 M_WAITOK);
1515 /* don't put large objects on the kernel stack */
1516 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
1517 afap->af_name[sizeof(afap->af_name)-1] = '\0';
1518 afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
1519 if (error) {
1520 FREE(afap, M_TEMP);
1521 return (error);
1522 }
1523 afp = accept_filt_get(afap->af_name);
1524 if (afp == NULL) {
1525 FREE(afap, M_TEMP);
1526 return (ENOENT);
1527 }
1528
1529 /*
1530 * Allocate the new accept filter instance storage. We may have to
1531 * free it again later if we fail to attach it. If attached
1532 * properly, 'newaf' is NULLed to avoid a free() while in use.
1533 */
1534 MALLOC(newaf, struct so_accf *, sizeof(*newaf), M_ACCF, M_WAITOK |
1535 M_ZERO);
1536 if (afp->accf_create != NULL && afap->af_name[0] != '\0') {
1537 int len = strlen(afap->af_name) + 1;
1538 MALLOC(newaf->so_accept_filter_str, char *, len, M_ACCF,
1539 M_WAITOK);
1540 strcpy(newaf->so_accept_filter_str, afap->af_name);
1541 }
1542
1543 SOCK_LOCK(so);
1544 /* must remove previous filter first */
1545 if (so->so_accf != NULL) {
1546 error = EINVAL;
1547 goto out;
1548 }
1549 /*
1550 * Invoke the accf_create() method of the filter if required.
1551 * XXXRW: the socket mutex is held over this call, so the create
1552 * method cannot block. This may be something we have to change, but
1553 * it would require addressing possible races.
1554 */
1555 if (afp->accf_create != NULL) {
1556 newaf->so_accept_filter_arg =
1557 afp->accf_create(so, afap->af_arg);
1558 if (newaf->so_accept_filter_arg == NULL) {
1559 error = EINVAL;
1560 goto out;
1561 }
1562 }
1563 newaf->so_accept_filter = afp;
1564 so->so_accf = newaf;
1565 so->so_options |= SO_ACCEPTFILTER;
1566 newaf = NULL;
1567 out:
1568 SOCK_UNLOCK(so);
1569 if (newaf != NULL) {
1570 if (newaf->so_accept_filter_str != NULL)
1571 FREE(newaf->so_accept_filter_str, M_ACCF);
1572 FREE(newaf, M_ACCF);
1573 }
1574 if (afap != NULL)
1575 FREE(afap, M_TEMP);
1576 return (error);
1577 }
1578 #endif /* INET */
1579
1580 /*
1581 * Perhaps this routine, and sooptcopyout(), below, ought to come in
1582 * an additional variant to handle the case where the option value needs
1583 * to be some kind of integer, but not a specific size.
1584 * In addition to their use here, these functions are also called by the
1585 * protocol-level pr_ctloutput() routines.
1586 */
1587 int
1588 sooptcopyin(sopt, buf, len, minlen)
1589 struct sockopt *sopt;
1590 void *buf;
1591 size_t len;
1592 size_t minlen;
1593 {
1594 size_t valsize;
1595
1596 /*
1597 * If the user gives us more than we wanted, we ignore it,
1598 * but if we don't get the minimum length the caller
1599 * wants, we return EINVAL. On success, sopt->sopt_valsize
1600 * is set to however much we actually retrieved.
1601 */
1602 if ((valsize = sopt->sopt_valsize) < minlen)
1603 return EINVAL;
1604 if (valsize > len)
1605 sopt->sopt_valsize = valsize = len;
1606
1607 if (sopt->sopt_td != NULL)
1608 return (copyin(sopt->sopt_val, buf, valsize));
1609
1610 bcopy(sopt->sopt_val, buf, valsize);
1611 return 0;
1612 }
1613
1614 /*
1615 * Kernel version of setsockopt(2)/
1616 * XXX: optlen is size_t, not socklen_t
1617 */
1618 int
1619 so_setsockopt(struct socket *so, int level, int optname, void *optval,
1620 size_t optlen)
1621 {
1622 struct sockopt sopt;
1623
1624 sopt.sopt_level = level;
1625 sopt.sopt_name = optname;
1626 sopt.sopt_dir = SOPT_SET;
1627 sopt.sopt_val = optval;
1628 sopt.sopt_valsize = optlen;
1629 sopt.sopt_td = NULL;
1630 return (sosetopt(so, &sopt));
1631 }
1632
1633 int
1634 sosetopt(so, sopt)
1635 struct socket *so;
1636 struct sockopt *sopt;
1637 {
1638 int error, optval;
1639 struct linger l;
1640 struct timeval tv;
1641 u_long val;
1642 #ifdef MAC
1643 struct mac extmac;
1644 #endif
1645
1646 error = 0;
1647 if (sopt->sopt_level != SOL_SOCKET) {
1648 if (so->so_proto && so->so_proto->pr_ctloutput)
1649 return ((*so->so_proto->pr_ctloutput)
1650 (so, sopt));
1651 error = ENOPROTOOPT;
1652 } else {
1653 switch (sopt->sopt_name) {
1654 #ifdef INET
1655 case SO_ACCEPTFILTER:
1656 error = do_setopt_accept_filter(so, sopt);
1657 if (error)
1658 goto bad;
1659 break;
1660 #endif
1661 case SO_LINGER:
1662 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
1663 if (error)
1664 goto bad;
1665
1666 SOCK_LOCK(so);
1667 so->so_linger = l.l_linger;
1668 if (l.l_onoff)
1669 so->so_options |= SO_LINGER;
1670 else
1671 so->so_options &= ~SO_LINGER;
1672 SOCK_UNLOCK(so);
1673 break;
1674
1675 case SO_DEBUG:
1676 case SO_KEEPALIVE:
1677 case SO_DONTROUTE:
1678 case SO_USELOOPBACK:
1679 case SO_BROADCAST:
1680 case SO_REUSEADDR:
1681 case SO_REUSEPORT:
1682 case SO_OOBINLINE:
1683 case SO_TIMESTAMP:
1684 case SO_BINTIME:
1685 case SO_NOSIGPIPE:
1686 error = sooptcopyin(sopt, &optval, sizeof optval,
1687 sizeof optval);
1688 if (error)
1689 goto bad;
1690 SOCK_LOCK(so);
1691 if (optval)
1692 so->so_options |= sopt->sopt_name;
1693 else
1694 so->so_options &= ~sopt->sopt_name;
1695 SOCK_UNLOCK(so);
1696 break;
1697
1698 case SO_SNDBUF:
1699 case SO_RCVBUF:
1700 case SO_SNDLOWAT:
1701 case SO_RCVLOWAT:
1702 error = sooptcopyin(sopt, &optval, sizeof optval,
1703 sizeof optval);
1704 if (error)
1705 goto bad;
1706
1707 /*
1708 * Values < 1 make no sense for any of these
1709 * options, so disallow them.
1710 */
1711 if (optval < 1) {
1712 error = EINVAL;
1713 goto bad;
1714 }
1715
1716 switch (sopt->sopt_name) {
1717 case SO_SNDBUF:
1718 case SO_RCVBUF:
1719 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1720 &so->so_snd : &so->so_rcv, (u_long)optval,
1721 so, curthread) == 0) {
1722 error = ENOBUFS;
1723 goto bad;
1724 }
1725 break;
1726
1727 /*
1728 * Make sure the low-water is never greater than
1729 * the high-water.
1730 */
1731 case SO_SNDLOWAT:
1732 SOCKBUF_LOCK(&so->so_snd);
1733 so->so_snd.sb_lowat =
1734 (optval > so->so_snd.sb_hiwat) ?
1735 so->so_snd.sb_hiwat : optval;
1736 SOCKBUF_UNLOCK(&so->so_snd);
1737 break;
1738 case SO_RCVLOWAT:
1739 SOCKBUF_LOCK(&so->so_rcv);
1740 so->so_rcv.sb_lowat =
1741 (optval > so->so_rcv.sb_hiwat) ?
1742 so->so_rcv.sb_hiwat : optval;
1743 SOCKBUF_UNLOCK(&so->so_rcv);
1744 break;
1745 }
1746 break;
1747
1748 case SO_SNDTIMEO:
1749 case SO_RCVTIMEO:
1750 error = sooptcopyin(sopt, &tv, sizeof tv,
1751 sizeof tv);
1752 if (error)
1753 goto bad;
1754
1755 /* assert(hz > 0); */
1756 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz ||
1757 tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
1758 error = EDOM;
1759 goto bad;
1760 }
1761 /* assert(tick > 0); */
1762 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */
1763 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
1764 if (val > SHRT_MAX) {
1765 error = EDOM;
1766 goto bad;
1767 }
1768 if (val == 0 && tv.tv_usec != 0)
1769 val = 1;
1770
1771 switch (sopt->sopt_name) {
1772 case SO_SNDTIMEO:
1773 so->so_snd.sb_timeo = val;
1774 break;
1775 case SO_RCVTIMEO:
1776 so->so_rcv.sb_timeo = val;
1777 break;
1778 }
1779 break;
1780 case SO_LABEL:
1781 #ifdef MAC
1782 error = sooptcopyin(sopt, &extmac, sizeof extmac,
1783 sizeof extmac);
1784 if (error)
1785 goto bad;
1786 error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
1787 so, &extmac);
1788 #else
1789 error = EOPNOTSUPP;
1790 #endif
1791 break;
1792 default:
1793 error = ENOPROTOOPT;
1794 break;
1795 }
1796 if (error == 0 && so->so_proto != NULL &&
1797 so->so_proto->pr_ctloutput != NULL) {
1798 (void) ((*so->so_proto->pr_ctloutput)
1799 (so, sopt));
1800 }
1801 }
1802 bad:
1803 return (error);
1804 }
1805
1806 /* Helper routine for getsockopt */
1807 int
1808 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
1809 {
1810 int error;
1811 size_t valsize;
1812
1813 error = 0;
1814
1815 /*
1816 * Documented get behavior is that we always return a value,
1817 * possibly truncated to fit in the user's buffer.
1818 * Traditional behavior is that we always tell the user
1819 * precisely how much we copied, rather than something useful
1820 * like the total amount we had available for her.
1821 * Note that this interface is not idempotent; the entire answer must
1822 * generated ahead of time.
1823 */
1824 valsize = min(len, sopt->sopt_valsize);
1825 sopt->sopt_valsize = valsize;
1826 if (sopt->sopt_val != NULL) {
1827 if (sopt->sopt_td != NULL)
1828 error = copyout(buf, sopt->sopt_val, valsize);
1829 else
1830 bcopy(buf, sopt->sopt_val, valsize);
1831 }
1832 return error;
1833 }
1834
1835 int
1836 sogetopt(so, sopt)
1837 struct socket *so;
1838 struct sockopt *sopt;
1839 {
1840 int error, optval;
1841 struct linger l;
1842 struct timeval tv;
1843 #ifdef INET
1844 struct accept_filter_arg *afap;
1845 #endif
1846 #ifdef MAC
1847 struct mac extmac;
1848 #endif
1849
1850 error = 0;
1851 if (sopt->sopt_level != SOL_SOCKET) {
1852 if (so->so_proto && so->so_proto->pr_ctloutput) {
1853 return ((*so->so_proto->pr_ctloutput)
1854 (so, sopt));
1855 } else
1856 return (ENOPROTOOPT);
1857 } else {
1858 switch (sopt->sopt_name) {
1859 #ifdef INET
1860 case SO_ACCEPTFILTER:
1861 /* Unlocked read. */
1862 if ((so->so_options & SO_ACCEPTCONN) == 0)
1863 return (EINVAL);
1864 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap),
1865 M_TEMP, M_WAITOK | M_ZERO);
1866 SOCK_LOCK(so);
1867 if ((so->so_options & SO_ACCEPTFILTER) != 0) {
1868 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
1869 if (so->so_accf->so_accept_filter_str != NULL)
1870 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
1871 }
1872 SOCK_UNLOCK(so);
1873 error = sooptcopyout(sopt, afap, sizeof(*afap));
1874 FREE(afap, M_TEMP);
1875 break;
1876 #endif
1877
1878 case SO_LINGER:
1879 /*
1880 * XXXRW: We grab the lock here to get a consistent
1881 * snapshot of both fields. This may not really
1882 * be necessary.
1883 */
1884 SOCK_LOCK(so);
1885 l.l_onoff = so->so_options & SO_LINGER;
1886 l.l_linger = so->so_linger;
1887 SOCK_UNLOCK(so);
1888 error = sooptcopyout(sopt, &l, sizeof l);
1889 break;
1890
1891 case SO_USELOOPBACK:
1892 case SO_DONTROUTE:
1893 case SO_DEBUG:
1894 case SO_KEEPALIVE:
1895 case SO_REUSEADDR:
1896 case SO_REUSEPORT:
1897 case SO_BROADCAST:
1898 case SO_OOBINLINE:
1899 case SO_TIMESTAMP:
1900 case SO_BINTIME:
1901 case SO_NOSIGPIPE:
1902 optval = so->so_options & sopt->sopt_name;
1903 integer:
1904 error = sooptcopyout(sopt, &optval, sizeof optval);
1905 break;
1906
1907 case SO_TYPE:
1908 optval = so->so_type;
1909 goto integer;
1910
1911 case SO_ERROR:
1912 optval = so->so_error;
1913 so->so_error = 0;
1914 goto integer;
1915
1916 case SO_SNDBUF:
1917 optval = so->so_snd.sb_hiwat;
1918 goto integer;
1919
1920 case SO_RCVBUF:
1921 optval = so->so_rcv.sb_hiwat;
1922 goto integer;
1923
1924 case SO_SNDLOWAT:
1925 optval = so->so_snd.sb_lowat;
1926 goto integer;
1927
1928 case SO_RCVLOWAT:
1929 optval = so->so_rcv.sb_lowat;
1930 goto integer;
1931
1932 case SO_SNDTIMEO:
1933 case SO_RCVTIMEO:
1934 optval = (sopt->sopt_name == SO_SNDTIMEO ?
1935 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1936
1937 tv.tv_sec = optval / hz;
1938 tv.tv_usec = (optval % hz) * tick;
1939 error = sooptcopyout(sopt, &tv, sizeof tv);
1940 break;
1941 case SO_LABEL:
1942 #ifdef MAC
1943 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
1944 sizeof(extmac));
1945 if (error)
1946 return (error);
1947 error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
1948 so, &extmac);
1949 if (error)
1950 return (error);
1951 error = sooptcopyout(sopt, &extmac, sizeof extmac);
1952 #else
1953 error = EOPNOTSUPP;
1954 #endif
1955 break;
1956 case SO_PEERLABEL:
1957 #ifdef MAC
1958 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
1959 sizeof(extmac));
1960 if (error)
1961 return (error);
1962 error = mac_getsockopt_peerlabel(
1963 sopt->sopt_td->td_ucred, so, &extmac);
1964 if (error)
1965 return (error);
1966 error = sooptcopyout(sopt, &extmac, sizeof extmac);
1967 #else
1968 error = EOPNOTSUPP;
1969 #endif
1970 break;
1971 default:
1972 error = ENOPROTOOPT;
1973 break;
1974 }
1975 return (error);
1976 }
1977 }
1978
1979 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
1980 int
1981 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1982 {
1983 struct mbuf *m, *m_prev;
1984 int sopt_size = sopt->sopt_valsize;
1985
1986 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
1987 if (m == NULL)
1988 return ENOBUFS;
1989 if (sopt_size > MLEN) {
1990 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
1991 if ((m->m_flags & M_EXT) == 0) {
1992 m_free(m);
1993 return ENOBUFS;
1994 }
1995 m->m_len = min(MCLBYTES, sopt_size);
1996 } else {
1997 m->m_len = min(MLEN, sopt_size);
1998 }
1999 sopt_size -= m->m_len;
2000 *mp = m;
2001 m_prev = m;
2002
2003 while (sopt_size) {
2004 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
2005 if (m == NULL) {
2006 m_freem(*mp);
2007 return ENOBUFS;
2008 }
2009 if (sopt_size > MLEN) {
2010 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
2011 M_DONTWAIT);
2012 if ((m->m_flags & M_EXT) == 0) {
2013 m_freem(m);
2014 m_freem(*mp);
2015 return ENOBUFS;
2016 }
2017 m->m_len = min(MCLBYTES, sopt_size);
2018 } else {
2019 m->m_len = min(MLEN, sopt_size);
2020 }
2021 sopt_size -= m->m_len;
2022 m_prev->m_next = m;
2023 m_prev = m;
2024 }
2025 return 0;
2026 }
2027
2028 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2029 int
2030 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2031 {
2032 struct mbuf *m0 = m;
2033
2034 if (sopt->sopt_val == NULL)
2035 return 0;
2036 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2037 if (sopt->sopt_td != NULL) {
2038 int error;
2039
2040 error = copyin(sopt->sopt_val, mtod(m, char *),
2041 m->m_len);
2042 if (error != 0) {
2043 m_freem(m0);
2044 return(error);
2045 }
2046 } else
2047 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2048 sopt->sopt_valsize -= m->m_len;
2049 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2050 m = m->m_next;
2051 }
2052 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2053 panic("ip6_sooptmcopyin");
2054 return 0;
2055 }
2056
2057 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2058 int
2059 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2060 {
2061 struct mbuf *m0 = m;
2062 size_t valsize = 0;
2063
2064 if (sopt->sopt_val == NULL)
2065 return 0;
2066 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2067 if (sopt->sopt_td != NULL) {
2068 int error;
2069
2070 error = copyout(mtod(m, char *), sopt->sopt_val,
2071 m->m_len);
2072 if (error != 0) {
2073 m_freem(m0);
2074 return(error);
2075 }
2076 } else
2077 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2078 sopt->sopt_valsize -= m->m_len;
2079 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2080 valsize += m->m_len;
2081 m = m->m_next;
2082 }
2083 if (m != NULL) {
2084 /* enough soopt buffer should be given from user-land */
2085 m_freem(m0);
2086 return(EINVAL);
2087 }
2088 sopt->sopt_valsize = valsize;
2089 return 0;
2090 }
2091
2092 void
2093 sohasoutofband(so)
2094 struct socket *so;
2095 {
2096 if (so->so_sigio != NULL)
2097 pgsigio(&so->so_sigio, SIGURG, 0);
2098 selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2099 }
2100
2101 int
2102 sopoll(struct socket *so, int events, struct ucred *active_cred,
2103 struct thread *td)
2104 {
2105 int revents = 0;
2106
2107 SOCKBUF_LOCK(&so->so_snd);
2108 SOCKBUF_LOCK(&so->so_rcv);
2109 if (events & (POLLIN | POLLRDNORM))
2110 if (soreadable(so))
2111 revents |= events & (POLLIN | POLLRDNORM);
2112
2113 if (events & POLLINIGNEOF)
2114 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2115 !TAILQ_EMPTY(&so->so_comp) || so->so_error)
2116 revents |= POLLINIGNEOF;
2117
2118 if (events & (POLLOUT | POLLWRNORM))
2119 if (sowriteable(so))
2120 revents |= events & (POLLOUT | POLLWRNORM);
2121
2122 if (events & (POLLPRI | POLLRDBAND))
2123 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2124 revents |= events & (POLLPRI | POLLRDBAND);
2125
2126 if (revents == 0) {
2127 if (events &
2128 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
2129 POLLRDBAND)) {
2130 selrecord(td, &so->so_rcv.sb_sel);
2131 so->so_rcv.sb_flags |= SB_SEL;
2132 }
2133
2134 if (events & (POLLOUT | POLLWRNORM)) {
2135 selrecord(td, &so->so_snd.sb_sel);
2136 so->so_snd.sb_flags |= SB_SEL;
2137 }
2138 }
2139
2140 SOCKBUF_UNLOCK(&so->so_rcv);
2141 SOCKBUF_UNLOCK(&so->so_snd);
2142 return (revents);
2143 }
2144
2145 int
2146 soo_kqfilter(struct file *fp, struct knote *kn)
2147 {
2148 struct socket *so = kn->kn_fp->f_data;
2149 struct sockbuf *sb;
2150
2151 switch (kn->kn_filter) {
2152 case EVFILT_READ:
2153 if (so->so_options & SO_ACCEPTCONN)
2154 kn->kn_fop = &solisten_filtops;
2155 else
2156 kn->kn_fop = &soread_filtops;
2157 sb = &so->so_rcv;
2158 break;
2159 case EVFILT_WRITE:
2160 kn->kn_fop = &sowrite_filtops;
2161 sb = &so->so_snd;
2162 break;
2163 default:
2164 return (EINVAL);
2165 }
2166
2167 SOCKBUF_LOCK(sb);
2168 knlist_add(&sb->sb_sel.si_note, kn, 1);
2169 sb->sb_flags |= SB_KNOTE;
2170 SOCKBUF_UNLOCK(sb);
2171 return (0);
2172 }
2173
2174 static void
2175 filt_sordetach(struct knote *kn)
2176 {
2177 struct socket *so = kn->kn_fp->f_data;
2178
2179 SOCKBUF_LOCK(&so->so_rcv);
2180 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
2181 if (knlist_empty(&so->so_rcv.sb_sel.si_note))
2182 so->so_rcv.sb_flags &= ~SB_KNOTE;
2183 SOCKBUF_UNLOCK(&so->so_rcv);
2184 }
2185
2186 /*ARGSUSED*/
2187 static int
2188 filt_soread(struct knote *kn, long hint)
2189 {
2190 struct socket *so;
2191
2192 so = kn->kn_fp->f_data;
2193 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2194
2195 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
2196 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2197 kn->kn_flags |= EV_EOF;
2198 kn->kn_fflags = so->so_error;
2199 return (1);
2200 } else if (so->so_error) /* temporary udp error */
2201 return (1);
2202 else if (kn->kn_sfflags & NOTE_LOWAT)
2203 return (kn->kn_data >= kn->kn_sdata);
2204 else
2205 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
2206 }
2207
2208 static void
2209 filt_sowdetach(struct knote *kn)
2210 {
2211 struct socket *so = kn->kn_fp->f_data;
2212
2213 SOCKBUF_LOCK(&so->so_snd);
2214 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
2215 if (knlist_empty(&so->so_snd.sb_sel.si_note))
2216 so->so_snd.sb_flags &= ~SB_KNOTE;
2217 SOCKBUF_UNLOCK(&so->so_snd);
2218 }
2219
2220 /*ARGSUSED*/
2221 static int
2222 filt_sowrite(struct knote *kn, long hint)
2223 {
2224 struct socket *so;
2225
2226 so = kn->kn_fp->f_data;
2227 SOCKBUF_LOCK_ASSERT(&so->so_snd);
2228 kn->kn_data = sbspace(&so->so_snd);
2229 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2230 kn->kn_flags |= EV_EOF;
2231 kn->kn_fflags = so->so_error;
2232 return (1);
2233 } else if (so->so_error) /* temporary udp error */
2234 return (1);
2235 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2236 (so->so_proto->pr_flags & PR_CONNREQUIRED))
2237 return (0);
2238 else if (kn->kn_sfflags & NOTE_LOWAT)
2239 return (kn->kn_data >= kn->kn_sdata);
2240 else
2241 return (kn->kn_data >= so->so_snd.sb_lowat);
2242 }
2243
2244 /*ARGSUSED*/
2245 static int
2246 filt_solisten(struct knote *kn, long hint)
2247 {
2248 struct socket *so = kn->kn_fp->f_data;
2249
2250 kn->kn_data = so->so_qlen;
2251 return (! TAILQ_EMPTY(&so->so_comp));
2252 }
2253
2254 int
2255 socheckuid(struct socket *so, uid_t uid)
2256 {
2257
2258 if (so == NULL)
2259 return (EPERM);
2260 if (so->so_cred->cr_uid == uid)
2261 return (0);
2262 return (EPERM);
2263 }
Cache object: 75df33231327010f0fa0be07d342c547
|