1 /*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California.
4 * Copyright (c) 2004 The FreeBSD Foundation
5 * Copyright (c) 2004-2008 Robert N. M. Watson
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
33 */
34
35 /*
36 * Comments on the socket life cycle:
37 *
38 * soalloc() sets of socket layer state for a socket, called only by
39 * socreate() and sonewconn(). Socket layer private.
40 *
41 * sodealloc() tears down socket layer state for a socket, called only by
42 * sofree() and sonewconn(). Socket layer private.
43 *
44 * pru_attach() associates protocol layer state with an allocated socket;
45 * called only once, may fail, aborting socket allocation. This is called
46 * from socreate() and sonewconn(). Socket layer private.
47 *
48 * pru_detach() disassociates protocol layer state from an attached socket,
49 * and will be called exactly once for sockets in which pru_attach() has
50 * been successfully called. If pru_attach() returned an error,
51 * pru_detach() will not be called. Socket layer private.
52 *
53 * pru_abort() and pru_close() notify the protocol layer that the last
54 * consumer of a socket is starting to tear down the socket, and that the
55 * protocol should terminate the connection. Historically, pru_abort() also
56 * detached protocol state from the socket state, but this is no longer the
57 * case.
58 *
59 * socreate() creates a socket and attaches protocol state. This is a public
60 * interface that may be used by socket layer consumers to create new
61 * sockets.
62 *
63 * sonewconn() creates a socket and attaches protocol state. This is a
64 * public interface that may be used by protocols to create new sockets when
65 * a new connection is received and will be available for accept() on a
66 * listen socket.
67 *
68 * soclose() destroys a socket after possibly waiting for it to disconnect.
69 * This is a public interface that socket consumers should use to close and
70 * release a socket when done with it.
71 *
72 * soabort() destroys a socket without waiting for it to disconnect (used
73 * only for incoming connections that are already partially or fully
74 * connected). This is used internally by the socket layer when clearing
75 * listen socket queues (due to overflow or close on the listen socket), but
76 * is also a public interface protocols may use to abort connections in
77 * their incomplete listen queues should they no longer be required. Sockets
78 * placed in completed connection listen queues should not be aborted for
79 * reasons described in the comment above the soclose() implementation. This
80 * is not a general purpose close routine, and except in the specific
81 * circumstances described here, should not be used.
82 *
83 * sofree() will free a socket and its protocol state if all references on
84 * the socket have been released, and is the public interface to attempt to
85 * free a socket when a reference is removed. This is a socket layer private
86 * interface.
87 *
88 * NOTE: In addition to socreate() and soclose(), which provide a single
89 * socket reference to the consumer to be managed as required, there are two
90 * calls to explicitly manage socket references, soref(), and sorele().
91 * Currently, these are generally required only when transitioning a socket
92 * from a listen queue to a file descriptor, in order to prevent garbage
93 * collection of the socket at an untimely moment. For a number of reasons,
94 * these interfaces are not preferred, and should be avoided.
95 *
96 * NOTE: With regard to VNETs the general rule is that callers do not set
97 * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
98 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
99 * and sorflush(), which are usually called from a pre-set VNET context.
100 * sopoll() currently does not need a VNET context to be set.
101 */
102
103 #include <sys/cdefs.h>
104 __FBSDID("$FreeBSD: releng/10.0/sys/kern/uipc_socket.c 255608 2013-09-16 06:25:54Z kib $");
105
106 #include "opt_inet.h"
107 #include "opt_inet6.h"
108 #include "opt_compat.h"
109
110 #include <sys/param.h>
111 #include <sys/systm.h>
112 #include <sys/fcntl.h>
113 #include <sys/limits.h>
114 #include <sys/lock.h>
115 #include <sys/mac.h>
116 #include <sys/malloc.h>
117 #include <sys/mbuf.h>
118 #include <sys/mutex.h>
119 #include <sys/domain.h>
120 #include <sys/file.h> /* for struct knote */
121 #include <sys/kernel.h>
122 #include <sys/event.h>
123 #include <sys/eventhandler.h>
124 #include <sys/poll.h>
125 #include <sys/proc.h>
126 #include <sys/protosw.h>
127 #include <sys/socket.h>
128 #include <sys/socketvar.h>
129 #include <sys/resourcevar.h>
130 #include <net/route.h>
131 #include <sys/signalvar.h>
132 #include <sys/stat.h>
133 #include <sys/sx.h>
134 #include <sys/sysctl.h>
135 #include <sys/uio.h>
136 #include <sys/jail.h>
137 #include <sys/syslog.h>
138 #include <netinet/in.h>
139
140 #include <net/vnet.h>
141
142 #include <security/mac/mac_framework.h>
143
144 #include <vm/uma.h>
145
146 #ifdef COMPAT_FREEBSD32
147 #include <sys/mount.h>
148 #include <sys/sysent.h>
149 #include <compat/freebsd32/freebsd32.h>
150 #endif
151
152 static int soreceive_rcvoob(struct socket *so, struct uio *uio,
153 int flags);
154
155 static void filt_sordetach(struct knote *kn);
156 static int filt_soread(struct knote *kn, long hint);
157 static void filt_sowdetach(struct knote *kn);
158 static int filt_sowrite(struct knote *kn, long hint);
159 static int filt_solisten(struct knote *kn, long hint);
160
161 static struct filterops solisten_filtops = {
162 .f_isfd = 1,
163 .f_detach = filt_sordetach,
164 .f_event = filt_solisten,
165 };
166 static struct filterops soread_filtops = {
167 .f_isfd = 1,
168 .f_detach = filt_sordetach,
169 .f_event = filt_soread,
170 };
171 static struct filterops sowrite_filtops = {
172 .f_isfd = 1,
173 .f_detach = filt_sowdetach,
174 .f_event = filt_sowrite,
175 };
176
177 so_gen_t so_gencnt; /* generation count for sockets */
178
179 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
180 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
181
182 #define VNET_SO_ASSERT(so) \
183 VNET_ASSERT(curvnet != NULL, \
184 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
185
186 /*
187 * Limit on the number of connections in the listen queue waiting
188 * for accept(2).
189 * NB: The orginal sysctl somaxconn is still available but hidden
190 * to prevent confusion about the actual purpose of this number.
191 */
192 static int somaxconn = SOMAXCONN;
193
194 static int
195 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
196 {
197 int error;
198 int val;
199
200 val = somaxconn;
201 error = sysctl_handle_int(oidp, &val, 0, req);
202 if (error || !req->newptr )
203 return (error);
204
205 if (val < 1 || val > USHRT_MAX)
206 return (EINVAL);
207
208 somaxconn = val;
209 return (0);
210 }
211 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW,
212 0, sizeof(int), sysctl_somaxconn, "I",
213 "Maximum listen socket pending connection accept queue size");
214 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
215 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP,
216 0, sizeof(int), sysctl_somaxconn, "I",
217 "Maximum listen socket pending connection accept queue size (compat)");
218
219 static int numopensockets;
220 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
221 &numopensockets, 0, "Number of open sockets");
222
223 /*
224 * accept_mtx locks down per-socket fields relating to accept queues. See
225 * socketvar.h for an annotation of the protected fields of struct socket.
226 */
227 struct mtx accept_mtx;
228 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
229
230 /*
231 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
232 * so_gencnt field.
233 */
234 static struct mtx so_global_mtx;
235 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
236
237 /*
238 * General IPC sysctl name space, used by sockets and a variety of other IPC
239 * types.
240 */
241 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
242
243 /*
244 * Initialize the socket subsystem and set up the socket
245 * memory allocator.
246 */
247 static uma_zone_t socket_zone;
248 int maxsockets;
249
250 static void
251 socket_zone_change(void *tag)
252 {
253
254 maxsockets = uma_zone_set_max(socket_zone, maxsockets);
255 }
256
257 static void
258 socket_init(void *tag)
259 {
260
261 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
262 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
263 maxsockets = uma_zone_set_max(socket_zone, maxsockets);
264 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
265 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
266 EVENTHANDLER_PRI_FIRST);
267 }
268 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
269
270 /*
271 * Initialise maxsockets. This SYSINIT must be run after
272 * tunable_mbinit().
273 */
274 static void
275 init_maxsockets(void *ignored)
276 {
277
278 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
279 maxsockets = imax(maxsockets, maxfiles);
280 }
281 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
282
283 /*
284 * Sysctl to get and set the maximum global sockets limit. Notify protocols
285 * of the change so that they can update their dependent limits as required.
286 */
287 static int
288 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
289 {
290 int error, newmaxsockets;
291
292 newmaxsockets = maxsockets;
293 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
294 if (error == 0 && req->newptr) {
295 if (newmaxsockets > maxsockets &&
296 newmaxsockets <= maxfiles) {
297 maxsockets = newmaxsockets;
298 EVENTHANDLER_INVOKE(maxsockets_change);
299 } else
300 error = EINVAL;
301 }
302 return (error);
303 }
304 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
305 &maxsockets, 0, sysctl_maxsockets, "IU",
306 "Maximum number of sockets avaliable");
307
308 /*
309 * Socket operation routines. These routines are called by the routines in
310 * sys_socket.c or from a system process, and implement the semantics of
311 * socket operations by switching out to the protocol specific routines.
312 */
313
314 /*
315 * Get a socket structure from our zone, and initialize it. Note that it
316 * would probably be better to allocate socket and PCB at the same time, but
317 * I'm not convinced that all the protocols can be easily modified to do
318 * this.
319 *
320 * soalloc() returns a socket with a ref count of 0.
321 */
322 static struct socket *
323 soalloc(struct vnet *vnet)
324 {
325 struct socket *so;
326
327 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
328 if (so == NULL)
329 return (NULL);
330 #ifdef MAC
331 if (mac_socket_init(so, M_NOWAIT) != 0) {
332 uma_zfree(socket_zone, so);
333 return (NULL);
334 }
335 #endif
336 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
337 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
338 sx_init(&so->so_snd.sb_sx, "so_snd_sx");
339 sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
340 TAILQ_INIT(&so->so_aiojobq);
341 mtx_lock(&so_global_mtx);
342 so->so_gencnt = ++so_gencnt;
343 ++numopensockets;
344 #ifdef VIMAGE
345 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
346 __func__, __LINE__, so));
347 vnet->vnet_sockcnt++;
348 so->so_vnet = vnet;
349 #endif
350 mtx_unlock(&so_global_mtx);
351 return (so);
352 }
353
354 /*
355 * Free the storage associated with a socket at the socket layer, tear down
356 * locks, labels, etc. All protocol state is assumed already to have been
357 * torn down (and possibly never set up) by the caller.
358 */
359 static void
360 sodealloc(struct socket *so)
361 {
362
363 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
364 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
365
366 mtx_lock(&so_global_mtx);
367 so->so_gencnt = ++so_gencnt;
368 --numopensockets; /* Could be below, but faster here. */
369 #ifdef VIMAGE
370 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
371 __func__, __LINE__, so));
372 so->so_vnet->vnet_sockcnt--;
373 #endif
374 mtx_unlock(&so_global_mtx);
375 if (so->so_rcv.sb_hiwat)
376 (void)chgsbsize(so->so_cred->cr_uidinfo,
377 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
378 if (so->so_snd.sb_hiwat)
379 (void)chgsbsize(so->so_cred->cr_uidinfo,
380 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
381 #ifdef INET
382 /* remove acccept filter if one is present. */
383 if (so->so_accf != NULL)
384 do_setopt_accept_filter(so, NULL);
385 #endif
386 #ifdef MAC
387 mac_socket_destroy(so);
388 #endif
389 crfree(so->so_cred);
390 sx_destroy(&so->so_snd.sb_sx);
391 sx_destroy(&so->so_rcv.sb_sx);
392 SOCKBUF_LOCK_DESTROY(&so->so_snd);
393 SOCKBUF_LOCK_DESTROY(&so->so_rcv);
394 uma_zfree(socket_zone, so);
395 }
396
397 /*
398 * socreate returns a socket with a ref count of 1. The socket should be
399 * closed with soclose().
400 */
401 int
402 socreate(int dom, struct socket **aso, int type, int proto,
403 struct ucred *cred, struct thread *td)
404 {
405 struct protosw *prp;
406 struct socket *so;
407 int error;
408
409 if (proto)
410 prp = pffindproto(dom, proto, type);
411 else
412 prp = pffindtype(dom, type);
413
414 if (prp == NULL) {
415 /* No support for domain. */
416 if (pffinddomain(dom) == NULL)
417 return (EAFNOSUPPORT);
418 /* No support for socket type. */
419 if (proto == 0 && type != 0)
420 return (EPROTOTYPE);
421 return (EPROTONOSUPPORT);
422 }
423 if (prp->pr_usrreqs->pru_attach == NULL ||
424 prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
425 return (EPROTONOSUPPORT);
426
427 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
428 return (EPROTONOSUPPORT);
429
430 if (prp->pr_type != type)
431 return (EPROTOTYPE);
432 so = soalloc(CRED_TO_VNET(cred));
433 if (so == NULL)
434 return (ENOBUFS);
435
436 TAILQ_INIT(&so->so_incomp);
437 TAILQ_INIT(&so->so_comp);
438 so->so_type = type;
439 so->so_cred = crhold(cred);
440 if ((prp->pr_domain->dom_family == PF_INET) ||
441 (prp->pr_domain->dom_family == PF_INET6) ||
442 (prp->pr_domain->dom_family == PF_ROUTE))
443 so->so_fibnum = td->td_proc->p_fibnum;
444 else
445 so->so_fibnum = 0;
446 so->so_proto = prp;
447 #ifdef MAC
448 mac_socket_create(cred, so);
449 #endif
450 knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
451 knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
452 so->so_count = 1;
453 /*
454 * Auto-sizing of socket buffers is managed by the protocols and
455 * the appropriate flags must be set in the pru_attach function.
456 */
457 CURVNET_SET(so->so_vnet);
458 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
459 CURVNET_RESTORE();
460 if (error) {
461 KASSERT(so->so_count == 1, ("socreate: so_count %d",
462 so->so_count));
463 so->so_count = 0;
464 sodealloc(so);
465 return (error);
466 }
467 *aso = so;
468 return (0);
469 }
470
471 #ifdef REGRESSION
472 static int regression_sonewconn_earlytest = 1;
473 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
474 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
475 #endif
476
477 /*
478 * When an attempt at a new connection is noted on a socket which accepts
479 * connections, sonewconn is called. If the connection is possible (subject
480 * to space constraints, etc.) then we allocate a new structure, propoerly
481 * linked into the data structure of the original socket, and return this.
482 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
483 *
484 * Note: the ref count on the socket is 0 on return.
485 */
486 struct socket *
487 sonewconn(struct socket *head, int connstatus)
488 {
489 struct socket *so;
490 int over;
491
492 ACCEPT_LOCK();
493 over = (head->so_qlen > 3 * head->so_qlimit / 2);
494 ACCEPT_UNLOCK();
495 #ifdef REGRESSION
496 if (regression_sonewconn_earlytest && over) {
497 #else
498 if (over) {
499 #endif
500 log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
501 "%i already in queue awaiting acceptance\n",
502 __func__, head->so_pcb, head->so_qlen);
503 return (NULL);
504 }
505 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
506 __func__, __LINE__, head));
507 so = soalloc(head->so_vnet);
508 if (so == NULL) {
509 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
510 "limit reached or out of memory\n",
511 __func__, head->so_pcb);
512 return (NULL);
513 }
514 if ((head->so_options & SO_ACCEPTFILTER) != 0)
515 connstatus = 0;
516 so->so_head = head;
517 so->so_type = head->so_type;
518 so->so_options = head->so_options &~ SO_ACCEPTCONN;
519 so->so_linger = head->so_linger;
520 so->so_state = head->so_state | SS_NOFDREF;
521 so->so_fibnum = head->so_fibnum;
522 so->so_proto = head->so_proto;
523 so->so_cred = crhold(head->so_cred);
524 #ifdef MAC
525 mac_socket_newconn(head, so);
526 #endif
527 knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
528 knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
529 VNET_SO_ASSERT(head);
530 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
531 sodealloc(so);
532 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
533 __func__, head->so_pcb);
534 return (NULL);
535 }
536 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
537 sodealloc(so);
538 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
539 __func__, head->so_pcb);
540 return (NULL);
541 }
542 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
543 so->so_snd.sb_lowat = head->so_snd.sb_lowat;
544 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
545 so->so_snd.sb_timeo = head->so_snd.sb_timeo;
546 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
547 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
548 so->so_state |= connstatus;
549 ACCEPT_LOCK();
550 /*
551 * The accept socket may be tearing down but we just
552 * won a race on the ACCEPT_LOCK.
553 * However, if sctp_peeloff() is called on a 1-to-many
554 * style socket, the SO_ACCEPTCONN doesn't need to be set.
555 */
556 if (!(head->so_options & SO_ACCEPTCONN) &&
557 ((head->so_proto->pr_protocol != IPPROTO_SCTP) ||
558 (head->so_type != SOCK_SEQPACKET))) {
559 SOCK_LOCK(so);
560 so->so_head = NULL;
561 sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */
562 return (NULL);
563 }
564 if (connstatus) {
565 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
566 so->so_qstate |= SQ_COMP;
567 head->so_qlen++;
568 } else {
569 /*
570 * Keep removing sockets from the head until there's room for
571 * us to insert on the tail. In pre-locking revisions, this
572 * was a simple if(), but as we could be racing with other
573 * threads and soabort() requires dropping locks, we must
574 * loop waiting for the condition to be true.
575 */
576 while (head->so_incqlen > head->so_qlimit) {
577 struct socket *sp;
578 sp = TAILQ_FIRST(&head->so_incomp);
579 TAILQ_REMOVE(&head->so_incomp, sp, so_list);
580 head->so_incqlen--;
581 sp->so_qstate &= ~SQ_INCOMP;
582 sp->so_head = NULL;
583 ACCEPT_UNLOCK();
584 soabort(sp);
585 ACCEPT_LOCK();
586 }
587 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
588 so->so_qstate |= SQ_INCOMP;
589 head->so_incqlen++;
590 }
591 ACCEPT_UNLOCK();
592 if (connstatus) {
593 sorwakeup(head);
594 wakeup_one(&head->so_timeo);
595 }
596 return (so);
597 }
598
599 int
600 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
601 {
602 int error;
603
604 CURVNET_SET(so->so_vnet);
605 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
606 CURVNET_RESTORE();
607 return (error);
608 }
609
610 int
611 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
612 {
613 int error;
614
615 CURVNET_SET(so->so_vnet);
616 error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
617 CURVNET_RESTORE();
618 return (error);
619 }
620
621 /*
622 * solisten() transitions a socket from a non-listening state to a listening
623 * state, but can also be used to update the listen queue depth on an
624 * existing listen socket. The protocol will call back into the sockets
625 * layer using solisten_proto_check() and solisten_proto() to check and set
626 * socket-layer listen state. Call backs are used so that the protocol can
627 * acquire both protocol and socket layer locks in whatever order is required
628 * by the protocol.
629 *
630 * Protocol implementors are advised to hold the socket lock across the
631 * socket-layer test and set to avoid races at the socket layer.
632 */
633 int
634 solisten(struct socket *so, int backlog, struct thread *td)
635 {
636 int error;
637
638 CURVNET_SET(so->so_vnet);
639 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
640 CURVNET_RESTORE();
641 return (error);
642 }
643
644 int
645 solisten_proto_check(struct socket *so)
646 {
647
648 SOCK_LOCK_ASSERT(so);
649
650 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
651 SS_ISDISCONNECTING))
652 return (EINVAL);
653 return (0);
654 }
655
656 void
657 solisten_proto(struct socket *so, int backlog)
658 {
659
660 SOCK_LOCK_ASSERT(so);
661
662 if (backlog < 0 || backlog > somaxconn)
663 backlog = somaxconn;
664 so->so_qlimit = backlog;
665 so->so_options |= SO_ACCEPTCONN;
666 }
667
668 /*
669 * Evaluate the reference count and named references on a socket; if no
670 * references remain, free it. This should be called whenever a reference is
671 * released, such as in sorele(), but also when named reference flags are
672 * cleared in socket or protocol code.
673 *
674 * sofree() will free the socket if:
675 *
676 * - There are no outstanding file descriptor references or related consumers
677 * (so_count == 0).
678 *
679 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
680 *
681 * - The protocol does not have an outstanding strong reference on the socket
682 * (SS_PROTOREF).
683 *
684 * - The socket is not in a completed connection queue, so a process has been
685 * notified that it is present. If it is removed, the user process may
686 * block in accept() despite select() saying the socket was ready.
687 */
688 void
689 sofree(struct socket *so)
690 {
691 struct protosw *pr = so->so_proto;
692 struct socket *head;
693
694 ACCEPT_LOCK_ASSERT();
695 SOCK_LOCK_ASSERT(so);
696
697 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
698 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
699 SOCK_UNLOCK(so);
700 ACCEPT_UNLOCK();
701 return;
702 }
703
704 head = so->so_head;
705 if (head != NULL) {
706 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
707 (so->so_qstate & SQ_INCOMP) != 0,
708 ("sofree: so_head != NULL, but neither SQ_COMP nor "
709 "SQ_INCOMP"));
710 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
711 (so->so_qstate & SQ_INCOMP) == 0,
712 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
713 TAILQ_REMOVE(&head->so_incomp, so, so_list);
714 head->so_incqlen--;
715 so->so_qstate &= ~SQ_INCOMP;
716 so->so_head = NULL;
717 }
718 KASSERT((so->so_qstate & SQ_COMP) == 0 &&
719 (so->so_qstate & SQ_INCOMP) == 0,
720 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
721 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
722 if (so->so_options & SO_ACCEPTCONN) {
723 KASSERT((TAILQ_EMPTY(&so->so_comp)),
724 ("sofree: so_comp populated"));
725 KASSERT((TAILQ_EMPTY(&so->so_incomp)),
726 ("sofree: so_incomp populated"));
727 }
728 SOCK_UNLOCK(so);
729 ACCEPT_UNLOCK();
730
731 VNET_SO_ASSERT(so);
732 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
733 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
734 if (pr->pr_usrreqs->pru_detach != NULL)
735 (*pr->pr_usrreqs->pru_detach)(so);
736
737 /*
738 * From this point on, we assume that no other references to this
739 * socket exist anywhere else in the stack. Therefore, no locks need
740 * to be acquired or held.
741 *
742 * We used to do a lot of socket buffer and socket locking here, as
743 * well as invoke sorflush() and perform wakeups. The direct call to
744 * dom_dispose() and sbrelease_internal() are an inlining of what was
745 * necessary from sorflush().
746 *
747 * Notice that the socket buffer and kqueue state are torn down
748 * before calling pru_detach. This means that protocols shold not
749 * assume they can perform socket wakeups, etc, in their detach code.
750 */
751 sbdestroy(&so->so_snd, so);
752 sbdestroy(&so->so_rcv, so);
753 seldrain(&so->so_snd.sb_sel);
754 seldrain(&so->so_rcv.sb_sel);
755 knlist_destroy(&so->so_rcv.sb_sel.si_note);
756 knlist_destroy(&so->so_snd.sb_sel.si_note);
757 sodealloc(so);
758 }
759
760 /*
761 * Close a socket on last file table reference removal. Initiate disconnect
762 * if connected. Free socket when disconnect complete.
763 *
764 * This function will sorele() the socket. Note that soclose() may be called
765 * prior to the ref count reaching zero. The actual socket structure will
766 * not be freed until the ref count reaches zero.
767 */
768 int
769 soclose(struct socket *so)
770 {
771 int error = 0;
772
773 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
774
775 CURVNET_SET(so->so_vnet);
776 funsetown(&so->so_sigio);
777 if (so->so_state & SS_ISCONNECTED) {
778 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
779 error = sodisconnect(so);
780 if (error) {
781 if (error == ENOTCONN)
782 error = 0;
783 goto drop;
784 }
785 }
786 if (so->so_options & SO_LINGER) {
787 if ((so->so_state & SS_ISDISCONNECTING) &&
788 (so->so_state & SS_NBIO))
789 goto drop;
790 while (so->so_state & SS_ISCONNECTED) {
791 error = tsleep(&so->so_timeo,
792 PSOCK | PCATCH, "soclos",
793 so->so_linger * hz);
794 if (error)
795 break;
796 }
797 }
798 }
799
800 drop:
801 if (so->so_proto->pr_usrreqs->pru_close != NULL)
802 (*so->so_proto->pr_usrreqs->pru_close)(so);
803 ACCEPT_LOCK();
804 if (so->so_options & SO_ACCEPTCONN) {
805 struct socket *sp;
806 /*
807 * Prevent new additions to the accept queues due
808 * to ACCEPT_LOCK races while we are draining them.
809 */
810 so->so_options &= ~SO_ACCEPTCONN;
811 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
812 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
813 so->so_incqlen--;
814 sp->so_qstate &= ~SQ_INCOMP;
815 sp->so_head = NULL;
816 ACCEPT_UNLOCK();
817 soabort(sp);
818 ACCEPT_LOCK();
819 }
820 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
821 TAILQ_REMOVE(&so->so_comp, sp, so_list);
822 so->so_qlen--;
823 sp->so_qstate &= ~SQ_COMP;
824 sp->so_head = NULL;
825 ACCEPT_UNLOCK();
826 soabort(sp);
827 ACCEPT_LOCK();
828 }
829 KASSERT((TAILQ_EMPTY(&so->so_comp)),
830 ("%s: so_comp populated", __func__));
831 KASSERT((TAILQ_EMPTY(&so->so_incomp)),
832 ("%s: so_incomp populated", __func__));
833 }
834 SOCK_LOCK(so);
835 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
836 so->so_state |= SS_NOFDREF;
837 sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */
838 CURVNET_RESTORE();
839 return (error);
840 }
841
842 /*
843 * soabort() is used to abruptly tear down a connection, such as when a
844 * resource limit is reached (listen queue depth exceeded), or if a listen
845 * socket is closed while there are sockets waiting to be accepted.
846 *
847 * This interface is tricky, because it is called on an unreferenced socket,
848 * and must be called only by a thread that has actually removed the socket
849 * from the listen queue it was on, or races with other threads are risked.
850 *
851 * This interface will call into the protocol code, so must not be called
852 * with any socket locks held. Protocols do call it while holding their own
853 * recursible protocol mutexes, but this is something that should be subject
854 * to review in the future.
855 */
856 void
857 soabort(struct socket *so)
858 {
859
860 /*
861 * In as much as is possible, assert that no references to this
862 * socket are held. This is not quite the same as asserting that the
863 * current thread is responsible for arranging for no references, but
864 * is as close as we can get for now.
865 */
866 KASSERT(so->so_count == 0, ("soabort: so_count"));
867 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
868 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
869 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
870 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
871 VNET_SO_ASSERT(so);
872
873 if (so->so_proto->pr_usrreqs->pru_abort != NULL)
874 (*so->so_proto->pr_usrreqs->pru_abort)(so);
875 ACCEPT_LOCK();
876 SOCK_LOCK(so);
877 sofree(so);
878 }
879
880 int
881 soaccept(struct socket *so, struct sockaddr **nam)
882 {
883 int error;
884
885 SOCK_LOCK(so);
886 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
887 so->so_state &= ~SS_NOFDREF;
888 SOCK_UNLOCK(so);
889
890 CURVNET_SET(so->so_vnet);
891 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
892 CURVNET_RESTORE();
893 return (error);
894 }
895
896 int
897 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
898 {
899
900 return (soconnectat(AT_FDCWD, so, nam, td));
901 }
902
903 int
904 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
905 {
906 int error;
907
908 if (so->so_options & SO_ACCEPTCONN)
909 return (EOPNOTSUPP);
910
911 CURVNET_SET(so->so_vnet);
912 /*
913 * If protocol is connection-based, can only connect once.
914 * Otherwise, if connected, try to disconnect first. This allows
915 * user to disconnect by connecting to, e.g., a null address.
916 */
917 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
918 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
919 (error = sodisconnect(so)))) {
920 error = EISCONN;
921 } else {
922 /*
923 * Prevent accumulated error from previous connection from
924 * biting us.
925 */
926 so->so_error = 0;
927 if (fd == AT_FDCWD) {
928 error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
929 nam, td);
930 } else {
931 error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
932 so, nam, td);
933 }
934 }
935 CURVNET_RESTORE();
936
937 return (error);
938 }
939
940 int
941 soconnect2(struct socket *so1, struct socket *so2)
942 {
943 int error;
944
945 CURVNET_SET(so1->so_vnet);
946 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
947 CURVNET_RESTORE();
948 return (error);
949 }
950
951 int
952 sodisconnect(struct socket *so)
953 {
954 int error;
955
956 if ((so->so_state & SS_ISCONNECTED) == 0)
957 return (ENOTCONN);
958 if (so->so_state & SS_ISDISCONNECTING)
959 return (EALREADY);
960 VNET_SO_ASSERT(so);
961 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
962 return (error);
963 }
964
965 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
966
967 int
968 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
969 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
970 {
971 long space;
972 ssize_t resid;
973 int clen = 0, error, dontroute;
974
975 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
976 KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
977 ("sosend_dgram: !PR_ATOMIC"));
978
979 if (uio != NULL)
980 resid = uio->uio_resid;
981 else
982 resid = top->m_pkthdr.len;
983 /*
984 * In theory resid should be unsigned. However, space must be
985 * signed, as it might be less than 0 if we over-committed, and we
986 * must use a signed comparison of space and resid. On the other
987 * hand, a negative resid causes us to loop sending 0-length
988 * segments to the protocol.
989 */
990 if (resid < 0) {
991 error = EINVAL;
992 goto out;
993 }
994
995 dontroute =
996 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
997 if (td != NULL)
998 td->td_ru.ru_msgsnd++;
999 if (control != NULL)
1000 clen = control->m_len;
1001
1002 SOCKBUF_LOCK(&so->so_snd);
1003 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1004 SOCKBUF_UNLOCK(&so->so_snd);
1005 error = EPIPE;
1006 goto out;
1007 }
1008 if (so->so_error) {
1009 error = so->so_error;
1010 so->so_error = 0;
1011 SOCKBUF_UNLOCK(&so->so_snd);
1012 goto out;
1013 }
1014 if ((so->so_state & SS_ISCONNECTED) == 0) {
1015 /*
1016 * `sendto' and `sendmsg' is allowed on a connection-based
1017 * socket if it supports implied connect. Return ENOTCONN if
1018 * not connected and no address is supplied.
1019 */
1020 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1021 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1022 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1023 !(resid == 0 && clen != 0)) {
1024 SOCKBUF_UNLOCK(&so->so_snd);
1025 error = ENOTCONN;
1026 goto out;
1027 }
1028 } else if (addr == NULL) {
1029 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1030 error = ENOTCONN;
1031 else
1032 error = EDESTADDRREQ;
1033 SOCKBUF_UNLOCK(&so->so_snd);
1034 goto out;
1035 }
1036 }
1037
1038 /*
1039 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
1040 * problem and need fixing.
1041 */
1042 space = sbspace(&so->so_snd);
1043 if (flags & MSG_OOB)
1044 space += 1024;
1045 space -= clen;
1046 SOCKBUF_UNLOCK(&so->so_snd);
1047 if (resid > space) {
1048 error = EMSGSIZE;
1049 goto out;
1050 }
1051 if (uio == NULL) {
1052 resid = 0;
1053 if (flags & MSG_EOR)
1054 top->m_flags |= M_EOR;
1055 } else {
1056 /*
1057 * Copy the data from userland into a mbuf chain.
1058 * If no data is to be copied in, a single empty mbuf
1059 * is returned.
1060 */
1061 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1062 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1063 if (top == NULL) {
1064 error = EFAULT; /* only possible error */
1065 goto out;
1066 }
1067 space -= resid - uio->uio_resid;
1068 resid = uio->uio_resid;
1069 }
1070 KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1071 /*
1072 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1073 * than with.
1074 */
1075 if (dontroute) {
1076 SOCK_LOCK(so);
1077 so->so_options |= SO_DONTROUTE;
1078 SOCK_UNLOCK(so);
1079 }
1080 /*
1081 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1082 * of date. We could have recieved a reset packet in an interrupt or
1083 * maybe we slept while doing page faults in uiomove() etc. We could
1084 * probably recheck again inside the locking protection here, but
1085 * there are probably other places that this also happens. We must
1086 * rethink this.
1087 */
1088 VNET_SO_ASSERT(so);
1089 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1090 (flags & MSG_OOB) ? PRUS_OOB :
1091 /*
1092 * If the user set MSG_EOF, the protocol understands this flag and
1093 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1094 */
1095 ((flags & MSG_EOF) &&
1096 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1097 (resid <= 0)) ?
1098 PRUS_EOF :
1099 /* If there is more to send set PRUS_MORETOCOME */
1100 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1101 top, addr, control, td);
1102 if (dontroute) {
1103 SOCK_LOCK(so);
1104 so->so_options &= ~SO_DONTROUTE;
1105 SOCK_UNLOCK(so);
1106 }
1107 clen = 0;
1108 control = NULL;
1109 top = NULL;
1110 out:
1111 if (top != NULL)
1112 m_freem(top);
1113 if (control != NULL)
1114 m_freem(control);
1115 return (error);
1116 }
1117
1118 /*
1119 * Send on a socket. If send must go all at once and message is larger than
1120 * send buffering, then hard error. Lock against other senders. If must go
1121 * all at once and not enough room now, then inform user that this would
1122 * block and do nothing. Otherwise, if nonblocking, send as much as
1123 * possible. The data to be sent is described by "uio" if nonzero, otherwise
1124 * by the mbuf chain "top" (which must be null if uio is not). Data provided
1125 * in mbuf chain must be small enough to send all at once.
1126 *
1127 * Returns nonzero on error, timeout or signal; callers must check for short
1128 * counts if EINTR/ERESTART are returned. Data and control buffers are freed
1129 * on return.
1130 */
1131 int
1132 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1133 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1134 {
1135 long space;
1136 ssize_t resid;
1137 int clen = 0, error, dontroute;
1138 int atomic = sosendallatonce(so) || top;
1139
1140 if (uio != NULL)
1141 resid = uio->uio_resid;
1142 else
1143 resid = top->m_pkthdr.len;
1144 /*
1145 * In theory resid should be unsigned. However, space must be
1146 * signed, as it might be less than 0 if we over-committed, and we
1147 * must use a signed comparison of space and resid. On the other
1148 * hand, a negative resid causes us to loop sending 0-length
1149 * segments to the protocol.
1150 *
1151 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1152 * type sockets since that's an error.
1153 */
1154 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1155 error = EINVAL;
1156 goto out;
1157 }
1158
1159 dontroute =
1160 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1161 (so->so_proto->pr_flags & PR_ATOMIC);
1162 if (td != NULL)
1163 td->td_ru.ru_msgsnd++;
1164 if (control != NULL)
1165 clen = control->m_len;
1166
1167 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1168 if (error)
1169 goto out;
1170
1171 restart:
1172 do {
1173 SOCKBUF_LOCK(&so->so_snd);
1174 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1175 SOCKBUF_UNLOCK(&so->so_snd);
1176 error = EPIPE;
1177 goto release;
1178 }
1179 if (so->so_error) {
1180 error = so->so_error;
1181 so->so_error = 0;
1182 SOCKBUF_UNLOCK(&so->so_snd);
1183 goto release;
1184 }
1185 if ((so->so_state & SS_ISCONNECTED) == 0) {
1186 /*
1187 * `sendto' and `sendmsg' is allowed on a connection-
1188 * based socket if it supports implied connect.
1189 * Return ENOTCONN if not connected and no address is
1190 * supplied.
1191 */
1192 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1193 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1194 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1195 !(resid == 0 && clen != 0)) {
1196 SOCKBUF_UNLOCK(&so->so_snd);
1197 error = ENOTCONN;
1198 goto release;
1199 }
1200 } else if (addr == NULL) {
1201 SOCKBUF_UNLOCK(&so->so_snd);
1202 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1203 error = ENOTCONN;
1204 else
1205 error = EDESTADDRREQ;
1206 goto release;
1207 }
1208 }
1209 space = sbspace(&so->so_snd);
1210 if (flags & MSG_OOB)
1211 space += 1024;
1212 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1213 clen > so->so_snd.sb_hiwat) {
1214 SOCKBUF_UNLOCK(&so->so_snd);
1215 error = EMSGSIZE;
1216 goto release;
1217 }
1218 if (space < resid + clen &&
1219 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1220 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1221 SOCKBUF_UNLOCK(&so->so_snd);
1222 error = EWOULDBLOCK;
1223 goto release;
1224 }
1225 error = sbwait(&so->so_snd);
1226 SOCKBUF_UNLOCK(&so->so_snd);
1227 if (error)
1228 goto release;
1229 goto restart;
1230 }
1231 SOCKBUF_UNLOCK(&so->so_snd);
1232 space -= clen;
1233 do {
1234 if (uio == NULL) {
1235 resid = 0;
1236 if (flags & MSG_EOR)
1237 top->m_flags |= M_EOR;
1238 } else {
1239 /*
1240 * Copy the data from userland into a mbuf
1241 * chain. If no data is to be copied in,
1242 * a single empty mbuf is returned.
1243 */
1244 top = m_uiotombuf(uio, M_WAITOK, space,
1245 (atomic ? max_hdr : 0),
1246 (atomic ? M_PKTHDR : 0) |
1247 ((flags & MSG_EOR) ? M_EOR : 0));
1248 if (top == NULL) {
1249 error = EFAULT; /* only possible error */
1250 goto release;
1251 }
1252 space -= resid - uio->uio_resid;
1253 resid = uio->uio_resid;
1254 }
1255 if (dontroute) {
1256 SOCK_LOCK(so);
1257 so->so_options |= SO_DONTROUTE;
1258 SOCK_UNLOCK(so);
1259 }
1260 /*
1261 * XXX all the SBS_CANTSENDMORE checks previously
1262 * done could be out of date. We could have recieved
1263 * a reset packet in an interrupt or maybe we slept
1264 * while doing page faults in uiomove() etc. We
1265 * could probably recheck again inside the locking
1266 * protection here, but there are probably other
1267 * places that this also happens. We must rethink
1268 * this.
1269 */
1270 VNET_SO_ASSERT(so);
1271 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1272 (flags & MSG_OOB) ? PRUS_OOB :
1273 /*
1274 * If the user set MSG_EOF, the protocol understands
1275 * this flag and nothing left to send then use
1276 * PRU_SEND_EOF instead of PRU_SEND.
1277 */
1278 ((flags & MSG_EOF) &&
1279 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1280 (resid <= 0)) ?
1281 PRUS_EOF :
1282 /* If there is more to send set PRUS_MORETOCOME. */
1283 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1284 top, addr, control, td);
1285 if (dontroute) {
1286 SOCK_LOCK(so);
1287 so->so_options &= ~SO_DONTROUTE;
1288 SOCK_UNLOCK(so);
1289 }
1290 clen = 0;
1291 control = NULL;
1292 top = NULL;
1293 if (error)
1294 goto release;
1295 } while (resid && space > 0);
1296 } while (resid);
1297
1298 release:
1299 sbunlock(&so->so_snd);
1300 out:
1301 if (top != NULL)
1302 m_freem(top);
1303 if (control != NULL)
1304 m_freem(control);
1305 return (error);
1306 }
1307
1308 int
1309 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1310 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1311 {
1312 int error;
1313
1314 CURVNET_SET(so->so_vnet);
1315 error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1316 control, flags, td);
1317 CURVNET_RESTORE();
1318 return (error);
1319 }
1320
1321 /*
1322 * The part of soreceive() that implements reading non-inline out-of-band
1323 * data from a socket. For more complete comments, see soreceive(), from
1324 * which this code originated.
1325 *
1326 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1327 * unable to return an mbuf chain to the caller.
1328 */
1329 static int
1330 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1331 {
1332 struct protosw *pr = so->so_proto;
1333 struct mbuf *m;
1334 int error;
1335
1336 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1337 VNET_SO_ASSERT(so);
1338
1339 m = m_get(M_WAITOK, MT_DATA);
1340 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1341 if (error)
1342 goto bad;
1343 do {
1344 error = uiomove(mtod(m, void *),
1345 (int) min(uio->uio_resid, m->m_len), uio);
1346 m = m_free(m);
1347 } while (uio->uio_resid && error == 0 && m);
1348 bad:
1349 if (m != NULL)
1350 m_freem(m);
1351 return (error);
1352 }
1353
1354 /*
1355 * Following replacement or removal of the first mbuf on the first mbuf chain
1356 * of a socket buffer, push necessary state changes back into the socket
1357 * buffer so that other consumers see the values consistently. 'nextrecord'
1358 * is the callers locally stored value of the original value of
1359 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1360 * NOTE: 'nextrecord' may be NULL.
1361 */
1362 static __inline void
1363 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1364 {
1365
1366 SOCKBUF_LOCK_ASSERT(sb);
1367 /*
1368 * First, update for the new value of nextrecord. If necessary, make
1369 * it the first record.
1370 */
1371 if (sb->sb_mb != NULL)
1372 sb->sb_mb->m_nextpkt = nextrecord;
1373 else
1374 sb->sb_mb = nextrecord;
1375
1376 /*
1377 * Now update any dependent socket buffer fields to reflect the new
1378 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
1379 * addition of a second clause that takes care of the case where
1380 * sb_mb has been updated, but remains the last record.
1381 */
1382 if (sb->sb_mb == NULL) {
1383 sb->sb_mbtail = NULL;
1384 sb->sb_lastrecord = NULL;
1385 } else if (sb->sb_mb->m_nextpkt == NULL)
1386 sb->sb_lastrecord = sb->sb_mb;
1387 }
1388
1389 /*
1390 * Implement receive operations on a socket. We depend on the way that
1391 * records are added to the sockbuf by sbappend. In particular, each record
1392 * (mbufs linked through m_next) must begin with an address if the protocol
1393 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1394 * data, and then zero or more mbufs of data. In order to allow parallelism
1395 * between network receive and copying to user space, as well as avoid
1396 * sleeping with a mutex held, we release the socket buffer mutex during the
1397 * user space copy. Although the sockbuf is locked, new data may still be
1398 * appended, and thus we must maintain consistency of the sockbuf during that
1399 * time.
1400 *
1401 * The caller may receive the data as a single mbuf chain by supplying an
1402 * mbuf **mp0 for use in returning the chain. The uio is then used only for
1403 * the count in uio_resid.
1404 */
1405 int
1406 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1407 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1408 {
1409 struct mbuf *m, **mp;
1410 int flags, error, offset;
1411 ssize_t len;
1412 struct protosw *pr = so->so_proto;
1413 struct mbuf *nextrecord;
1414 int moff, type = 0;
1415 ssize_t orig_resid = uio->uio_resid;
1416
1417 mp = mp0;
1418 if (psa != NULL)
1419 *psa = NULL;
1420 if (controlp != NULL)
1421 *controlp = NULL;
1422 if (flagsp != NULL)
1423 flags = *flagsp &~ MSG_EOR;
1424 else
1425 flags = 0;
1426 if (flags & MSG_OOB)
1427 return (soreceive_rcvoob(so, uio, flags));
1428 if (mp != NULL)
1429 *mp = NULL;
1430 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1431 && uio->uio_resid) {
1432 VNET_SO_ASSERT(so);
1433 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1434 }
1435
1436 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1437 if (error)
1438 return (error);
1439
1440 restart:
1441 SOCKBUF_LOCK(&so->so_rcv);
1442 m = so->so_rcv.sb_mb;
1443 /*
1444 * If we have less data than requested, block awaiting more (subject
1445 * to any timeout) if:
1446 * 1. the current count is less than the low water mark, or
1447 * 2. MSG_DONTWAIT is not set
1448 */
1449 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1450 so->so_rcv.sb_cc < uio->uio_resid) &&
1451 so->so_rcv.sb_cc < so->so_rcv.sb_lowat &&
1452 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1453 KASSERT(m != NULL || !so->so_rcv.sb_cc,
1454 ("receive: m == %p so->so_rcv.sb_cc == %u",
1455 m, so->so_rcv.sb_cc));
1456 if (so->so_error) {
1457 if (m != NULL)
1458 goto dontblock;
1459 error = so->so_error;
1460 if ((flags & MSG_PEEK) == 0)
1461 so->so_error = 0;
1462 SOCKBUF_UNLOCK(&so->so_rcv);
1463 goto release;
1464 }
1465 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1466 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1467 if (m == NULL) {
1468 SOCKBUF_UNLOCK(&so->so_rcv);
1469 goto release;
1470 } else
1471 goto dontblock;
1472 }
1473 for (; m != NULL; m = m->m_next)
1474 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1475 m = so->so_rcv.sb_mb;
1476 goto dontblock;
1477 }
1478 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1479 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1480 SOCKBUF_UNLOCK(&so->so_rcv);
1481 error = ENOTCONN;
1482 goto release;
1483 }
1484 if (uio->uio_resid == 0) {
1485 SOCKBUF_UNLOCK(&so->so_rcv);
1486 goto release;
1487 }
1488 if ((so->so_state & SS_NBIO) ||
1489 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1490 SOCKBUF_UNLOCK(&so->so_rcv);
1491 error = EWOULDBLOCK;
1492 goto release;
1493 }
1494 SBLASTRECORDCHK(&so->so_rcv);
1495 SBLASTMBUFCHK(&so->so_rcv);
1496 error = sbwait(&so->so_rcv);
1497 SOCKBUF_UNLOCK(&so->so_rcv);
1498 if (error)
1499 goto release;
1500 goto restart;
1501 }
1502 dontblock:
1503 /*
1504 * From this point onward, we maintain 'nextrecord' as a cache of the
1505 * pointer to the next record in the socket buffer. We must keep the
1506 * various socket buffer pointers and local stack versions of the
1507 * pointers in sync, pushing out modifications before dropping the
1508 * socket buffer mutex, and re-reading them when picking it up.
1509 *
1510 * Otherwise, we will race with the network stack appending new data
1511 * or records onto the socket buffer by using inconsistent/stale
1512 * versions of the field, possibly resulting in socket buffer
1513 * corruption.
1514 *
1515 * By holding the high-level sblock(), we prevent simultaneous
1516 * readers from pulling off the front of the socket buffer.
1517 */
1518 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1519 if (uio->uio_td)
1520 uio->uio_td->td_ru.ru_msgrcv++;
1521 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1522 SBLASTRECORDCHK(&so->so_rcv);
1523 SBLASTMBUFCHK(&so->so_rcv);
1524 nextrecord = m->m_nextpkt;
1525 if (pr->pr_flags & PR_ADDR) {
1526 KASSERT(m->m_type == MT_SONAME,
1527 ("m->m_type == %d", m->m_type));
1528 orig_resid = 0;
1529 if (psa != NULL)
1530 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
1531 M_NOWAIT);
1532 if (flags & MSG_PEEK) {
1533 m = m->m_next;
1534 } else {
1535 sbfree(&so->so_rcv, m);
1536 so->so_rcv.sb_mb = m_free(m);
1537 m = so->so_rcv.sb_mb;
1538 sockbuf_pushsync(&so->so_rcv, nextrecord);
1539 }
1540 }
1541
1542 /*
1543 * Process one or more MT_CONTROL mbufs present before any data mbufs
1544 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
1545 * just copy the data; if !MSG_PEEK, we call into the protocol to
1546 * perform externalization (or freeing if controlp == NULL).
1547 */
1548 if (m != NULL && m->m_type == MT_CONTROL) {
1549 struct mbuf *cm = NULL, *cmn;
1550 struct mbuf **cme = &cm;
1551
1552 do {
1553 if (flags & MSG_PEEK) {
1554 if (controlp != NULL) {
1555 *controlp = m_copy(m, 0, m->m_len);
1556 controlp = &(*controlp)->m_next;
1557 }
1558 m = m->m_next;
1559 } else {
1560 sbfree(&so->so_rcv, m);
1561 so->so_rcv.sb_mb = m->m_next;
1562 m->m_next = NULL;
1563 *cme = m;
1564 cme = &(*cme)->m_next;
1565 m = so->so_rcv.sb_mb;
1566 }
1567 } while (m != NULL && m->m_type == MT_CONTROL);
1568 if ((flags & MSG_PEEK) == 0)
1569 sockbuf_pushsync(&so->so_rcv, nextrecord);
1570 while (cm != NULL) {
1571 cmn = cm->m_next;
1572 cm->m_next = NULL;
1573 if (pr->pr_domain->dom_externalize != NULL) {
1574 SOCKBUF_UNLOCK(&so->so_rcv);
1575 VNET_SO_ASSERT(so);
1576 error = (*pr->pr_domain->dom_externalize)
1577 (cm, controlp, flags);
1578 SOCKBUF_LOCK(&so->so_rcv);
1579 } else if (controlp != NULL)
1580 *controlp = cm;
1581 else
1582 m_freem(cm);
1583 if (controlp != NULL) {
1584 orig_resid = 0;
1585 while (*controlp != NULL)
1586 controlp = &(*controlp)->m_next;
1587 }
1588 cm = cmn;
1589 }
1590 if (m != NULL)
1591 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1592 else
1593 nextrecord = so->so_rcv.sb_mb;
1594 orig_resid = 0;
1595 }
1596 if (m != NULL) {
1597 if ((flags & MSG_PEEK) == 0) {
1598 KASSERT(m->m_nextpkt == nextrecord,
1599 ("soreceive: post-control, nextrecord !sync"));
1600 if (nextrecord == NULL) {
1601 KASSERT(so->so_rcv.sb_mb == m,
1602 ("soreceive: post-control, sb_mb!=m"));
1603 KASSERT(so->so_rcv.sb_lastrecord == m,
1604 ("soreceive: post-control, lastrecord!=m"));
1605 }
1606 }
1607 type = m->m_type;
1608 if (type == MT_OOBDATA)
1609 flags |= MSG_OOB;
1610 } else {
1611 if ((flags & MSG_PEEK) == 0) {
1612 KASSERT(so->so_rcv.sb_mb == nextrecord,
1613 ("soreceive: sb_mb != nextrecord"));
1614 if (so->so_rcv.sb_mb == NULL) {
1615 KASSERT(so->so_rcv.sb_lastrecord == NULL,
1616 ("soreceive: sb_lastercord != NULL"));
1617 }
1618 }
1619 }
1620 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1621 SBLASTRECORDCHK(&so->so_rcv);
1622 SBLASTMBUFCHK(&so->so_rcv);
1623
1624 /*
1625 * Now continue to read any data mbufs off of the head of the socket
1626 * buffer until the read request is satisfied. Note that 'type' is
1627 * used to store the type of any mbuf reads that have happened so far
1628 * such that soreceive() can stop reading if the type changes, which
1629 * causes soreceive() to return only one of regular data and inline
1630 * out-of-band data in a single socket receive operation.
1631 */
1632 moff = 0;
1633 offset = 0;
1634 while (m != NULL && uio->uio_resid > 0 && error == 0) {
1635 /*
1636 * If the type of mbuf has changed since the last mbuf
1637 * examined ('type'), end the receive operation.
1638 */
1639 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1640 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
1641 if (type != m->m_type)
1642 break;
1643 } else if (type == MT_OOBDATA)
1644 break;
1645 else
1646 KASSERT(m->m_type == MT_DATA,
1647 ("m->m_type == %d", m->m_type));
1648 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
1649 len = uio->uio_resid;
1650 if (so->so_oobmark && len > so->so_oobmark - offset)
1651 len = so->so_oobmark - offset;
1652 if (len > m->m_len - moff)
1653 len = m->m_len - moff;
1654 /*
1655 * If mp is set, just pass back the mbufs. Otherwise copy
1656 * them out via the uio, then free. Sockbuf must be
1657 * consistent here (points to current mbuf, it points to next
1658 * record) when we drop priority; we must note any additions
1659 * to the sockbuf when we block interrupts again.
1660 */
1661 if (mp == NULL) {
1662 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1663 SBLASTRECORDCHK(&so->so_rcv);
1664 SBLASTMBUFCHK(&so->so_rcv);
1665 SOCKBUF_UNLOCK(&so->so_rcv);
1666 error = uiomove(mtod(m, char *) + moff, (int)len, uio);
1667 SOCKBUF_LOCK(&so->so_rcv);
1668 if (error) {
1669 /*
1670 * The MT_SONAME mbuf has already been removed
1671 * from the record, so it is necessary to
1672 * remove the data mbufs, if any, to preserve
1673 * the invariant in the case of PR_ADDR that
1674 * requires MT_SONAME mbufs at the head of
1675 * each record.
1676 */
1677 if (m && pr->pr_flags & PR_ATOMIC &&
1678 ((flags & MSG_PEEK) == 0))
1679 (void)sbdroprecord_locked(&so->so_rcv);
1680 SOCKBUF_UNLOCK(&so->so_rcv);
1681 goto release;
1682 }
1683 } else
1684 uio->uio_resid -= len;
1685 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1686 if (len == m->m_len - moff) {
1687 if (m->m_flags & M_EOR)
1688 flags |= MSG_EOR;
1689 if (flags & MSG_PEEK) {
1690 m = m->m_next;
1691 moff = 0;
1692 } else {
1693 nextrecord = m->m_nextpkt;
1694 sbfree(&so->so_rcv, m);
1695 if (mp != NULL) {
1696 m->m_nextpkt = NULL;
1697 *mp = m;
1698 mp = &m->m_next;
1699 so->so_rcv.sb_mb = m = m->m_next;
1700 *mp = NULL;
1701 } else {
1702 so->so_rcv.sb_mb = m_free(m);
1703 m = so->so_rcv.sb_mb;
1704 }
1705 sockbuf_pushsync(&so->so_rcv, nextrecord);
1706 SBLASTRECORDCHK(&so->so_rcv);
1707 SBLASTMBUFCHK(&so->so_rcv);
1708 }
1709 } else {
1710 if (flags & MSG_PEEK)
1711 moff += len;
1712 else {
1713 if (mp != NULL) {
1714 int copy_flag;
1715
1716 if (flags & MSG_DONTWAIT)
1717 copy_flag = M_NOWAIT;
1718 else
1719 copy_flag = M_WAIT;
1720 if (copy_flag == M_WAITOK)
1721 SOCKBUF_UNLOCK(&so->so_rcv);
1722 *mp = m_copym(m, 0, len, copy_flag);
1723 if (copy_flag == M_WAITOK)
1724 SOCKBUF_LOCK(&so->so_rcv);
1725 if (*mp == NULL) {
1726 /*
1727 * m_copym() couldn't
1728 * allocate an mbuf. Adjust
1729 * uio_resid back (it was
1730 * adjusted down by len
1731 * bytes, which we didn't end
1732 * up "copying" over).
1733 */
1734 uio->uio_resid += len;
1735 break;
1736 }
1737 }
1738 m->m_data += len;
1739 m->m_len -= len;
1740 so->so_rcv.sb_cc -= len;
1741 }
1742 }
1743 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1744 if (so->so_oobmark) {
1745 if ((flags & MSG_PEEK) == 0) {
1746 so->so_oobmark -= len;
1747 if (so->so_oobmark == 0) {
1748 so->so_rcv.sb_state |= SBS_RCVATMARK;
1749 break;
1750 }
1751 } else {
1752 offset += len;
1753 if (offset == so->so_oobmark)
1754 break;
1755 }
1756 }
1757 if (flags & MSG_EOR)
1758 break;
1759 /*
1760 * If the MSG_WAITALL flag is set (for non-atomic socket), we
1761 * must not quit until "uio->uio_resid == 0" or an error
1762 * termination. If a signal/timeout occurs, return with a
1763 * short count but without error. Keep sockbuf locked
1764 * against other readers.
1765 */
1766 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1767 !sosendallatonce(so) && nextrecord == NULL) {
1768 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1769 if (so->so_error ||
1770 so->so_rcv.sb_state & SBS_CANTRCVMORE)
1771 break;
1772 /*
1773 * Notify the protocol that some data has been
1774 * drained before blocking.
1775 */
1776 if (pr->pr_flags & PR_WANTRCVD) {
1777 SOCKBUF_UNLOCK(&so->so_rcv);
1778 VNET_SO_ASSERT(so);
1779 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1780 SOCKBUF_LOCK(&so->so_rcv);
1781 }
1782 SBLASTRECORDCHK(&so->so_rcv);
1783 SBLASTMBUFCHK(&so->so_rcv);
1784 /*
1785 * We could receive some data while was notifying
1786 * the protocol. Skip blocking in this case.
1787 */
1788 if (so->so_rcv.sb_mb == NULL) {
1789 error = sbwait(&so->so_rcv);
1790 if (error) {
1791 SOCKBUF_UNLOCK(&so->so_rcv);
1792 goto release;
1793 }
1794 }
1795 m = so->so_rcv.sb_mb;
1796 if (m != NULL)
1797 nextrecord = m->m_nextpkt;
1798 }
1799 }
1800
1801 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1802 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
1803 flags |= MSG_TRUNC;
1804 if ((flags & MSG_PEEK) == 0)
1805 (void) sbdroprecord_locked(&so->so_rcv);
1806 }
1807 if ((flags & MSG_PEEK) == 0) {
1808 if (m == NULL) {
1809 /*
1810 * First part is an inline SB_EMPTY_FIXUP(). Second
1811 * part makes sure sb_lastrecord is up-to-date if
1812 * there is still data in the socket buffer.
1813 */
1814 so->so_rcv.sb_mb = nextrecord;
1815 if (so->so_rcv.sb_mb == NULL) {
1816 so->so_rcv.sb_mbtail = NULL;
1817 so->so_rcv.sb_lastrecord = NULL;
1818 } else if (nextrecord->m_nextpkt == NULL)
1819 so->so_rcv.sb_lastrecord = nextrecord;
1820 }
1821 SBLASTRECORDCHK(&so->so_rcv);
1822 SBLASTMBUFCHK(&so->so_rcv);
1823 /*
1824 * If soreceive() is being done from the socket callback,
1825 * then don't need to generate ACK to peer to update window,
1826 * since ACK will be generated on return to TCP.
1827 */
1828 if (!(flags & MSG_SOCALLBCK) &&
1829 (pr->pr_flags & PR_WANTRCVD)) {
1830 SOCKBUF_UNLOCK(&so->so_rcv);
1831 VNET_SO_ASSERT(so);
1832 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1833 SOCKBUF_LOCK(&so->so_rcv);
1834 }
1835 }
1836 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1837 if (orig_resid == uio->uio_resid && orig_resid &&
1838 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1839 SOCKBUF_UNLOCK(&so->so_rcv);
1840 goto restart;
1841 }
1842 SOCKBUF_UNLOCK(&so->so_rcv);
1843
1844 if (flagsp != NULL)
1845 *flagsp |= flags;
1846 release:
1847 sbunlock(&so->so_rcv);
1848 return (error);
1849 }
1850
1851 /*
1852 * Optimized version of soreceive() for stream (TCP) sockets.
1853 * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled.
1854 */
1855 int
1856 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
1857 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1858 {
1859 int len = 0, error = 0, flags, oresid;
1860 struct sockbuf *sb;
1861 struct mbuf *m, *n = NULL;
1862
1863 /* We only do stream sockets. */
1864 if (so->so_type != SOCK_STREAM)
1865 return (EINVAL);
1866 if (psa != NULL)
1867 *psa = NULL;
1868 if (controlp != NULL)
1869 return (EINVAL);
1870 if (flagsp != NULL)
1871 flags = *flagsp &~ MSG_EOR;
1872 else
1873 flags = 0;
1874 if (flags & MSG_OOB)
1875 return (soreceive_rcvoob(so, uio, flags));
1876 if (mp0 != NULL)
1877 *mp0 = NULL;
1878
1879 sb = &so->so_rcv;
1880
1881 /* Prevent other readers from entering the socket. */
1882 error = sblock(sb, SBLOCKWAIT(flags));
1883 if (error)
1884 goto out;
1885 SOCKBUF_LOCK(sb);
1886
1887 /* Easy one, no space to copyout anything. */
1888 if (uio->uio_resid == 0) {
1889 error = EINVAL;
1890 goto out;
1891 }
1892 oresid = uio->uio_resid;
1893
1894 /* We will never ever get anything unless we are or were connected. */
1895 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1896 error = ENOTCONN;
1897 goto out;
1898 }
1899
1900 restart:
1901 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1902
1903 /* Abort if socket has reported problems. */
1904 if (so->so_error) {
1905 if (sb->sb_cc > 0)
1906 goto deliver;
1907 if (oresid > uio->uio_resid)
1908 goto out;
1909 error = so->so_error;
1910 if (!(flags & MSG_PEEK))
1911 so->so_error = 0;
1912 goto out;
1913 }
1914
1915 /* Door is closed. Deliver what is left, if any. */
1916 if (sb->sb_state & SBS_CANTRCVMORE) {
1917 if (sb->sb_cc > 0)
1918 goto deliver;
1919 else
1920 goto out;
1921 }
1922
1923 /* Socket buffer is empty and we shall not block. */
1924 if (sb->sb_cc == 0 &&
1925 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1926 error = EAGAIN;
1927 goto out;
1928 }
1929
1930 /* Socket buffer got some data that we shall deliver now. */
1931 if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1932 ((sb->sb_flags & SS_NBIO) ||
1933 (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1934 sb->sb_cc >= sb->sb_lowat ||
1935 sb->sb_cc >= uio->uio_resid ||
1936 sb->sb_cc >= sb->sb_hiwat) ) {
1937 goto deliver;
1938 }
1939
1940 /* On MSG_WAITALL we must wait until all data or error arrives. */
1941 if ((flags & MSG_WAITALL) &&
1942 (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat))
1943 goto deliver;
1944
1945 /*
1946 * Wait and block until (more) data comes in.
1947 * NB: Drops the sockbuf lock during wait.
1948 */
1949 error = sbwait(sb);
1950 if (error)
1951 goto out;
1952 goto restart;
1953
1954 deliver:
1955 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1956 KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
1957 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1958
1959 /* Statistics. */
1960 if (uio->uio_td)
1961 uio->uio_td->td_ru.ru_msgrcv++;
1962
1963 /* Fill uio until full or current end of socket buffer is reached. */
1964 len = min(uio->uio_resid, sb->sb_cc);
1965 if (mp0 != NULL) {
1966 /* Dequeue as many mbufs as possible. */
1967 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1968 if (*mp0 == NULL)
1969 *mp0 = sb->sb_mb;
1970 else
1971 m_cat(*mp0, sb->sb_mb);
1972 for (m = sb->sb_mb;
1973 m != NULL && m->m_len <= len;
1974 m = m->m_next) {
1975 len -= m->m_len;
1976 uio->uio_resid -= m->m_len;
1977 sbfree(sb, m);
1978 n = m;
1979 }
1980 n->m_next = NULL;
1981 sb->sb_mb = m;
1982 sb->sb_lastrecord = sb->sb_mb;
1983 if (sb->sb_mb == NULL)
1984 SB_EMPTY_FIXUP(sb);
1985 }
1986 /* Copy the remainder. */
1987 if (len > 0) {
1988 KASSERT(sb->sb_mb != NULL,
1989 ("%s: len > 0 && sb->sb_mb empty", __func__));
1990
1991 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1992 if (m == NULL)
1993 len = 0; /* Don't flush data from sockbuf. */
1994 else
1995 uio->uio_resid -= len;
1996 if (*mp0 != NULL)
1997 m_cat(*mp0, m);
1998 else
1999 *mp0 = m;
2000 if (*mp0 == NULL) {
2001 error = ENOBUFS;
2002 goto out;
2003 }
2004 }
2005 } else {
2006 /* NB: Must unlock socket buffer as uiomove may sleep. */
2007 SOCKBUF_UNLOCK(sb);
2008 error = m_mbuftouio(uio, sb->sb_mb, len);
2009 SOCKBUF_LOCK(sb);
2010 if (error)
2011 goto out;
2012 }
2013 SBLASTRECORDCHK(sb);
2014 SBLASTMBUFCHK(sb);
2015
2016 /*
2017 * Remove the delivered data from the socket buffer unless we
2018 * were only peeking.
2019 */
2020 if (!(flags & MSG_PEEK)) {
2021 if (len > 0)
2022 sbdrop_locked(sb, len);
2023
2024 /* Notify protocol that we drained some data. */
2025 if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2026 (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2027 !(flags & MSG_SOCALLBCK))) {
2028 SOCKBUF_UNLOCK(sb);
2029 VNET_SO_ASSERT(so);
2030 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2031 SOCKBUF_LOCK(sb);
2032 }
2033 }
2034
2035 /*
2036 * For MSG_WAITALL we may have to loop again and wait for
2037 * more data to come in.
2038 */
2039 if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2040 goto restart;
2041 out:
2042 SOCKBUF_LOCK_ASSERT(sb);
2043 SBLASTRECORDCHK(sb);
2044 SBLASTMBUFCHK(sb);
2045 SOCKBUF_UNLOCK(sb);
2046 sbunlock(sb);
2047 return (error);
2048 }
2049
2050 /*
2051 * Optimized version of soreceive() for simple datagram cases from userspace.
2052 * Unlike in the stream case, we're able to drop a datagram if copyout()
2053 * fails, and because we handle datagrams atomically, we don't need to use a
2054 * sleep lock to prevent I/O interlacing.
2055 */
2056 int
2057 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2058 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2059 {
2060 struct mbuf *m, *m2;
2061 int flags, error;
2062 ssize_t len;
2063 struct protosw *pr = so->so_proto;
2064 struct mbuf *nextrecord;
2065
2066 if (psa != NULL)
2067 *psa = NULL;
2068 if (controlp != NULL)
2069 *controlp = NULL;
2070 if (flagsp != NULL)
2071 flags = *flagsp &~ MSG_EOR;
2072 else
2073 flags = 0;
2074
2075 /*
2076 * For any complicated cases, fall back to the full
2077 * soreceive_generic().
2078 */
2079 if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
2080 return (soreceive_generic(so, psa, uio, mp0, controlp,
2081 flagsp));
2082
2083 /*
2084 * Enforce restrictions on use.
2085 */
2086 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2087 ("soreceive_dgram: wantrcvd"));
2088 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2089 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2090 ("soreceive_dgram: SBS_RCVATMARK"));
2091 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2092 ("soreceive_dgram: P_CONNREQUIRED"));
2093
2094 /*
2095 * Loop blocking while waiting for a datagram.
2096 */
2097 SOCKBUF_LOCK(&so->so_rcv);
2098 while ((m = so->so_rcv.sb_mb) == NULL) {
2099 KASSERT(so->so_rcv.sb_cc == 0,
2100 ("soreceive_dgram: sb_mb NULL but sb_cc %u",
2101 so->so_rcv.sb_cc));
2102 if (so->so_error) {
2103 error = so->so_error;
2104 so->so_error = 0;
2105 SOCKBUF_UNLOCK(&so->so_rcv);
2106 return (error);
2107 }
2108 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2109 uio->uio_resid == 0) {
2110 SOCKBUF_UNLOCK(&so->so_rcv);
2111 return (0);
2112 }
2113 if ((so->so_state & SS_NBIO) ||
2114 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2115 SOCKBUF_UNLOCK(&so->so_rcv);
2116 return (EWOULDBLOCK);
2117 }
2118 SBLASTRECORDCHK(&so->so_rcv);
2119 SBLASTMBUFCHK(&so->so_rcv);
2120 error = sbwait(&so->so_rcv);
2121 if (error) {
2122 SOCKBUF_UNLOCK(&so->so_rcv);
2123 return (error);
2124 }
2125 }
2126 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2127
2128 if (uio->uio_td)
2129 uio->uio_td->td_ru.ru_msgrcv++;
2130 SBLASTRECORDCHK(&so->so_rcv);
2131 SBLASTMBUFCHK(&so->so_rcv);
2132 nextrecord = m->m_nextpkt;
2133 if (nextrecord == NULL) {
2134 KASSERT(so->so_rcv.sb_lastrecord == m,
2135 ("soreceive_dgram: lastrecord != m"));
2136 }
2137
2138 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2139 ("soreceive_dgram: m_nextpkt != nextrecord"));
2140
2141 /*
2142 * Pull 'm' and its chain off the front of the packet queue.
2143 */
2144 so->so_rcv.sb_mb = NULL;
2145 sockbuf_pushsync(&so->so_rcv, nextrecord);
2146
2147 /*
2148 * Walk 'm's chain and free that many bytes from the socket buffer.
2149 */
2150 for (m2 = m; m2 != NULL; m2 = m2->m_next)
2151 sbfree(&so->so_rcv, m2);
2152
2153 /*
2154 * Do a few last checks before we let go of the lock.
2155 */
2156 SBLASTRECORDCHK(&so->so_rcv);
2157 SBLASTMBUFCHK(&so->so_rcv);
2158 SOCKBUF_UNLOCK(&so->so_rcv);
2159
2160 if (pr->pr_flags & PR_ADDR) {
2161 KASSERT(m->m_type == MT_SONAME,
2162 ("m->m_type == %d", m->m_type));
2163 if (psa != NULL)
2164 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2165 M_NOWAIT);
2166 m = m_free(m);
2167 }
2168 if (m == NULL) {
2169 /* XXXRW: Can this happen? */
2170 return (0);
2171 }
2172
2173 /*
2174 * Packet to copyout() is now in 'm' and it is disconnected from the
2175 * queue.
2176 *
2177 * Process one or more MT_CONTROL mbufs present before any data mbufs
2178 * in the first mbuf chain on the socket buffer. We call into the
2179 * protocol to perform externalization (or freeing if controlp ==
2180 * NULL).
2181 */
2182 if (m->m_type == MT_CONTROL) {
2183 struct mbuf *cm = NULL, *cmn;
2184 struct mbuf **cme = &cm;
2185
2186 do {
2187 m2 = m->m_next;
2188 m->m_next = NULL;
2189 *cme = m;
2190 cme = &(*cme)->m_next;
2191 m = m2;
2192 } while (m != NULL && m->m_type == MT_CONTROL);
2193 while (cm != NULL) {
2194 cmn = cm->m_next;
2195 cm->m_next = NULL;
2196 if (pr->pr_domain->dom_externalize != NULL) {
2197 error = (*pr->pr_domain->dom_externalize)
2198 (cm, controlp, flags);
2199 } else if (controlp != NULL)
2200 *controlp = cm;
2201 else
2202 m_freem(cm);
2203 if (controlp != NULL) {
2204 while (*controlp != NULL)
2205 controlp = &(*controlp)->m_next;
2206 }
2207 cm = cmn;
2208 }
2209 }
2210 KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
2211
2212 while (m != NULL && uio->uio_resid > 0) {
2213 len = uio->uio_resid;
2214 if (len > m->m_len)
2215 len = m->m_len;
2216 error = uiomove(mtod(m, char *), (int)len, uio);
2217 if (error) {
2218 m_freem(m);
2219 return (error);
2220 }
2221 if (len == m->m_len)
2222 m = m_free(m);
2223 else {
2224 m->m_data += len;
2225 m->m_len -= len;
2226 }
2227 }
2228 if (m != NULL)
2229 flags |= MSG_TRUNC;
2230 m_freem(m);
2231 if (flagsp != NULL)
2232 *flagsp |= flags;
2233 return (0);
2234 }
2235
2236 int
2237 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2238 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2239 {
2240 int error;
2241
2242 CURVNET_SET(so->so_vnet);
2243 error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
2244 controlp, flagsp));
2245 CURVNET_RESTORE();
2246 return (error);
2247 }
2248
2249 int
2250 soshutdown(struct socket *so, int how)
2251 {
2252 struct protosw *pr = so->so_proto;
2253 int error;
2254
2255 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2256 return (EINVAL);
2257
2258 CURVNET_SET(so->so_vnet);
2259 if (pr->pr_usrreqs->pru_flush != NULL)
2260 (*pr->pr_usrreqs->pru_flush)(so, how);
2261 if (how != SHUT_WR)
2262 sorflush(so);
2263 if (how != SHUT_RD) {
2264 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2265 wakeup(&so->so_timeo);
2266 CURVNET_RESTORE();
2267 return (error);
2268 }
2269 wakeup(&so->so_timeo);
2270 CURVNET_RESTORE();
2271 return (0);
2272 }
2273
2274 void
2275 sorflush(struct socket *so)
2276 {
2277 struct sockbuf *sb = &so->so_rcv;
2278 struct protosw *pr = so->so_proto;
2279 struct sockbuf asb;
2280
2281 VNET_SO_ASSERT(so);
2282
2283 /*
2284 * In order to avoid calling dom_dispose with the socket buffer mutex
2285 * held, and in order to generally avoid holding the lock for a long
2286 * time, we make a copy of the socket buffer and clear the original
2287 * (except locks, state). The new socket buffer copy won't have
2288 * initialized locks so we can only call routines that won't use or
2289 * assert those locks.
2290 *
2291 * Dislodge threads currently blocked in receive and wait to acquire
2292 * a lock against other simultaneous readers before clearing the
2293 * socket buffer. Don't let our acquire be interrupted by a signal
2294 * despite any existing socket disposition on interruptable waiting.
2295 */
2296 socantrcvmore(so);
2297 (void) sblock(sb, SBL_WAIT | SBL_NOINTR);
2298
2299 /*
2300 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2301 * and mutex data unchanged.
2302 */
2303 SOCKBUF_LOCK(sb);
2304 bzero(&asb, offsetof(struct sockbuf, sb_startzero));
2305 bcopy(&sb->sb_startzero, &asb.sb_startzero,
2306 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2307 bzero(&sb->sb_startzero,
2308 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2309 SOCKBUF_UNLOCK(sb);
2310 sbunlock(sb);
2311
2312 /*
2313 * Dispose of special rights and flush the socket buffer. Don't call
2314 * any unsafe routines (that rely on locks being initialized) on asb.
2315 */
2316 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2317 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2318 sbrelease_internal(&asb, so);
2319 }
2320
2321 /*
2322 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2323 * additional variant to handle the case where the option value needs to be
2324 * some kind of integer, but not a specific size. In addition to their use
2325 * here, these functions are also called by the protocol-level pr_ctloutput()
2326 * routines.
2327 */
2328 int
2329 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2330 {
2331 size_t valsize;
2332
2333 /*
2334 * If the user gives us more than we wanted, we ignore it, but if we
2335 * don't get the minimum length the caller wants, we return EINVAL.
2336 * On success, sopt->sopt_valsize is set to however much we actually
2337 * retrieved.
2338 */
2339 if ((valsize = sopt->sopt_valsize) < minlen)
2340 return EINVAL;
2341 if (valsize > len)
2342 sopt->sopt_valsize = valsize = len;
2343
2344 if (sopt->sopt_td != NULL)
2345 return (copyin(sopt->sopt_val, buf, valsize));
2346
2347 bcopy(sopt->sopt_val, buf, valsize);
2348 return (0);
2349 }
2350
2351 /*
2352 * Kernel version of setsockopt(2).
2353 *
2354 * XXX: optlen is size_t, not socklen_t
2355 */
2356 int
2357 so_setsockopt(struct socket *so, int level, int optname, void *optval,
2358 size_t optlen)
2359 {
2360 struct sockopt sopt;
2361
2362 sopt.sopt_level = level;
2363 sopt.sopt_name = optname;
2364 sopt.sopt_dir = SOPT_SET;
2365 sopt.sopt_val = optval;
2366 sopt.sopt_valsize = optlen;
2367 sopt.sopt_td = NULL;
2368 return (sosetopt(so, &sopt));
2369 }
2370
2371 int
2372 sosetopt(struct socket *so, struct sockopt *sopt)
2373 {
2374 int error, optval;
2375 struct linger l;
2376 struct timeval tv;
2377 sbintime_t val;
2378 uint32_t val32;
2379 #ifdef MAC
2380 struct mac extmac;
2381 #endif
2382
2383 CURVNET_SET(so->so_vnet);
2384 error = 0;
2385 if (sopt->sopt_level != SOL_SOCKET) {
2386 if (so->so_proto->pr_ctloutput != NULL) {
2387 error = (*so->so_proto->pr_ctloutput)(so, sopt);
2388 CURVNET_RESTORE();
2389 return (error);
2390 }
2391 error = ENOPROTOOPT;
2392 } else {
2393 switch (sopt->sopt_name) {
2394 #ifdef INET
2395 case SO_ACCEPTFILTER:
2396 error = do_setopt_accept_filter(so, sopt);
2397 if (error)
2398 goto bad;
2399 break;
2400 #endif
2401 case SO_LINGER:
2402 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2403 if (error)
2404 goto bad;
2405
2406 SOCK_LOCK(so);
2407 so->so_linger = l.l_linger;
2408 if (l.l_onoff)
2409 so->so_options |= SO_LINGER;
2410 else
2411 so->so_options &= ~SO_LINGER;
2412 SOCK_UNLOCK(so);
2413 break;
2414
2415 case SO_DEBUG:
2416 case SO_KEEPALIVE:
2417 case SO_DONTROUTE:
2418 case SO_USELOOPBACK:
2419 case SO_BROADCAST:
2420 case SO_REUSEADDR:
2421 case SO_REUSEPORT:
2422 case SO_OOBINLINE:
2423 case SO_TIMESTAMP:
2424 case SO_BINTIME:
2425 case SO_NOSIGPIPE:
2426 case SO_NO_DDP:
2427 case SO_NO_OFFLOAD:
2428 error = sooptcopyin(sopt, &optval, sizeof optval,
2429 sizeof optval);
2430 if (error)
2431 goto bad;
2432 SOCK_LOCK(so);
2433 if (optval)
2434 so->so_options |= sopt->sopt_name;
2435 else
2436 so->so_options &= ~sopt->sopt_name;
2437 SOCK_UNLOCK(so);
2438 break;
2439
2440 case SO_SETFIB:
2441 error = sooptcopyin(sopt, &optval, sizeof optval,
2442 sizeof optval);
2443 if (error)
2444 goto bad;
2445
2446 if (optval < 0 || optval >= rt_numfibs) {
2447 error = EINVAL;
2448 goto bad;
2449 }
2450 if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
2451 (so->so_proto->pr_domain->dom_family == PF_INET6) ||
2452 (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
2453 so->so_fibnum = optval;
2454 else
2455 so->so_fibnum = 0;
2456 break;
2457
2458 case SO_USER_COOKIE:
2459 error = sooptcopyin(sopt, &val32, sizeof val32,
2460 sizeof val32);
2461 if (error)
2462 goto bad;
2463 so->so_user_cookie = val32;
2464 break;
2465
2466 case SO_SNDBUF:
2467 case SO_RCVBUF:
2468 case SO_SNDLOWAT:
2469 case SO_RCVLOWAT:
2470 error = sooptcopyin(sopt, &optval, sizeof optval,
2471 sizeof optval);
2472 if (error)
2473 goto bad;
2474
2475 /*
2476 * Values < 1 make no sense for any of these options,
2477 * so disallow them.
2478 */
2479 if (optval < 1) {
2480 error = EINVAL;
2481 goto bad;
2482 }
2483
2484 switch (sopt->sopt_name) {
2485 case SO_SNDBUF:
2486 case SO_RCVBUF:
2487 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
2488 &so->so_snd : &so->so_rcv, (u_long)optval,
2489 so, curthread) == 0) {
2490 error = ENOBUFS;
2491 goto bad;
2492 }
2493 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
2494 &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
2495 break;
2496
2497 /*
2498 * Make sure the low-water is never greater than the
2499 * high-water.
2500 */
2501 case SO_SNDLOWAT:
2502 SOCKBUF_LOCK(&so->so_snd);
2503 so->so_snd.sb_lowat =
2504 (optval > so->so_snd.sb_hiwat) ?
2505 so->so_snd.sb_hiwat : optval;
2506 SOCKBUF_UNLOCK(&so->so_snd);
2507 break;
2508 case SO_RCVLOWAT:
2509 SOCKBUF_LOCK(&so->so_rcv);
2510 so->so_rcv.sb_lowat =
2511 (optval > so->so_rcv.sb_hiwat) ?
2512 so->so_rcv.sb_hiwat : optval;
2513 SOCKBUF_UNLOCK(&so->so_rcv);
2514 break;
2515 }
2516 break;
2517
2518 case SO_SNDTIMEO:
2519 case SO_RCVTIMEO:
2520 #ifdef COMPAT_FREEBSD32
2521 if (SV_CURPROC_FLAG(SV_ILP32)) {
2522 struct timeval32 tv32;
2523
2524 error = sooptcopyin(sopt, &tv32, sizeof tv32,
2525 sizeof tv32);
2526 CP(tv32, tv, tv_sec);
2527 CP(tv32, tv, tv_usec);
2528 } else
2529 #endif
2530 error = sooptcopyin(sopt, &tv, sizeof tv,
2531 sizeof tv);
2532 if (error)
2533 goto bad;
2534 if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
2535 tv.tv_usec >= 1000000) {
2536 error = EDOM;
2537 goto bad;
2538 }
2539 val = tvtosbt(tv);
2540
2541 switch (sopt->sopt_name) {
2542 case SO_SNDTIMEO:
2543 so->so_snd.sb_timeo = val;
2544 break;
2545 case SO_RCVTIMEO:
2546 so->so_rcv.sb_timeo = val;
2547 break;
2548 }
2549 break;
2550
2551 case SO_LABEL:
2552 #ifdef MAC
2553 error = sooptcopyin(sopt, &extmac, sizeof extmac,
2554 sizeof extmac);
2555 if (error)
2556 goto bad;
2557 error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
2558 so, &extmac);
2559 #else
2560 error = EOPNOTSUPP;
2561 #endif
2562 break;
2563
2564 default:
2565 error = ENOPROTOOPT;
2566 break;
2567 }
2568 if (error == 0 && so->so_proto->pr_ctloutput != NULL)
2569 (void)(*so->so_proto->pr_ctloutput)(so, sopt);
2570 }
2571 bad:
2572 CURVNET_RESTORE();
2573 return (error);
2574 }
2575
2576 /*
2577 * Helper routine for getsockopt.
2578 */
2579 int
2580 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2581 {
2582 int error;
2583 size_t valsize;
2584
2585 error = 0;
2586
2587 /*
2588 * Documented get behavior is that we always return a value, possibly
2589 * truncated to fit in the user's buffer. Traditional behavior is
2590 * that we always tell the user precisely how much we copied, rather
2591 * than something useful like the total amount we had available for
2592 * her. Note that this interface is not idempotent; the entire
2593 * answer must generated ahead of time.
2594 */
2595 valsize = min(len, sopt->sopt_valsize);
2596 sopt->sopt_valsize = valsize;
2597 if (sopt->sopt_val != NULL) {
2598 if (sopt->sopt_td != NULL)
2599 error = copyout(buf, sopt->sopt_val, valsize);
2600 else
2601 bcopy(buf, sopt->sopt_val, valsize);
2602 }
2603 return (error);
2604 }
2605
2606 int
2607 sogetopt(struct socket *so, struct sockopt *sopt)
2608 {
2609 int error, optval;
2610 struct linger l;
2611 struct timeval tv;
2612 #ifdef MAC
2613 struct mac extmac;
2614 #endif
2615
2616 CURVNET_SET(so->so_vnet);
2617 error = 0;
2618 if (sopt->sopt_level != SOL_SOCKET) {
2619 if (so->so_proto->pr_ctloutput != NULL)
2620 error = (*so->so_proto->pr_ctloutput)(so, sopt);
2621 else
2622 error = ENOPROTOOPT;
2623 CURVNET_RESTORE();
2624 return (error);
2625 } else {
2626 switch (sopt->sopt_name) {
2627 #ifdef INET
2628 case SO_ACCEPTFILTER:
2629 error = do_getopt_accept_filter(so, sopt);
2630 break;
2631 #endif
2632 case SO_LINGER:
2633 SOCK_LOCK(so);
2634 l.l_onoff = so->so_options & SO_LINGER;
2635 l.l_linger = so->so_linger;
2636 SOCK_UNLOCK(so);
2637 error = sooptcopyout(sopt, &l, sizeof l);
2638 break;
2639
2640 case SO_USELOOPBACK:
2641 case SO_DONTROUTE:
2642 case SO_DEBUG:
2643 case SO_KEEPALIVE:
2644 case SO_REUSEADDR:
2645 case SO_REUSEPORT:
2646 case SO_BROADCAST:
2647 case SO_OOBINLINE:
2648 case SO_ACCEPTCONN:
2649 case SO_TIMESTAMP:
2650 case SO_BINTIME:
2651 case SO_NOSIGPIPE:
2652 optval = so->so_options & sopt->sopt_name;
2653 integer:
2654 error = sooptcopyout(sopt, &optval, sizeof optval);
2655 break;
2656
2657 case SO_TYPE:
2658 optval = so->so_type;
2659 goto integer;
2660
2661 case SO_PROTOCOL:
2662 optval = so->so_proto->pr_protocol;
2663 goto integer;
2664
2665 case SO_ERROR:
2666 SOCK_LOCK(so);
2667 optval = so->so_error;
2668 so->so_error = 0;
2669 SOCK_UNLOCK(so);
2670 goto integer;
2671
2672 case SO_SNDBUF:
2673 optval = so->so_snd.sb_hiwat;
2674 goto integer;
2675
2676 case SO_RCVBUF:
2677 optval = so->so_rcv.sb_hiwat;
2678 goto integer;
2679
2680 case SO_SNDLOWAT:
2681 optval = so->so_snd.sb_lowat;
2682 goto integer;
2683
2684 case SO_RCVLOWAT:
2685 optval = so->so_rcv.sb_lowat;
2686 goto integer;
2687
2688 case SO_SNDTIMEO:
2689 case SO_RCVTIMEO:
2690 optval = (sopt->sopt_name == SO_SNDTIMEO ?
2691 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2692
2693 tv = sbttotv(optval);
2694 #ifdef COMPAT_FREEBSD32
2695 if (SV_CURPROC_FLAG(SV_ILP32)) {
2696 struct timeval32 tv32;
2697
2698 CP(tv, tv32, tv_sec);
2699 CP(tv, tv32, tv_usec);
2700 error = sooptcopyout(sopt, &tv32, sizeof tv32);
2701 } else
2702 #endif
2703 error = sooptcopyout(sopt, &tv, sizeof tv);
2704 break;
2705
2706 case SO_LABEL:
2707 #ifdef MAC
2708 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2709 sizeof(extmac));
2710 if (error)
2711 goto bad;
2712 error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
2713 so, &extmac);
2714 if (error)
2715 goto bad;
2716 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2717 #else
2718 error = EOPNOTSUPP;
2719 #endif
2720 break;
2721
2722 case SO_PEERLABEL:
2723 #ifdef MAC
2724 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
2725 sizeof(extmac));
2726 if (error)
2727 goto bad;
2728 error = mac_getsockopt_peerlabel(
2729 sopt->sopt_td->td_ucred, so, &extmac);
2730 if (error)
2731 goto bad;
2732 error = sooptcopyout(sopt, &extmac, sizeof extmac);
2733 #else
2734 error = EOPNOTSUPP;
2735 #endif
2736 break;
2737
2738 case SO_LISTENQLIMIT:
2739 optval = so->so_qlimit;
2740 goto integer;
2741
2742 case SO_LISTENQLEN:
2743 optval = so->so_qlen;
2744 goto integer;
2745
2746 case SO_LISTENINCQLEN:
2747 optval = so->so_incqlen;
2748 goto integer;
2749
2750 default:
2751 error = ENOPROTOOPT;
2752 break;
2753 }
2754 }
2755 #ifdef MAC
2756 bad:
2757 #endif
2758 CURVNET_RESTORE();
2759 return (error);
2760 }
2761
2762 int
2763 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2764 {
2765 struct mbuf *m, *m_prev;
2766 int sopt_size = sopt->sopt_valsize;
2767
2768 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
2769 if (m == NULL)
2770 return ENOBUFS;
2771 if (sopt_size > MLEN) {
2772 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
2773 if ((m->m_flags & M_EXT) == 0) {
2774 m_free(m);
2775 return ENOBUFS;
2776 }
2777 m->m_len = min(MCLBYTES, sopt_size);
2778 } else {
2779 m->m_len = min(MLEN, sopt_size);
2780 }
2781 sopt_size -= m->m_len;
2782 *mp = m;
2783 m_prev = m;
2784
2785 while (sopt_size) {
2786 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
2787 if (m == NULL) {
2788 m_freem(*mp);
2789 return ENOBUFS;
2790 }
2791 if (sopt_size > MLEN) {
2792 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
2793 M_NOWAIT);
2794 if ((m->m_flags & M_EXT) == 0) {
2795 m_freem(m);
2796 m_freem(*mp);
2797 return ENOBUFS;
2798 }
2799 m->m_len = min(MCLBYTES, sopt_size);
2800 } else {
2801 m->m_len = min(MLEN, sopt_size);
2802 }
2803 sopt_size -= m->m_len;
2804 m_prev->m_next = m;
2805 m_prev = m;
2806 }
2807 return (0);
2808 }
2809
2810 int
2811 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2812 {
2813 struct mbuf *m0 = m;
2814
2815 if (sopt->sopt_val == NULL)
2816 return (0);
2817 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2818 if (sopt->sopt_td != NULL) {
2819 int error;
2820
2821 error = copyin(sopt->sopt_val, mtod(m, char *),
2822 m->m_len);
2823 if (error != 0) {
2824 m_freem(m0);
2825 return(error);
2826 }
2827 } else
2828 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2829 sopt->sopt_valsize -= m->m_len;
2830 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2831 m = m->m_next;
2832 }
2833 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2834 panic("ip6_sooptmcopyin");
2835 return (0);
2836 }
2837
2838 int
2839 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2840 {
2841 struct mbuf *m0 = m;
2842 size_t valsize = 0;
2843
2844 if (sopt->sopt_val == NULL)
2845 return (0);
2846 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2847 if (sopt->sopt_td != NULL) {
2848 int error;
2849
2850 error = copyout(mtod(m, char *), sopt->sopt_val,
2851 m->m_len);
2852 if (error != 0) {
2853 m_freem(m0);
2854 return(error);
2855 }
2856 } else
2857 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2858 sopt->sopt_valsize -= m->m_len;
2859 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
2860 valsize += m->m_len;
2861 m = m->m_next;
2862 }
2863 if (m != NULL) {
2864 /* enough soopt buffer should be given from user-land */
2865 m_freem(m0);
2866 return(EINVAL);
2867 }
2868 sopt->sopt_valsize = valsize;
2869 return (0);
2870 }
2871
2872 /*
2873 * sohasoutofband(): protocol notifies socket layer of the arrival of new
2874 * out-of-band data, which will then notify socket consumers.
2875 */
2876 void
2877 sohasoutofband(struct socket *so)
2878 {
2879
2880 if (so->so_sigio != NULL)
2881 pgsigio(&so->so_sigio, SIGURG, 0);
2882 selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
2883 }
2884
2885 int
2886 sopoll(struct socket *so, int events, struct ucred *active_cred,
2887 struct thread *td)
2888 {
2889
2890 /*
2891 * We do not need to set or assert curvnet as long as everyone uses
2892 * sopoll_generic().
2893 */
2894 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
2895 td));
2896 }
2897
2898 int
2899 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
2900 struct thread *td)
2901 {
2902 int revents = 0;
2903
2904 SOCKBUF_LOCK(&so->so_snd);
2905 SOCKBUF_LOCK(&so->so_rcv);
2906 if (events & (POLLIN | POLLRDNORM))
2907 if (soreadabledata(so))
2908 revents |= events & (POLLIN | POLLRDNORM);
2909
2910 if (events & (POLLOUT | POLLWRNORM))
2911 if (sowriteable(so))
2912 revents |= events & (POLLOUT | POLLWRNORM);
2913
2914 if (events & (POLLPRI | POLLRDBAND))
2915 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
2916 revents |= events & (POLLPRI | POLLRDBAND);
2917
2918 if ((events & POLLINIGNEOF) == 0) {
2919 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2920 revents |= events & (POLLIN | POLLRDNORM);
2921 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
2922 revents |= POLLHUP;
2923 }
2924 }
2925
2926 if (revents == 0) {
2927 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2928 selrecord(td, &so->so_rcv.sb_sel);
2929 so->so_rcv.sb_flags |= SB_SEL;
2930 }
2931
2932 if (events & (POLLOUT | POLLWRNORM)) {
2933 selrecord(td, &so->so_snd.sb_sel);
2934 so->so_snd.sb_flags |= SB_SEL;
2935 }
2936 }
2937
2938 SOCKBUF_UNLOCK(&so->so_rcv);
2939 SOCKBUF_UNLOCK(&so->so_snd);
2940 return (revents);
2941 }
2942
2943 int
2944 soo_kqfilter(struct file *fp, struct knote *kn)
2945 {
2946 struct socket *so = kn->kn_fp->f_data;
2947 struct sockbuf *sb;
2948
2949 switch (kn->kn_filter) {
2950 case EVFILT_READ:
2951 if (so->so_options & SO_ACCEPTCONN)
2952 kn->kn_fop = &solisten_filtops;
2953 else
2954 kn->kn_fop = &soread_filtops;
2955 sb = &so->so_rcv;
2956 break;
2957 case EVFILT_WRITE:
2958 kn->kn_fop = &sowrite_filtops;
2959 sb = &so->so_snd;
2960 break;
2961 default:
2962 return (EINVAL);
2963 }
2964
2965 SOCKBUF_LOCK(sb);
2966 knlist_add(&sb->sb_sel.si_note, kn, 1);
2967 sb->sb_flags |= SB_KNOTE;
2968 SOCKBUF_UNLOCK(sb);
2969 return (0);
2970 }
2971
2972 /*
2973 * Some routines that return EOPNOTSUPP for entry points that are not
2974 * supported by a protocol. Fill in as needed.
2975 */
2976 int
2977 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2978 {
2979
2980 return EOPNOTSUPP;
2981 }
2982
2983 int
2984 pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
2985 {
2986
2987 return EOPNOTSUPP;
2988 }
2989
2990 int
2991 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
2992 {
2993
2994 return EOPNOTSUPP;
2995 }
2996
2997 int
2998 pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
2999 struct thread *td)
3000 {
3001
3002 return EOPNOTSUPP;
3003 }
3004
3005 int
3006 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3007 {
3008
3009 return EOPNOTSUPP;
3010 }
3011
3012 int
3013 pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3014 struct thread *td)
3015 {
3016
3017 return EOPNOTSUPP;
3018 }
3019
3020 int
3021 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3022 {
3023
3024 return EOPNOTSUPP;
3025 }
3026
3027 int
3028 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3029 struct ifnet *ifp, struct thread *td)
3030 {
3031
3032 return EOPNOTSUPP;
3033 }
3034
3035 int
3036 pru_disconnect_notsupp(struct socket *so)
3037 {
3038
3039 return EOPNOTSUPP;
3040 }
3041
3042 int
3043 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3044 {
3045
3046 return EOPNOTSUPP;
3047 }
3048
3049 int
3050 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3051 {
3052
3053 return EOPNOTSUPP;
3054 }
3055
3056 int
3057 pru_rcvd_notsupp(struct socket *so, int flags)
3058 {
3059
3060 return EOPNOTSUPP;
3061 }
3062
3063 int
3064 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3065 {
3066
3067 return EOPNOTSUPP;
3068 }
3069
3070 int
3071 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3072 struct sockaddr *addr, struct mbuf *control, struct thread *td)
3073 {
3074
3075 return EOPNOTSUPP;
3076 }
3077
3078 /*
3079 * This isn't really a ``null'' operation, but it's the default one and
3080 * doesn't do anything destructive.
3081 */
3082 int
3083 pru_sense_null(struct socket *so, struct stat *sb)
3084 {
3085
3086 sb->st_blksize = so->so_snd.sb_hiwat;
3087 return 0;
3088 }
3089
3090 int
3091 pru_shutdown_notsupp(struct socket *so)
3092 {
3093
3094 return EOPNOTSUPP;
3095 }
3096
3097 int
3098 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3099 {
3100
3101 return EOPNOTSUPP;
3102 }
3103
3104 int
3105 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3106 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3107 {
3108
3109 return EOPNOTSUPP;
3110 }
3111
3112 int
3113 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3114 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3115 {
3116
3117 return EOPNOTSUPP;
3118 }
3119
3120 int
3121 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3122 struct thread *td)
3123 {
3124
3125 return EOPNOTSUPP;
3126 }
3127
3128 static void
3129 filt_sordetach(struct knote *kn)
3130 {
3131 struct socket *so = kn->kn_fp->f_data;
3132
3133 SOCKBUF_LOCK(&so->so_rcv);
3134 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
3135 if (knlist_empty(&so->so_rcv.sb_sel.si_note))
3136 so->so_rcv.sb_flags &= ~SB_KNOTE;
3137 SOCKBUF_UNLOCK(&so->so_rcv);
3138 }
3139
3140 /*ARGSUSED*/
3141 static int
3142 filt_soread(struct knote *kn, long hint)
3143 {
3144 struct socket *so;
3145
3146 so = kn->kn_fp->f_data;
3147 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3148
3149 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3150 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3151 kn->kn_flags |= EV_EOF;
3152 kn->kn_fflags = so->so_error;
3153 return (1);
3154 } else if (so->so_error) /* temporary udp error */
3155 return (1);
3156 else if (kn->kn_sfflags & NOTE_LOWAT)
3157 return (kn->kn_data >= kn->kn_sdata);
3158 else
3159 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
3160 }
3161
3162 static void
3163 filt_sowdetach(struct knote *kn)
3164 {
3165 struct socket *so = kn->kn_fp->f_data;
3166
3167 SOCKBUF_LOCK(&so->so_snd);
3168 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
3169 if (knlist_empty(&so->so_snd.sb_sel.si_note))
3170 so->so_snd.sb_flags &= ~SB_KNOTE;
3171 SOCKBUF_UNLOCK(&so->so_snd);
3172 }
3173
3174 /*ARGSUSED*/
3175 static int
3176 filt_sowrite(struct knote *kn, long hint)
3177 {
3178 struct socket *so;
3179
3180 so = kn->kn_fp->f_data;
3181 SOCKBUF_LOCK_ASSERT(&so->so_snd);
3182 kn->kn_data = sbspace(&so->so_snd);
3183 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3184 kn->kn_flags |= EV_EOF;
3185 kn->kn_fflags = so->so_error;
3186 return (1);
3187 } else if (so->so_error) /* temporary udp error */
3188 return (1);
3189 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3190 (so->so_proto->pr_flags & PR_CONNREQUIRED))
3191 return (0);
3192 else if (kn->kn_sfflags & NOTE_LOWAT)
3193 return (kn->kn_data >= kn->kn_sdata);
3194 else
3195 return (kn->kn_data >= so->so_snd.sb_lowat);
3196 }
3197
3198 /*ARGSUSED*/
3199 static int
3200 filt_solisten(struct knote *kn, long hint)
3201 {
3202 struct socket *so = kn->kn_fp->f_data;
3203
3204 kn->kn_data = so->so_qlen;
3205 return (!TAILQ_EMPTY(&so->so_comp));
3206 }
3207
3208 int
3209 socheckuid(struct socket *so, uid_t uid)
3210 {
3211
3212 if (so == NULL)
3213 return (EPERM);
3214 if (so->so_cred->cr_uid != uid)
3215 return (EPERM);
3216 return (0);
3217 }
3218
3219 /*
3220 * These functions are used by protocols to notify the socket layer (and its
3221 * consumers) of state changes in the sockets driven by protocol-side events.
3222 */
3223
3224 /*
3225 * Procedures to manipulate state flags of socket and do appropriate wakeups.
3226 *
3227 * Normal sequence from the active (originating) side is that
3228 * soisconnecting() is called during processing of connect() call, resulting
3229 * in an eventual call to soisconnected() if/when the connection is
3230 * established. When the connection is torn down soisdisconnecting() is
3231 * called during processing of disconnect() call, and soisdisconnected() is
3232 * called when the connection to the peer is totally severed. The semantics
3233 * of these routines are such that connectionless protocols can call
3234 * soisconnected() and soisdisconnected() only, bypassing the in-progress
3235 * calls when setting up a ``connection'' takes no time.
3236 *
3237 * From the passive side, a socket is created with two queues of sockets:
3238 * so_incomp for connections in progress and so_comp for connections already
3239 * made and awaiting user acceptance. As a protocol is preparing incoming
3240 * connections, it creates a socket structure queued on so_incomp by calling
3241 * sonewconn(). When the connection is established, soisconnected() is
3242 * called, and transfers the socket structure to so_comp, making it available
3243 * to accept().
3244 *
3245 * If a socket is closed with sockets on either so_incomp or so_comp, these
3246 * sockets are dropped.
3247 *
3248 * If higher-level protocols are implemented in the kernel, the wakeups done
3249 * here will sometimes cause software-interrupt process scheduling.
3250 */
3251 void
3252 soisconnecting(struct socket *so)
3253 {
3254
3255 SOCK_LOCK(so);
3256 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
3257 so->so_state |= SS_ISCONNECTING;
3258 SOCK_UNLOCK(so);
3259 }
3260
3261 void
3262 soisconnected(struct socket *so)
3263 {
3264 struct socket *head;
3265 int ret;
3266
3267 restart:
3268 ACCEPT_LOCK();
3269 SOCK_LOCK(so);
3270 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
3271 so->so_state |= SS_ISCONNECTED;
3272 head = so->so_head;
3273 if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
3274 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
3275 SOCK_UNLOCK(so);
3276 TAILQ_REMOVE(&head->so_incomp, so, so_list);
3277 head->so_incqlen--;
3278 so->so_qstate &= ~SQ_INCOMP;
3279 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
3280 head->so_qlen++;
3281 so->so_qstate |= SQ_COMP;
3282 ACCEPT_UNLOCK();
3283 sorwakeup(head);
3284 wakeup_one(&head->so_timeo);
3285 } else {
3286 ACCEPT_UNLOCK();
3287 soupcall_set(so, SO_RCV,
3288 head->so_accf->so_accept_filter->accf_callback,
3289 head->so_accf->so_accept_filter_arg);
3290 so->so_options &= ~SO_ACCEPTFILTER;
3291 ret = head->so_accf->so_accept_filter->accf_callback(so,
3292 head->so_accf->so_accept_filter_arg, M_NOWAIT);
3293 if (ret == SU_ISCONNECTED)
3294 soupcall_clear(so, SO_RCV);
3295 SOCK_UNLOCK(so);
3296 if (ret == SU_ISCONNECTED)
3297 goto restart;
3298 }
3299 return;
3300 }
3301 SOCK_UNLOCK(so);
3302 ACCEPT_UNLOCK();
3303 wakeup(&so->so_timeo);
3304 sorwakeup(so);
3305 sowwakeup(so);
3306 }
3307
3308 void
3309 soisdisconnecting(struct socket *so)
3310 {
3311
3312 /*
3313 * Note: This code assumes that SOCK_LOCK(so) and
3314 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3315 */
3316 SOCKBUF_LOCK(&so->so_rcv);
3317 so->so_state &= ~SS_ISCONNECTING;
3318 so->so_state |= SS_ISDISCONNECTING;
3319 so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3320 sorwakeup_locked(so);
3321 SOCKBUF_LOCK(&so->so_snd);
3322 so->so_snd.sb_state |= SBS_CANTSENDMORE;
3323 sowwakeup_locked(so);
3324 wakeup(&so->so_timeo);
3325 }
3326
3327 void
3328 soisdisconnected(struct socket *so)
3329 {
3330
3331 /*
3332 * Note: This code assumes that SOCK_LOCK(so) and
3333 * SOCKBUF_LOCK(&so->so_rcv) are the same.
3334 */
3335 SOCKBUF_LOCK(&so->so_rcv);
3336 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
3337 so->so_state |= SS_ISDISCONNECTED;
3338 so->so_rcv.sb_state |= SBS_CANTRCVMORE;
3339 sorwakeup_locked(so);
3340 SOCKBUF_LOCK(&so->so_snd);
3341 so->so_snd.sb_state |= SBS_CANTSENDMORE;
3342 sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
3343 sowwakeup_locked(so);
3344 wakeup(&so->so_timeo);
3345 }
3346
3347 /*
3348 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
3349 */
3350 struct sockaddr *
3351 sodupsockaddr(const struct sockaddr *sa, int mflags)
3352 {
3353 struct sockaddr *sa2;
3354
3355 sa2 = malloc(sa->sa_len, M_SONAME, mflags);
3356 if (sa2)
3357 bcopy(sa, sa2, sa->sa_len);
3358 return sa2;
3359 }
3360
3361 /*
3362 * Register per-socket buffer upcalls.
3363 */
3364 void
3365 soupcall_set(struct socket *so, int which,
3366 int (*func)(struct socket *, void *, int), void *arg)
3367 {
3368 struct sockbuf *sb;
3369
3370 switch (which) {
3371 case SO_RCV:
3372 sb = &so->so_rcv;
3373 break;
3374 case SO_SND:
3375 sb = &so->so_snd;
3376 break;
3377 default:
3378 panic("soupcall_set: bad which");
3379 }
3380 SOCKBUF_LOCK_ASSERT(sb);
3381 #if 0
3382 /* XXX: accf_http actually wants to do this on purpose. */
3383 KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
3384 #endif
3385 sb->sb_upcall = func;
3386 sb->sb_upcallarg = arg;
3387 sb->sb_flags |= SB_UPCALL;
3388 }
3389
3390 void
3391 soupcall_clear(struct socket *so, int which)
3392 {
3393 struct sockbuf *sb;
3394
3395 switch (which) {
3396 case SO_RCV:
3397 sb = &so->so_rcv;
3398 break;
3399 case SO_SND:
3400 sb = &so->so_snd;
3401 break;
3402 default:
3403 panic("soupcall_clear: bad which");
3404 }
3405 SOCKBUF_LOCK_ASSERT(sb);
3406 KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
3407 sb->sb_upcall = NULL;
3408 sb->sb_upcallarg = NULL;
3409 sb->sb_flags &= ~SB_UPCALL;
3410 }
3411
3412 /*
3413 * Create an external-format (``xsocket'') structure using the information in
3414 * the kernel-format socket structure pointed to by so. This is done to
3415 * reduce the spew of irrelevant information over this interface, to isolate
3416 * user code from changes in the kernel structure, and potentially to provide
3417 * information-hiding if we decide that some of this information should be
3418 * hidden from users.
3419 */
3420 void
3421 sotoxsocket(struct socket *so, struct xsocket *xso)
3422 {
3423
3424 xso->xso_len = sizeof *xso;
3425 xso->xso_so = so;
3426 xso->so_type = so->so_type;
3427 xso->so_options = so->so_options;
3428 xso->so_linger = so->so_linger;
3429 xso->so_state = so->so_state;
3430 xso->so_pcb = so->so_pcb;
3431 xso->xso_protocol = so->so_proto->pr_protocol;
3432 xso->xso_family = so->so_proto->pr_domain->dom_family;
3433 xso->so_qlen = so->so_qlen;
3434 xso->so_incqlen = so->so_incqlen;
3435 xso->so_qlimit = so->so_qlimit;
3436 xso->so_timeo = so->so_timeo;
3437 xso->so_error = so->so_error;
3438 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
3439 xso->so_oobmark = so->so_oobmark;
3440 sbtoxsockbuf(&so->so_snd, &xso->so_snd);
3441 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
3442 xso->so_uid = so->so_cred->cr_uid;
3443 }
3444
3445
3446 /*
3447 * Socket accessor functions to provide external consumers with
3448 * a safe interface to socket state
3449 *
3450 */
3451
3452 void
3453 so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *),
3454 void *arg)
3455 {
3456
3457 TAILQ_FOREACH(so, &so->so_comp, so_list)
3458 func(so, arg);
3459 }
3460
3461 struct sockbuf *
3462 so_sockbuf_rcv(struct socket *so)
3463 {
3464
3465 return (&so->so_rcv);
3466 }
3467
3468 struct sockbuf *
3469 so_sockbuf_snd(struct socket *so)
3470 {
3471
3472 return (&so->so_snd);
3473 }
3474
3475 int
3476 so_state_get(const struct socket *so)
3477 {
3478
3479 return (so->so_state);
3480 }
3481
3482 void
3483 so_state_set(struct socket *so, int val)
3484 {
3485
3486 so->so_state = val;
3487 }
3488
3489 int
3490 so_options_get(const struct socket *so)
3491 {
3492
3493 return (so->so_options);
3494 }
3495
3496 void
3497 so_options_set(struct socket *so, int val)
3498 {
3499
3500 so->so_options = val;
3501 }
3502
3503 int
3504 so_error_get(const struct socket *so)
3505 {
3506
3507 return (so->so_error);
3508 }
3509
3510 void
3511 so_error_set(struct socket *so, int val)
3512 {
3513
3514 so->so_error = val;
3515 }
3516
3517 int
3518 so_linger_get(const struct socket *so)
3519 {
3520
3521 return (so->so_linger);
3522 }
3523
3524 void
3525 so_linger_set(struct socket *so, int val)
3526 {
3527
3528 so->so_linger = val;
3529 }
3530
3531 struct protosw *
3532 so_protosw_get(const struct socket *so)
3533 {
3534
3535 return (so->so_proto);
3536 }
3537
3538 void
3539 so_protosw_set(struct socket *so, struct protosw *val)
3540 {
3541
3542 so->so_proto = val;
3543 }
3544
3545 void
3546 so_sorwakeup(struct socket *so)
3547 {
3548
3549 sorwakeup(so);
3550 }
3551
3552 void
3553 so_sowwakeup(struct socket *so)
3554 {
3555
3556 sowwakeup(so);
3557 }
3558
3559 void
3560 so_sorwakeup_locked(struct socket *so)
3561 {
3562
3563 sorwakeup_locked(so);
3564 }
3565
3566 void
3567 so_sowwakeup_locked(struct socket *so)
3568 {
3569
3570 sowwakeup_locked(so);
3571 }
3572
3573 void
3574 so_lock(struct socket *so)
3575 {
3576
3577 SOCK_LOCK(so);
3578 }
3579
3580 void
3581 so_unlock(struct socket *so)
3582 {
3583
3584 SOCK_UNLOCK(so);
3585 }
Cache object: e52c016597e79b567f3999ac6ec49983
|