The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/uipc_socket.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2004 The FreeBSD Foundation
    3  * Copyright (c) 2004-2005 Robert N. M. Watson
    4  * Copyright (c) 1982, 1986, 1988, 1990, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  * 4. Neither the name of the University nor the names of its contributors
   16  *    may be used to endorse or promote products derived from this software
   17  *    without specific prior written permission.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   29  * SUCH DAMAGE.
   30  *
   31  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
   32  */
   33 
   34 #include <sys/cdefs.h>
   35 __FBSDID("$FreeBSD: releng/6.4/sys/kern/uipc_socket.c 171938 2007-08-23 18:17:08Z jinmei $");
   36 
   37 #include "opt_inet.h"
   38 #include "opt_mac.h"
   39 #include "opt_zero.h"
   40 #include "opt_compat.h"
   41 
   42 #include <sys/param.h>
   43 #include <sys/systm.h>
   44 #include <sys/fcntl.h>
   45 #include <sys/limits.h>
   46 #include <sys/lock.h>
   47 #include <sys/mac.h>
   48 #include <sys/malloc.h>
   49 #include <sys/mbuf.h>
   50 #include <sys/mutex.h>
   51 #include <sys/domain.h>
   52 #include <sys/file.h>                   /* for struct knote */
   53 #include <sys/kernel.h>
   54 #include <sys/event.h>
   55 #include <sys/poll.h>
   56 #include <sys/proc.h>
   57 #include <sys/protosw.h>
   58 #include <sys/socket.h>
   59 #include <sys/socketvar.h>
   60 #include <sys/resourcevar.h>
   61 #include <sys/signalvar.h>
   62 #include <sys/sysctl.h>
   63 #include <sys/uio.h>
   64 #include <sys/jail.h>
   65 
   66 #include <vm/uma.h>
   67 
   68 #ifdef COMPAT_IA32
   69 #include <sys/mount.h>
   70 #include <compat/freebsd32/freebsd32.h>
   71 
   72 extern struct sysentvec ia32_freebsd_sysvec;
   73 #endif
   74 
   75 static int      soreceive_rcvoob(struct socket *so, struct uio *uio,
   76                     int flags);
   77 
   78 static void     filt_sordetach(struct knote *kn);
   79 static int      filt_soread(struct knote *kn, long hint);
   80 static void     filt_sowdetach(struct knote *kn);
   81 static int      filt_sowrite(struct knote *kn, long hint);
   82 static int      filt_solisten(struct knote *kn, long hint);
   83 
   84 static struct filterops solisten_filtops =
   85         { 1, NULL, filt_sordetach, filt_solisten };
   86 static struct filterops soread_filtops =
   87         { 1, NULL, filt_sordetach, filt_soread };
   88 static struct filterops sowrite_filtops =
   89         { 1, NULL, filt_sowdetach, filt_sowrite };
   90 
   91 uma_zone_t socket_zone;
   92 so_gen_t        so_gencnt;      /* generation count for sockets */
   93 
   94 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
   95 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
   96 
   97 SYSCTL_DECL(_kern_ipc);
   98 
   99 static int somaxconn = SOMAXCONN;
  100 static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS);
  101 /* XXX: we dont have SYSCTL_USHORT */
  102 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
  103     0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection "
  104     "queue size");
  105 static int numopensockets;
  106 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
  107     &numopensockets, 0, "Number of open sockets");
  108 #ifdef ZERO_COPY_SOCKETS
  109 /* These aren't static because they're used in other files. */
  110 int so_zero_copy_send = 1;
  111 int so_zero_copy_receive = 1;
  112 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
  113     "Zero copy controls");
  114 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
  115     &so_zero_copy_receive, 0, "Enable zero copy receive");
  116 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
  117     &so_zero_copy_send, 0, "Enable zero copy send");
  118 #endif /* ZERO_COPY_SOCKETS */
  119 
  120 /*
  121  * accept_mtx locks down per-socket fields relating to accept queues.  See
  122  * socketvar.h for an annotation of the protected fields of struct socket.
  123  */
  124 struct mtx accept_mtx;
  125 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
  126 
  127 /*
  128  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
  129  * so_gencnt field.
  130  */
  131 static struct mtx so_global_mtx;
  132 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
  133 
  134 /*
  135  * Socket operation routines.
  136  * These routines are called by the routines in
  137  * sys_socket.c or from a system process, and
  138  * implement the semantics of socket operations by
  139  * switching out to the protocol specific routines.
  140  */
  141 
  142 /*
  143  * Get a socket structure from our zone, and initialize it.
  144  * Note that it would probably be better to allocate socket
  145  * and PCB at the same time, but I'm not convinced that all
  146  * the protocols can be easily modified to do this.
  147  *
  148  * soalloc() returns a socket with a ref count of 0.
  149  */
  150 struct socket *
  151 soalloc(void)
  152 {
  153         struct socket *so;
  154 
  155         so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
  156         if (so == NULL)
  157                 return (NULL);
  158 #ifdef MAC
  159         if (mac_init_socket(so, M_NOWAIT) != 0) {
  160                 uma_zfree(socket_zone, so);
  161                 return (NULL);
  162         }
  163 #endif
  164         SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
  165         SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
  166         TAILQ_INIT(&so->so_aiojobq);
  167         mtx_lock(&so_global_mtx);
  168         so->so_gencnt = ++so_gencnt;
  169         ++numopensockets;
  170         mtx_unlock(&so_global_mtx);
  171         return (so);
  172 }
  173 
  174 /*
  175  * socreate returns a socket with a ref count of 1.  The socket should be
  176  * closed with soclose().
  177  */
  178 int
  179 socreate(dom, aso, type, proto, cred, td)
  180         int dom;
  181         struct socket **aso;
  182         int type;
  183         int proto;
  184         struct ucred *cred;
  185         struct thread *td;
  186 {
  187         struct protosw *prp;
  188         struct socket *so;
  189         int error;
  190 
  191         if (proto)
  192                 prp = pffindproto(dom, proto, type);
  193         else
  194                 prp = pffindtype(dom, type);
  195 
  196         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
  197             prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
  198                 return (EPROTONOSUPPORT);
  199 
  200         if (jailed(cred) && jail_socket_unixiproute_only &&
  201             prp->pr_domain->dom_family != PF_LOCAL &&
  202             prp->pr_domain->dom_family != PF_INET &&
  203             prp->pr_domain->dom_family != PF_ROUTE) {
  204                 return (EPROTONOSUPPORT);
  205         }
  206 
  207         if (prp->pr_type != type)
  208                 return (EPROTOTYPE);
  209         so = soalloc();
  210         if (so == NULL)
  211                 return (ENOBUFS);
  212 
  213         TAILQ_INIT(&so->so_incomp);
  214         TAILQ_INIT(&so->so_comp);
  215         so->so_type = type;
  216         so->so_cred = crhold(cred);
  217         so->so_proto = prp;
  218 #ifdef MAC
  219         mac_create_socket(cred, so);
  220 #endif
  221         knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
  222             NULL, NULL, NULL);
  223         knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
  224             NULL, NULL, NULL);
  225         so->so_count = 1;
  226         error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
  227         if (error) {
  228                 ACCEPT_LOCK();
  229                 SOCK_LOCK(so);
  230                 so->so_state |= SS_NOFDREF;
  231                 sorele(so);
  232                 return (error);
  233         }
  234         *aso = so;
  235         return (0);
  236 }
  237 
  238 int
  239 sobind(so, nam, td)
  240         struct socket *so;
  241         struct sockaddr *nam;
  242         struct thread *td;
  243 {
  244 
  245         return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
  246 }
  247 
  248 void
  249 sodealloc(struct socket *so)
  250 {
  251 
  252         KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
  253         mtx_lock(&so_global_mtx);
  254         so->so_gencnt = ++so_gencnt;
  255         mtx_unlock(&so_global_mtx);
  256         if (so->so_rcv.sb_hiwat)
  257                 (void)chgsbsize(so->so_cred->cr_uidinfo,
  258                     &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
  259         if (so->so_snd.sb_hiwat)
  260                 (void)chgsbsize(so->so_cred->cr_uidinfo,
  261                     &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
  262 #ifdef INET
  263         /* remove acccept filter if one is present. */
  264         if (so->so_accf != NULL)
  265                 do_setopt_accept_filter(so, NULL);
  266 #endif
  267 #ifdef MAC
  268         mac_destroy_socket(so);
  269 #endif
  270         crfree(so->so_cred);
  271         SOCKBUF_LOCK_DESTROY(&so->so_snd);
  272         SOCKBUF_LOCK_DESTROY(&so->so_rcv);
  273         uma_zfree(socket_zone, so);
  274         mtx_lock(&so_global_mtx);
  275         --numopensockets;
  276         mtx_unlock(&so_global_mtx);
  277 }
  278 
  279 /*
  280  * solisten() transitions a socket from a non-listening state to a listening
  281  * state, but can also be used to update the listen queue depth on an
  282  * existing listen socket.  The protocol will call back into the sockets
  283  * layer using solisten_proto_check() and solisten_proto() to check and set
  284  * socket-layer listen state.  Call backs are used so that the protocol can
  285  * acquire both protocol and socket layer locks in whatever order is required
  286  * by the protocol.
  287  *
  288  * Protocol implementors are advised to hold the socket lock across the
  289  * socket-layer test and set to avoid races at the socket layer.
  290  */
  291 int
  292 solisten(so, backlog, td)
  293         struct socket *so;
  294         int backlog;
  295         struct thread *td;
  296 {
  297         int error;
  298 
  299         error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
  300         if (error)
  301                 return (error);
  302 
  303         /*
  304          * XXXRW: The following state adjustment should occur in
  305          * solisten_proto(), but we don't currently pass the backlog request
  306          * to the protocol via pru_listen().
  307          */
  308         if (backlog < 0 || backlog > somaxconn)
  309                 backlog = somaxconn;
  310         so->so_qlimit = backlog;
  311         return (0);
  312 }
  313 
  314 int
  315 solisten_proto_check(so)
  316         struct socket *so;
  317 {
  318 
  319         SOCK_LOCK_ASSERT(so);
  320 
  321         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
  322             SS_ISDISCONNECTING))
  323                 return (EINVAL);
  324         return (0);
  325 }
  326 
  327 void
  328 solisten_proto(so)
  329         struct socket *so;
  330 {
  331 
  332         SOCK_LOCK_ASSERT(so);
  333 
  334         so->so_options |= SO_ACCEPTCONN;
  335 }
  336 
  337 /*
  338  * Attempt to free a socket.  This should really be sotryfree().
  339  *
  340  * We free the socket if the protocol is no longer interested in the socket,
  341  * there's no file descriptor reference, and the refcount is 0.  While the
  342  * calling macro sotryfree() tests the refcount, sofree() has to test it
  343  * again as it's possible to race with an accept()ing thread if the socket is
  344  * in an listen queue of a listen socket, as being in the listen queue
  345  * doesn't elevate the reference count.  sofree() acquires the accept mutex
  346  * early for this test in order to avoid that race.
  347  */
  348 void
  349 sofree(so)
  350         struct socket *so;
  351 {
  352         struct socket *head;
  353 
  354         ACCEPT_LOCK_ASSERT();
  355         SOCK_LOCK_ASSERT(so);
  356 
  357         if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0 ||
  358             so->so_count != 0) {
  359                 SOCK_UNLOCK(so);
  360                 ACCEPT_UNLOCK();
  361                 return;
  362         }
  363 
  364         head = so->so_head;
  365         if (head != NULL) {
  366                 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
  367                     (so->so_qstate & SQ_INCOMP) != 0,
  368                     ("sofree: so_head != NULL, but neither SQ_COMP nor "
  369                     "SQ_INCOMP"));
  370                 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
  371                     (so->so_qstate & SQ_INCOMP) == 0,
  372                     ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
  373                 /*
  374                  * accept(2) is responsible draining the completed
  375                  * connection queue and freeing those sockets, so
  376                  * we just return here if this socket is currently
  377                  * on the completed connection queue.  Otherwise,
  378                  * accept(2) may hang after select(2) has indicating
  379                  * that a listening socket was ready.  If it's an
  380                  * incomplete connection, we remove it from the queue
  381                  * and free it; otherwise, it won't be released until
  382                  * the listening socket is closed.
  383                  */
  384                 if ((so->so_qstate & SQ_COMP) != 0) {
  385                         SOCK_UNLOCK(so);
  386                         ACCEPT_UNLOCK();
  387                         return;
  388                 }
  389                 TAILQ_REMOVE(&head->so_incomp, so, so_list);
  390                 head->so_incqlen--;
  391                 so->so_qstate &= ~SQ_INCOMP;
  392                 so->so_head = NULL;
  393         }
  394         KASSERT((so->so_qstate & SQ_COMP) == 0 &&
  395             (so->so_qstate & SQ_INCOMP) == 0,
  396             ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
  397             so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
  398         SOCK_UNLOCK(so);
  399         ACCEPT_UNLOCK();
  400         SOCKBUF_LOCK(&so->so_snd);
  401         so->so_snd.sb_flags |= SB_NOINTR;
  402         (void)sblock(&so->so_snd, M_WAITOK);
  403         /*
  404          * socantsendmore_locked() drops the socket buffer mutex so that it
  405          * can safely perform wakeups.  Re-acquire the mutex before
  406          * continuing.
  407          */
  408         socantsendmore_locked(so);
  409         SOCKBUF_LOCK(&so->so_snd);
  410         sbunlock(&so->so_snd);
  411         sbrelease_locked(&so->so_snd, so);
  412         SOCKBUF_UNLOCK(&so->so_snd);
  413         sorflush(so);
  414         knlist_destroy(&so->so_rcv.sb_sel.si_note);
  415         knlist_destroy(&so->so_snd.sb_sel.si_note);
  416         sodealloc(so);
  417 }
  418 
  419 /*
  420  * Close a socket on last file table reference removal.
  421  * Initiate disconnect if connected.
  422  * Free socket when disconnect complete.
  423  *
  424  * This function will sorele() the socket.  Note that soclose() may be
  425  * called prior to the ref count reaching zero.  The actual socket
  426  * structure will not be freed until the ref count reaches zero.
  427  */
  428 int
  429 soclose(so)
  430         struct socket *so;
  431 {
  432         int error = 0;
  433 
  434         KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
  435 
  436         funsetown(&so->so_sigio);
  437         if (so->so_pcb == NULL)
  438                 goto discard;
  439         if (so->so_state & SS_ISCONNECTED) {
  440                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
  441                         error = sodisconnect(so);
  442                         if (error)
  443                                 goto drop;
  444                 }
  445                 if (so->so_options & SO_LINGER) {
  446                         if ((so->so_state & SS_ISDISCONNECTING) &&
  447                             (so->so_state & SS_NBIO))
  448                                 goto drop;
  449                         while (so->so_state & SS_ISCONNECTED) {
  450                                 error = tsleep(&so->so_timeo,
  451                                     PSOCK | PCATCH, "soclos", so->so_linger * hz);
  452                                 if (error)
  453                                         break;
  454                         }
  455                 }
  456         }
  457 drop:
  458         if (so->so_pcb != NULL) {
  459                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
  460                 if (error == 0)
  461                         error = error2;
  462         }
  463         if (so->so_options & SO_ACCEPTCONN) {
  464                 struct socket *sp;
  465                 ACCEPT_LOCK();
  466                 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
  467                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
  468                         so->so_incqlen--;
  469                         sp->so_qstate &= ~SQ_INCOMP;
  470                         sp->so_head = NULL;
  471                         ACCEPT_UNLOCK();
  472                         (void) soabort(sp);
  473                         ACCEPT_LOCK();
  474                 }
  475                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
  476                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
  477                         so->so_qlen--;
  478                         sp->so_qstate &= ~SQ_COMP;
  479                         sp->so_head = NULL;
  480                         ACCEPT_UNLOCK();
  481                         (void) soabort(sp);
  482                         ACCEPT_LOCK();
  483                 }
  484                 ACCEPT_UNLOCK();
  485         }
  486 discard:
  487         ACCEPT_LOCK();
  488         SOCK_LOCK(so);
  489         KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
  490         so->so_state |= SS_NOFDREF;
  491         sorele(so);
  492         return (error);
  493 }
  494 
  495 /*
  496  * soabort() must not be called with any socket locks held, as it calls
  497  * into the protocol, which will call back into the socket code causing
  498  * it to acquire additional socket locks that may cause recursion or lock
  499  * order reversals.
  500  */
  501 int
  502 soabort(so)
  503         struct socket *so;
  504 {
  505         int error;
  506 
  507         error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
  508         if (error) {
  509                 ACCEPT_LOCK();
  510                 SOCK_LOCK(so);
  511                 sotryfree(so);  /* note: does not decrement the ref count */
  512                 return error;
  513         }
  514         return (0);
  515 }
  516 
  517 int
  518 soaccept(so, nam)
  519         struct socket *so;
  520         struct sockaddr **nam;
  521 {
  522         int error;
  523 
  524         SOCK_LOCK(so);
  525         KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
  526         so->so_state &= ~SS_NOFDREF;
  527         SOCK_UNLOCK(so);
  528         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
  529         return (error);
  530 }
  531 
  532 int
  533 soconnect(so, nam, td)
  534         struct socket *so;
  535         struct sockaddr *nam;
  536         struct thread *td;
  537 {
  538         int error;
  539 
  540         if (so->so_options & SO_ACCEPTCONN)
  541                 return (EOPNOTSUPP);
  542         /*
  543          * If protocol is connection-based, can only connect once.
  544          * Otherwise, if connected, try to disconnect first.
  545          * This allows user to disconnect by connecting to, e.g.,
  546          * a null address.
  547          */
  548         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
  549             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
  550             (error = sodisconnect(so)))) {
  551                 error = EISCONN;
  552         } else {
  553                 /*
  554                  * Prevent accumulated error from previous connection
  555                  * from biting us.
  556                  */
  557                 so->so_error = 0;
  558                 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
  559         }
  560 
  561         return (error);
  562 }
  563 
  564 int
  565 soconnect2(so1, so2)
  566         struct socket *so1;
  567         struct socket *so2;
  568 {
  569 
  570         return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
  571 }
  572 
  573 int
  574 sodisconnect(so)
  575         struct socket *so;
  576 {
  577         int error;
  578 
  579         if ((so->so_state & SS_ISCONNECTED) == 0)
  580                 return (ENOTCONN);
  581         if (so->so_state & SS_ISDISCONNECTING)
  582                 return (EALREADY);
  583         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
  584         return (error);
  585 }
  586 
  587 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
  588 /*
  589  * Send on a socket.
  590  * If send must go all at once and message is larger than
  591  * send buffering, then hard error.
  592  * Lock against other senders.
  593  * If must go all at once and not enough room now, then
  594  * inform user that this would block and do nothing.
  595  * Otherwise, if nonblocking, send as much as possible.
  596  * The data to be sent is described by "uio" if nonzero,
  597  * otherwise by the mbuf chain "top" (which must be null
  598  * if uio is not).  Data provided in mbuf chain must be small
  599  * enough to send all at once.
  600  *
  601  * Returns nonzero on error, timeout or signal; callers
  602  * must check for short counts if EINTR/ERESTART are returned.
  603  * Data and control buffers are freed on return.
  604  */
  605 
  606 #ifdef ZERO_COPY_SOCKETS
  607 struct so_zerocopy_stats{
  608         int size_ok;
  609         int align_ok;
  610         int found_ifp;
  611 };
  612 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
  613 #include <netinet/in.h>
  614 #include <net/route.h>
  615 #include <netinet/in_pcb.h>
  616 #include <vm/vm.h>
  617 #include <vm/vm_page.h>
  618 #include <vm/vm_object.h>
  619 #endif /*ZERO_COPY_SOCKETS*/
  620 
  621 int
  622 sosend(so, addr, uio, top, control, flags, td)
  623         struct socket *so;
  624         struct sockaddr *addr;
  625         struct uio *uio;
  626         struct mbuf *top;
  627         struct mbuf *control;
  628         int flags;
  629         struct thread *td;
  630 {
  631         struct mbuf **mp;
  632         struct mbuf *m;
  633         long space, len = 0, resid;
  634         int clen = 0, error, dontroute;
  635         int atomic = sosendallatonce(so) || top;
  636 #ifdef ZERO_COPY_SOCKETS
  637         int cow_send;
  638 #endif /* ZERO_COPY_SOCKETS */
  639 
  640         if (uio != NULL)
  641                 resid = uio->uio_resid;
  642         else
  643                 resid = top->m_pkthdr.len;
  644         /*
  645          * In theory resid should be unsigned.
  646          * However, space must be signed, as it might be less than 0
  647          * if we over-committed, and we must use a signed comparison
  648          * of space and resid.  On the other hand, a negative resid
  649          * causes us to loop sending 0-length segments to the protocol.
  650          *
  651          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
  652          * type sockets since that's an error.
  653          */
  654         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
  655                 error = EINVAL;
  656                 goto out;
  657         }
  658 
  659         dontroute =
  660             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
  661             (so->so_proto->pr_flags & PR_ATOMIC);
  662         if (td != NULL)
  663                 td->td_proc->p_stats->p_ru.ru_msgsnd++;
  664         if (control != NULL)
  665                 clen = control->m_len;
  666 #define snderr(errno)   { error = (errno); goto release; }
  667 
  668         SOCKBUF_LOCK(&so->so_snd);
  669 restart:
  670         SOCKBUF_LOCK_ASSERT(&so->so_snd);
  671         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
  672         if (error)
  673                 goto out_locked;
  674         do {
  675                 SOCKBUF_LOCK_ASSERT(&so->so_snd);
  676                 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
  677                         snderr(EPIPE);
  678                 if (so->so_error) {
  679                         error = so->so_error;
  680                         so->so_error = 0;
  681                         goto release;
  682                 }
  683                 if ((so->so_state & SS_ISCONNECTED) == 0) {
  684                         /*
  685                          * `sendto' and `sendmsg' is allowed on a connection-
  686                          * based socket if it supports implied connect.
  687                          * Return ENOTCONN if not connected and no address is
  688                          * supplied.
  689                          */
  690                         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
  691                             (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
  692                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
  693                                     !(resid == 0 && clen != 0))
  694                                         snderr(ENOTCONN);
  695                         } else if (addr == NULL)
  696                             snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
  697                                    ENOTCONN : EDESTADDRREQ);
  698                 }
  699                 space = sbspace(&so->so_snd);
  700                 if (flags & MSG_OOB)
  701                         space += 1024;
  702                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
  703                     clen > so->so_snd.sb_hiwat)
  704                         snderr(EMSGSIZE);
  705                 if (space < resid + clen &&
  706                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
  707                         if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
  708                                 snderr(EWOULDBLOCK);
  709                         sbunlock(&so->so_snd);
  710                         error = sbwait(&so->so_snd);
  711                         if (error)
  712                                 goto out_locked;
  713                         goto restart;
  714                 }
  715                 SOCKBUF_UNLOCK(&so->so_snd);
  716                 mp = &top;
  717                 space -= clen;
  718                 do {
  719                     if (uio == NULL) {
  720                         /*
  721                          * Data is prepackaged in "top".
  722                          */
  723                         resid = 0;
  724                         if (flags & MSG_EOR)
  725                                 top->m_flags |= M_EOR;
  726                     } else do {
  727 #ifdef ZERO_COPY_SOCKETS
  728                         cow_send = 0;
  729 #endif /* ZERO_COPY_SOCKETS */
  730                         if (resid >= MINCLSIZE) {
  731 #ifdef ZERO_COPY_SOCKETS
  732                                 if (top == NULL) {
  733                                         MGETHDR(m, M_TRYWAIT, MT_DATA);
  734                                         if (m == NULL) {
  735                                                 error = ENOBUFS;
  736                                                 SOCKBUF_LOCK(&so->so_snd);
  737                                                 goto release;
  738                                         }
  739                                         m->m_pkthdr.len = 0;
  740                                         m->m_pkthdr.rcvif = NULL; 
  741                                 } else {
  742                                         MGET(m, M_TRYWAIT, MT_DATA);
  743                                         if (m == NULL) {
  744                                                 error = ENOBUFS;
  745                                                 SOCKBUF_LOCK(&so->so_snd);
  746                                                 goto release;
  747                                         }
  748                                 }
  749                                 if (so_zero_copy_send &&
  750                                     resid>=PAGE_SIZE &&
  751                                     space>=PAGE_SIZE &&
  752                                     uio->uio_iov->iov_len>=PAGE_SIZE) {
  753                                         so_zerocp_stats.size_ok++;
  754                                         so_zerocp_stats.align_ok++;
  755                                         cow_send = socow_setup(m, uio);
  756                                         len = cow_send;
  757                                 }
  758                                 if (!cow_send) {
  759                                         MCLGET(m, M_TRYWAIT);
  760                                         if ((m->m_flags & M_EXT) == 0) {
  761                                                 m_free(m);
  762                                                 m = NULL;
  763                                         } else {
  764                                                 len = min(min(MCLBYTES, resid), space);
  765                                         }
  766                                 }
  767 #else /* ZERO_COPY_SOCKETS */
  768                                 if (top == NULL) {
  769                                         m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
  770                                         m->m_pkthdr.len = 0;
  771                                         m->m_pkthdr.rcvif = NULL;
  772                                 } else
  773                                         m = m_getcl(M_TRYWAIT, MT_DATA, 0);
  774                                 len = min(min(MCLBYTES, resid), space);
  775 #endif /* ZERO_COPY_SOCKETS */
  776                         } else {
  777                                 if (top == NULL) {
  778                                         m = m_gethdr(M_TRYWAIT, MT_DATA);
  779                                         m->m_pkthdr.len = 0;
  780                                         m->m_pkthdr.rcvif = NULL;
  781 
  782                                         len = min(min(MHLEN, resid), space);
  783                                         /*
  784                                          * For datagram protocols, leave room
  785                                          * for protocol headers in first mbuf.
  786                                          */
  787                                         if (atomic && m && len < MHLEN)
  788                                                 MH_ALIGN(m, len);
  789                                 } else {
  790                                         m = m_get(M_TRYWAIT, MT_DATA);
  791                                         len = min(min(MLEN, resid), space);
  792                                 }
  793                         }
  794                         if (m == NULL) {
  795                                 error = ENOBUFS;
  796                                 SOCKBUF_LOCK(&so->so_snd);
  797                                 goto release;
  798                         }
  799 
  800                         space -= len;
  801 #ifdef ZERO_COPY_SOCKETS
  802                         if (cow_send)
  803                                 error = 0;
  804                         else
  805 #endif /* ZERO_COPY_SOCKETS */
  806                         error = uiomove(mtod(m, void *), (int)len, uio);
  807                         resid = uio->uio_resid;
  808                         m->m_len = len;
  809                         *mp = m;
  810                         top->m_pkthdr.len += len;
  811                         if (error) {
  812                                 SOCKBUF_LOCK(&so->so_snd);
  813                                 goto release;
  814                         }
  815                         mp = &m->m_next;
  816                         if (resid <= 0) {
  817                                 if (flags & MSG_EOR)
  818                                         top->m_flags |= M_EOR;
  819                                 break;
  820                         }
  821                     } while (space > 0 && atomic);
  822                     if (dontroute) {
  823                             SOCK_LOCK(so);
  824                             so->so_options |= SO_DONTROUTE;
  825                             SOCK_UNLOCK(so);
  826                     }
  827                     /*
  828                      * XXX all the SBS_CANTSENDMORE checks previously
  829                      * done could be out of date.  We could have recieved
  830                      * a reset packet in an interrupt or maybe we slept
  831                      * while doing page faults in uiomove() etc. We could
  832                      * probably recheck again inside the locking protection
  833                      * here, but there are probably other places that this
  834                      * also happens.  We must rethink this.
  835                      */
  836                     error = (*so->so_proto->pr_usrreqs->pru_send)(so,
  837                         (flags & MSG_OOB) ? PRUS_OOB :
  838                         /*
  839                          * If the user set MSG_EOF, the protocol
  840                          * understands this flag and nothing left to
  841                          * send then use PRU_SEND_EOF instead of PRU_SEND.
  842                          */
  843                         ((flags & MSG_EOF) &&
  844                          (so->so_proto->pr_flags & PR_IMPLOPCL) &&
  845                          (resid <= 0)) ?
  846                                 PRUS_EOF :
  847                         /* If there is more to send set PRUS_MORETOCOME */
  848                         (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
  849                         top, addr, control, td);
  850                     if (dontroute) {
  851                             SOCK_LOCK(so);
  852                             so->so_options &= ~SO_DONTROUTE;
  853                             SOCK_UNLOCK(so);
  854                     }
  855                     clen = 0;
  856                     control = NULL;
  857                     top = NULL;
  858                     mp = &top;
  859                     if (error) {
  860                         SOCKBUF_LOCK(&so->so_snd);
  861                         goto release;
  862                     }
  863                 } while (resid && space > 0);
  864                 SOCKBUF_LOCK(&so->so_snd);
  865         } while (resid);
  866 
  867 release:
  868         SOCKBUF_LOCK_ASSERT(&so->so_snd);
  869         sbunlock(&so->so_snd);
  870 out_locked:
  871         SOCKBUF_LOCK_ASSERT(&so->so_snd);
  872         SOCKBUF_UNLOCK(&so->so_snd);
  873 out:
  874         if (top != NULL)
  875                 m_freem(top);
  876         if (control != NULL)
  877                 m_freem(control);
  878         return (error);
  879 }
  880 
  881 /*
  882  * The part of soreceive() that implements reading non-inline out-of-band
  883  * data from a socket.  For more complete comments, see soreceive(), from
  884  * which this code originated.
  885  *
  886  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
  887  * unable to return an mbuf chain to the caller.
  888  */
  889 static int
  890 soreceive_rcvoob(so, uio, flags)
  891         struct socket *so;
  892         struct uio *uio;
  893         int flags;
  894 {
  895         struct protosw *pr = so->so_proto;
  896         struct mbuf *m;
  897         int error;
  898 
  899         KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
  900 
  901         m = m_get(M_TRYWAIT, MT_DATA);
  902         if (m == NULL)
  903                 return (ENOBUFS);
  904         error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
  905         if (error)
  906                 goto bad;
  907         do {
  908 #ifdef ZERO_COPY_SOCKETS
  909                 if (so_zero_copy_receive) {
  910                         int disposable;
  911 
  912                         if ((m->m_flags & M_EXT)
  913                          && (m->m_ext.ext_type == EXT_DISPOSABLE))
  914                                 disposable = 1;
  915                         else
  916                                 disposable = 0;
  917 
  918                         error = uiomoveco(mtod(m, void *),
  919                                           min(uio->uio_resid, m->m_len),
  920                                           uio, disposable);
  921                 } else
  922 #endif /* ZERO_COPY_SOCKETS */
  923                 error = uiomove(mtod(m, void *),
  924                     (int) min(uio->uio_resid, m->m_len), uio);
  925                 m = m_free(m);
  926         } while (uio->uio_resid && error == 0 && m);
  927 bad:
  928         if (m != NULL)
  929                 m_freem(m);
  930         return (error);
  931 }
  932 
  933 /*
  934  * Following replacement or removal of the first mbuf on the first mbuf chain
  935  * of a socket buffer, push necessary state changes back into the socket
  936  * buffer so that other consumers see the values consistently.  'nextrecord'
  937  * is the callers locally stored value of the original value of
  938  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
  939  * NOTE: 'nextrecord' may be NULL.
  940  */
  941 static __inline void
  942 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
  943 {
  944 
  945         SOCKBUF_LOCK_ASSERT(sb);
  946         /*
  947          * First, update for the new value of nextrecord.  If necessary, make
  948          * it the first record.
  949          */
  950         if (sb->sb_mb != NULL)
  951                 sb->sb_mb->m_nextpkt = nextrecord;
  952         else
  953                 sb->sb_mb = nextrecord;
  954 
  955         /*
  956          * Now update any dependent socket buffer fields to reflect the new
  957          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
  958          * addition of a second clause that takes care of the case where
  959          * sb_mb has been updated, but remains the last record.
  960          */
  961         if (sb->sb_mb == NULL) {
  962                 sb->sb_mbtail = NULL;
  963                 sb->sb_lastrecord = NULL;
  964         } else if (sb->sb_mb->m_nextpkt == NULL)
  965                 sb->sb_lastrecord = sb->sb_mb;
  966 }
  967 
  968 
  969 /*
  970  * Implement receive operations on a socket.
  971  * We depend on the way that records are added to the sockbuf
  972  * by sbappend*.  In particular, each record (mbufs linked through m_next)
  973  * must begin with an address if the protocol so specifies,
  974  * followed by an optional mbuf or mbufs containing ancillary data,
  975  * and then zero or more mbufs of data.
  976  * In order to avoid blocking network interrupts for the entire time here,
  977  * we splx() while doing the actual copy to user space.
  978  * Although the sockbuf is locked, new data may still be appended,
  979  * and thus we must maintain consistency of the sockbuf during that time.
  980  *
  981  * The caller may receive the data as a single mbuf chain by supplying
  982  * an mbuf **mp0 for use in returning the chain.  The uio is then used
  983  * only for the count in uio_resid.
  984  */
  985 int
  986 soreceive(so, psa, uio, mp0, controlp, flagsp)
  987         struct socket *so;
  988         struct sockaddr **psa;
  989         struct uio *uio;
  990         struct mbuf **mp0;
  991         struct mbuf **controlp;
  992         int *flagsp;
  993 {
  994         struct mbuf *m, **mp;
  995         int flags, len, error, offset;
  996         struct protosw *pr = so->so_proto;
  997         struct mbuf *nextrecord;
  998         int moff, type = 0;
  999         int orig_resid = uio->uio_resid;
 1000 
 1001         mp = mp0;
 1002         if (psa != NULL)
 1003                 *psa = NULL;
 1004         if (controlp != NULL)
 1005                 *controlp = NULL;
 1006         if (flagsp != NULL)
 1007                 flags = *flagsp &~ MSG_EOR;
 1008         else
 1009                 flags = 0;
 1010         if (flags & MSG_OOB)
 1011                 return (soreceive_rcvoob(so, uio, flags));
 1012         if (mp != NULL)
 1013                 *mp = NULL;
 1014         if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
 1015             && uio->uio_resid)
 1016                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
 1017 
 1018         SOCKBUF_LOCK(&so->so_rcv);
 1019 restart:
 1020         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1021         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
 1022         if (error)
 1023                 goto out;
 1024 
 1025         m = so->so_rcv.sb_mb;
 1026         /*
 1027          * If we have less data than requested, block awaiting more
 1028          * (subject to any timeout) if:
 1029          *   1. the current count is less than the low water mark, or
 1030          *   2. MSG_WAITALL is set, and it is possible to do the entire
 1031          *      receive operation at once if we block (resid <= hiwat).
 1032          *   3. MSG_DONTWAIT is not set
 1033          * If MSG_WAITALL is set but resid is larger than the receive buffer,
 1034          * we have to do the receive in sections, and thus risk returning
 1035          * a short count if a timeout or signal occurs after we start.
 1036          */
 1037         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
 1038             so->so_rcv.sb_cc < uio->uio_resid) &&
 1039             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
 1040             ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
 1041             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
 1042                 KASSERT(m != NULL || !so->so_rcv.sb_cc,
 1043                     ("receive: m == %p so->so_rcv.sb_cc == %u",
 1044                     m, so->so_rcv.sb_cc));
 1045                 if (so->so_error) {
 1046                         if (m != NULL)
 1047                                 goto dontblock;
 1048                         error = so->so_error;
 1049                         if ((flags & MSG_PEEK) == 0)
 1050                                 so->so_error = 0;
 1051                         goto release;
 1052                 }
 1053                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1054                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 1055                         if (m)
 1056                                 goto dontblock;
 1057                         else
 1058                                 goto release;
 1059                 }
 1060                 for (; m != NULL; m = m->m_next)
 1061                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
 1062                                 m = so->so_rcv.sb_mb;
 1063                                 goto dontblock;
 1064                         }
 1065                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
 1066                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
 1067                         error = ENOTCONN;
 1068                         goto release;
 1069                 }
 1070                 if (uio->uio_resid == 0)
 1071                         goto release;
 1072                 if ((so->so_state & SS_NBIO) ||
 1073                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
 1074                         error = EWOULDBLOCK;
 1075                         goto release;
 1076                 }
 1077                 SBLASTRECORDCHK(&so->so_rcv);
 1078                 SBLASTMBUFCHK(&so->so_rcv);
 1079                 sbunlock(&so->so_rcv);
 1080                 error = sbwait(&so->so_rcv);
 1081                 if (error)
 1082                         goto out;
 1083                 goto restart;
 1084         }
 1085 dontblock:
 1086         /*
 1087          * From this point onward, we maintain 'nextrecord' as a cache of the
 1088          * pointer to the next record in the socket buffer.  We must keep the
 1089          * various socket buffer pointers and local stack versions of the
 1090          * pointers in sync, pushing out modifications before dropping the
 1091          * socket buffer mutex, and re-reading them when picking it up.
 1092          *
 1093          * Otherwise, we will race with the network stack appending new data
 1094          * or records onto the socket buffer by using inconsistent/stale
 1095          * versions of the field, possibly resulting in socket buffer
 1096          * corruption.
 1097          *
 1098          * By holding the high-level sblock(), we prevent simultaneous
 1099          * readers from pulling off the front of the socket buffer.
 1100          */
 1101         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1102         if (uio->uio_td)
 1103                 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
 1104         KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
 1105         SBLASTRECORDCHK(&so->so_rcv);
 1106         SBLASTMBUFCHK(&so->so_rcv);
 1107         nextrecord = m->m_nextpkt;
 1108         if (pr->pr_flags & PR_ADDR) {
 1109                 KASSERT(m->m_type == MT_SONAME,
 1110                     ("m->m_type == %d", m->m_type));
 1111                 orig_resid = 0;
 1112                 if (psa != NULL)
 1113                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
 1114                             M_NOWAIT);
 1115                 if (flags & MSG_PEEK) {
 1116                         m = m->m_next;
 1117                 } else {
 1118                         sbfree(&so->so_rcv, m);
 1119                         so->so_rcv.sb_mb = m_free(m);
 1120                         m = so->so_rcv.sb_mb;
 1121                         sockbuf_pushsync(&so->so_rcv, nextrecord);
 1122                 }
 1123         }
 1124 
 1125         /*
 1126          * Process one or more MT_CONTROL mbufs present before any data mbufs
 1127          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
 1128          * just copy the data; if !MSG_PEEK, we call into the protocol to
 1129          * perform externalization (or freeing if controlp == NULL).
 1130          */
 1131         if (m != NULL && m->m_type == MT_CONTROL) {
 1132                 struct mbuf *cm = NULL, *cmn;
 1133                 struct mbuf **cme = &cm;
 1134 
 1135                 do {
 1136                         if (flags & MSG_PEEK) {
 1137                                 if (controlp != NULL) {
 1138                                         *controlp = m_copy(m, 0, m->m_len);
 1139                                         controlp = &(*controlp)->m_next;
 1140                                 }
 1141                                 m = m->m_next;
 1142                         } else {
 1143                                 sbfree(&so->so_rcv, m);
 1144                                 so->so_rcv.sb_mb = m->m_next;
 1145                                 m->m_next = NULL;
 1146                                 *cme = m;
 1147                                 cme = &(*cme)->m_next;
 1148                                 m = so->so_rcv.sb_mb;
 1149                         }
 1150                 } while (m != NULL && m->m_type == MT_CONTROL);
 1151                 if ((flags & MSG_PEEK) == 0)
 1152                         sockbuf_pushsync(&so->so_rcv, nextrecord);
 1153                 while (cm != NULL) {
 1154                         cmn = cm->m_next;
 1155                         cm->m_next = NULL;
 1156                         if (pr->pr_domain->dom_externalize != NULL) {
 1157                                 SOCKBUF_UNLOCK(&so->so_rcv);
 1158                                 error = (*pr->pr_domain->dom_externalize)
 1159                                     (cm, controlp);
 1160                                 SOCKBUF_LOCK(&so->so_rcv);
 1161                         } else if (controlp != NULL)
 1162                                 *controlp = cm;
 1163                         else
 1164                                 m_freem(cm);
 1165                         if (controlp != NULL) {
 1166                                 orig_resid = 0;
 1167                                 while (*controlp != NULL)
 1168                                         controlp = &(*controlp)->m_next;
 1169                         }
 1170                         cm = cmn;
 1171                 }
 1172                 if (m != NULL)
 1173                         nextrecord = so->so_rcv.sb_mb->m_nextpkt;
 1174                 else
 1175                         nextrecord = so->so_rcv.sb_mb;
 1176                 orig_resid = 0;
 1177         }
 1178         if (m != NULL) {
 1179                 if ((flags & MSG_PEEK) == 0) {
 1180                         KASSERT(m->m_nextpkt == nextrecord,
 1181                             ("soreceive: post-control, nextrecord !sync"));
 1182                         if (nextrecord == NULL) {
 1183                                 KASSERT(so->so_rcv.sb_mb == m,
 1184                                     ("soreceive: post-control, sb_mb!=m"));
 1185                                 KASSERT(so->so_rcv.sb_lastrecord == m,
 1186                                     ("soreceive: post-control, lastrecord!=m"));
 1187                         }
 1188                 }
 1189                 type = m->m_type;
 1190                 if (type == MT_OOBDATA)
 1191                         flags |= MSG_OOB;
 1192         } else {
 1193                 if ((flags & MSG_PEEK) == 0) {
 1194                         KASSERT(so->so_rcv.sb_mb == nextrecord,
 1195                             ("soreceive: sb_mb != nextrecord"));
 1196                         if (so->so_rcv.sb_mb == NULL) {
 1197                                 KASSERT(so->so_rcv.sb_lastrecord == NULL,
 1198                                     ("soreceive: sb_lastercord != NULL"));
 1199                         }
 1200                 }
 1201         }
 1202         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1203         SBLASTRECORDCHK(&so->so_rcv);
 1204         SBLASTMBUFCHK(&so->so_rcv);
 1205 
 1206         /*
 1207          * Now continue to read any data mbufs off of the head of the socket
 1208          * buffer until the read request is satisfied.  Note that 'type' is
 1209          * used to store the type of any mbuf reads that have happened so far
 1210          * such that soreceive() can stop reading if the type changes, which
 1211          * causes soreceive() to return only one of regular data and inline
 1212          * out-of-band data in a single socket receive operation.
 1213          */
 1214         moff = 0;
 1215         offset = 0;
 1216         while (m != NULL && uio->uio_resid > 0 && error == 0) {
 1217                 /*
 1218                  * If the type of mbuf has changed since the last mbuf
 1219                  * examined ('type'), end the receive operation.
 1220                  */
 1221                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1222                 if (m->m_type == MT_OOBDATA) {
 1223                         if (type != MT_OOBDATA)
 1224                                 break;
 1225                 } else if (type == MT_OOBDATA)
 1226                         break;
 1227                 else
 1228                     KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
 1229                         ("m->m_type == %d", m->m_type));
 1230                 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
 1231                 len = uio->uio_resid;
 1232                 if (so->so_oobmark && len > so->so_oobmark - offset)
 1233                         len = so->so_oobmark - offset;
 1234                 if (len > m->m_len - moff)
 1235                         len = m->m_len - moff;
 1236                 /*
 1237                  * If mp is set, just pass back the mbufs.
 1238                  * Otherwise copy them out via the uio, then free.
 1239                  * Sockbuf must be consistent here (points to current mbuf,
 1240                  * it points to next record) when we drop priority;
 1241                  * we must note any additions to the sockbuf when we
 1242                  * block interrupts again.
 1243                  */
 1244                 if (mp == NULL) {
 1245                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1246                         SBLASTRECORDCHK(&so->so_rcv);
 1247                         SBLASTMBUFCHK(&so->so_rcv);
 1248                         SOCKBUF_UNLOCK(&so->so_rcv);
 1249 #ifdef ZERO_COPY_SOCKETS
 1250                         if (so_zero_copy_receive) {
 1251                                 int disposable;
 1252 
 1253                                 if ((m->m_flags & M_EXT)
 1254                                  && (m->m_ext.ext_type == EXT_DISPOSABLE))
 1255                                         disposable = 1;
 1256                                 else
 1257                                         disposable = 0;
 1258 
 1259                                 error = uiomoveco(mtod(m, char *) + moff,
 1260                                                   (int)len, uio,
 1261                                                   disposable);
 1262                         } else
 1263 #endif /* ZERO_COPY_SOCKETS */
 1264                         error = uiomove(mtod(m, char *) + moff, (int)len, uio);
 1265                         SOCKBUF_LOCK(&so->so_rcv);
 1266                         if (error) {
 1267                                 /*
 1268                                  * If any part of the record has been removed
 1269                                  * (such as the MT_SONAME mbuf, which will
 1270                                  * happen when PR_ADDR, and thus also
 1271                                  * PR_ATOMIC, is set), then drop the entire
 1272                                  * record to maintain the atomicity of the
 1273                                  * receive operation.
 1274                                  */
 1275                                 if (m && pr->pr_flags & PR_ATOMIC &&
 1276                                     ((flags & MSG_PEEK) == 0))
 1277                                         (void)sbdroprecord_locked(&so->so_rcv);
 1278                                 goto release;
 1279                         }
 1280                 } else
 1281                         uio->uio_resid -= len;
 1282                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1283                 if (len == m->m_len - moff) {
 1284                         if (m->m_flags & M_EOR)
 1285                                 flags |= MSG_EOR;
 1286                         if (flags & MSG_PEEK) {
 1287                                 m = m->m_next;
 1288                                 moff = 0;
 1289                         } else {
 1290                                 nextrecord = m->m_nextpkt;
 1291                                 sbfree(&so->so_rcv, m);
 1292                                 if (mp != NULL) {
 1293                                         *mp = m;
 1294                                         mp = &m->m_next;
 1295                                         so->so_rcv.sb_mb = m = m->m_next;
 1296                                         *mp = NULL;
 1297                                 } else {
 1298                                         so->so_rcv.sb_mb = m_free(m);
 1299                                         m = so->so_rcv.sb_mb;
 1300                                 }
 1301                                 if (m != NULL) {
 1302                                         m->m_nextpkt = nextrecord;
 1303                                         if (nextrecord == NULL)
 1304                                                 so->so_rcv.sb_lastrecord = m;
 1305                                 } else {
 1306                                         so->so_rcv.sb_mb = nextrecord;
 1307                                         SB_EMPTY_FIXUP(&so->so_rcv);
 1308                                 }
 1309                                 SBLASTRECORDCHK(&so->so_rcv);
 1310                                 SBLASTMBUFCHK(&so->so_rcv);
 1311                         }
 1312                 } else {
 1313                         if (flags & MSG_PEEK)
 1314                                 moff += len;
 1315                         else {
 1316                                 if (mp != NULL) {
 1317                                         int copy_flag;
 1318 
 1319                                         if (flags & MSG_DONTWAIT)
 1320                                                 copy_flag = M_DONTWAIT;
 1321                                         else
 1322                                                 copy_flag = M_TRYWAIT;
 1323                                         if (copy_flag == M_TRYWAIT)
 1324                                                 SOCKBUF_UNLOCK(&so->so_rcv);
 1325                                         *mp = m_copym(m, 0, len, copy_flag);
 1326                                         if (copy_flag == M_TRYWAIT)
 1327                                                 SOCKBUF_LOCK(&so->so_rcv);
 1328                                         if (*mp == NULL) {
 1329                                                 /*
 1330                                                  * m_copym() couldn't allocate an mbuf. 
 1331                                                  * Adjust uio_resid back (it was adjusted 
 1332                                                  * down by len bytes, which we didn't end 
 1333                                                  * up "copying" over).
 1334                                                  */
 1335                                                 uio->uio_resid += len;
 1336                                                 break;
 1337                                         }
 1338                                 }
 1339                                 m->m_data += len;
 1340                                 m->m_len -= len;
 1341                                 so->so_rcv.sb_cc -= len;
 1342                         }
 1343                 }
 1344                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1345                 if (so->so_oobmark) {
 1346                         if ((flags & MSG_PEEK) == 0) {
 1347                                 so->so_oobmark -= len;
 1348                                 if (so->so_oobmark == 0) {
 1349                                         so->so_rcv.sb_state |= SBS_RCVATMARK;
 1350                                         break;
 1351                                 }
 1352                         } else {
 1353                                 offset += len;
 1354                                 if (offset == so->so_oobmark)
 1355                                         break;
 1356                         }
 1357                 }
 1358                 if (flags & MSG_EOR)
 1359                         break;
 1360                 /*
 1361                  * If the MSG_WAITALL flag is set (for non-atomic socket),
 1362                  * we must not quit until "uio->uio_resid == 0" or an error
 1363                  * termination.  If a signal/timeout occurs, return
 1364                  * with a short count but without error.
 1365                  * Keep sockbuf locked against other readers.
 1366                  */
 1367                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
 1368                     !sosendallatonce(so) && nextrecord == NULL) {
 1369                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1370                         if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
 1371                                 break;
 1372                         /*
 1373                          * Notify the protocol that some data has been
 1374                          * drained before blocking.
 1375                          */
 1376                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
 1377                                 SOCKBUF_UNLOCK(&so->so_rcv);
 1378                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
 1379                                 SOCKBUF_LOCK(&so->so_rcv);
 1380                         }
 1381                         SBLASTRECORDCHK(&so->so_rcv);
 1382                         SBLASTMBUFCHK(&so->so_rcv);
 1383                         error = sbwait(&so->so_rcv);
 1384                         if (error)
 1385                                 goto release;
 1386                         m = so->so_rcv.sb_mb;
 1387                         if (m != NULL)
 1388                                 nextrecord = m->m_nextpkt;
 1389                 }
 1390         }
 1391 
 1392         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1393         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
 1394                 flags |= MSG_TRUNC;
 1395                 if ((flags & MSG_PEEK) == 0)
 1396                         (void) sbdroprecord_locked(&so->so_rcv);
 1397         }
 1398         if ((flags & MSG_PEEK) == 0) {
 1399                 if (m == NULL) {
 1400                         /*
 1401                          * First part is an inline SB_EMPTY_FIXUP().  Second
 1402                          * part makes sure sb_lastrecord is up-to-date if
 1403                          * there is still data in the socket buffer.
 1404                          */
 1405                         so->so_rcv.sb_mb = nextrecord;
 1406                         if (so->so_rcv.sb_mb == NULL) {
 1407                                 so->so_rcv.sb_mbtail = NULL;
 1408                                 so->so_rcv.sb_lastrecord = NULL;
 1409                         } else if (nextrecord->m_nextpkt == NULL)
 1410                                 so->so_rcv.sb_lastrecord = nextrecord;
 1411                 }
 1412                 SBLASTRECORDCHK(&so->so_rcv);
 1413                 SBLASTMBUFCHK(&so->so_rcv);
 1414                 /*
 1415                  * If soreceive() is being done from the socket callback, then 
 1416                  * don't need to generate ACK to peer to update window, since 
 1417                  * ACK will be generated on return to TCP.
 1418                  */
 1419                 if (!(flags & MSG_SOCALLBCK) && 
 1420                     (pr->pr_flags & PR_WANTRCVD) && so->so_pcb) {
 1421                         SOCKBUF_UNLOCK(&so->so_rcv);
 1422                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
 1423                         SOCKBUF_LOCK(&so->so_rcv);
 1424                 }
 1425         }
 1426         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1427         if (orig_resid == uio->uio_resid && orig_resid &&
 1428             (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
 1429                 sbunlock(&so->so_rcv);
 1430                 goto restart;
 1431         }
 1432 
 1433         if (flagsp != NULL)
 1434                 *flagsp |= flags;
 1435 release:
 1436         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1437         sbunlock(&so->so_rcv);
 1438 out:
 1439         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1440         SOCKBUF_UNLOCK(&so->so_rcv);
 1441         return (error);
 1442 }
 1443 
 1444 int
 1445 soshutdown(so, how)
 1446         struct socket *so;
 1447         int how;
 1448 {
 1449         struct protosw *pr = so->so_proto;
 1450 
 1451         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
 1452                 return (EINVAL);
 1453 
 1454         if (how != SHUT_WR)
 1455                 sorflush(so);
 1456         if (how != SHUT_RD)
 1457                 return ((*pr->pr_usrreqs->pru_shutdown)(so));
 1458         return (0);
 1459 }
 1460 
 1461 void
 1462 sorflush(so)
 1463         struct socket *so;
 1464 {
 1465         struct sockbuf *sb = &so->so_rcv;
 1466         struct protosw *pr = so->so_proto;
 1467         struct sockbuf asb;
 1468 
 1469         /*
 1470          * XXXRW: This is quite ugly.  Previously, this code made a copy of
 1471          * the socket buffer, then zero'd the original to clear the buffer
 1472          * fields.  However, with mutexes in the socket buffer, this causes
 1473          * problems.  We only clear the zeroable bits of the original;
 1474          * however, we have to initialize and destroy the mutex in the copy
 1475          * so that dom_dispose() and sbrelease() can lock t as needed.
 1476          */
 1477         SOCKBUF_LOCK(sb);
 1478         sb->sb_flags |= SB_NOINTR;
 1479         (void) sblock(sb, M_WAITOK);
 1480         /*
 1481          * socantrcvmore_locked() drops the socket buffer mutex so that it
 1482          * can safely perform wakeups.  Re-acquire the mutex before
 1483          * continuing.
 1484          */
 1485         socantrcvmore_locked(so);
 1486         SOCKBUF_LOCK(sb);
 1487         sbunlock(sb);
 1488         /*
 1489          * Invalidate/clear most of the sockbuf structure, but leave
 1490          * selinfo and mutex data unchanged.
 1491          */
 1492         bzero(&asb, offsetof(struct sockbuf, sb_startzero));
 1493         bcopy(&sb->sb_startzero, &asb.sb_startzero,
 1494             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 1495         bzero(&sb->sb_startzero,
 1496             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 1497         SOCKBUF_UNLOCK(sb);
 1498 
 1499         SOCKBUF_LOCK_INIT(&asb, "so_rcv");
 1500         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
 1501                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
 1502         sbrelease(&asb, so);
 1503         SOCKBUF_LOCK_DESTROY(&asb);
 1504 }
 1505 
 1506 /*
 1507  * Perhaps this routine, and sooptcopyout(), below, ought to come in
 1508  * an additional variant to handle the case where the option value needs
 1509  * to be some kind of integer, but not a specific size.
 1510  * In addition to their use here, these functions are also called by the
 1511  * protocol-level pr_ctloutput() routines.
 1512  */
 1513 int
 1514 sooptcopyin(sopt, buf, len, minlen)
 1515         struct  sockopt *sopt;
 1516         void    *buf;
 1517         size_t  len;
 1518         size_t  minlen;
 1519 {
 1520         size_t  valsize;
 1521 
 1522         /*
 1523          * If the user gives us more than we wanted, we ignore it,
 1524          * but if we don't get the minimum length the caller
 1525          * wants, we return EINVAL.  On success, sopt->sopt_valsize
 1526          * is set to however much we actually retrieved.
 1527          */
 1528         if ((valsize = sopt->sopt_valsize) < minlen)
 1529                 return EINVAL;
 1530         if (valsize > len)
 1531                 sopt->sopt_valsize = valsize = len;
 1532 
 1533         if (sopt->sopt_td != NULL)
 1534                 return (copyin(sopt->sopt_val, buf, valsize));
 1535 
 1536         bcopy(sopt->sopt_val, buf, valsize);
 1537         return 0;
 1538 }
 1539 
 1540 /*
 1541  * Kernel version of setsockopt(2)/
 1542  * XXX: optlen is size_t, not socklen_t
 1543  */
 1544 int
 1545 so_setsockopt(struct socket *so, int level, int optname, void *optval,
 1546     size_t optlen)
 1547 {
 1548         struct sockopt sopt;
 1549 
 1550         sopt.sopt_level = level;
 1551         sopt.sopt_name = optname;
 1552         sopt.sopt_dir = SOPT_SET;
 1553         sopt.sopt_val = optval;
 1554         sopt.sopt_valsize = optlen;
 1555         sopt.sopt_td = NULL;
 1556         return (sosetopt(so, &sopt));
 1557 }
 1558 
 1559 int
 1560 sosetopt(so, sopt)
 1561         struct socket *so;
 1562         struct sockopt *sopt;
 1563 {
 1564         int     error, optval;
 1565         struct  linger l;
 1566         struct  timeval tv;
 1567         u_long  val;
 1568 #ifdef MAC
 1569         struct mac extmac;
 1570 #endif
 1571 
 1572         error = 0;
 1573         if (sopt->sopt_level != SOL_SOCKET) {
 1574                 if (so->so_proto && so->so_proto->pr_ctloutput)
 1575                         return ((*so->so_proto->pr_ctloutput)
 1576                                   (so, sopt));
 1577                 error = ENOPROTOOPT;
 1578         } else {
 1579                 switch (sopt->sopt_name) {
 1580 #ifdef INET
 1581                 case SO_ACCEPTFILTER:
 1582                         error = do_setopt_accept_filter(so, sopt);
 1583                         if (error)
 1584                                 goto bad;
 1585                         break;
 1586 #endif
 1587                 case SO_LINGER:
 1588                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
 1589                         if (error)
 1590                                 goto bad;
 1591 
 1592                         SOCK_LOCK(so);
 1593                         so->so_linger = l.l_linger;
 1594                         if (l.l_onoff)
 1595                                 so->so_options |= SO_LINGER;
 1596                         else
 1597                                 so->so_options &= ~SO_LINGER;
 1598                         SOCK_UNLOCK(so);
 1599                         break;
 1600 
 1601                 case SO_DEBUG:
 1602                 case SO_KEEPALIVE:
 1603                 case SO_DONTROUTE:
 1604                 case SO_USELOOPBACK:
 1605                 case SO_BROADCAST:
 1606                 case SO_REUSEADDR:
 1607                 case SO_REUSEPORT:
 1608                 case SO_OOBINLINE:
 1609                 case SO_TIMESTAMP:
 1610                 case SO_BINTIME:
 1611                 case SO_NOSIGPIPE:
 1612                         error = sooptcopyin(sopt, &optval, sizeof optval,
 1613                                             sizeof optval);
 1614                         if (error)
 1615                                 goto bad;
 1616                         SOCK_LOCK(so);
 1617                         if (optval)
 1618                                 so->so_options |= sopt->sopt_name;
 1619                         else
 1620                                 so->so_options &= ~sopt->sopt_name;
 1621                         SOCK_UNLOCK(so);
 1622                         break;
 1623 
 1624                 case SO_SNDBUF:
 1625                 case SO_RCVBUF:
 1626                 case SO_SNDLOWAT:
 1627                 case SO_RCVLOWAT:
 1628                         error = sooptcopyin(sopt, &optval, sizeof optval,
 1629                                             sizeof optval);
 1630                         if (error)
 1631                                 goto bad;
 1632 
 1633                         /*
 1634                          * Values < 1 make no sense for any of these
 1635                          * options, so disallow them.
 1636                          */
 1637                         if (optval < 1) {
 1638                                 error = EINVAL;
 1639                                 goto bad;
 1640                         }
 1641 
 1642                         switch (sopt->sopt_name) {
 1643                         case SO_SNDBUF:
 1644                         case SO_RCVBUF:
 1645                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
 1646                                     &so->so_snd : &so->so_rcv, (u_long)optval,
 1647                                     so, curthread) == 0) {
 1648                                         error = ENOBUFS;
 1649                                         goto bad;
 1650                                 }
 1651                                 break;
 1652 
 1653                         /*
 1654                          * Make sure the low-water is never greater than
 1655                          * the high-water.
 1656                          */
 1657                         case SO_SNDLOWAT:
 1658                                 SOCKBUF_LOCK(&so->so_snd);
 1659                                 so->so_snd.sb_lowat =
 1660                                     (optval > so->so_snd.sb_hiwat) ?
 1661                                     so->so_snd.sb_hiwat : optval;
 1662                                 SOCKBUF_UNLOCK(&so->so_snd);
 1663                                 break;
 1664                         case SO_RCVLOWAT:
 1665                                 SOCKBUF_LOCK(&so->so_rcv);
 1666                                 so->so_rcv.sb_lowat =
 1667                                     (optval > so->so_rcv.sb_hiwat) ?
 1668                                     so->so_rcv.sb_hiwat : optval;
 1669                                 SOCKBUF_UNLOCK(&so->so_rcv);
 1670                                 break;
 1671                         }
 1672                         break;
 1673 
 1674                 case SO_SNDTIMEO:
 1675                 case SO_RCVTIMEO:
 1676 #ifdef COMPAT_IA32
 1677                         if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
 1678                                 struct timeval32 tv32;
 1679 
 1680                                 error = sooptcopyin(sopt, &tv32, sizeof tv32,
 1681                                     sizeof tv32);
 1682                                 CP(tv32, tv, tv_sec);
 1683                                 CP(tv32, tv, tv_usec);
 1684                         } else
 1685 #endif
 1686                                 error = sooptcopyin(sopt, &tv, sizeof tv,
 1687                                     sizeof tv);
 1688                         if (error)
 1689                                 goto bad;
 1690 
 1691                         /* assert(hz > 0); */
 1692                         if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
 1693                             tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
 1694                                 error = EDOM;
 1695                                 goto bad;
 1696                         }
 1697                         /* assert(tick > 0); */
 1698                         /* assert(ULONG_MAX - INT_MAX >= 1000000); */
 1699                         val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
 1700                         if (val > INT_MAX) {
 1701                                 error = EDOM;
 1702                                 goto bad;
 1703                         }
 1704                         if (val == 0 && tv.tv_usec != 0)
 1705                                 val = 1;
 1706 
 1707                         switch (sopt->sopt_name) {
 1708                         case SO_SNDTIMEO:
 1709                                 so->so_snd.sb_timeo = val;
 1710                                 break;
 1711                         case SO_RCVTIMEO:
 1712                                 so->so_rcv.sb_timeo = val;
 1713                                 break;
 1714                         }
 1715                         break;
 1716 
 1717                 case SO_LABEL:
 1718 #ifdef MAC
 1719                         error = sooptcopyin(sopt, &extmac, sizeof extmac,
 1720                             sizeof extmac);
 1721                         if (error)
 1722                                 goto bad;
 1723                         error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
 1724                             so, &extmac);
 1725 #else
 1726                         error = EOPNOTSUPP;
 1727 #endif
 1728                         break;
 1729 
 1730                 default:
 1731                         error = ENOPROTOOPT;
 1732                         break;
 1733                 }
 1734                 if (error == 0 && so->so_proto != NULL &&
 1735                     so->so_proto->pr_ctloutput != NULL) {
 1736                         (void) ((*so->so_proto->pr_ctloutput)
 1737                                   (so, sopt));
 1738                 }
 1739         }
 1740 bad:
 1741         return (error);
 1742 }
 1743 
 1744 /* Helper routine for getsockopt */
 1745 int
 1746 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
 1747 {
 1748         int     error;
 1749         size_t  valsize;
 1750 
 1751         error = 0;
 1752 
 1753         /*
 1754          * Documented get behavior is that we always return a value,
 1755          * possibly truncated to fit in the user's buffer.
 1756          * Traditional behavior is that we always tell the user
 1757          * precisely how much we copied, rather than something useful
 1758          * like the total amount we had available for her.
 1759          * Note that this interface is not idempotent; the entire answer must
 1760          * generated ahead of time.
 1761          */
 1762         valsize = min(len, sopt->sopt_valsize);
 1763         sopt->sopt_valsize = valsize;
 1764         if (sopt->sopt_val != NULL) {
 1765                 if (sopt->sopt_td != NULL)
 1766                         error = copyout(buf, sopt->sopt_val, valsize);
 1767                 else
 1768                         bcopy(buf, sopt->sopt_val, valsize);
 1769         }
 1770         return error;
 1771 }
 1772 
 1773 int
 1774 sogetopt(so, sopt)
 1775         struct socket *so;
 1776         struct sockopt *sopt;
 1777 {
 1778         int     error, optval;
 1779         struct  linger l;
 1780         struct  timeval tv;
 1781 #ifdef MAC
 1782         struct mac extmac;
 1783 #endif
 1784 
 1785         error = 0;
 1786         if (sopt->sopt_level != SOL_SOCKET) {
 1787                 if (so->so_proto && so->so_proto->pr_ctloutput) {
 1788                         return ((*so->so_proto->pr_ctloutput)
 1789                                   (so, sopt));
 1790                 } else
 1791                         return (ENOPROTOOPT);
 1792         } else {
 1793                 switch (sopt->sopt_name) {
 1794 #ifdef INET
 1795                 case SO_ACCEPTFILTER:
 1796                         error = do_getopt_accept_filter(so, sopt);
 1797                         break;
 1798 #endif
 1799                 case SO_LINGER:
 1800                         SOCK_LOCK(so);
 1801                         l.l_onoff = so->so_options & SO_LINGER;
 1802                         l.l_linger = so->so_linger;
 1803                         SOCK_UNLOCK(so);
 1804                         error = sooptcopyout(sopt, &l, sizeof l);
 1805                         break;
 1806 
 1807                 case SO_USELOOPBACK:
 1808                 case SO_DONTROUTE:
 1809                 case SO_DEBUG:
 1810                 case SO_KEEPALIVE:
 1811                 case SO_REUSEADDR:
 1812                 case SO_REUSEPORT:
 1813                 case SO_BROADCAST:
 1814                 case SO_OOBINLINE:
 1815                 case SO_ACCEPTCONN:
 1816                 case SO_TIMESTAMP:
 1817                 case SO_BINTIME:
 1818                 case SO_NOSIGPIPE:
 1819                         optval = so->so_options & sopt->sopt_name;
 1820 integer:
 1821                         error = sooptcopyout(sopt, &optval, sizeof optval);
 1822                         break;
 1823 
 1824                 case SO_TYPE:
 1825                         optval = so->so_type;
 1826                         goto integer;
 1827 
 1828                 case SO_ERROR:
 1829                         optval = so->so_error;
 1830                         so->so_error = 0;
 1831                         goto integer;
 1832 
 1833                 case SO_SNDBUF:
 1834                         optval = so->so_snd.sb_hiwat;
 1835                         goto integer;
 1836 
 1837                 case SO_RCVBUF:
 1838                         optval = so->so_rcv.sb_hiwat;
 1839                         goto integer;
 1840 
 1841                 case SO_SNDLOWAT:
 1842                         optval = so->so_snd.sb_lowat;
 1843                         goto integer;
 1844 
 1845                 case SO_RCVLOWAT:
 1846                         optval = so->so_rcv.sb_lowat;
 1847                         goto integer;
 1848 
 1849                 case SO_SNDTIMEO:
 1850                 case SO_RCVTIMEO:
 1851                         optval = (sopt->sopt_name == SO_SNDTIMEO ?
 1852                                   so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
 1853 
 1854                         tv.tv_sec = optval / hz;
 1855                         tv.tv_usec = (optval % hz) * tick;
 1856 #ifdef COMPAT_IA32
 1857                         if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) {
 1858                                 struct timeval32 tv32;
 1859 
 1860                                 CP(tv, tv32, tv_sec);
 1861                                 CP(tv, tv32, tv_usec);
 1862                                 error = sooptcopyout(sopt, &tv32, sizeof tv32);
 1863                         } else
 1864 #endif
 1865                                 error = sooptcopyout(sopt, &tv, sizeof tv);
 1866                         break;
 1867 
 1868                 case SO_LABEL:
 1869 #ifdef MAC
 1870                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
 1871                             sizeof(extmac));
 1872                         if (error)
 1873                                 return (error);
 1874                         error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
 1875                             so, &extmac);
 1876                         if (error)
 1877                                 return (error);
 1878                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
 1879 #else
 1880                         error = EOPNOTSUPP;
 1881 #endif
 1882                         break;
 1883 
 1884                 case SO_PEERLABEL:
 1885 #ifdef MAC
 1886                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
 1887                             sizeof(extmac));
 1888                         if (error)
 1889                                 return (error);
 1890                         error = mac_getsockopt_peerlabel(
 1891                             sopt->sopt_td->td_ucred, so, &extmac);
 1892                         if (error)
 1893                                 return (error);
 1894                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
 1895 #else
 1896                         error = EOPNOTSUPP;
 1897 #endif
 1898                         break;
 1899 
 1900                 case SO_LISTENQLIMIT:
 1901                         optval = so->so_qlimit;
 1902                         goto integer;
 1903 
 1904                 case SO_LISTENQLEN:
 1905                         optval = so->so_qlen;
 1906                         goto integer;
 1907 
 1908                 case SO_LISTENINCQLEN:
 1909                         optval = so->so_incqlen;
 1910                         goto integer;
 1911 
 1912                 default:
 1913                         error = ENOPROTOOPT;
 1914                         break;
 1915                 }
 1916                 return (error);
 1917         }
 1918 }
 1919 
 1920 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
 1921 int
 1922 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
 1923 {
 1924         struct mbuf *m, *m_prev;
 1925         int sopt_size = sopt->sopt_valsize;
 1926 
 1927         MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
 1928         if (m == NULL)
 1929                 return ENOBUFS;
 1930         if (sopt_size > MLEN) {
 1931                 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
 1932                 if ((m->m_flags & M_EXT) == 0) {
 1933                         m_free(m);
 1934                         return ENOBUFS;
 1935                 }
 1936                 m->m_len = min(MCLBYTES, sopt_size);
 1937         } else {
 1938                 m->m_len = min(MLEN, sopt_size);
 1939         }
 1940         sopt_size -= m->m_len;
 1941         *mp = m;
 1942         m_prev = m;
 1943 
 1944         while (sopt_size) {
 1945                 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
 1946                 if (m == NULL) {
 1947                         m_freem(*mp);
 1948                         return ENOBUFS;
 1949                 }
 1950                 if (sopt_size > MLEN) {
 1951                         MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
 1952                             M_DONTWAIT);
 1953                         if ((m->m_flags & M_EXT) == 0) {
 1954                                 m_freem(m);
 1955                                 m_freem(*mp);
 1956                                 return ENOBUFS;
 1957                         }
 1958                         m->m_len = min(MCLBYTES, sopt_size);
 1959                 } else {
 1960                         m->m_len = min(MLEN, sopt_size);
 1961                 }
 1962                 sopt_size -= m->m_len;
 1963                 m_prev->m_next = m;
 1964                 m_prev = m;
 1965         }
 1966         return 0;
 1967 }
 1968 
 1969 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
 1970 int
 1971 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
 1972 {
 1973         struct mbuf *m0 = m;
 1974 
 1975         if (sopt->sopt_val == NULL)
 1976                 return 0;
 1977         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 1978                 if (sopt->sopt_td != NULL) {
 1979                         int error;
 1980 
 1981                         error = copyin(sopt->sopt_val, mtod(m, char *),
 1982                                        m->m_len);
 1983                         if (error != 0) {
 1984                                 m_freem(m0);
 1985                                 return(error);
 1986                         }
 1987                 } else
 1988                         bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
 1989                 sopt->sopt_valsize -= m->m_len;
 1990                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
 1991                 m = m->m_next;
 1992         }
 1993         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
 1994                 panic("ip6_sooptmcopyin");
 1995         return 0;
 1996 }
 1997 
 1998 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
 1999 int
 2000 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
 2001 {
 2002         struct mbuf *m0 = m;
 2003         size_t valsize = 0;
 2004 
 2005         if (sopt->sopt_val == NULL)
 2006                 return 0;
 2007         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 2008                 if (sopt->sopt_td != NULL) {
 2009                         int error;
 2010 
 2011                         error = copyout(mtod(m, char *), sopt->sopt_val,
 2012                                        m->m_len);
 2013                         if (error != 0) {
 2014                                 m_freem(m0);
 2015                                 return(error);
 2016                         }
 2017                 } else
 2018                         bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
 2019                sopt->sopt_valsize -= m->m_len;
 2020                sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
 2021                valsize += m->m_len;
 2022                m = m->m_next;
 2023         }
 2024         if (m != NULL) {
 2025                 /* enough soopt buffer should be given from user-land */
 2026                 m_freem(m0);
 2027                 return(EINVAL);
 2028         }
 2029         sopt->sopt_valsize = valsize;
 2030         return 0;
 2031 }
 2032 
 2033 void
 2034 sohasoutofband(so)
 2035         struct socket *so;
 2036 {
 2037         if (so->so_sigio != NULL)
 2038                 pgsigio(&so->so_sigio, SIGURG, 0);
 2039         selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
 2040 }
 2041 
 2042 int
 2043 sopoll(struct socket *so, int events, struct ucred *active_cred,
 2044     struct thread *td)
 2045 {
 2046         int revents = 0;
 2047 
 2048         SOCKBUF_LOCK(&so->so_snd);
 2049         SOCKBUF_LOCK(&so->so_rcv);
 2050         if (events & (POLLIN | POLLRDNORM))
 2051                 if (soreadable(so))
 2052                         revents |= events & (POLLIN | POLLRDNORM);
 2053 
 2054         if (events & POLLINIGNEOF)
 2055                 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
 2056                     !TAILQ_EMPTY(&so->so_comp) || so->so_error)
 2057                         revents |= POLLINIGNEOF;
 2058 
 2059         if (events & (POLLOUT | POLLWRNORM))
 2060                 if (sowriteable(so))
 2061                         revents |= events & (POLLOUT | POLLWRNORM);
 2062 
 2063         if (events & (POLLPRI | POLLRDBAND))
 2064                 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
 2065                         revents |= events & (POLLPRI | POLLRDBAND);
 2066 
 2067         if (revents == 0) {
 2068                 if (events &
 2069                     (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
 2070                      POLLRDBAND)) {
 2071                         selrecord(td, &so->so_rcv.sb_sel);
 2072                         so->so_rcv.sb_flags |= SB_SEL;
 2073                 }
 2074 
 2075                 if (events & (POLLOUT | POLLWRNORM)) {
 2076                         selrecord(td, &so->so_snd.sb_sel);
 2077                         so->so_snd.sb_flags |= SB_SEL;
 2078                 }
 2079         }
 2080 
 2081         SOCKBUF_UNLOCK(&so->so_rcv);
 2082         SOCKBUF_UNLOCK(&so->so_snd);
 2083         return (revents);
 2084 }
 2085 
 2086 int
 2087 soo_kqfilter(struct file *fp, struct knote *kn)
 2088 {
 2089         struct socket *so = kn->kn_fp->f_data;
 2090         struct sockbuf *sb;
 2091 
 2092         switch (kn->kn_filter) {
 2093         case EVFILT_READ:
 2094                 if (so->so_options & SO_ACCEPTCONN)
 2095                         kn->kn_fop = &solisten_filtops;
 2096                 else
 2097                         kn->kn_fop = &soread_filtops;
 2098                 sb = &so->so_rcv;
 2099                 break;
 2100         case EVFILT_WRITE:
 2101                 kn->kn_fop = &sowrite_filtops;
 2102                 sb = &so->so_snd;
 2103                 break;
 2104         default:
 2105                 return (EINVAL);
 2106         }
 2107 
 2108         SOCKBUF_LOCK(sb);
 2109         knlist_add(&sb->sb_sel.si_note, kn, 1);
 2110         sb->sb_flags |= SB_KNOTE;
 2111         SOCKBUF_UNLOCK(sb);
 2112         return (0);
 2113 }
 2114 
 2115 static void
 2116 filt_sordetach(struct knote *kn)
 2117 {
 2118         struct socket *so = kn->kn_fp->f_data;
 2119 
 2120         SOCKBUF_LOCK(&so->so_rcv);
 2121         knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
 2122         if (knlist_empty(&so->so_rcv.sb_sel.si_note))
 2123                 so->so_rcv.sb_flags &= ~SB_KNOTE;
 2124         SOCKBUF_UNLOCK(&so->so_rcv);
 2125 }
 2126 
 2127 /*ARGSUSED*/
 2128 static int
 2129 filt_soread(struct knote *kn, long hint)
 2130 {
 2131         struct socket *so;
 2132 
 2133         so = kn->kn_fp->f_data;
 2134         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 2135 
 2136         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
 2137         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 2138                 kn->kn_flags |= EV_EOF;
 2139                 kn->kn_fflags = so->so_error;
 2140                 return (1);
 2141         } else if (so->so_error)        /* temporary udp error */
 2142                 return (1);
 2143         else if (kn->kn_sfflags & NOTE_LOWAT)
 2144                 return (kn->kn_data >= kn->kn_sdata);
 2145         else
 2146                 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
 2147 }
 2148 
 2149 static void
 2150 filt_sowdetach(struct knote *kn)
 2151 {
 2152         struct socket *so = kn->kn_fp->f_data;
 2153 
 2154         SOCKBUF_LOCK(&so->so_snd);
 2155         knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
 2156         if (knlist_empty(&so->so_snd.sb_sel.si_note))
 2157                 so->so_snd.sb_flags &= ~SB_KNOTE;
 2158         SOCKBUF_UNLOCK(&so->so_snd);
 2159 }
 2160 
 2161 /*ARGSUSED*/
 2162 static int
 2163 filt_sowrite(struct knote *kn, long hint)
 2164 {
 2165         struct socket *so;
 2166 
 2167         so = kn->kn_fp->f_data;
 2168         SOCKBUF_LOCK_ASSERT(&so->so_snd);
 2169         kn->kn_data = sbspace(&so->so_snd);
 2170         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 2171                 kn->kn_flags |= EV_EOF;
 2172                 kn->kn_fflags = so->so_error;
 2173                 return (1);
 2174         } else if (so->so_error)        /* temporary udp error */
 2175                 return (1);
 2176         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
 2177             (so->so_proto->pr_flags & PR_CONNREQUIRED))
 2178                 return (0);
 2179         else if (kn->kn_sfflags & NOTE_LOWAT)
 2180                 return (kn->kn_data >= kn->kn_sdata);
 2181         else
 2182                 return (kn->kn_data >= so->so_snd.sb_lowat);
 2183 }
 2184 
 2185 /*ARGSUSED*/
 2186 static int
 2187 filt_solisten(struct knote *kn, long hint)
 2188 {
 2189         struct socket *so = kn->kn_fp->f_data;
 2190 
 2191         kn->kn_data = so->so_qlen;
 2192         return (! TAILQ_EMPTY(&so->so_comp));
 2193 }
 2194 
 2195 int
 2196 socheckuid(struct socket *so, uid_t uid)
 2197 {
 2198 
 2199         if (so == NULL)
 2200                 return (EPERM);
 2201         if (so->so_cred->cr_uid != uid)
 2202                 return (EPERM);
 2203         return (0);
 2204 }
 2205 
 2206 static int
 2207 somaxconn_sysctl(SYSCTL_HANDLER_ARGS)
 2208 {
 2209         int error;
 2210         int val;
 2211 
 2212         val = somaxconn;
 2213         error = sysctl_handle_int(oidp, &val, sizeof(int), req);
 2214         if (error || !req->newptr )
 2215                 return (error);
 2216 
 2217         if (val < 1 || val > USHRT_MAX)
 2218                 return (EINVAL);
 2219 
 2220         somaxconn = val;
 2221         return (0);
 2222 }

Cache object: 5a314967918971c886190f070c0de28d


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.