The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/uipc_socket.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2004 The FreeBSD Foundation
    3  * Copyright (c) 2004-2005 Robert N. M. Watson
    4  * Copyright (c) 1982, 1986, 1988, 1990, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  * 4. Neither the name of the University nor the names of its contributors
   16  *    may be used to endorse or promote products derived from this software
   17  *    without specific prior written permission.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   29  * SUCH DAMAGE.
   30  *
   31  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
   32  */
   33 
   34 #include <sys/cdefs.h>
   35 __FBSDID("$FreeBSD: releng/6.0/sys/kern/uipc_socket.c 151016 2005-10-06 18:31:38Z delphij $");
   36 
   37 #include "opt_inet.h"
   38 #include "opt_mac.h"
   39 #include "opt_zero.h"
   40 
   41 #include <sys/param.h>
   42 #include <sys/systm.h>
   43 #include <sys/fcntl.h>
   44 #include <sys/limits.h>
   45 #include <sys/lock.h>
   46 #include <sys/mac.h>
   47 #include <sys/malloc.h>
   48 #include <sys/mbuf.h>
   49 #include <sys/mutex.h>
   50 #include <sys/domain.h>
   51 #include <sys/file.h>                   /* for struct knote */
   52 #include <sys/kernel.h>
   53 #include <sys/event.h>
   54 #include <sys/poll.h>
   55 #include <sys/proc.h>
   56 #include <sys/protosw.h>
   57 #include <sys/socket.h>
   58 #include <sys/socketvar.h>
   59 #include <sys/resourcevar.h>
   60 #include <sys/signalvar.h>
   61 #include <sys/sysctl.h>
   62 #include <sys/uio.h>
   63 #include <sys/jail.h>
   64 
   65 #include <vm/uma.h>
   66 
   67 
   68 static int      soreceive_rcvoob(struct socket *so, struct uio *uio,
   69                     int flags);
   70 
   71 static void     filt_sordetach(struct knote *kn);
   72 static int      filt_soread(struct knote *kn, long hint);
   73 static void     filt_sowdetach(struct knote *kn);
   74 static int      filt_sowrite(struct knote *kn, long hint);
   75 static int      filt_solisten(struct knote *kn, long hint);
   76 
   77 static struct filterops solisten_filtops =
   78         { 1, NULL, filt_sordetach, filt_solisten };
   79 static struct filterops soread_filtops =
   80         { 1, NULL, filt_sordetach, filt_soread };
   81 static struct filterops sowrite_filtops =
   82         { 1, NULL, filt_sowdetach, filt_sowrite };
   83 
   84 uma_zone_t socket_zone;
   85 so_gen_t        so_gencnt;      /* generation count for sockets */
   86 
   87 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
   88 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
   89 
   90 SYSCTL_DECL(_kern_ipc);
   91 
   92 static int somaxconn = SOMAXCONN;
   93 static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS);
   94 /* XXX: we dont have SYSCTL_USHORT */
   95 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
   96     0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection "
   97     "queue size");
   98 static int numopensockets;
   99 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
  100     &numopensockets, 0, "Number of open sockets");
  101 #ifdef ZERO_COPY_SOCKETS
  102 /* These aren't static because they're used in other files. */
  103 int so_zero_copy_send = 1;
  104 int so_zero_copy_receive = 1;
  105 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
  106     "Zero copy controls");
  107 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
  108     &so_zero_copy_receive, 0, "Enable zero copy receive");
  109 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
  110     &so_zero_copy_send, 0, "Enable zero copy send");
  111 #endif /* ZERO_COPY_SOCKETS */
  112 
  113 /*
  114  * accept_mtx locks down per-socket fields relating to accept queues.  See
  115  * socketvar.h for an annotation of the protected fields of struct socket.
  116  */
  117 struct mtx accept_mtx;
  118 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
  119 
  120 /*
  121  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
  122  * so_gencnt field.
  123  */
  124 static struct mtx so_global_mtx;
  125 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
  126 
  127 /*
  128  * Socket operation routines.
  129  * These routines are called by the routines in
  130  * sys_socket.c or from a system process, and
  131  * implement the semantics of socket operations by
  132  * switching out to the protocol specific routines.
  133  */
  134 
  135 /*
  136  * Get a socket structure from our zone, and initialize it.
  137  * Note that it would probably be better to allocate socket
  138  * and PCB at the same time, but I'm not convinced that all
  139  * the protocols can be easily modified to do this.
  140  *
  141  * soalloc() returns a socket with a ref count of 0.
  142  */
  143 struct socket *
  144 soalloc(int mflags)
  145 {
  146         struct socket *so;
  147 
  148         so = uma_zalloc(socket_zone, mflags | M_ZERO);
  149         if (so != NULL) {
  150 #ifdef MAC
  151                 if (mac_init_socket(so, mflags) != 0) {
  152                         uma_zfree(socket_zone, so);
  153                         return (NULL);
  154                 }
  155 #endif
  156                 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
  157                 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
  158                 TAILQ_INIT(&so->so_aiojobq);
  159                 mtx_lock(&so_global_mtx);
  160                 so->so_gencnt = ++so_gencnt;
  161                 ++numopensockets;
  162                 mtx_unlock(&so_global_mtx);
  163         }
  164         return (so);
  165 }
  166 
  167 /*
  168  * socreate returns a socket with a ref count of 1.  The socket should be
  169  * closed with soclose().
  170  */
  171 int
  172 socreate(dom, aso, type, proto, cred, td)
  173         int dom;
  174         struct socket **aso;
  175         int type;
  176         int proto;
  177         struct ucred *cred;
  178         struct thread *td;
  179 {
  180         struct protosw *prp;
  181         struct socket *so;
  182         int error;
  183 
  184         if (proto)
  185                 prp = pffindproto(dom, proto, type);
  186         else
  187                 prp = pffindtype(dom, type);
  188 
  189         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
  190             prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
  191                 return (EPROTONOSUPPORT);
  192 
  193         if (jailed(cred) && jail_socket_unixiproute_only &&
  194             prp->pr_domain->dom_family != PF_LOCAL &&
  195             prp->pr_domain->dom_family != PF_INET &&
  196             prp->pr_domain->dom_family != PF_ROUTE) {
  197                 return (EPROTONOSUPPORT);
  198         }
  199 
  200         if (prp->pr_type != type)
  201                 return (EPROTOTYPE);
  202         so = soalloc(M_WAITOK);
  203         if (so == NULL)
  204                 return (ENOBUFS);
  205 
  206         TAILQ_INIT(&so->so_incomp);
  207         TAILQ_INIT(&so->so_comp);
  208         so->so_type = type;
  209         so->so_cred = crhold(cred);
  210         so->so_proto = prp;
  211 #ifdef MAC
  212         mac_create_socket(cred, so);
  213 #endif
  214         knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
  215             NULL, NULL, NULL);
  216         knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
  217             NULL, NULL, NULL);
  218         so->so_count = 1;
  219         error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
  220         if (error) {
  221                 ACCEPT_LOCK();
  222                 SOCK_LOCK(so);
  223                 so->so_state |= SS_NOFDREF;
  224                 sorele(so);
  225                 return (error);
  226         }
  227         *aso = so;
  228         return (0);
  229 }
  230 
  231 int
  232 sobind(so, nam, td)
  233         struct socket *so;
  234         struct sockaddr *nam;
  235         struct thread *td;
  236 {
  237 
  238         return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
  239 }
  240 
  241 void
  242 sodealloc(struct socket *so)
  243 {
  244 
  245         KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
  246         mtx_lock(&so_global_mtx);
  247         so->so_gencnt = ++so_gencnt;
  248         mtx_unlock(&so_global_mtx);
  249         if (so->so_rcv.sb_hiwat)
  250                 (void)chgsbsize(so->so_cred->cr_uidinfo,
  251                     &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
  252         if (so->so_snd.sb_hiwat)
  253                 (void)chgsbsize(so->so_cred->cr_uidinfo,
  254                     &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
  255 #ifdef INET
  256         /* remove acccept filter if one is present. */
  257         if (so->so_accf != NULL)
  258                 do_setopt_accept_filter(so, NULL);
  259 #endif
  260 #ifdef MAC
  261         mac_destroy_socket(so);
  262 #endif
  263         crfree(so->so_cred);
  264         SOCKBUF_LOCK_DESTROY(&so->so_snd);
  265         SOCKBUF_LOCK_DESTROY(&so->so_rcv);
  266         uma_zfree(socket_zone, so);
  267         mtx_lock(&so_global_mtx);
  268         --numopensockets;
  269         mtx_unlock(&so_global_mtx);
  270 }
  271 
  272 /*
  273  * solisten() transitions a socket from a non-listening state to a listening
  274  * state, but can also be used to update the listen queue depth on an
  275  * existing listen socket.  The protocol will call back into the sockets
  276  * layer using solisten_proto_check() and solisten_proto() to check and set
  277  * socket-layer listen state.  Call backs are used so that the protocol can
  278  * acquire both protocol and socket layer locks in whatever order is required
  279  * by the protocol.
  280  *
  281  * Protocol implementors are advised to hold the socket lock across the
  282  * socket-layer test and set to avoid races at the socket layer.
  283  */
  284 int
  285 solisten(so, backlog, td)
  286         struct socket *so;
  287         int backlog;
  288         struct thread *td;
  289 {
  290         int error;
  291 
  292         error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
  293         if (error)
  294                 return (error);
  295 
  296         /*
  297          * XXXRW: The following state adjustment should occur in
  298          * solisten_proto(), but we don't currently pass the backlog request
  299          * to the protocol via pru_listen().
  300          */
  301         if (backlog < 0 || backlog > somaxconn)
  302                 backlog = somaxconn;
  303         so->so_qlimit = backlog;
  304         return (0);
  305 }
  306 
  307 int
  308 solisten_proto_check(so)
  309         struct socket *so;
  310 {
  311 
  312         SOCK_LOCK_ASSERT(so);
  313 
  314         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
  315             SS_ISDISCONNECTING))
  316                 return (EINVAL);
  317         return (0);
  318 }
  319 
  320 void
  321 solisten_proto(so)
  322         struct socket *so;
  323 {
  324 
  325         SOCK_LOCK_ASSERT(so);
  326 
  327         so->so_options |= SO_ACCEPTCONN;
  328 }
  329 
  330 /*
  331  * Attempt to free a socket.  This should really be sotryfree().
  332  *
  333  * We free the socket if the protocol is no longer interested in the socket,
  334  * there's no file descriptor reference, and the refcount is 0.  While the
  335  * calling macro sotryfree() tests the refcount, sofree() has to test it
  336  * again as it's possible to race with an accept()ing thread if the socket is
  337  * in an listen queue of a listen socket, as being in the listen queue
  338  * doesn't elevate the reference count.  sofree() acquires the accept mutex
  339  * early for this test in order to avoid that race.
  340  */
  341 void
  342 sofree(so)
  343         struct socket *so;
  344 {
  345         struct socket *head;
  346 
  347         ACCEPT_LOCK_ASSERT();
  348         SOCK_LOCK_ASSERT(so);
  349 
  350         if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0 ||
  351             so->so_count != 0) {
  352                 SOCK_UNLOCK(so);
  353                 ACCEPT_UNLOCK();
  354                 return;
  355         }
  356 
  357         head = so->so_head;
  358         if (head != NULL) {
  359                 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
  360                     (so->so_qstate & SQ_INCOMP) != 0,
  361                     ("sofree: so_head != NULL, but neither SQ_COMP nor "
  362                     "SQ_INCOMP"));
  363                 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
  364                     (so->so_qstate & SQ_INCOMP) == 0,
  365                     ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
  366                 /*
  367                  * accept(2) is responsible draining the completed
  368                  * connection queue and freeing those sockets, so
  369                  * we just return here if this socket is currently
  370                  * on the completed connection queue.  Otherwise,
  371                  * accept(2) may hang after select(2) has indicating
  372                  * that a listening socket was ready.  If it's an
  373                  * incomplete connection, we remove it from the queue
  374                  * and free it; otherwise, it won't be released until
  375                  * the listening socket is closed.
  376                  */
  377                 if ((so->so_qstate & SQ_COMP) != 0) {
  378                         SOCK_UNLOCK(so);
  379                         ACCEPT_UNLOCK();
  380                         return;
  381                 }
  382                 TAILQ_REMOVE(&head->so_incomp, so, so_list);
  383                 head->so_incqlen--;
  384                 so->so_qstate &= ~SQ_INCOMP;
  385                 so->so_head = NULL;
  386         }
  387         KASSERT((so->so_qstate & SQ_COMP) == 0 &&
  388             (so->so_qstate & SQ_INCOMP) == 0,
  389             ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
  390             so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
  391         SOCK_UNLOCK(so);
  392         ACCEPT_UNLOCK();
  393         SOCKBUF_LOCK(&so->so_snd);
  394         so->so_snd.sb_flags |= SB_NOINTR;
  395         (void)sblock(&so->so_snd, M_WAITOK);
  396         /*
  397          * socantsendmore_locked() drops the socket buffer mutex so that it
  398          * can safely perform wakeups.  Re-acquire the mutex before
  399          * continuing.
  400          */
  401         socantsendmore_locked(so);
  402         SOCKBUF_LOCK(&so->so_snd);
  403         sbunlock(&so->so_snd);
  404         sbrelease_locked(&so->so_snd, so);
  405         SOCKBUF_UNLOCK(&so->so_snd);
  406         sorflush(so);
  407         knlist_destroy(&so->so_rcv.sb_sel.si_note);
  408         knlist_destroy(&so->so_snd.sb_sel.si_note);
  409         sodealloc(so);
  410 }
  411 
  412 /*
  413  * Close a socket on last file table reference removal.
  414  * Initiate disconnect if connected.
  415  * Free socket when disconnect complete.
  416  *
  417  * This function will sorele() the socket.  Note that soclose() may be
  418  * called prior to the ref count reaching zero.  The actual socket
  419  * structure will not be freed until the ref count reaches zero.
  420  */
  421 int
  422 soclose(so)
  423         struct socket *so;
  424 {
  425         int error = 0;
  426 
  427         KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
  428 
  429         funsetown(&so->so_sigio);
  430         if (so->so_options & SO_ACCEPTCONN) {
  431                 struct socket *sp;
  432                 ACCEPT_LOCK();
  433                 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
  434                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
  435                         so->so_incqlen--;
  436                         sp->so_qstate &= ~SQ_INCOMP;
  437                         sp->so_head = NULL;
  438                         ACCEPT_UNLOCK();
  439                         (void) soabort(sp);
  440                         ACCEPT_LOCK();
  441                 }
  442                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
  443                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
  444                         so->so_qlen--;
  445                         sp->so_qstate &= ~SQ_COMP;
  446                         sp->so_head = NULL;
  447                         ACCEPT_UNLOCK();
  448                         (void) soabort(sp);
  449                         ACCEPT_LOCK();
  450                 }
  451                 ACCEPT_UNLOCK();
  452         }
  453         if (so->so_pcb == NULL)
  454                 goto discard;
  455         if (so->so_state & SS_ISCONNECTED) {
  456                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
  457                         error = sodisconnect(so);
  458                         if (error)
  459                                 goto drop;
  460                 }
  461                 if (so->so_options & SO_LINGER) {
  462                         if ((so->so_state & SS_ISDISCONNECTING) &&
  463                             (so->so_state & SS_NBIO))
  464                                 goto drop;
  465                         while (so->so_state & SS_ISCONNECTED) {
  466                                 error = tsleep(&so->so_timeo,
  467                                     PSOCK | PCATCH, "soclos", so->so_linger * hz);
  468                                 if (error)
  469                                         break;
  470                         }
  471                 }
  472         }
  473 drop:
  474         if (so->so_pcb != NULL) {
  475                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
  476                 if (error == 0)
  477                         error = error2;
  478         }
  479 discard:
  480         ACCEPT_LOCK();
  481         SOCK_LOCK(so);
  482         KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
  483         so->so_state |= SS_NOFDREF;
  484         sorele(so);
  485         return (error);
  486 }
  487 
  488 /*
  489  * soabort() must not be called with any socket locks held, as it calls
  490  * into the protocol, which will call back into the socket code causing
  491  * it to acquire additional socket locks that may cause recursion or lock
  492  * order reversals.
  493  */
  494 int
  495 soabort(so)
  496         struct socket *so;
  497 {
  498         int error;
  499 
  500         error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
  501         if (error) {
  502                 ACCEPT_LOCK();
  503                 SOCK_LOCK(so);
  504                 sotryfree(so);  /* note: does not decrement the ref count */
  505                 return error;
  506         }
  507         return (0);
  508 }
  509 
  510 int
  511 soaccept(so, nam)
  512         struct socket *so;
  513         struct sockaddr **nam;
  514 {
  515         int error;
  516 
  517         SOCK_LOCK(so);
  518         KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
  519         so->so_state &= ~SS_NOFDREF;
  520         SOCK_UNLOCK(so);
  521         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
  522         return (error);
  523 }
  524 
  525 int
  526 soconnect(so, nam, td)
  527         struct socket *so;
  528         struct sockaddr *nam;
  529         struct thread *td;
  530 {
  531         int error;
  532 
  533         if (so->so_options & SO_ACCEPTCONN)
  534                 return (EOPNOTSUPP);
  535         /*
  536          * If protocol is connection-based, can only connect once.
  537          * Otherwise, if connected, try to disconnect first.
  538          * This allows user to disconnect by connecting to, e.g.,
  539          * a null address.
  540          */
  541         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
  542             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
  543             (error = sodisconnect(so)))) {
  544                 error = EISCONN;
  545         } else {
  546                 /*
  547                  * Prevent accumulated error from previous connection
  548                  * from biting us.
  549                  */
  550                 so->so_error = 0;
  551                 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
  552         }
  553 
  554         return (error);
  555 }
  556 
  557 int
  558 soconnect2(so1, so2)
  559         struct socket *so1;
  560         struct socket *so2;
  561 {
  562 
  563         return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
  564 }
  565 
  566 int
  567 sodisconnect(so)
  568         struct socket *so;
  569 {
  570         int error;
  571 
  572         if ((so->so_state & SS_ISCONNECTED) == 0)
  573                 return (ENOTCONN);
  574         if (so->so_state & SS_ISDISCONNECTING)
  575                 return (EALREADY);
  576         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
  577         return (error);
  578 }
  579 
  580 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
  581 /*
  582  * Send on a socket.
  583  * If send must go all at once and message is larger than
  584  * send buffering, then hard error.
  585  * Lock against other senders.
  586  * If must go all at once and not enough room now, then
  587  * inform user that this would block and do nothing.
  588  * Otherwise, if nonblocking, send as much as possible.
  589  * The data to be sent is described by "uio" if nonzero,
  590  * otherwise by the mbuf chain "top" (which must be null
  591  * if uio is not).  Data provided in mbuf chain must be small
  592  * enough to send all at once.
  593  *
  594  * Returns nonzero on error, timeout or signal; callers
  595  * must check for short counts if EINTR/ERESTART are returned.
  596  * Data and control buffers are freed on return.
  597  */
  598 
  599 #ifdef ZERO_COPY_SOCKETS
  600 struct so_zerocopy_stats{
  601         int size_ok;
  602         int align_ok;
  603         int found_ifp;
  604 };
  605 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
  606 #include <netinet/in.h>
  607 #include <net/route.h>
  608 #include <netinet/in_pcb.h>
  609 #include <vm/vm.h>
  610 #include <vm/vm_page.h>
  611 #include <vm/vm_object.h>
  612 #endif /*ZERO_COPY_SOCKETS*/
  613 
  614 int
  615 sosend(so, addr, uio, top, control, flags, td)
  616         struct socket *so;
  617         struct sockaddr *addr;
  618         struct uio *uio;
  619         struct mbuf *top;
  620         struct mbuf *control;
  621         int flags;
  622         struct thread *td;
  623 {
  624         struct mbuf **mp;
  625         struct mbuf *m;
  626         long space, len = 0, resid;
  627         int clen = 0, error, dontroute;
  628         int atomic = sosendallatonce(so) || top;
  629 #ifdef ZERO_COPY_SOCKETS
  630         int cow_send;
  631 #endif /* ZERO_COPY_SOCKETS */
  632 
  633         if (uio != NULL)
  634                 resid = uio->uio_resid;
  635         else
  636                 resid = top->m_pkthdr.len;
  637         /*
  638          * In theory resid should be unsigned.
  639          * However, space must be signed, as it might be less than 0
  640          * if we over-committed, and we must use a signed comparison
  641          * of space and resid.  On the other hand, a negative resid
  642          * causes us to loop sending 0-length segments to the protocol.
  643          *
  644          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
  645          * type sockets since that's an error.
  646          */
  647         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
  648                 error = EINVAL;
  649                 goto out;
  650         }
  651 
  652         dontroute =
  653             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
  654             (so->so_proto->pr_flags & PR_ATOMIC);
  655         if (td != NULL)
  656                 td->td_proc->p_stats->p_ru.ru_msgsnd++;
  657         if (control != NULL)
  658                 clen = control->m_len;
  659 #define snderr(errno)   { error = (errno); goto release; }
  660 
  661         SOCKBUF_LOCK(&so->so_snd);
  662 restart:
  663         SOCKBUF_LOCK_ASSERT(&so->so_snd);
  664         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
  665         if (error)
  666                 goto out_locked;
  667         do {
  668                 SOCKBUF_LOCK_ASSERT(&so->so_snd);
  669                 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
  670                         snderr(EPIPE);
  671                 if (so->so_error) {
  672                         error = so->so_error;
  673                         so->so_error = 0;
  674                         goto release;
  675                 }
  676                 if ((so->so_state & SS_ISCONNECTED) == 0) {
  677                         /*
  678                          * `sendto' and `sendmsg' is allowed on a connection-
  679                          * based socket if it supports implied connect.
  680                          * Return ENOTCONN if not connected and no address is
  681                          * supplied.
  682                          */
  683                         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
  684                             (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
  685                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
  686                                     !(resid == 0 && clen != 0))
  687                                         snderr(ENOTCONN);
  688                         } else if (addr == NULL)
  689                             snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
  690                                    ENOTCONN : EDESTADDRREQ);
  691                 }
  692                 space = sbspace(&so->so_snd);
  693                 if (flags & MSG_OOB)
  694                         space += 1024;
  695                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
  696                     clen > so->so_snd.sb_hiwat)
  697                         snderr(EMSGSIZE);
  698                 if (space < resid + clen &&
  699                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
  700                         if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
  701                                 snderr(EWOULDBLOCK);
  702                         sbunlock(&so->so_snd);
  703                         error = sbwait(&so->so_snd);
  704                         if (error)
  705                                 goto out_locked;
  706                         goto restart;
  707                 }
  708                 SOCKBUF_UNLOCK(&so->so_snd);
  709                 mp = &top;
  710                 space -= clen;
  711                 do {
  712                     if (uio == NULL) {
  713                         /*
  714                          * Data is prepackaged in "top".
  715                          */
  716                         resid = 0;
  717                         if (flags & MSG_EOR)
  718                                 top->m_flags |= M_EOR;
  719                     } else do {
  720 #ifdef ZERO_COPY_SOCKETS
  721                         cow_send = 0;
  722 #endif /* ZERO_COPY_SOCKETS */
  723                         if (resid >= MINCLSIZE) {
  724 #ifdef ZERO_COPY_SOCKETS
  725                                 if (top == NULL) {
  726                                         MGETHDR(m, M_TRYWAIT, MT_DATA);
  727                                         if (m == NULL) {
  728                                                 error = ENOBUFS;
  729                                                 SOCKBUF_LOCK(&so->so_snd);
  730                                                 goto release;
  731                                         }
  732                                         m->m_pkthdr.len = 0;
  733                                         m->m_pkthdr.rcvif = NULL; 
  734                                 } else {
  735                                         MGET(m, M_TRYWAIT, MT_DATA);
  736                                         if (m == NULL) {
  737                                                 error = ENOBUFS;
  738                                                 SOCKBUF_LOCK(&so->so_snd);
  739                                                 goto release;
  740                                         }
  741                                 }
  742                                 if (so_zero_copy_send &&
  743                                     resid>=PAGE_SIZE &&
  744                                     space>=PAGE_SIZE &&
  745                                     uio->uio_iov->iov_len>=PAGE_SIZE) {
  746                                         so_zerocp_stats.size_ok++;
  747                                         so_zerocp_stats.align_ok++;
  748                                         cow_send = socow_setup(m, uio);
  749                                         len = cow_send;
  750                                 }
  751                                 if (!cow_send) {
  752                                         MCLGET(m, M_TRYWAIT);
  753                                         if ((m->m_flags & M_EXT) == 0) {
  754                                                 m_free(m);
  755                                                 m = NULL;
  756                                         } else {
  757                                                 len = min(min(MCLBYTES, resid), space);
  758                                         }
  759                                 }
  760 #else /* ZERO_COPY_SOCKETS */
  761                                 if (top == NULL) {
  762                                         m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
  763                                         m->m_pkthdr.len = 0;
  764                                         m->m_pkthdr.rcvif = NULL;
  765                                 } else
  766                                         m = m_getcl(M_TRYWAIT, MT_DATA, 0);
  767                                 len = min(min(MCLBYTES, resid), space);
  768 #endif /* ZERO_COPY_SOCKETS */
  769                         } else {
  770                                 if (top == NULL) {
  771                                         m = m_gethdr(M_TRYWAIT, MT_DATA);
  772                                         m->m_pkthdr.len = 0;
  773                                         m->m_pkthdr.rcvif = NULL;
  774 
  775                                         len = min(min(MHLEN, resid), space);
  776                                         /*
  777                                          * For datagram protocols, leave room
  778                                          * for protocol headers in first mbuf.
  779                                          */
  780                                         if (atomic && m && len < MHLEN)
  781                                                 MH_ALIGN(m, len);
  782                                 } else {
  783                                         m = m_get(M_TRYWAIT, MT_DATA);
  784                                         len = min(min(MLEN, resid), space);
  785                                 }
  786                         }
  787                         if (m == NULL) {
  788                                 error = ENOBUFS;
  789                                 SOCKBUF_LOCK(&so->so_snd);
  790                                 goto release;
  791                         }
  792 
  793                         space -= len;
  794 #ifdef ZERO_COPY_SOCKETS
  795                         if (cow_send)
  796                                 error = 0;
  797                         else
  798 #endif /* ZERO_COPY_SOCKETS */
  799                         error = uiomove(mtod(m, void *), (int)len, uio);
  800                         resid = uio->uio_resid;
  801                         m->m_len = len;
  802                         *mp = m;
  803                         top->m_pkthdr.len += len;
  804                         if (error) {
  805                                 SOCKBUF_LOCK(&so->so_snd);
  806                                 goto release;
  807                         }
  808                         mp = &m->m_next;
  809                         if (resid <= 0) {
  810                                 if (flags & MSG_EOR)
  811                                         top->m_flags |= M_EOR;
  812                                 break;
  813                         }
  814                     } while (space > 0 && atomic);
  815                     if (dontroute) {
  816                             SOCK_LOCK(so);
  817                             so->so_options |= SO_DONTROUTE;
  818                             SOCK_UNLOCK(so);
  819                     }
  820                     /*
  821                      * XXX all the SBS_CANTSENDMORE checks previously
  822                      * done could be out of date.  We could have recieved
  823                      * a reset packet in an interrupt or maybe we slept
  824                      * while doing page faults in uiomove() etc. We could
  825                      * probably recheck again inside the locking protection
  826                      * here, but there are probably other places that this
  827                      * also happens.  We must rethink this.
  828                      */
  829                     error = (*so->so_proto->pr_usrreqs->pru_send)(so,
  830                         (flags & MSG_OOB) ? PRUS_OOB :
  831                         /*
  832                          * If the user set MSG_EOF, the protocol
  833                          * understands this flag and nothing left to
  834                          * send then use PRU_SEND_EOF instead of PRU_SEND.
  835                          */
  836                         ((flags & MSG_EOF) &&
  837                          (so->so_proto->pr_flags & PR_IMPLOPCL) &&
  838                          (resid <= 0)) ?
  839                                 PRUS_EOF :
  840                         /* If there is more to send set PRUS_MORETOCOME */
  841                         (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
  842                         top, addr, control, td);
  843                     if (dontroute) {
  844                             SOCK_LOCK(so);
  845                             so->so_options &= ~SO_DONTROUTE;
  846                             SOCK_UNLOCK(so);
  847                     }
  848                     clen = 0;
  849                     control = NULL;
  850                     top = NULL;
  851                     mp = &top;
  852                     if (error) {
  853                         SOCKBUF_LOCK(&so->so_snd);
  854                         goto release;
  855                     }
  856                 } while (resid && space > 0);
  857                 SOCKBUF_LOCK(&so->so_snd);
  858         } while (resid);
  859 
  860 release:
  861         SOCKBUF_LOCK_ASSERT(&so->so_snd);
  862         sbunlock(&so->so_snd);
  863 out_locked:
  864         SOCKBUF_LOCK_ASSERT(&so->so_snd);
  865         SOCKBUF_UNLOCK(&so->so_snd);
  866 out:
  867         if (top != NULL)
  868                 m_freem(top);
  869         if (control != NULL)
  870                 m_freem(control);
  871         return (error);
  872 }
  873 
  874 /*
  875  * The part of soreceive() that implements reading non-inline out-of-band
  876  * data from a socket.  For more complete comments, see soreceive(), from
  877  * which this code originated.
  878  *
  879  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
  880  * unable to return an mbuf chain to the caller.
  881  */
  882 static int
  883 soreceive_rcvoob(so, uio, flags)
  884         struct socket *so;
  885         struct uio *uio;
  886         int flags;
  887 {
  888         struct protosw *pr = so->so_proto;
  889         struct mbuf *m;
  890         int error;
  891 
  892         KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
  893 
  894         m = m_get(M_TRYWAIT, MT_DATA);
  895         if (m == NULL)
  896                 return (ENOBUFS);
  897         error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
  898         if (error)
  899                 goto bad;
  900         do {
  901 #ifdef ZERO_COPY_SOCKETS
  902                 if (so_zero_copy_receive) {
  903                         int disposable;
  904 
  905                         if ((m->m_flags & M_EXT)
  906                          && (m->m_ext.ext_type == EXT_DISPOSABLE))
  907                                 disposable = 1;
  908                         else
  909                                 disposable = 0;
  910 
  911                         error = uiomoveco(mtod(m, void *),
  912                                           min(uio->uio_resid, m->m_len),
  913                                           uio, disposable);
  914                 } else
  915 #endif /* ZERO_COPY_SOCKETS */
  916                 error = uiomove(mtod(m, void *),
  917                     (int) min(uio->uio_resid, m->m_len), uio);
  918                 m = m_free(m);
  919         } while (uio->uio_resid && error == 0 && m);
  920 bad:
  921         if (m != NULL)
  922                 m_freem(m);
  923         return (error);
  924 }
  925 
  926 /*
  927  * Following replacement or removal of the first mbuf on the first mbuf chain
  928  * of a socket buffer, push necessary state changes back into the socket
  929  * buffer so that other consumers see the values consistently.  'nextrecord'
  930  * is the callers locally stored value of the original value of
  931  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
  932  * NOTE: 'nextrecord' may be NULL.
  933  */
  934 static __inline void
  935 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
  936 {
  937 
  938         SOCKBUF_LOCK_ASSERT(sb);
  939         /*
  940          * First, update for the new value of nextrecord.  If necessary, make
  941          * it the first record.
  942          */
  943         if (sb->sb_mb != NULL)
  944                 sb->sb_mb->m_nextpkt = nextrecord;
  945         else
  946                 sb->sb_mb = nextrecord;
  947 
  948         /*
  949          * Now update any dependent socket buffer fields to reflect the new
  950          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
  951          * addition of a second clause that takes care of the case where
  952          * sb_mb has been updated, but remains the last record.
  953          */
  954         if (sb->sb_mb == NULL) {
  955                 sb->sb_mbtail = NULL;
  956                 sb->sb_lastrecord = NULL;
  957         } else if (sb->sb_mb->m_nextpkt == NULL)
  958                 sb->sb_lastrecord = sb->sb_mb;
  959 }
  960 
  961 
  962 /*
  963  * Implement receive operations on a socket.
  964  * We depend on the way that records are added to the sockbuf
  965  * by sbappend*.  In particular, each record (mbufs linked through m_next)
  966  * must begin with an address if the protocol so specifies,
  967  * followed by an optional mbuf or mbufs containing ancillary data,
  968  * and then zero or more mbufs of data.
  969  * In order to avoid blocking network interrupts for the entire time here,
  970  * we splx() while doing the actual copy to user space.
  971  * Although the sockbuf is locked, new data may still be appended,
  972  * and thus we must maintain consistency of the sockbuf during that time.
  973  *
  974  * The caller may receive the data as a single mbuf chain by supplying
  975  * an mbuf **mp0 for use in returning the chain.  The uio is then used
  976  * only for the count in uio_resid.
  977  */
  978 int
  979 soreceive(so, psa, uio, mp0, controlp, flagsp)
  980         struct socket *so;
  981         struct sockaddr **psa;
  982         struct uio *uio;
  983         struct mbuf **mp0;
  984         struct mbuf **controlp;
  985         int *flagsp;
  986 {
  987         struct mbuf *m, **mp;
  988         int flags, len, error, offset;
  989         struct protosw *pr = so->so_proto;
  990         struct mbuf *nextrecord;
  991         int moff, type = 0;
  992         int orig_resid = uio->uio_resid;
  993 
  994         mp = mp0;
  995         if (psa != NULL)
  996                 *psa = NULL;
  997         if (controlp != NULL)
  998                 *controlp = NULL;
  999         if (flagsp != NULL)
 1000                 flags = *flagsp &~ MSG_EOR;
 1001         else
 1002                 flags = 0;
 1003         if (flags & MSG_OOB)
 1004                 return (soreceive_rcvoob(so, uio, flags));
 1005         if (mp != NULL)
 1006                 *mp = NULL;
 1007         if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
 1008             && uio->uio_resid)
 1009                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
 1010 
 1011         SOCKBUF_LOCK(&so->so_rcv);
 1012 restart:
 1013         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1014         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
 1015         if (error)
 1016                 goto out;
 1017 
 1018         m = so->so_rcv.sb_mb;
 1019         /*
 1020          * If we have less data than requested, block awaiting more
 1021          * (subject to any timeout) if:
 1022          *   1. the current count is less than the low water mark, or
 1023          *   2. MSG_WAITALL is set, and it is possible to do the entire
 1024          *      receive operation at once if we block (resid <= hiwat).
 1025          *   3. MSG_DONTWAIT is not set
 1026          * If MSG_WAITALL is set but resid is larger than the receive buffer,
 1027          * we have to do the receive in sections, and thus risk returning
 1028          * a short count if a timeout or signal occurs after we start.
 1029          */
 1030         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
 1031             so->so_rcv.sb_cc < uio->uio_resid) &&
 1032             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
 1033             ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
 1034             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
 1035                 KASSERT(m != NULL || !so->so_rcv.sb_cc,
 1036                     ("receive: m == %p so->so_rcv.sb_cc == %u",
 1037                     m, so->so_rcv.sb_cc));
 1038                 if (so->so_error) {
 1039                         if (m != NULL)
 1040                                 goto dontblock;
 1041                         error = so->so_error;
 1042                         if ((flags & MSG_PEEK) == 0)
 1043                                 so->so_error = 0;
 1044                         goto release;
 1045                 }
 1046                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1047                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 1048                         if (m)
 1049                                 goto dontblock;
 1050                         else
 1051                                 goto release;
 1052                 }
 1053                 for (; m != NULL; m = m->m_next)
 1054                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
 1055                                 m = so->so_rcv.sb_mb;
 1056                                 goto dontblock;
 1057                         }
 1058                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
 1059                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
 1060                         error = ENOTCONN;
 1061                         goto release;
 1062                 }
 1063                 if (uio->uio_resid == 0)
 1064                         goto release;
 1065                 if ((so->so_state & SS_NBIO) ||
 1066                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
 1067                         error = EWOULDBLOCK;
 1068                         goto release;
 1069                 }
 1070                 SBLASTRECORDCHK(&so->so_rcv);
 1071                 SBLASTMBUFCHK(&so->so_rcv);
 1072                 sbunlock(&so->so_rcv);
 1073                 error = sbwait(&so->so_rcv);
 1074                 if (error)
 1075                         goto out;
 1076                 goto restart;
 1077         }
 1078 dontblock:
 1079         /*
 1080          * From this point onward, we maintain 'nextrecord' as a cache of the
 1081          * pointer to the next record in the socket buffer.  We must keep the
 1082          * various socket buffer pointers and local stack versions of the
 1083          * pointers in sync, pushing out modifications before dropping the
 1084          * socket buffer mutex, and re-reading them when picking it up.
 1085          *
 1086          * Otherwise, we will race with the network stack appending new data
 1087          * or records onto the socket buffer by using inconsistent/stale
 1088          * versions of the field, possibly resulting in socket buffer
 1089          * corruption.
 1090          *
 1091          * By holding the high-level sblock(), we prevent simultaneous
 1092          * readers from pulling off the front of the socket buffer.
 1093          */
 1094         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1095         if (uio->uio_td)
 1096                 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
 1097         KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
 1098         SBLASTRECORDCHK(&so->so_rcv);
 1099         SBLASTMBUFCHK(&so->so_rcv);
 1100         nextrecord = m->m_nextpkt;
 1101         if (pr->pr_flags & PR_ADDR) {
 1102                 KASSERT(m->m_type == MT_SONAME,
 1103                     ("m->m_type == %d", m->m_type));
 1104                 orig_resid = 0;
 1105                 if (psa != NULL)
 1106                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
 1107                             M_NOWAIT);
 1108                 if (flags & MSG_PEEK) {
 1109                         m = m->m_next;
 1110                 } else {
 1111                         sbfree(&so->so_rcv, m);
 1112                         so->so_rcv.sb_mb = m_free(m);
 1113                         m = so->so_rcv.sb_mb;
 1114                         sockbuf_pushsync(&so->so_rcv, nextrecord);
 1115                 }
 1116         }
 1117 
 1118         /*
 1119          * Process one or more MT_CONTROL mbufs present before any data mbufs
 1120          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
 1121          * just copy the data; if !MSG_PEEK, we call into the protocol to
 1122          * perform externalization (or freeing if controlp == NULL).
 1123          */
 1124         if (m != NULL && m->m_type == MT_CONTROL) {
 1125                 struct mbuf *cm = NULL, *cmn;
 1126                 struct mbuf **cme = &cm;
 1127 
 1128                 do {
 1129                         if (flags & MSG_PEEK) {
 1130                                 if (controlp != NULL) {
 1131                                         *controlp = m_copy(m, 0, m->m_len);
 1132                                         controlp = &(*controlp)->m_next;
 1133                                 }
 1134                                 m = m->m_next;
 1135                         } else {
 1136                                 sbfree(&so->so_rcv, m);
 1137                                 so->so_rcv.sb_mb = m->m_next;
 1138                                 m->m_next = NULL;
 1139                                 *cme = m;
 1140                                 cme = &(*cme)->m_next;
 1141                                 m = so->so_rcv.sb_mb;
 1142                         }
 1143                 } while (m != NULL && m->m_type == MT_CONTROL);
 1144                 if ((flags & MSG_PEEK) == 0)
 1145                         sockbuf_pushsync(&so->so_rcv, nextrecord);
 1146                 while (cm != NULL) {
 1147                         cmn = cm->m_next;
 1148                         cm->m_next = NULL;
 1149                         if (pr->pr_domain->dom_externalize != NULL) {
 1150                                 SOCKBUF_UNLOCK(&so->so_rcv);
 1151                                 error = (*pr->pr_domain->dom_externalize)
 1152                                     (cm, controlp);
 1153                                 SOCKBUF_LOCK(&so->so_rcv);
 1154                         } else if (controlp != NULL)
 1155                                 *controlp = cm;
 1156                         else
 1157                                 m_freem(cm);
 1158                         if (controlp != NULL) {
 1159                                 orig_resid = 0;
 1160                                 while (*controlp != NULL)
 1161                                         controlp = &(*controlp)->m_next;
 1162                         }
 1163                         cm = cmn;
 1164                 }
 1165                 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
 1166                 orig_resid = 0;
 1167         }
 1168         if (m != NULL) {
 1169                 if ((flags & MSG_PEEK) == 0) {
 1170                         KASSERT(m->m_nextpkt == nextrecord,
 1171                             ("soreceive: post-control, nextrecord !sync"));
 1172                         if (nextrecord == NULL) {
 1173                                 KASSERT(so->so_rcv.sb_mb == m,
 1174                                     ("soreceive: post-control, sb_mb!=m"));
 1175                                 KASSERT(so->so_rcv.sb_lastrecord == m,
 1176                                     ("soreceive: post-control, lastrecord!=m"));
 1177                         }
 1178                 }
 1179                 type = m->m_type;
 1180                 if (type == MT_OOBDATA)
 1181                         flags |= MSG_OOB;
 1182         } else {
 1183                 if ((flags & MSG_PEEK) == 0) {
 1184                         KASSERT(so->so_rcv.sb_mb == nextrecord,
 1185                             ("soreceive: sb_mb != nextrecord"));
 1186                         if (so->so_rcv.sb_mb == NULL) {
 1187                                 KASSERT(so->so_rcv.sb_lastrecord == NULL,
 1188                                     ("soreceive: sb_lastercord != NULL"));
 1189                         }
 1190                 }
 1191         }
 1192         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1193         SBLASTRECORDCHK(&so->so_rcv);
 1194         SBLASTMBUFCHK(&so->so_rcv);
 1195 
 1196         /*
 1197          * Now continue to read any data mbufs off of the head of the socket
 1198          * buffer until the read request is satisfied.  Note that 'type' is
 1199          * used to store the type of any mbuf reads that have happened so far
 1200          * such that soreceive() can stop reading if the type changes, which
 1201          * causes soreceive() to return only one of regular data and inline
 1202          * out-of-band data in a single socket receive operation.
 1203          */
 1204         moff = 0;
 1205         offset = 0;
 1206         while (m != NULL && uio->uio_resid > 0 && error == 0) {
 1207                 /*
 1208                  * If the type of mbuf has changed since the last mbuf
 1209                  * examined ('type'), end the receive operation.
 1210                  */
 1211                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1212                 if (m->m_type == MT_OOBDATA) {
 1213                         if (type != MT_OOBDATA)
 1214                                 break;
 1215                 } else if (type == MT_OOBDATA)
 1216                         break;
 1217                 else
 1218                     KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
 1219                         ("m->m_type == %d", m->m_type));
 1220                 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
 1221                 len = uio->uio_resid;
 1222                 if (so->so_oobmark && len > so->so_oobmark - offset)
 1223                         len = so->so_oobmark - offset;
 1224                 if (len > m->m_len - moff)
 1225                         len = m->m_len - moff;
 1226                 /*
 1227                  * If mp is set, just pass back the mbufs.
 1228                  * Otherwise copy them out via the uio, then free.
 1229                  * Sockbuf must be consistent here (points to current mbuf,
 1230                  * it points to next record) when we drop priority;
 1231                  * we must note any additions to the sockbuf when we
 1232                  * block interrupts again.
 1233                  */
 1234                 if (mp == NULL) {
 1235                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1236                         SBLASTRECORDCHK(&so->so_rcv);
 1237                         SBLASTMBUFCHK(&so->so_rcv);
 1238                         SOCKBUF_UNLOCK(&so->so_rcv);
 1239 #ifdef ZERO_COPY_SOCKETS
 1240                         if (so_zero_copy_receive) {
 1241                                 int disposable;
 1242 
 1243                                 if ((m->m_flags & M_EXT)
 1244                                  && (m->m_ext.ext_type == EXT_DISPOSABLE))
 1245                                         disposable = 1;
 1246                                 else
 1247                                         disposable = 0;
 1248 
 1249                                 error = uiomoveco(mtod(m, char *) + moff,
 1250                                                   (int)len, uio,
 1251                                                   disposable);
 1252                         } else
 1253 #endif /* ZERO_COPY_SOCKETS */
 1254                         error = uiomove(mtod(m, char *) + moff, (int)len, uio);
 1255                         SOCKBUF_LOCK(&so->so_rcv);
 1256                         if (error)
 1257                                 goto release;
 1258                 } else
 1259                         uio->uio_resid -= len;
 1260                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1261                 if (len == m->m_len - moff) {
 1262                         if (m->m_flags & M_EOR)
 1263                                 flags |= MSG_EOR;
 1264                         if (flags & MSG_PEEK) {
 1265                                 m = m->m_next;
 1266                                 moff = 0;
 1267                         } else {
 1268                                 nextrecord = m->m_nextpkt;
 1269                                 sbfree(&so->so_rcv, m);
 1270                                 if (mp != NULL) {
 1271                                         *mp = m;
 1272                                         mp = &m->m_next;
 1273                                         so->so_rcv.sb_mb = m = m->m_next;
 1274                                         *mp = NULL;
 1275                                 } else {
 1276                                         so->so_rcv.sb_mb = m_free(m);
 1277                                         m = so->so_rcv.sb_mb;
 1278                                 }
 1279                                 if (m != NULL) {
 1280                                         m->m_nextpkt = nextrecord;
 1281                                         if (nextrecord == NULL)
 1282                                                 so->so_rcv.sb_lastrecord = m;
 1283                                 } else {
 1284                                         so->so_rcv.sb_mb = nextrecord;
 1285                                         SB_EMPTY_FIXUP(&so->so_rcv);
 1286                                 }
 1287                                 SBLASTRECORDCHK(&so->so_rcv);
 1288                                 SBLASTMBUFCHK(&so->so_rcv);
 1289                         }
 1290                 } else {
 1291                         if (flags & MSG_PEEK)
 1292                                 moff += len;
 1293                         else {
 1294                                 if (mp != NULL) {
 1295                                         int copy_flag;
 1296 
 1297                                         if (flags & MSG_DONTWAIT)
 1298                                                 copy_flag = M_DONTWAIT;
 1299                                         else
 1300                                                 copy_flag = M_TRYWAIT;
 1301                                         if (copy_flag == M_TRYWAIT)
 1302                                                 SOCKBUF_UNLOCK(&so->so_rcv);
 1303                                         *mp = m_copym(m, 0, len, copy_flag);
 1304                                         if (copy_flag == M_TRYWAIT)
 1305                                                 SOCKBUF_LOCK(&so->so_rcv);
 1306                                         if (*mp == NULL) {
 1307                                                 /*
 1308                                                  * m_copym() couldn't allocate an mbuf. 
 1309                                                  * Adjust uio_resid back (it was adjusted 
 1310                                                  * down by len bytes, which we didn't end 
 1311                                                  * up "copying" over).
 1312                                                  */
 1313                                                 uio->uio_resid += len;
 1314                                                 break;
 1315                                         }
 1316                                 }
 1317                                 m->m_data += len;
 1318                                 m->m_len -= len;
 1319                                 so->so_rcv.sb_cc -= len;
 1320                         }
 1321                 }
 1322                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1323                 if (so->so_oobmark) {
 1324                         if ((flags & MSG_PEEK) == 0) {
 1325                                 so->so_oobmark -= len;
 1326                                 if (so->so_oobmark == 0) {
 1327                                         so->so_rcv.sb_state |= SBS_RCVATMARK;
 1328                                         break;
 1329                                 }
 1330                         } else {
 1331                                 offset += len;
 1332                                 if (offset == so->so_oobmark)
 1333                                         break;
 1334                         }
 1335                 }
 1336                 if (flags & MSG_EOR)
 1337                         break;
 1338                 /*
 1339                  * If the MSG_WAITALL flag is set (for non-atomic socket),
 1340                  * we must not quit until "uio->uio_resid == 0" or an error
 1341                  * termination.  If a signal/timeout occurs, return
 1342                  * with a short count but without error.
 1343                  * Keep sockbuf locked against other readers.
 1344                  */
 1345                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
 1346                     !sosendallatonce(so) && nextrecord == NULL) {
 1347                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1348                         if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
 1349                                 break;
 1350                         /*
 1351                          * Notify the protocol that some data has been
 1352                          * drained before blocking.
 1353                          */
 1354                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
 1355                                 SOCKBUF_UNLOCK(&so->so_rcv);
 1356                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
 1357                                 SOCKBUF_LOCK(&so->so_rcv);
 1358                         }
 1359                         SBLASTRECORDCHK(&so->so_rcv);
 1360                         SBLASTMBUFCHK(&so->so_rcv);
 1361                         error = sbwait(&so->so_rcv);
 1362                         if (error)
 1363                                 goto release;
 1364                         m = so->so_rcv.sb_mb;
 1365                         if (m != NULL)
 1366                                 nextrecord = m->m_nextpkt;
 1367                 }
 1368         }
 1369 
 1370         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1371         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
 1372                 flags |= MSG_TRUNC;
 1373                 if ((flags & MSG_PEEK) == 0)
 1374                         (void) sbdroprecord_locked(&so->so_rcv);
 1375         }
 1376         if ((flags & MSG_PEEK) == 0) {
 1377                 if (m == NULL) {
 1378                         /*
 1379                          * First part is an inline SB_EMPTY_FIXUP().  Second
 1380                          * part makes sure sb_lastrecord is up-to-date if
 1381                          * there is still data in the socket buffer.
 1382                          */
 1383                         so->so_rcv.sb_mb = nextrecord;
 1384                         if (so->so_rcv.sb_mb == NULL) {
 1385                                 so->so_rcv.sb_mbtail = NULL;
 1386                                 so->so_rcv.sb_lastrecord = NULL;
 1387                         } else if (nextrecord->m_nextpkt == NULL)
 1388                                 so->so_rcv.sb_lastrecord = nextrecord;
 1389                 }
 1390                 SBLASTRECORDCHK(&so->so_rcv);
 1391                 SBLASTMBUFCHK(&so->so_rcv);
 1392                 /*
 1393                  * If soreceive() is being done from the socket callback, then 
 1394                  * don't need to generate ACK to peer to update window, since 
 1395                  * ACK will be generated on return to TCP.
 1396                  */
 1397                 if (!(flags & MSG_SOCALLBCK) && 
 1398                     (pr->pr_flags & PR_WANTRCVD) && so->so_pcb) {
 1399                         SOCKBUF_UNLOCK(&so->so_rcv);
 1400                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
 1401                         SOCKBUF_LOCK(&so->so_rcv);
 1402                 }
 1403         }
 1404         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1405         if (orig_resid == uio->uio_resid && orig_resid &&
 1406             (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
 1407                 sbunlock(&so->so_rcv);
 1408                 goto restart;
 1409         }
 1410 
 1411         if (flagsp != NULL)
 1412                 *flagsp |= flags;
 1413 release:
 1414         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1415         sbunlock(&so->so_rcv);
 1416 out:
 1417         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1418         SOCKBUF_UNLOCK(&so->so_rcv);
 1419         return (error);
 1420 }
 1421 
 1422 int
 1423 soshutdown(so, how)
 1424         struct socket *so;
 1425         int how;
 1426 {
 1427         struct protosw *pr = so->so_proto;
 1428 
 1429         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
 1430                 return (EINVAL);
 1431 
 1432         if (how != SHUT_WR)
 1433                 sorflush(so);
 1434         if (how != SHUT_RD)
 1435                 return ((*pr->pr_usrreqs->pru_shutdown)(so));
 1436         return (0);
 1437 }
 1438 
 1439 void
 1440 sorflush(so)
 1441         struct socket *so;
 1442 {
 1443         struct sockbuf *sb = &so->so_rcv;
 1444         struct protosw *pr = so->so_proto;
 1445         struct sockbuf asb;
 1446 
 1447         /*
 1448          * XXXRW: This is quite ugly.  Previously, this code made a copy of
 1449          * the socket buffer, then zero'd the original to clear the buffer
 1450          * fields.  However, with mutexes in the socket buffer, this causes
 1451          * problems.  We only clear the zeroable bits of the original;
 1452          * however, we have to initialize and destroy the mutex in the copy
 1453          * so that dom_dispose() and sbrelease() can lock t as needed.
 1454          */
 1455         SOCKBUF_LOCK(sb);
 1456         sb->sb_flags |= SB_NOINTR;
 1457         (void) sblock(sb, M_WAITOK);
 1458         /*
 1459          * socantrcvmore_locked() drops the socket buffer mutex so that it
 1460          * can safely perform wakeups.  Re-acquire the mutex before
 1461          * continuing.
 1462          */
 1463         socantrcvmore_locked(so);
 1464         SOCKBUF_LOCK(sb);
 1465         sbunlock(sb);
 1466         /*
 1467          * Invalidate/clear most of the sockbuf structure, but leave
 1468          * selinfo and mutex data unchanged.
 1469          */
 1470         bzero(&asb, offsetof(struct sockbuf, sb_startzero));
 1471         bcopy(&sb->sb_startzero, &asb.sb_startzero,
 1472             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 1473         bzero(&sb->sb_startzero,
 1474             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 1475         SOCKBUF_UNLOCK(sb);
 1476 
 1477         SOCKBUF_LOCK_INIT(&asb, "so_rcv");
 1478         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
 1479                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
 1480         sbrelease(&asb, so);
 1481         SOCKBUF_LOCK_DESTROY(&asb);
 1482 }
 1483 
 1484 /*
 1485  * Perhaps this routine, and sooptcopyout(), below, ought to come in
 1486  * an additional variant to handle the case where the option value needs
 1487  * to be some kind of integer, but not a specific size.
 1488  * In addition to their use here, these functions are also called by the
 1489  * protocol-level pr_ctloutput() routines.
 1490  */
 1491 int
 1492 sooptcopyin(sopt, buf, len, minlen)
 1493         struct  sockopt *sopt;
 1494         void    *buf;
 1495         size_t  len;
 1496         size_t  minlen;
 1497 {
 1498         size_t  valsize;
 1499 
 1500         /*
 1501          * If the user gives us more than we wanted, we ignore it,
 1502          * but if we don't get the minimum length the caller
 1503          * wants, we return EINVAL.  On success, sopt->sopt_valsize
 1504          * is set to however much we actually retrieved.
 1505          */
 1506         if ((valsize = sopt->sopt_valsize) < minlen)
 1507                 return EINVAL;
 1508         if (valsize > len)
 1509                 sopt->sopt_valsize = valsize = len;
 1510 
 1511         if (sopt->sopt_td != NULL)
 1512                 return (copyin(sopt->sopt_val, buf, valsize));
 1513 
 1514         bcopy(sopt->sopt_val, buf, valsize);
 1515         return 0;
 1516 }
 1517 
 1518 /*
 1519  * Kernel version of setsockopt(2)/
 1520  * XXX: optlen is size_t, not socklen_t
 1521  */
 1522 int
 1523 so_setsockopt(struct socket *so, int level, int optname, void *optval,
 1524     size_t optlen)
 1525 {
 1526         struct sockopt sopt;
 1527 
 1528         sopt.sopt_level = level;
 1529         sopt.sopt_name = optname;
 1530         sopt.sopt_dir = SOPT_SET;
 1531         sopt.sopt_val = optval;
 1532         sopt.sopt_valsize = optlen;
 1533         sopt.sopt_td = NULL;
 1534         return (sosetopt(so, &sopt));
 1535 }
 1536 
 1537 int
 1538 sosetopt(so, sopt)
 1539         struct socket *so;
 1540         struct sockopt *sopt;
 1541 {
 1542         int     error, optval;
 1543         struct  linger l;
 1544         struct  timeval tv;
 1545         u_long  val;
 1546 #ifdef MAC
 1547         struct mac extmac;
 1548 #endif
 1549 
 1550         error = 0;
 1551         if (sopt->sopt_level != SOL_SOCKET) {
 1552                 if (so->so_proto && so->so_proto->pr_ctloutput)
 1553                         return ((*so->so_proto->pr_ctloutput)
 1554                                   (so, sopt));
 1555                 error = ENOPROTOOPT;
 1556         } else {
 1557                 switch (sopt->sopt_name) {
 1558 #ifdef INET
 1559                 case SO_ACCEPTFILTER:
 1560                         error = do_setopt_accept_filter(so, sopt);
 1561                         if (error)
 1562                                 goto bad;
 1563                         break;
 1564 #endif
 1565                 case SO_LINGER:
 1566                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
 1567                         if (error)
 1568                                 goto bad;
 1569 
 1570                         SOCK_LOCK(so);
 1571                         so->so_linger = l.l_linger;
 1572                         if (l.l_onoff)
 1573                                 so->so_options |= SO_LINGER;
 1574                         else
 1575                                 so->so_options &= ~SO_LINGER;
 1576                         SOCK_UNLOCK(so);
 1577                         break;
 1578 
 1579                 case SO_DEBUG:
 1580                 case SO_KEEPALIVE:
 1581                 case SO_DONTROUTE:
 1582                 case SO_USELOOPBACK:
 1583                 case SO_BROADCAST:
 1584                 case SO_REUSEADDR:
 1585                 case SO_REUSEPORT:
 1586                 case SO_OOBINLINE:
 1587                 case SO_TIMESTAMP:
 1588                 case SO_BINTIME:
 1589                 case SO_NOSIGPIPE:
 1590                         error = sooptcopyin(sopt, &optval, sizeof optval,
 1591                                             sizeof optval);
 1592                         if (error)
 1593                                 goto bad;
 1594                         SOCK_LOCK(so);
 1595                         if (optval)
 1596                                 so->so_options |= sopt->sopt_name;
 1597                         else
 1598                                 so->so_options &= ~sopt->sopt_name;
 1599                         SOCK_UNLOCK(so);
 1600                         break;
 1601 
 1602                 case SO_SNDBUF:
 1603                 case SO_RCVBUF:
 1604                 case SO_SNDLOWAT:
 1605                 case SO_RCVLOWAT:
 1606                         error = sooptcopyin(sopt, &optval, sizeof optval,
 1607                                             sizeof optval);
 1608                         if (error)
 1609                                 goto bad;
 1610 
 1611                         /*
 1612                          * Values < 1 make no sense for any of these
 1613                          * options, so disallow them.
 1614                          */
 1615                         if (optval < 1) {
 1616                                 error = EINVAL;
 1617                                 goto bad;
 1618                         }
 1619 
 1620                         switch (sopt->sopt_name) {
 1621                         case SO_SNDBUF:
 1622                         case SO_RCVBUF:
 1623                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
 1624                                     &so->so_snd : &so->so_rcv, (u_long)optval,
 1625                                     so, curthread) == 0) {
 1626                                         error = ENOBUFS;
 1627                                         goto bad;
 1628                                 }
 1629                                 break;
 1630 
 1631                         /*
 1632                          * Make sure the low-water is never greater than
 1633                          * the high-water.
 1634                          */
 1635                         case SO_SNDLOWAT:
 1636                                 SOCKBUF_LOCK(&so->so_snd);
 1637                                 so->so_snd.sb_lowat =
 1638                                     (optval > so->so_snd.sb_hiwat) ?
 1639                                     so->so_snd.sb_hiwat : optval;
 1640                                 SOCKBUF_UNLOCK(&so->so_snd);
 1641                                 break;
 1642                         case SO_RCVLOWAT:
 1643                                 SOCKBUF_LOCK(&so->so_rcv);
 1644                                 so->so_rcv.sb_lowat =
 1645                                     (optval > so->so_rcv.sb_hiwat) ?
 1646                                     so->so_rcv.sb_hiwat : optval;
 1647                                 SOCKBUF_UNLOCK(&so->so_rcv);
 1648                                 break;
 1649                         }
 1650                         break;
 1651 
 1652                 case SO_SNDTIMEO:
 1653                 case SO_RCVTIMEO:
 1654                         error = sooptcopyin(sopt, &tv, sizeof tv,
 1655                                             sizeof tv);
 1656                         if (error)
 1657                                 goto bad;
 1658 
 1659                         /* assert(hz > 0); */
 1660                         if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
 1661                             tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
 1662                                 error = EDOM;
 1663                                 goto bad;
 1664                         }
 1665                         /* assert(tick > 0); */
 1666                         /* assert(ULONG_MAX - INT_MAX >= 1000000); */
 1667                         val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
 1668                         if (val > INT_MAX) {
 1669                                 error = EDOM;
 1670                                 goto bad;
 1671                         }
 1672                         if (val == 0 && tv.tv_usec != 0)
 1673                                 val = 1;
 1674 
 1675                         switch (sopt->sopt_name) {
 1676                         case SO_SNDTIMEO:
 1677                                 so->so_snd.sb_timeo = val;
 1678                                 break;
 1679                         case SO_RCVTIMEO:
 1680                                 so->so_rcv.sb_timeo = val;
 1681                                 break;
 1682                         }
 1683                         break;
 1684 
 1685                 case SO_LABEL:
 1686 #ifdef MAC
 1687                         error = sooptcopyin(sopt, &extmac, sizeof extmac,
 1688                             sizeof extmac);
 1689                         if (error)
 1690                                 goto bad;
 1691                         error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
 1692                             so, &extmac);
 1693 #else
 1694                         error = EOPNOTSUPP;
 1695 #endif
 1696                         break;
 1697 
 1698                 default:
 1699                         error = ENOPROTOOPT;
 1700                         break;
 1701                 }
 1702                 if (error == 0 && so->so_proto != NULL &&
 1703                     so->so_proto->pr_ctloutput != NULL) {
 1704                         (void) ((*so->so_proto->pr_ctloutput)
 1705                                   (so, sopt));
 1706                 }
 1707         }
 1708 bad:
 1709         return (error);
 1710 }
 1711 
 1712 /* Helper routine for getsockopt */
 1713 int
 1714 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
 1715 {
 1716         int     error;
 1717         size_t  valsize;
 1718 
 1719         error = 0;
 1720 
 1721         /*
 1722          * Documented get behavior is that we always return a value,
 1723          * possibly truncated to fit in the user's buffer.
 1724          * Traditional behavior is that we always tell the user
 1725          * precisely how much we copied, rather than something useful
 1726          * like the total amount we had available for her.
 1727          * Note that this interface is not idempotent; the entire answer must
 1728          * generated ahead of time.
 1729          */
 1730         valsize = min(len, sopt->sopt_valsize);
 1731         sopt->sopt_valsize = valsize;
 1732         if (sopt->sopt_val != NULL) {
 1733                 if (sopt->sopt_td != NULL)
 1734                         error = copyout(buf, sopt->sopt_val, valsize);
 1735                 else
 1736                         bcopy(buf, sopt->sopt_val, valsize);
 1737         }
 1738         return error;
 1739 }
 1740 
 1741 int
 1742 sogetopt(so, sopt)
 1743         struct socket *so;
 1744         struct sockopt *sopt;
 1745 {
 1746         int     error, optval;
 1747         struct  linger l;
 1748         struct  timeval tv;
 1749 #ifdef MAC
 1750         struct mac extmac;
 1751 #endif
 1752 
 1753         error = 0;
 1754         if (sopt->sopt_level != SOL_SOCKET) {
 1755                 if (so->so_proto && so->so_proto->pr_ctloutput) {
 1756                         return ((*so->so_proto->pr_ctloutput)
 1757                                   (so, sopt));
 1758                 } else
 1759                         return (ENOPROTOOPT);
 1760         } else {
 1761                 switch (sopt->sopt_name) {
 1762 #ifdef INET
 1763                 case SO_ACCEPTFILTER:
 1764                         error = do_getopt_accept_filter(so, sopt);
 1765                         break;
 1766 #endif
 1767                 case SO_LINGER:
 1768                         SOCK_LOCK(so);
 1769                         l.l_onoff = so->so_options & SO_LINGER;
 1770                         l.l_linger = so->so_linger;
 1771                         SOCK_UNLOCK(so);
 1772                         error = sooptcopyout(sopt, &l, sizeof l);
 1773                         break;
 1774 
 1775                 case SO_USELOOPBACK:
 1776                 case SO_DONTROUTE:
 1777                 case SO_DEBUG:
 1778                 case SO_KEEPALIVE:
 1779                 case SO_REUSEADDR:
 1780                 case SO_REUSEPORT:
 1781                 case SO_BROADCAST:
 1782                 case SO_OOBINLINE:
 1783                 case SO_ACCEPTCONN:
 1784                 case SO_TIMESTAMP:
 1785                 case SO_BINTIME:
 1786                 case SO_NOSIGPIPE:
 1787                         optval = so->so_options & sopt->sopt_name;
 1788 integer:
 1789                         error = sooptcopyout(sopt, &optval, sizeof optval);
 1790                         break;
 1791 
 1792                 case SO_TYPE:
 1793                         optval = so->so_type;
 1794                         goto integer;
 1795 
 1796                 case SO_ERROR:
 1797                         optval = so->so_error;
 1798                         so->so_error = 0;
 1799                         goto integer;
 1800 
 1801                 case SO_SNDBUF:
 1802                         optval = so->so_snd.sb_hiwat;
 1803                         goto integer;
 1804 
 1805                 case SO_RCVBUF:
 1806                         optval = so->so_rcv.sb_hiwat;
 1807                         goto integer;
 1808 
 1809                 case SO_SNDLOWAT:
 1810                         optval = so->so_snd.sb_lowat;
 1811                         goto integer;
 1812 
 1813                 case SO_RCVLOWAT:
 1814                         optval = so->so_rcv.sb_lowat;
 1815                         goto integer;
 1816 
 1817                 case SO_SNDTIMEO:
 1818                 case SO_RCVTIMEO:
 1819                         optval = (sopt->sopt_name == SO_SNDTIMEO ?
 1820                                   so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
 1821 
 1822                         tv.tv_sec = optval / hz;
 1823                         tv.tv_usec = (optval % hz) * tick;
 1824                         error = sooptcopyout(sopt, &tv, sizeof tv);
 1825                         break;
 1826 
 1827                 case SO_LABEL:
 1828 #ifdef MAC
 1829                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
 1830                             sizeof(extmac));
 1831                         if (error)
 1832                                 return (error);
 1833                         error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
 1834                             so, &extmac);
 1835                         if (error)
 1836                                 return (error);
 1837                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
 1838 #else
 1839                         error = EOPNOTSUPP;
 1840 #endif
 1841                         break;
 1842 
 1843                 case SO_PEERLABEL:
 1844 #ifdef MAC
 1845                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
 1846                             sizeof(extmac));
 1847                         if (error)
 1848                                 return (error);
 1849                         error = mac_getsockopt_peerlabel(
 1850                             sopt->sopt_td->td_ucred, so, &extmac);
 1851                         if (error)
 1852                                 return (error);
 1853                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
 1854 #else
 1855                         error = EOPNOTSUPP;
 1856 #endif
 1857                         break;
 1858 
 1859                 case SO_LISTENQLIMIT:
 1860                         optval = so->so_qlimit;
 1861                         goto integer;
 1862 
 1863                 case SO_LISTENQLEN:
 1864                         optval = so->so_qlen;
 1865                         goto integer;
 1866 
 1867                 case SO_LISTENINCQLEN:
 1868                         optval = so->so_incqlen;
 1869                         goto integer;
 1870 
 1871                 default:
 1872                         error = ENOPROTOOPT;
 1873                         break;
 1874                 }
 1875                 return (error);
 1876         }
 1877 }
 1878 
 1879 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
 1880 int
 1881 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
 1882 {
 1883         struct mbuf *m, *m_prev;
 1884         int sopt_size = sopt->sopt_valsize;
 1885 
 1886         MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
 1887         if (m == NULL)
 1888                 return ENOBUFS;
 1889         if (sopt_size > MLEN) {
 1890                 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
 1891                 if ((m->m_flags & M_EXT) == 0) {
 1892                         m_free(m);
 1893                         return ENOBUFS;
 1894                 }
 1895                 m->m_len = min(MCLBYTES, sopt_size);
 1896         } else {
 1897                 m->m_len = min(MLEN, sopt_size);
 1898         }
 1899         sopt_size -= m->m_len;
 1900         *mp = m;
 1901         m_prev = m;
 1902 
 1903         while (sopt_size) {
 1904                 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
 1905                 if (m == NULL) {
 1906                         m_freem(*mp);
 1907                         return ENOBUFS;
 1908                 }
 1909                 if (sopt_size > MLEN) {
 1910                         MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
 1911                             M_DONTWAIT);
 1912                         if ((m->m_flags & M_EXT) == 0) {
 1913                                 m_freem(m);
 1914                                 m_freem(*mp);
 1915                                 return ENOBUFS;
 1916                         }
 1917                         m->m_len = min(MCLBYTES, sopt_size);
 1918                 } else {
 1919                         m->m_len = min(MLEN, sopt_size);
 1920                 }
 1921                 sopt_size -= m->m_len;
 1922                 m_prev->m_next = m;
 1923                 m_prev = m;
 1924         }
 1925         return 0;
 1926 }
 1927 
 1928 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
 1929 int
 1930 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
 1931 {
 1932         struct mbuf *m0 = m;
 1933 
 1934         if (sopt->sopt_val == NULL)
 1935                 return 0;
 1936         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 1937                 if (sopt->sopt_td != NULL) {
 1938                         int error;
 1939 
 1940                         error = copyin(sopt->sopt_val, mtod(m, char *),
 1941                                        m->m_len);
 1942                         if (error != 0) {
 1943                                 m_freem(m0);
 1944                                 return(error);
 1945                         }
 1946                 } else
 1947                         bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
 1948                 sopt->sopt_valsize -= m->m_len;
 1949                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
 1950                 m = m->m_next;
 1951         }
 1952         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
 1953                 panic("ip6_sooptmcopyin");
 1954         return 0;
 1955 }
 1956 
 1957 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
 1958 int
 1959 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
 1960 {
 1961         struct mbuf *m0 = m;
 1962         size_t valsize = 0;
 1963 
 1964         if (sopt->sopt_val == NULL)
 1965                 return 0;
 1966         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 1967                 if (sopt->sopt_td != NULL) {
 1968                         int error;
 1969 
 1970                         error = copyout(mtod(m, char *), sopt->sopt_val,
 1971                                        m->m_len);
 1972                         if (error != 0) {
 1973                                 m_freem(m0);
 1974                                 return(error);
 1975                         }
 1976                 } else
 1977                         bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
 1978                sopt->sopt_valsize -= m->m_len;
 1979                sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
 1980                valsize += m->m_len;
 1981                m = m->m_next;
 1982         }
 1983         if (m != NULL) {
 1984                 /* enough soopt buffer should be given from user-land */
 1985                 m_freem(m0);
 1986                 return(EINVAL);
 1987         }
 1988         sopt->sopt_valsize = valsize;
 1989         return 0;
 1990 }
 1991 
 1992 void
 1993 sohasoutofband(so)
 1994         struct socket *so;
 1995 {
 1996         if (so->so_sigio != NULL)
 1997                 pgsigio(&so->so_sigio, SIGURG, 0);
 1998         selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
 1999 }
 2000 
 2001 int
 2002 sopoll(struct socket *so, int events, struct ucred *active_cred,
 2003     struct thread *td)
 2004 {
 2005         int revents = 0;
 2006 
 2007         SOCKBUF_LOCK(&so->so_snd);
 2008         SOCKBUF_LOCK(&so->so_rcv);
 2009         if (events & (POLLIN | POLLRDNORM))
 2010                 if (soreadable(so))
 2011                         revents |= events & (POLLIN | POLLRDNORM);
 2012 
 2013         if (events & POLLINIGNEOF)
 2014                 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
 2015                     !TAILQ_EMPTY(&so->so_comp) || so->so_error)
 2016                         revents |= POLLINIGNEOF;
 2017 
 2018         if (events & (POLLOUT | POLLWRNORM))
 2019                 if (sowriteable(so))
 2020                         revents |= events & (POLLOUT | POLLWRNORM);
 2021 
 2022         if (events & (POLLPRI | POLLRDBAND))
 2023                 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
 2024                         revents |= events & (POLLPRI | POLLRDBAND);
 2025 
 2026         if (revents == 0) {
 2027                 if (events &
 2028                     (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
 2029                      POLLRDBAND)) {
 2030                         selrecord(td, &so->so_rcv.sb_sel);
 2031                         so->so_rcv.sb_flags |= SB_SEL;
 2032                 }
 2033 
 2034                 if (events & (POLLOUT | POLLWRNORM)) {
 2035                         selrecord(td, &so->so_snd.sb_sel);
 2036                         so->so_snd.sb_flags |= SB_SEL;
 2037                 }
 2038         }
 2039 
 2040         SOCKBUF_UNLOCK(&so->so_rcv);
 2041         SOCKBUF_UNLOCK(&so->so_snd);
 2042         return (revents);
 2043 }
 2044 
 2045 int
 2046 soo_kqfilter(struct file *fp, struct knote *kn)
 2047 {
 2048         struct socket *so = kn->kn_fp->f_data;
 2049         struct sockbuf *sb;
 2050 
 2051         switch (kn->kn_filter) {
 2052         case EVFILT_READ:
 2053                 if (so->so_options & SO_ACCEPTCONN)
 2054                         kn->kn_fop = &solisten_filtops;
 2055                 else
 2056                         kn->kn_fop = &soread_filtops;
 2057                 sb = &so->so_rcv;
 2058                 break;
 2059         case EVFILT_WRITE:
 2060                 kn->kn_fop = &sowrite_filtops;
 2061                 sb = &so->so_snd;
 2062                 break;
 2063         default:
 2064                 return (EINVAL);
 2065         }
 2066 
 2067         SOCKBUF_LOCK(sb);
 2068         knlist_add(&sb->sb_sel.si_note, kn, 1);
 2069         sb->sb_flags |= SB_KNOTE;
 2070         SOCKBUF_UNLOCK(sb);
 2071         return (0);
 2072 }
 2073 
 2074 static void
 2075 filt_sordetach(struct knote *kn)
 2076 {
 2077         struct socket *so = kn->kn_fp->f_data;
 2078 
 2079         SOCKBUF_LOCK(&so->so_rcv);
 2080         knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
 2081         if (knlist_empty(&so->so_rcv.sb_sel.si_note))
 2082                 so->so_rcv.sb_flags &= ~SB_KNOTE;
 2083         SOCKBUF_UNLOCK(&so->so_rcv);
 2084 }
 2085 
 2086 /*ARGSUSED*/
 2087 static int
 2088 filt_soread(struct knote *kn, long hint)
 2089 {
 2090         struct socket *so;
 2091 
 2092         so = kn->kn_fp->f_data;
 2093         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 2094 
 2095         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
 2096         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 2097                 kn->kn_flags |= EV_EOF;
 2098                 kn->kn_fflags = so->so_error;
 2099                 return (1);
 2100         } else if (so->so_error)        /* temporary udp error */
 2101                 return (1);
 2102         else if (kn->kn_sfflags & NOTE_LOWAT)
 2103                 return (kn->kn_data >= kn->kn_sdata);
 2104         else
 2105                 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
 2106 }
 2107 
 2108 static void
 2109 filt_sowdetach(struct knote *kn)
 2110 {
 2111         struct socket *so = kn->kn_fp->f_data;
 2112 
 2113         SOCKBUF_LOCK(&so->so_snd);
 2114         knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
 2115         if (knlist_empty(&so->so_snd.sb_sel.si_note))
 2116                 so->so_snd.sb_flags &= ~SB_KNOTE;
 2117         SOCKBUF_UNLOCK(&so->so_snd);
 2118 }
 2119 
 2120 /*ARGSUSED*/
 2121 static int
 2122 filt_sowrite(struct knote *kn, long hint)
 2123 {
 2124         struct socket *so;
 2125 
 2126         so = kn->kn_fp->f_data;
 2127         SOCKBUF_LOCK_ASSERT(&so->so_snd);
 2128         kn->kn_data = sbspace(&so->so_snd);
 2129         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 2130                 kn->kn_flags |= EV_EOF;
 2131                 kn->kn_fflags = so->so_error;
 2132                 return (1);
 2133         } else if (so->so_error)        /* temporary udp error */
 2134                 return (1);
 2135         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
 2136             (so->so_proto->pr_flags & PR_CONNREQUIRED))
 2137                 return (0);
 2138         else if (kn->kn_sfflags & NOTE_LOWAT)
 2139                 return (kn->kn_data >= kn->kn_sdata);
 2140         else
 2141                 return (kn->kn_data >= so->so_snd.sb_lowat);
 2142 }
 2143 
 2144 /*ARGSUSED*/
 2145 static int
 2146 filt_solisten(struct knote *kn, long hint)
 2147 {
 2148         struct socket *so = kn->kn_fp->f_data;
 2149 
 2150         kn->kn_data = so->so_qlen;
 2151         return (! TAILQ_EMPTY(&so->so_comp));
 2152 }
 2153 
 2154 int
 2155 socheckuid(struct socket *so, uid_t uid)
 2156 {
 2157 
 2158         if (so == NULL)
 2159                 return (EPERM);
 2160         if (so->so_cred->cr_uid != uid)
 2161                 return (EPERM);
 2162         return (0);
 2163 }
 2164 
 2165 static int
 2166 somaxconn_sysctl(SYSCTL_HANDLER_ARGS)
 2167 {
 2168         int error;
 2169         int val;
 2170 
 2171         val = somaxconn;
 2172         error = sysctl_handle_int(oidp, &val, sizeof(int), req);
 2173         if (error || !req->newptr )
 2174                 return (error);
 2175 
 2176         if (val < 1 || val > USHRT_MAX)
 2177                 return (EINVAL);
 2178 
 2179         somaxconn = val;
 2180         return (0);
 2181 }

Cache object: 8b1551852cf0f9a3a4161165409fa45d


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.