uipc_socket.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Copyright (c) 2004 The FreeBSD Foundation
    3  * Copyright (c) 2004 Robert Watson
    4  * Copyright (c) 1982, 1986, 1988, 1990, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  * 4. Neither the name of the University nor the names of its contributors
   16  *    may be used to endorse or promote products derived from this software
   17  *    without specific prior written permission.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   29  * SUCH DAMAGE.
   30  *
   31  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
   32  */
   33 
   34 #include <sys/cdefs.h>
   35 __FBSDID("$FreeBSD: releng/5.3/sys/kern/uipc_socket.c 137175 2004-11-04 01:17:31Z rwatson $");
   36 
   37 #include "opt_inet.h"
   38 #include "opt_mac.h"
   39 #include "opt_zero.h"
   40 
   41 #include <sys/param.h>
   42 #include <sys/systm.h>
   43 #include <sys/fcntl.h>
   44 #include <sys/limits.h>
   45 #include <sys/lock.h>
   46 #include <sys/mac.h>
   47 #include <sys/malloc.h>
   48 #include <sys/mbuf.h>
   49 #include <sys/mutex.h>
   50 #include <sys/domain.h>
   51 #include <sys/file.h>                   /* for struct knote */
   52 #include <sys/kernel.h>
   53 #include <sys/event.h>
   54 #include <sys/poll.h>
   55 #include <sys/proc.h>
   56 #include <sys/protosw.h>
   57 #include <sys/socket.h>
   58 #include <sys/socketvar.h>
   59 #include <sys/resourcevar.h>
   60 #include <sys/signalvar.h>
   61 #include <sys/sysctl.h>
   62 #include <sys/uio.h>
   63 #include <sys/jail.h>
   64 
   65 #include <vm/uma.h>
   66 
   67 
   68 static int      soreceive_rcvoob(struct socket *so, struct uio *uio,
   69                     int flags);
   70 
   71 #ifdef INET
   72 static int       do_setopt_accept_filter(struct socket *so, struct sockopt *sopt);
   73 #endif
   74 
   75 static void     filt_sordetach(struct knote *kn);
   76 static int      filt_soread(struct knote *kn, long hint);
   77 static void     filt_sowdetach(struct knote *kn);
   78 static int      filt_sowrite(struct knote *kn, long hint);
   79 static int      filt_solisten(struct knote *kn, long hint);
   80 
   81 static struct filterops solisten_filtops =
   82         { 1, NULL, filt_sordetach, filt_solisten };
   83 static struct filterops soread_filtops =
   84         { 1, NULL, filt_sordetach, filt_soread };
   85 static struct filterops sowrite_filtops =
   86         { 1, NULL, filt_sowdetach, filt_sowrite };
   87 
   88 uma_zone_t socket_zone;
   89 so_gen_t        so_gencnt;      /* generation count for sockets */
   90 
   91 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
   92 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
   93 
   94 SYSCTL_DECL(_kern_ipc);
   95 
   96 static int somaxconn = SOMAXCONN;
   97 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
   98     &somaxconn, 0, "Maximum pending socket connection queue size");
   99 static int numopensockets;
  100 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
  101     &numopensockets, 0, "Number of open sockets");
  102 #ifdef ZERO_COPY_SOCKETS
  103 /* These aren't static because they're used in other files. */
  104 int so_zero_copy_send = 1;
  105 int so_zero_copy_receive = 1;
  106 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
  107     "Zero copy controls");
  108 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
  109     &so_zero_copy_receive, 0, "Enable zero copy receive");
  110 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
  111     &so_zero_copy_send, 0, "Enable zero copy send");
  112 #endif /* ZERO_COPY_SOCKETS */
  113 
  114 /*
  115  * accept_mtx locks down per-socket fields relating to accept queues.  See
  116  * socketvar.h for an annotation of the protected fields of struct socket.
  117  */
  118 struct mtx accept_mtx;
  119 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
  120 
  121 /*
  122  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
  123  * so_gencnt field.
  124  *
  125  * XXXRW: These variables might be better manipulated using atomic operations
  126  * for improved efficiency.
  127  */
  128 static struct mtx so_global_mtx;
  129 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
  130 
  131 /*
  132  * Socket operation routines.
  133  * These routines are called by the routines in
  134  * sys_socket.c or from a system process, and
  135  * implement the semantics of socket operations by
  136  * switching out to the protocol specific routines.
  137  */
  138 
  139 /*
  140  * Get a socket structure from our zone, and initialize it.
  141  * Note that it would probably be better to allocate socket
  142  * and PCB at the same time, but I'm not convinced that all
  143  * the protocols can be easily modified to do this.
  144  *
  145  * soalloc() returns a socket with a ref count of 0.
  146  */
  147 struct socket *
  148 soalloc(int mflags)
  149 {
  150         struct socket *so;
  151 #ifdef MAC
  152         int error;
  153 #endif
  154 
  155         so = uma_zalloc(socket_zone, mflags | M_ZERO);
  156         if (so != NULL) {
  157 #ifdef MAC
  158                 error = mac_init_socket(so, mflags);
  159                 if (error != 0) {
  160                         uma_zfree(socket_zone, so);
  161                         so = NULL;
  162                         return so;
  163                 }
  164 #endif
  165                 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
  166                 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
  167                 /* sx_init(&so->so_sxlock, "socket sxlock"); */
  168                 TAILQ_INIT(&so->so_aiojobq);
  169                 mtx_lock(&so_global_mtx);
  170                 so->so_gencnt = ++so_gencnt;
  171                 ++numopensockets;
  172                 mtx_unlock(&so_global_mtx);
  173         }
  174         return so;
  175 }
  176 
  177 /*
  178  * socreate returns a socket with a ref count of 1.  The socket should be
  179  * closed with soclose().
  180  */
  181 int
  182 socreate(dom, aso, type, proto, cred, td)
  183         int dom;
  184         struct socket **aso;
  185         int type;
  186         int proto;
  187         struct ucred *cred;
  188         struct thread *td;
  189 {
  190         struct protosw *prp;
  191         struct socket *so;
  192         int error;
  193 
  194         if (proto)
  195                 prp = pffindproto(dom, proto, type);
  196         else
  197                 prp = pffindtype(dom, type);
  198 
  199         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL)
  200                 return (EPROTONOSUPPORT);
  201 
  202         if (jailed(cred) && jail_socket_unixiproute_only &&
  203             prp->pr_domain->dom_family != PF_LOCAL &&
  204             prp->pr_domain->dom_family != PF_INET &&
  205             prp->pr_domain->dom_family != PF_ROUTE) {
  206                 return (EPROTONOSUPPORT);
  207         }
  208 
  209         if (prp->pr_type != type)
  210                 return (EPROTOTYPE);
  211         so = soalloc(M_WAITOK);
  212         if (so == NULL)
  213                 return (ENOBUFS);
  214 
  215         TAILQ_INIT(&so->so_incomp);
  216         TAILQ_INIT(&so->so_comp);
  217         so->so_type = type;
  218         so->so_cred = crhold(cred);
  219         so->so_proto = prp;
  220 #ifdef MAC
  221         mac_create_socket(cred, so);
  222 #endif
  223         SOCK_LOCK(so);
  224         knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
  225         knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
  226         soref(so);
  227         SOCK_UNLOCK(so);
  228         error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
  229         if (error) {
  230                 ACCEPT_LOCK();
  231                 SOCK_LOCK(so);
  232                 so->so_state |= SS_NOFDREF;
  233                 sorele(so);
  234                 return (error);
  235         }
  236         *aso = so;
  237         return (0);
  238 }
  239 
  240 int
  241 sobind(so, nam, td)
  242         struct socket *so;
  243         struct sockaddr *nam;
  244         struct thread *td;
  245 {
  246 
  247         return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
  248 }
  249 
  250 void
  251 sodealloc(struct socket *so)
  252 {
  253 
  254         KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
  255         mtx_lock(&so_global_mtx);
  256         so->so_gencnt = ++so_gencnt;
  257         mtx_unlock(&so_global_mtx);
  258         if (so->so_rcv.sb_hiwat)
  259                 (void)chgsbsize(so->so_cred->cr_uidinfo,
  260                     &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
  261         if (so->so_snd.sb_hiwat)
  262                 (void)chgsbsize(so->so_cred->cr_uidinfo,
  263                     &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
  264 #ifdef INET
  265         /* remove acccept filter if one is present. */
  266         if (so->so_accf != NULL)
  267                 do_setopt_accept_filter(so, NULL);
  268 #endif
  269 #ifdef MAC
  270         mac_destroy_socket(so);
  271 #endif
  272         crfree(so->so_cred);
  273         SOCKBUF_LOCK_DESTROY(&so->so_snd);
  274         SOCKBUF_LOCK_DESTROY(&so->so_rcv);
  275         /* sx_destroy(&so->so_sxlock); */
  276         uma_zfree(socket_zone, so);
  277         /*
  278          * XXXRW: Seems like a shame to grab the mutex again down here, but
  279          * we don't want to decrement the socket count until after we free
  280          * the socket, and we can't increment the gencnt on the socket after
  281          * we free, it so...
  282          */
  283         mtx_lock(&so_global_mtx);
  284         --numopensockets;
  285         mtx_unlock(&so_global_mtx);
  286 }
  287 
  288 int
  289 solisten(so, backlog, td)
  290         struct socket *so;
  291         int backlog;
  292         struct thread *td;
  293 {
  294         int error;
  295 
  296         /*
  297          * XXXRW: Ordering issue here -- perhaps we need to set
  298          * SO_ACCEPTCONN before the call to pru_listen()?
  299          * XXXRW: General atomic test-and-set concerns here also.
  300          */
  301         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
  302                             SS_ISDISCONNECTING))
  303                 return (EINVAL);
  304         error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
  305         if (error)
  306                 return (error);
  307         ACCEPT_LOCK();
  308         if (TAILQ_EMPTY(&so->so_comp)) {
  309                 SOCK_LOCK(so);
  310                 so->so_options |= SO_ACCEPTCONN;
  311                 SOCK_UNLOCK(so);
  312         }
  313         if (backlog < 0 || backlog > somaxconn)
  314                 backlog = somaxconn;
  315         so->so_qlimit = backlog;
  316         ACCEPT_UNLOCK();
  317         return (0);
  318 }
  319 
  320 /*
  321  * Attempt to free a socket.  This should really be sotryfree().
  322  *
  323  * We free the socket if the protocol is no longer interested in the socket,
  324  * there's no file descriptor reference, and the refcount is 0.  While the
  325  * calling macro sotryfree() tests the refcount, sofree() has to test it
  326  * again as it's possible to race with an accept()ing thread if the socket is
  327  * in an listen queue of a listen socket, as being in the listen queue
  328  * doesn't elevate the reference count.  sofree() acquires the accept mutex
  329  * early for this test in order to avoid that race.
  330  */
  331 void
  332 sofree(so)
  333         struct socket *so;
  334 {
  335         struct socket *head;
  336 
  337         ACCEPT_LOCK_ASSERT();
  338         SOCK_LOCK_ASSERT(so);
  339 
  340         if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0 ||
  341             so->so_count != 0) {
  342                 SOCK_UNLOCK(so);
  343                 ACCEPT_UNLOCK();
  344                 return;
  345         }
  346 
  347         head = so->so_head;
  348         if (head != NULL) {
  349                 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
  350                     (so->so_qstate & SQ_INCOMP) != 0,
  351                     ("sofree: so_head != NULL, but neither SQ_COMP nor "
  352                     "SQ_INCOMP"));
  353                 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
  354                     (so->so_qstate & SQ_INCOMP) == 0,
  355                     ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
  356                 /*
  357                  * accept(2) is responsible draining the completed
  358                  * connection queue and freeing those sockets, so
  359                  * we just return here if this socket is currently
  360                  * on the completed connection queue.  Otherwise,
  361                  * accept(2) may hang after select(2) has indicating
  362                  * that a listening socket was ready.  If it's an
  363                  * incomplete connection, we remove it from the queue
  364                  * and free it; otherwise, it won't be released until
  365                  * the listening socket is closed.
  366                  */
  367                 if ((so->so_qstate & SQ_COMP) != 0) {
  368                         SOCK_UNLOCK(so);
  369                         ACCEPT_UNLOCK();
  370                         return;
  371                 }
  372                 TAILQ_REMOVE(&head->so_incomp, so, so_list);
  373                 head->so_incqlen--;
  374                 so->so_qstate &= ~SQ_INCOMP;
  375                 so->so_head = NULL;
  376         }
  377         KASSERT((so->so_qstate & SQ_COMP) == 0 &&
  378             (so->so_qstate & SQ_INCOMP) == 0,
  379             ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
  380             so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
  381         SOCK_UNLOCK(so);
  382         ACCEPT_UNLOCK();
  383         SOCKBUF_LOCK(&so->so_snd);
  384         so->so_snd.sb_flags |= SB_NOINTR;
  385         (void)sblock(&so->so_snd, M_WAITOK);
  386         /*
  387          * socantsendmore_locked() drops the socket buffer mutex so that it
  388          * can safely perform wakeups.  Re-acquire the mutex before
  389          * continuing.
  390          */
  391         socantsendmore_locked(so);
  392         SOCKBUF_LOCK(&so->so_snd);
  393         sbunlock(&so->so_snd);
  394         sbrelease_locked(&so->so_snd, so);
  395         SOCKBUF_UNLOCK(&so->so_snd);
  396         sorflush(so);
  397         knlist_destroy(&so->so_rcv.sb_sel.si_note);
  398         knlist_destroy(&so->so_snd.sb_sel.si_note);
  399         sodealloc(so);
  400 }
  401 
  402 /*
  403  * Close a socket on last file table reference removal.
  404  * Initiate disconnect if connected.
  405  * Free socket when disconnect complete.
  406  *
  407  * This function will sorele() the socket.  Note that soclose() may be
  408  * called prior to the ref count reaching zero.  The actual socket
  409  * structure will not be freed until the ref count reaches zero.
  410  */
  411 int
  412 soclose(so)
  413         struct socket *so;
  414 {
  415         int error = 0;
  416 
  417         KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
  418 
  419         funsetown(&so->so_sigio);
  420         if (so->so_options & SO_ACCEPTCONN) {
  421                 struct socket *sp;
  422                 ACCEPT_LOCK();
  423                 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
  424                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
  425                         so->so_incqlen--;
  426                         sp->so_qstate &= ~SQ_INCOMP;
  427                         sp->so_head = NULL;
  428                         ACCEPT_UNLOCK();
  429                         (void) soabort(sp);
  430                         ACCEPT_LOCK();
  431                 }
  432                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
  433                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
  434                         so->so_qlen--;
  435                         sp->so_qstate &= ~SQ_COMP;
  436                         sp->so_head = NULL;
  437                         ACCEPT_UNLOCK();
  438                         (void) soabort(sp);
  439                         ACCEPT_LOCK();
  440                 }
  441                 ACCEPT_UNLOCK();
  442         }
  443         if (so->so_pcb == NULL)
  444                 goto discard;
  445         if (so->so_state & SS_ISCONNECTED) {
  446                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
  447                         error = sodisconnect(so);
  448                         if (error)
  449                                 goto drop;
  450                 }
  451                 if (so->so_options & SO_LINGER) {
  452                         if ((so->so_state & SS_ISDISCONNECTING) &&
  453                             (so->so_state & SS_NBIO))
  454                                 goto drop;
  455                         while (so->so_state & SS_ISCONNECTED) {
  456                                 error = tsleep(&so->so_timeo,
  457                                     PSOCK | PCATCH, "soclos", so->so_linger * hz);
  458                                 if (error)
  459                                         break;
  460                         }
  461                 }
  462         }
  463 drop:
  464         if (so->so_pcb != NULL) {
  465                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
  466                 if (error == 0)
  467                         error = error2;
  468         }
  469 discard:
  470         ACCEPT_LOCK();
  471         SOCK_LOCK(so);
  472         KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
  473         so->so_state |= SS_NOFDREF;
  474         sorele(so);
  475         return (error);
  476 }
  477 
  478 /*
  479  * soabort() must not be called with any socket locks held, as it calls
  480  * into the protocol, which will call back into the socket code causing
  481  * it to acquire additional socket locks that may cause recursion or lock
  482  * order reversals.
  483  */
  484 int
  485 soabort(so)
  486         struct socket *so;
  487 {
  488         int error;
  489 
  490         error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
  491         if (error) {
  492                 ACCEPT_LOCK();
  493                 SOCK_LOCK(so);
  494                 sotryfree(so);  /* note: does not decrement the ref count */
  495                 return error;
  496         }
  497         return (0);
  498 }
  499 
  500 int
  501 soaccept(so, nam)
  502         struct socket *so;
  503         struct sockaddr **nam;
  504 {
  505         int error;
  506 
  507         SOCK_LOCK(so);
  508         KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
  509         so->so_state &= ~SS_NOFDREF;
  510         SOCK_UNLOCK(so);
  511         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
  512         return (error);
  513 }
  514 
  515 int
  516 soconnect(so, nam, td)
  517         struct socket *so;
  518         struct sockaddr *nam;
  519         struct thread *td;
  520 {
  521         int error;
  522 
  523         if (so->so_options & SO_ACCEPTCONN)
  524                 return (EOPNOTSUPP);
  525         /*
  526          * If protocol is connection-based, can only connect once.
  527          * Otherwise, if connected, try to disconnect first.
  528          * This allows user to disconnect by connecting to, e.g.,
  529          * a null address.
  530          */
  531         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
  532             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
  533             (error = sodisconnect(so))))
  534                 error = EISCONN;
  535         else
  536                 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
  537         return (error);
  538 }
  539 
  540 int
  541 soconnect2(so1, so2)
  542         struct socket *so1;
  543         struct socket *so2;
  544 {
  545 
  546         return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
  547 }
  548 
  549 int
  550 sodisconnect(so)
  551         struct socket *so;
  552 {
  553         int error;
  554 
  555         if ((so->so_state & SS_ISCONNECTED) == 0)
  556                 return (ENOTCONN);
  557         if (so->so_state & SS_ISDISCONNECTING)
  558                 return (EALREADY);
  559         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
  560         return (error);
  561 }
  562 
  563 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
  564 /*
  565  * Send on a socket.
  566  * If send must go all at once and message is larger than
  567  * send buffering, then hard error.
  568  * Lock against other senders.
  569  * If must go all at once and not enough room now, then
  570  * inform user that this would block and do nothing.
  571  * Otherwise, if nonblocking, send as much as possible.
  572  * The data to be sent is described by "uio" if nonzero,
  573  * otherwise by the mbuf chain "top" (which must be null
  574  * if uio is not).  Data provided in mbuf chain must be small
  575  * enough to send all at once.
  576  *
  577  * Returns nonzero on error, timeout or signal; callers
  578  * must check for short counts if EINTR/ERESTART are returned.
  579  * Data and control buffers are freed on return.
  580  */
  581 
  582 #ifdef ZERO_COPY_SOCKETS
  583 struct so_zerocopy_stats{
  584         int size_ok;
  585         int align_ok;
  586         int found_ifp;
  587 };
  588 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
  589 #include <netinet/in.h>
  590 #include <net/route.h>
  591 #include <netinet/in_pcb.h>
  592 #include <vm/vm.h>
  593 #include <vm/vm_page.h>
  594 #include <vm/vm_object.h>
  595 #endif /*ZERO_COPY_SOCKETS*/
  596 
  597 int
  598 sosend(so, addr, uio, top, control, flags, td)
  599         struct socket *so;
  600         struct sockaddr *addr;
  601         struct uio *uio;
  602         struct mbuf *top;
  603         struct mbuf *control;
  604         int flags;
  605         struct thread *td;
  606 {
  607         struct mbuf **mp;
  608         struct mbuf *m;
  609         long space, len = 0, resid;
  610         int clen = 0, error, dontroute;
  611         int atomic = sosendallatonce(so) || top;
  612 #ifdef ZERO_COPY_SOCKETS
  613         int cow_send;
  614 #endif /* ZERO_COPY_SOCKETS */
  615 
  616         if (uio != NULL)
  617                 resid = uio->uio_resid;
  618         else
  619                 resid = top->m_pkthdr.len;
  620         /*
  621          * In theory resid should be unsigned.
  622          * However, space must be signed, as it might be less than 0
  623          * if we over-committed, and we must use a signed comparison
  624          * of space and resid.  On the other hand, a negative resid
  625          * causes us to loop sending 0-length segments to the protocol.
  626          *
  627          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
  628          * type sockets since that's an error.
  629          */
  630         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
  631                 error = EINVAL;
  632                 goto out;
  633         }
  634 
  635         dontroute =
  636             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
  637             (so->so_proto->pr_flags & PR_ATOMIC);
  638         if (td != NULL)
  639                 td->td_proc->p_stats->p_ru.ru_msgsnd++;
  640         if (control != NULL)
  641                 clen = control->m_len;
  642 #define snderr(errno)   { error = (errno); goto release; }
  643 
  644         SOCKBUF_LOCK(&so->so_snd);
  645 restart:
  646         SOCKBUF_LOCK_ASSERT(&so->so_snd);
  647         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
  648         if (error)
  649                 goto out_locked;
  650         do {
  651                 SOCKBUF_LOCK_ASSERT(&so->so_snd);
  652                 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
  653                         snderr(EPIPE);
  654                 if (so->so_error) {
  655                         error = so->so_error;
  656                         so->so_error = 0;
  657                         goto release;
  658                 }
  659                 if ((so->so_state & SS_ISCONNECTED) == 0) {
  660                         /*
  661                          * `sendto' and `sendmsg' is allowed on a connection-
  662                          * based socket if it supports implied connect.
  663                          * Return ENOTCONN if not connected and no address is
  664                          * supplied.
  665                          */
  666                         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
  667                             (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
  668                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
  669                                     !(resid == 0 && clen != 0))
  670                                         snderr(ENOTCONN);
  671                         } else if (addr == NULL)
  672                             snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
  673                                    ENOTCONN : EDESTADDRREQ);
  674                 }
  675                 space = sbspace(&so->so_snd);
  676                 if (flags & MSG_OOB)
  677                         space += 1024;
  678                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
  679                     clen > so->so_snd.sb_hiwat)
  680                         snderr(EMSGSIZE);
  681                 if (space < resid + clen &&
  682                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
  683                         if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
  684                                 snderr(EWOULDBLOCK);
  685                         sbunlock(&so->so_snd);
  686                         error = sbwait(&so->so_snd);
  687                         if (error)
  688                                 goto out_locked;
  689                         goto restart;
  690                 }
  691                 SOCKBUF_UNLOCK(&so->so_snd);
  692                 mp = &top;
  693                 space -= clen;
  694                 do {
  695                     if (uio == NULL) {
  696                         /*
  697                          * Data is prepackaged in "top".
  698                          */
  699                         resid = 0;
  700                         if (flags & MSG_EOR)
  701                                 top->m_flags |= M_EOR;
  702                     } else do {
  703 #ifdef ZERO_COPY_SOCKETS
  704                         cow_send = 0;
  705 #endif /* ZERO_COPY_SOCKETS */
  706                         if (resid >= MINCLSIZE) {
  707 #ifdef ZERO_COPY_SOCKETS
  708                                 if (top == NULL) {
  709                                         MGETHDR(m, M_TRYWAIT, MT_DATA);
  710                                         if (m == NULL) {
  711                                                 error = ENOBUFS;
  712                                                 SOCKBUF_LOCK(&so->so_snd);
  713                                                 goto release;
  714                                         }
  715                                         m->m_pkthdr.len = 0;
  716                                         m->m_pkthdr.rcvif = (struct ifnet *)0;
  717                                 } else {
  718                                         MGET(m, M_TRYWAIT, MT_DATA);
  719                                         if (m == NULL) {
  720                                                 error = ENOBUFS;
  721                                                 SOCKBUF_LOCK(&so->so_snd);
  722                                                 goto release;
  723                                         }
  724                                 }
  725                                 if (so_zero_copy_send &&
  726                                     resid>=PAGE_SIZE &&
  727                                     space>=PAGE_SIZE &&
  728                                     uio->uio_iov->iov_len>=PAGE_SIZE) {
  729                                         so_zerocp_stats.size_ok++;
  730                                         if (!((vm_offset_t)
  731                                           uio->uio_iov->iov_base & PAGE_MASK)){
  732                                                 so_zerocp_stats.align_ok++;
  733                                                 cow_send = socow_setup(m, uio);
  734                                         }
  735                                 }
  736                                 if (!cow_send) {
  737                                         MCLGET(m, M_TRYWAIT);
  738                                         if ((m->m_flags & M_EXT) == 0) {
  739                                                 m_free(m);
  740                                                 m = NULL;
  741                                         } else {
  742                                                 len = min(min(MCLBYTES, resid), space);
  743                                         }
  744                                 } else
  745                                         len = PAGE_SIZE;
  746 #else /* ZERO_COPY_SOCKETS */
  747                                 if (top == NULL) {
  748                                         m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
  749                                         m->m_pkthdr.len = 0;
  750                                         m->m_pkthdr.rcvif = (struct ifnet *)0;
  751                                 } else
  752                                         m = m_getcl(M_TRYWAIT, MT_DATA, 0);
  753                                 len = min(min(MCLBYTES, resid), space);
  754 #endif /* ZERO_COPY_SOCKETS */
  755                         } else {
  756                                 if (top == NULL) {
  757                                         m = m_gethdr(M_TRYWAIT, MT_DATA);
  758                                         m->m_pkthdr.len = 0;
  759                                         m->m_pkthdr.rcvif = (struct ifnet *)0;
  760 
  761                                         len = min(min(MHLEN, resid), space);
  762                                         /*
  763                                          * For datagram protocols, leave room
  764                                          * for protocol headers in first mbuf.
  765                                          */
  766                                         if (atomic && m && len < MHLEN)
  767                                                 MH_ALIGN(m, len);
  768                                 } else {
  769                                         m = m_get(M_TRYWAIT, MT_DATA);
  770                                         len = min(min(MLEN, resid), space);
  771                                 }
  772                         }
  773                         if (m == NULL) {
  774                                 error = ENOBUFS;
  775                                 SOCKBUF_LOCK(&so->so_snd);
  776                                 goto release;
  777                         }
  778 
  779                         space -= len;
  780 #ifdef ZERO_COPY_SOCKETS
  781                         if (cow_send)
  782                                 error = 0;
  783                         else
  784 #endif /* ZERO_COPY_SOCKETS */
  785                         error = uiomove(mtod(m, void *), (int)len, uio);
  786                         resid = uio->uio_resid;
  787                         m->m_len = len;
  788                         *mp = m;
  789                         top->m_pkthdr.len += len;
  790                         if (error) {
  791                                 SOCKBUF_LOCK(&so->so_snd);
  792                                 goto release;
  793                         }
  794                         mp = &m->m_next;
  795                         if (resid <= 0) {
  796                                 if (flags & MSG_EOR)
  797                                         top->m_flags |= M_EOR;
  798                                 break;
  799                         }
  800                     } while (space > 0 && atomic);
  801                     if (dontroute) {
  802                             SOCK_LOCK(so);
  803                             so->so_options |= SO_DONTROUTE;
  804                             SOCK_UNLOCK(so);
  805                     }
  806                     /*
  807                      * XXX all the SBS_CANTSENDMORE checks previously
  808                      * done could be out of date.  We could have recieved
  809                      * a reset packet in an interrupt or maybe we slept
  810                      * while doing page faults in uiomove() etc. We could
  811                      * probably recheck again inside the locking protection
  812                      * here, but there are probably other places that this
  813                      * also happens.  We must rethink this.
  814                      */
  815                     error = (*so->so_proto->pr_usrreqs->pru_send)(so,
  816                         (flags & MSG_OOB) ? PRUS_OOB :
  817                         /*
  818                          * If the user set MSG_EOF, the protocol
  819                          * understands this flag and nothing left to
  820                          * send then use PRU_SEND_EOF instead of PRU_SEND.
  821                          */
  822                         ((flags & MSG_EOF) &&
  823                          (so->so_proto->pr_flags & PR_IMPLOPCL) &&
  824                          (resid <= 0)) ?
  825                                 PRUS_EOF :
  826                         /* If there is more to send set PRUS_MORETOCOME */
  827                         (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
  828                         top, addr, control, td);
  829                     if (dontroute) {
  830                             SOCK_LOCK(so);
  831                             so->so_options &= ~SO_DONTROUTE;
  832                             SOCK_UNLOCK(so);
  833                     }
  834                     clen = 0;
  835                     control = NULL;
  836                     top = NULL;
  837                     mp = &top;
  838                     if (error) {
  839                         SOCKBUF_LOCK(&so->so_snd);
  840                         goto release;
  841                     }
  842                 } while (resid && space > 0);
  843                 SOCKBUF_LOCK(&so->so_snd);
  844         } while (resid);
  845 
  846 release:
  847         SOCKBUF_LOCK_ASSERT(&so->so_snd);
  848         sbunlock(&so->so_snd);
  849 out_locked:
  850         SOCKBUF_LOCK_ASSERT(&so->so_snd);
  851         SOCKBUF_UNLOCK(&so->so_snd);
  852 out:
  853         if (top != NULL)
  854                 m_freem(top);
  855         if (control != NULL)
  856                 m_freem(control);
  857         return (error);
  858 }
  859 
  860 /*
  861  * The part of soreceive() that implements reading non-inline out-of-band
  862  * data from a socket.  For more complete comments, see soreceive(), from
  863  * which this code originated.
  864  *
  865  * XXXRW: Note that soreceive_rcvoob(), unlike the remainder of soreiceve(),
  866  * is unable to return an mbuf chain to the caller.
  867  */
  868 static int
  869 soreceive_rcvoob(so, uio, flags)
  870         struct socket *so;
  871         struct uio *uio;
  872         int flags;
  873 {
  874         struct protosw *pr = so->so_proto;
  875         struct mbuf *m;
  876         int error;
  877 
  878         KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
  879 
  880         m = m_get(M_TRYWAIT, MT_DATA);
  881         if (m == NULL)
  882                 return (ENOBUFS);
  883         error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
  884         if (error)
  885                 goto bad;
  886         do {
  887 #ifdef ZERO_COPY_SOCKETS
  888                 if (so_zero_copy_receive) {
  889                         vm_page_t pg;
  890                         int disposable;
  891 
  892                         if ((m->m_flags & M_EXT)
  893                          && (m->m_ext.ext_type == EXT_DISPOSABLE))
  894                                 disposable = 1;
  895                         else
  896                                 disposable = 0;
  897 
  898                         pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t)));
  899                         if (uio->uio_offset == -1)
  900                                 uio->uio_offset =IDX_TO_OFF(pg->pindex);
  901 
  902                         error = uiomoveco(mtod(m, void *),
  903                                           min(uio->uio_resid, m->m_len),
  904                                           uio, pg->object,
  905                                           disposable);
  906                 } else
  907 #endif /* ZERO_COPY_SOCKETS */
  908                 error = uiomove(mtod(m, void *),
  909                     (int) min(uio->uio_resid, m->m_len), uio);
  910                 m = m_free(m);
  911         } while (uio->uio_resid && error == 0 && m);
  912 bad:
  913         if (m != NULL)
  914                 m_freem(m);
  915         return (error);
  916 }
  917 
  918 /*
  919  * Following replacement or removal of the first mbuf on the first mbuf chain
  920  * of a socket buffer, push necessary state changes back into the socket
  921  * buffer so that other consumers see the values consistently.  'nextrecord'
  922  * is the callers locally stored value of the original value of
  923  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
  924  * NOTE: 'nextrecord' may be NULL.
  925  */
  926 static __inline void
  927 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
  928 {
  929 
  930         SOCKBUF_LOCK_ASSERT(sb);
  931         /*
  932          * First, update for the new value of nextrecord.  If necessary, make
  933          * it the first record.
  934          */
  935         if (sb->sb_mb != NULL)
  936                 sb->sb_mb->m_nextpkt = nextrecord;
  937         else
  938                 sb->sb_mb = nextrecord;
  939 
  940         /*
  941          * Now update any dependent socket buffer fields to reflect the new
  942          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
  943          * addition of a second clause that takes care of the case where
  944          * sb_mb has been updated, but remains the last record.
  945          */
  946         if (sb->sb_mb == NULL) {
  947                 sb->sb_mbtail = NULL;
  948                 sb->sb_lastrecord = NULL;
  949         } else if (sb->sb_mb->m_nextpkt == NULL)
  950                 sb->sb_lastrecord = sb->sb_mb;
  951 }
  952 
  953 
  954 /*
  955  * Implement receive operations on a socket.
  956  * We depend on the way that records are added to the sockbuf
  957  * by sbappend*.  In particular, each record (mbufs linked through m_next)
  958  * must begin with an address if the protocol so specifies,
  959  * followed by an optional mbuf or mbufs containing ancillary data,
  960  * and then zero or more mbufs of data.
  961  * In order to avoid blocking network interrupts for the entire time here,
  962  * we splx() while doing the actual copy to user space.
  963  * Although the sockbuf is locked, new data may still be appended,
  964  * and thus we must maintain consistency of the sockbuf during that time.
  965  *
  966  * The caller may receive the data as a single mbuf chain by supplying
  967  * an mbuf **mp0 for use in returning the chain.  The uio is then used
  968  * only for the count in uio_resid.
  969  */
  970 int
  971 soreceive(so, psa, uio, mp0, controlp, flagsp)
  972         struct socket *so;
  973         struct sockaddr **psa;
  974         struct uio *uio;
  975         struct mbuf **mp0;
  976         struct mbuf **controlp;
  977         int *flagsp;
  978 {
  979         struct mbuf *m, **mp;
  980         int flags, len, error, offset;
  981         struct protosw *pr = so->so_proto;
  982         struct mbuf *nextrecord;
  983         int moff, type = 0;
  984         int orig_resid = uio->uio_resid;
  985 
  986         mp = mp0;
  987         if (psa != NULL)
  988                 *psa = NULL;
  989         if (controlp != NULL)
  990                 *controlp = NULL;
  991         if (flagsp != NULL)
  992                 flags = *flagsp &~ MSG_EOR;
  993         else
  994                 flags = 0;
  995         if (flags & MSG_OOB)
  996                 return (soreceive_rcvoob(so, uio, flags));
  997         if (mp != NULL)
  998                 *mp = NULL;
  999         if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
 1000                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
 1001 
 1002         SOCKBUF_LOCK(&so->so_rcv);
 1003 restart:
 1004         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1005         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
 1006         if (error)
 1007                 goto out;
 1008 
 1009         m = so->so_rcv.sb_mb;
 1010         /*
 1011          * If we have less data than requested, block awaiting more
 1012          * (subject to any timeout) if:
 1013          *   1. the current count is less than the low water mark, or
 1014          *   2. MSG_WAITALL is set, and it is possible to do the entire
 1015          *      receive operation at once if we block (resid <= hiwat).
 1016          *   3. MSG_DONTWAIT is not set
 1017          * If MSG_WAITALL is set but resid is larger than the receive buffer,
 1018          * we have to do the receive in sections, and thus risk returning
 1019          * a short count if a timeout or signal occurs after we start.
 1020          */
 1021         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
 1022             so->so_rcv.sb_cc < uio->uio_resid) &&
 1023             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
 1024             ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
 1025             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
 1026                 KASSERT(m != NULL || !so->so_rcv.sb_cc,
 1027                     ("receive: m == %p so->so_rcv.sb_cc == %u",
 1028                     m, so->so_rcv.sb_cc));
 1029                 if (so->so_error) {
 1030                         if (m != NULL)
 1031                                 goto dontblock;
 1032                         error = so->so_error;
 1033                         if ((flags & MSG_PEEK) == 0)
 1034                                 so->so_error = 0;
 1035                         goto release;
 1036                 }
 1037                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1038                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 1039                         if (m)
 1040                                 goto dontblock;
 1041                         else
 1042                                 goto release;
 1043                 }
 1044                 for (; m != NULL; m = m->m_next)
 1045                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
 1046                                 m = so->so_rcv.sb_mb;
 1047                                 goto dontblock;
 1048                         }
 1049                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
 1050                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
 1051                         error = ENOTCONN;
 1052                         goto release;
 1053                 }
 1054                 if (uio->uio_resid == 0)
 1055                         goto release;
 1056                 if ((so->so_state & SS_NBIO) ||
 1057                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
 1058                         error = EWOULDBLOCK;
 1059                         goto release;
 1060                 }
 1061                 SBLASTRECORDCHK(&so->so_rcv);
 1062                 SBLASTMBUFCHK(&so->so_rcv);
 1063                 sbunlock(&so->so_rcv);
 1064                 error = sbwait(&so->so_rcv);
 1065                 if (error)
 1066                         goto out;
 1067                 goto restart;
 1068         }
 1069 dontblock:
 1070         /*
 1071          * From this point onward, we maintain 'nextrecord' as a cache of the
 1072          * pointer to the next record in the socket buffer.  We must keep the
 1073          * various socket buffer pointers and local stack versions of the
 1074          * pointers in sync, pushing out modifications before dropping the
 1075          * socket buffer mutex, and re-reading them when picking it up.
 1076          *
 1077          * Otherwise, we will race with the network stack appending new data
 1078          * or records onto the socket buffer by using inconsistent/stale
 1079          * versions of the field, possibly resulting in socket buffer
 1080          * corruption.
 1081          *
 1082          * By holding the high-level sblock(), we prevent simultaneous
 1083          * readers from pulling off the front of the socket buffer.
 1084          */
 1085         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1086         if (uio->uio_td)
 1087                 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
 1088         KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
 1089         SBLASTRECORDCHK(&so->so_rcv);
 1090         SBLASTMBUFCHK(&so->so_rcv);
 1091         nextrecord = m->m_nextpkt;
 1092         if (pr->pr_flags & PR_ADDR) {
 1093                 KASSERT(m->m_type == MT_SONAME,
 1094                     ("m->m_type == %d", m->m_type));
 1095                 orig_resid = 0;
 1096                 if (psa != NULL)
 1097                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
 1098                             M_NOWAIT);
 1099                 if (flags & MSG_PEEK) {
 1100                         m = m->m_next;
 1101                 } else {
 1102                         sbfree(&so->so_rcv, m);
 1103                         so->so_rcv.sb_mb = m_free(m);
 1104                         m = so->so_rcv.sb_mb;
 1105                         sockbuf_pushsync(&so->so_rcv, nextrecord);
 1106                 }
 1107         }
 1108 
 1109         /*
 1110          * Process one or more MT_CONTROL mbufs present before any data mbufs
 1111          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
 1112          * just copy the data; if !MSG_PEEK, we call into the protocol to
 1113          * perform externalization (or freeing if controlp == NULL).
 1114          */
 1115         if (m != NULL && m->m_type == MT_CONTROL) {
 1116                 struct mbuf *cm = NULL, *cmn;
 1117                 struct mbuf **cme = &cm;
 1118 
 1119                 do {
 1120                         if (flags & MSG_PEEK) {
 1121                                 if (controlp != NULL) {
 1122                                         *controlp = m_copy(m, 0, m->m_len);
 1123                                         controlp = &(*controlp)->m_next;
 1124                                 }
 1125                                 m = m->m_next;
 1126                         } else {
 1127                                 sbfree(&so->so_rcv, m);
 1128                                 so->so_rcv.sb_mb = m->m_next;
 1129                                 m->m_next = NULL;
 1130                                 *cme = m;
 1131                                 cme = &(*cme)->m_next;
 1132                                 m = so->so_rcv.sb_mb;
 1133                         }
 1134                 } while (m != NULL && m->m_type == MT_CONTROL);
 1135                 if ((flags & MSG_PEEK) == 0)
 1136                         sockbuf_pushsync(&so->so_rcv, nextrecord);
 1137                 while (cm != NULL) {
 1138                         cmn = cm->m_next;
 1139                         cm->m_next = NULL;
 1140                         if (pr->pr_domain->dom_externalize != NULL) {
 1141                                 SOCKBUF_UNLOCK(&so->so_rcv);
 1142                                 error = (*pr->pr_domain->dom_externalize)
 1143                                     (cm, controlp);
 1144                                 SOCKBUF_LOCK(&so->so_rcv);
 1145                         } else if (controlp != NULL)
 1146                                 *controlp = cm;
 1147                         else
 1148                                 m_freem(cm);
 1149                         if (controlp != NULL) {
 1150                                 orig_resid = 0;
 1151                                 while (*controlp != NULL)
 1152                                         controlp = &(*controlp)->m_next;
 1153                         }
 1154                         cm = cmn;
 1155                 }
 1156                 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
 1157                 orig_resid = 0;
 1158         }
 1159         if (m != NULL) {
 1160                 if ((flags & MSG_PEEK) == 0) {
 1161                         KASSERT(m->m_nextpkt == nextrecord,
 1162                             ("soreceive: post-control, nextrecord !sync"));
 1163                         if (nextrecord == NULL) {
 1164                                 KASSERT(so->so_rcv.sb_mb == m,
 1165                                     ("soreceive: post-control, sb_mb!=m"));
 1166                                 KASSERT(so->so_rcv.sb_lastrecord == m,
 1167                                     ("soreceive: post-control, lastrecord!=m"));
 1168                         }
 1169                 }
 1170                 type = m->m_type;
 1171                 if (type == MT_OOBDATA)
 1172                         flags |= MSG_OOB;
 1173         } else {
 1174                 if ((flags & MSG_PEEK) == 0) {
 1175                         KASSERT(so->so_rcv.sb_mb == nextrecord,
 1176                             ("soreceive: sb_mb != nextrecord"));
 1177                         if (so->so_rcv.sb_mb == NULL) {
 1178                                 KASSERT(so->so_rcv.sb_lastrecord == NULL,
 1179                                     ("soreceive: sb_lastercord != NULL"));
 1180                         }
 1181                 }
 1182         }
 1183         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1184         SBLASTRECORDCHK(&so->so_rcv);
 1185         SBLASTMBUFCHK(&so->so_rcv);
 1186 
 1187         /*
 1188          * Now continue to read any data mbufs off of the head of the socket
 1189          * buffer until the read request is satisfied.  Note that 'type' is
 1190          * used to store the type of any mbuf reads that have happened so far
 1191          * such that soreceive() can stop reading if the type changes, which
 1192          * causes soreceive() to return only one of regular data and inline
 1193          * out-of-band data in a single socket receive operation.
 1194          */
 1195         moff = 0;
 1196         offset = 0;
 1197         while (m != NULL && uio->uio_resid > 0 && error == 0) {
 1198                 /*
 1199                  * If the type of mbuf has changed since the last mbuf
 1200                  * examined ('type'), end the receive operation.
 1201                  */
 1202                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1203                 if (m->m_type == MT_OOBDATA) {
 1204                         if (type != MT_OOBDATA)
 1205                                 break;
 1206                 } else if (type == MT_OOBDATA)
 1207                         break;
 1208                 else
 1209                     KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
 1210                         ("m->m_type == %d", m->m_type));
 1211                 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
 1212                 len = uio->uio_resid;
 1213                 if (so->so_oobmark && len > so->so_oobmark - offset)
 1214                         len = so->so_oobmark - offset;
 1215                 if (len > m->m_len - moff)
 1216                         len = m->m_len - moff;
 1217                 /*
 1218                  * If mp is set, just pass back the mbufs.
 1219                  * Otherwise copy them out via the uio, then free.
 1220                  * Sockbuf must be consistent here (points to current mbuf,
 1221                  * it points to next record) when we drop priority;
 1222                  * we must note any additions to the sockbuf when we
 1223                  * block interrupts again.
 1224                  */
 1225                 if (mp == NULL) {
 1226                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1227                         SBLASTRECORDCHK(&so->so_rcv);
 1228                         SBLASTMBUFCHK(&so->so_rcv);
 1229                         SOCKBUF_UNLOCK(&so->so_rcv);
 1230 #ifdef ZERO_COPY_SOCKETS
 1231                         if (so_zero_copy_receive) {
 1232                                 vm_page_t pg;
 1233                                 int disposable;
 1234 
 1235                                 if ((m->m_flags & M_EXT)
 1236                                  && (m->m_ext.ext_type == EXT_DISPOSABLE))
 1237                                         disposable = 1;
 1238                                 else
 1239                                         disposable = 0;
 1240 
 1241                                 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) +
 1242                                         moff));
 1243 
 1244                                 if (uio->uio_offset == -1)
 1245                                         uio->uio_offset =IDX_TO_OFF(pg->pindex);
 1246 
 1247                                 error = uiomoveco(mtod(m, char *) + moff,
 1248                                                   (int)len, uio,pg->object,
 1249                                                   disposable);
 1250                         } else
 1251 #endif /* ZERO_COPY_SOCKETS */
 1252                         error = uiomove(mtod(m, char *) + moff, (int)len, uio);
 1253                         SOCKBUF_LOCK(&so->so_rcv);
 1254                         if (error)
 1255                                 goto release;
 1256                 } else
 1257                         uio->uio_resid -= len;
 1258                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1259                 if (len == m->m_len - moff) {
 1260                         if (m->m_flags & M_EOR)
 1261                                 flags |= MSG_EOR;
 1262                         if (flags & MSG_PEEK) {
 1263                                 m = m->m_next;
 1264                                 moff = 0;
 1265                         } else {
 1266                                 nextrecord = m->m_nextpkt;
 1267                                 sbfree(&so->so_rcv, m);
 1268                                 if (mp != NULL) {
 1269                                         *mp = m;
 1270                                         mp = &m->m_next;
 1271                                         so->so_rcv.sb_mb = m = m->m_next;
 1272                                         *mp = NULL;
 1273                                 } else {
 1274                                         so->so_rcv.sb_mb = m_free(m);
 1275                                         m = so->so_rcv.sb_mb;
 1276                                 }
 1277                                 if (m != NULL) {
 1278                                         m->m_nextpkt = nextrecord;
 1279                                         if (nextrecord == NULL)
 1280                                                 so->so_rcv.sb_lastrecord = m;
 1281                                 } else {
 1282                                         so->so_rcv.sb_mb = nextrecord;
 1283                                         SB_EMPTY_FIXUP(&so->so_rcv);
 1284                                 }
 1285                                 SBLASTRECORDCHK(&so->so_rcv);
 1286                                 SBLASTMBUFCHK(&so->so_rcv);
 1287                         }
 1288                 } else {
 1289                         if (flags & MSG_PEEK)
 1290                                 moff += len;
 1291                         else {
 1292                                 if (mp != NULL) {
 1293                                         SOCKBUF_UNLOCK(&so->so_rcv);
 1294                                         *mp = m_copym(m, 0, len, M_TRYWAIT);
 1295                                         SOCKBUF_LOCK(&so->so_rcv);
 1296                                 }
 1297                                 m->m_data += len;
 1298                                 m->m_len -= len;
 1299                                 so->so_rcv.sb_cc -= len;
 1300                         }
 1301                 }
 1302                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1303                 if (so->so_oobmark) {
 1304                         if ((flags & MSG_PEEK) == 0) {
 1305                                 so->so_oobmark -= len;
 1306                                 if (so->so_oobmark == 0) {
 1307                                         so->so_rcv.sb_state |= SBS_RCVATMARK;
 1308                                         break;
 1309                                 }
 1310                         } else {
 1311                                 offset += len;
 1312                                 if (offset == so->so_oobmark)
 1313                                         break;
 1314                         }
 1315                 }
 1316                 if (flags & MSG_EOR)
 1317                         break;
 1318                 /*
 1319                  * If the MSG_WAITALL flag is set (for non-atomic socket),
 1320                  * we must not quit until "uio->uio_resid == 0" or an error
 1321                  * termination.  If a signal/timeout occurs, return
 1322                  * with a short count but without error.
 1323                  * Keep sockbuf locked against other readers.
 1324                  */
 1325                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
 1326                     !sosendallatonce(so) && nextrecord == NULL) {
 1327                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1328                         if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
 1329                                 break;
 1330                         /*
 1331                          * Notify the protocol that some data has been
 1332                          * drained before blocking.
 1333                          */
 1334                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
 1335                                 SOCKBUF_UNLOCK(&so->so_rcv);
 1336                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
 1337                                 SOCKBUF_LOCK(&so->so_rcv);
 1338                         }
 1339                         SBLASTRECORDCHK(&so->so_rcv);
 1340                         SBLASTMBUFCHK(&so->so_rcv);
 1341                         error = sbwait(&so->so_rcv);
 1342                         if (error)
 1343                                 goto release;
 1344                         m = so->so_rcv.sb_mb;
 1345                         if (m != NULL)
 1346                                 nextrecord = m->m_nextpkt;
 1347                 }
 1348         }
 1349 
 1350         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1351         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
 1352                 flags |= MSG_TRUNC;
 1353                 if ((flags & MSG_PEEK) == 0)
 1354                         (void) sbdroprecord_locked(&so->so_rcv);
 1355         }
 1356         if ((flags & MSG_PEEK) == 0) {
 1357                 if (m == NULL) {
 1358                         /*
 1359                          * First part is an inline SB_EMPTY_FIXUP().  Second
 1360                          * part makes sure sb_lastrecord is up-to-date if
 1361                          * there is still data in the socket buffer.
 1362                          */
 1363                         so->so_rcv.sb_mb = nextrecord;
 1364                         if (so->so_rcv.sb_mb == NULL) {
 1365                                 so->so_rcv.sb_mbtail = NULL;
 1366                                 so->so_rcv.sb_lastrecord = NULL;
 1367                         } else if (nextrecord->m_nextpkt == NULL)
 1368                                 so->so_rcv.sb_lastrecord = nextrecord;
 1369                 }
 1370                 SBLASTRECORDCHK(&so->so_rcv);
 1371                 SBLASTMBUFCHK(&so->so_rcv);
 1372                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
 1373                         SOCKBUF_UNLOCK(&so->so_rcv);
 1374                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
 1375                         SOCKBUF_LOCK(&so->so_rcv);
 1376                 }
 1377         }
 1378         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1379         if (orig_resid == uio->uio_resid && orig_resid &&
 1380             (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
 1381                 sbunlock(&so->so_rcv);
 1382                 goto restart;
 1383         }
 1384 
 1385         if (flagsp != NULL)
 1386                 *flagsp |= flags;
 1387 release:
 1388         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1389         sbunlock(&so->so_rcv);
 1390 out:
 1391         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1392         SOCKBUF_UNLOCK(&so->so_rcv);
 1393         return (error);
 1394 }
 1395 
 1396 int
 1397 soshutdown(so, how)
 1398         struct socket *so;
 1399         int how;
 1400 {
 1401         struct protosw *pr = so->so_proto;
 1402 
 1403         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
 1404                 return (EINVAL);
 1405 
 1406         if (how != SHUT_WR)
 1407                 sorflush(so);
 1408         if (how != SHUT_RD)
 1409                 return ((*pr->pr_usrreqs->pru_shutdown)(so));
 1410         return (0);
 1411 }
 1412 
 1413 void
 1414 sorflush(so)
 1415         struct socket *so;
 1416 {
 1417         struct sockbuf *sb = &so->so_rcv;
 1418         struct protosw *pr = so->so_proto;
 1419         struct sockbuf asb;
 1420 
 1421         /*
 1422          * XXXRW: This is quite ugly.  The existing code made a copy of the
 1423          * socket buffer, then zero'd the original to clear the buffer
 1424          * fields.  However, with mutexes in the socket buffer, this causes
 1425          * problems.  We only clear the zeroable bits of the original;
 1426          * however, we have to initialize and destroy the mutex in the copy
 1427          * so that dom_dispose() and sbrelease() can lock t as needed.
 1428          */
 1429         SOCKBUF_LOCK(sb);
 1430         sb->sb_flags |= SB_NOINTR;
 1431         (void) sblock(sb, M_WAITOK);
 1432         /*
 1433          * socantrcvmore_locked() drops the socket buffer mutex so that it
 1434          * can safely perform wakeups.  Re-acquire the mutex before
 1435          * continuing.
 1436          */
 1437         socantrcvmore_locked(so);
 1438         SOCKBUF_LOCK(sb);
 1439         sbunlock(sb);
 1440         /*
 1441          * Invalidate/clear most of the sockbuf structure, but leave
 1442          * selinfo and mutex data unchanged.
 1443          */
 1444         bzero(&asb, offsetof(struct sockbuf, sb_startzero));
 1445         bcopy(&sb->sb_startzero, &asb.sb_startzero,
 1446             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 1447         bzero(&sb->sb_startzero,
 1448             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 1449         SOCKBUF_UNLOCK(sb);
 1450 
 1451         SOCKBUF_LOCK_INIT(&asb, "so_rcv");
 1452         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
 1453                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
 1454         sbrelease(&asb, so);
 1455         SOCKBUF_LOCK_DESTROY(&asb);
 1456 }
 1457 
 1458 #ifdef INET
 1459 static int
 1460 do_setopt_accept_filter(so, sopt)
 1461         struct  socket *so;
 1462         struct  sockopt *sopt;
 1463 {
 1464         struct accept_filter_arg        *afap;
 1465         struct accept_filter    *afp;
 1466         struct so_accf  *newaf;
 1467         int     error = 0;
 1468 
 1469         newaf = NULL;
 1470         afap = NULL;
 1471 
 1472         /*
 1473          * XXXRW: Configuring accept filters should be an atomic test-and-set
 1474          * operation to prevent races during setup and attach.  There may be
 1475          * more general issues of racing and ordering here that are not yet
 1476          * addressed by locking.
 1477          */
 1478         /* do not set/remove accept filters on non listen sockets */
 1479         SOCK_LOCK(so);
 1480         if ((so->so_options & SO_ACCEPTCONN) == 0) {
 1481                 SOCK_UNLOCK(so);
 1482                 return (EINVAL);
 1483         }
 1484 
 1485         /* removing the filter */
 1486         if (sopt == NULL) {
 1487                 if (so->so_accf != NULL) {
 1488                         struct so_accf *af = so->so_accf;
 1489                         if (af->so_accept_filter != NULL &&
 1490                                 af->so_accept_filter->accf_destroy != NULL) {
 1491                                 af->so_accept_filter->accf_destroy(so);
 1492                         }
 1493                         if (af->so_accept_filter_str != NULL) {
 1494                                 FREE(af->so_accept_filter_str, M_ACCF);
 1495                         }
 1496                         FREE(af, M_ACCF);
 1497                         so->so_accf = NULL;
 1498                 }
 1499                 so->so_options &= ~SO_ACCEPTFILTER;
 1500                 SOCK_UNLOCK(so);
 1501                 return (0);
 1502         }
 1503         SOCK_UNLOCK(so);
 1504 
 1505         /*-
 1506          * Adding a filter.
 1507          *
 1508          * Do memory allocation, copyin, and filter lookup now while we're
 1509          * not holding any locks.  Avoids sleeping with a mutex, as well as
 1510          * introducing a lock order between accept filter locks and socket
 1511          * locks here.
 1512          */
 1513         MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP,
 1514             M_WAITOK);
 1515         /* don't put large objects on the kernel stack */
 1516         error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
 1517         afap->af_name[sizeof(afap->af_name)-1] = '\0';
 1518         afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
 1519         if (error) {
 1520                 FREE(afap, M_TEMP);
 1521                 return (error);
 1522         }
 1523         afp = accept_filt_get(afap->af_name);
 1524         if (afp == NULL) {
 1525                 FREE(afap, M_TEMP);
 1526                 return (ENOENT);
 1527         }
 1528 
 1529         /*
 1530          * Allocate the new accept filter instance storage.  We may have to
 1531          * free it again later if we fail to attach it.  If attached
 1532          * properly, 'newaf' is NULLed to avoid a free() while in use.
 1533          */
 1534         MALLOC(newaf, struct so_accf *, sizeof(*newaf), M_ACCF, M_WAITOK |
 1535             M_ZERO);
 1536         if (afp->accf_create != NULL && afap->af_name[0] != '\0') {
 1537                 int len = strlen(afap->af_name) + 1;
 1538                 MALLOC(newaf->so_accept_filter_str, char *, len, M_ACCF,
 1539                     M_WAITOK);
 1540                 strcpy(newaf->so_accept_filter_str, afap->af_name);
 1541         }
 1542 
 1543         SOCK_LOCK(so);
 1544         /* must remove previous filter first */
 1545         if (so->so_accf != NULL) {
 1546                 error = EINVAL;
 1547                 goto out;
 1548         }
 1549         /*
 1550          * Invoke the accf_create() method of the filter if required.
 1551          * XXXRW: the socket mutex is held over this call, so the create
 1552          * method cannot block.  This may be something we have to change, but
 1553          * it would require addressing possible races.
 1554          */
 1555         if (afp->accf_create != NULL) {
 1556                 newaf->so_accept_filter_arg =
 1557                     afp->accf_create(so, afap->af_arg);
 1558                 if (newaf->so_accept_filter_arg == NULL) {
 1559                         error = EINVAL;
 1560                         goto out;
 1561                 }
 1562         }
 1563         newaf->so_accept_filter = afp;
 1564         so->so_accf = newaf;
 1565         so->so_options |= SO_ACCEPTFILTER;
 1566         newaf = NULL;
 1567 out:
 1568         SOCK_UNLOCK(so);
 1569         if (newaf != NULL) {
 1570                 if (newaf->so_accept_filter_str != NULL)
 1571                         FREE(newaf->so_accept_filter_str, M_ACCF);
 1572                 FREE(newaf, M_ACCF);
 1573         }
 1574         if (afap != NULL)
 1575                 FREE(afap, M_TEMP);
 1576         return (error);
 1577 }
 1578 #endif /* INET */
 1579 
 1580 /*
 1581  * Perhaps this routine, and sooptcopyout(), below, ought to come in
 1582  * an additional variant to handle the case where the option value needs
 1583  * to be some kind of integer, but not a specific size.
 1584  * In addition to their use here, these functions are also called by the
 1585  * protocol-level pr_ctloutput() routines.
 1586  */
 1587 int
 1588 sooptcopyin(sopt, buf, len, minlen)
 1589         struct  sockopt *sopt;
 1590         void    *buf;
 1591         size_t  len;
 1592         size_t  minlen;
 1593 {
 1594         size_t  valsize;
 1595 
 1596         /*
 1597          * If the user gives us more than we wanted, we ignore it,
 1598          * but if we don't get the minimum length the caller
 1599          * wants, we return EINVAL.  On success, sopt->sopt_valsize
 1600          * is set to however much we actually retrieved.
 1601          */
 1602         if ((valsize = sopt->sopt_valsize) < minlen)
 1603                 return EINVAL;
 1604         if (valsize > len)
 1605                 sopt->sopt_valsize = valsize = len;
 1606 
 1607         if (sopt->sopt_td != NULL)
 1608                 return (copyin(sopt->sopt_val, buf, valsize));
 1609 
 1610         bcopy(sopt->sopt_val, buf, valsize);
 1611         return 0;
 1612 }
 1613 
 1614 /*
 1615  * Kernel version of setsockopt(2)/
 1616  * XXX: optlen is size_t, not socklen_t
 1617  */
 1618 int
 1619 so_setsockopt(struct socket *so, int level, int optname, void *optval,
 1620     size_t optlen)
 1621 {
 1622         struct sockopt sopt;
 1623 
 1624         sopt.sopt_level = level;
 1625         sopt.sopt_name = optname;
 1626         sopt.sopt_dir = SOPT_SET;
 1627         sopt.sopt_val = optval;
 1628         sopt.sopt_valsize = optlen;
 1629         sopt.sopt_td = NULL;
 1630         return (sosetopt(so, &sopt));
 1631 }
 1632 
 1633 int
 1634 sosetopt(so, sopt)
 1635         struct socket *so;
 1636         struct sockopt *sopt;
 1637 {
 1638         int     error, optval;
 1639         struct  linger l;
 1640         struct  timeval tv;
 1641         u_long  val;
 1642 #ifdef MAC
 1643         struct mac extmac;
 1644 #endif
 1645 
 1646         error = 0;
 1647         if (sopt->sopt_level != SOL_SOCKET) {
 1648                 if (so->so_proto && so->so_proto->pr_ctloutput)
 1649                         return ((*so->so_proto->pr_ctloutput)
 1650                                   (so, sopt));
 1651                 error = ENOPROTOOPT;
 1652         } else {
 1653                 switch (sopt->sopt_name) {
 1654 #ifdef INET
 1655                 case SO_ACCEPTFILTER:
 1656                         error = do_setopt_accept_filter(so, sopt);
 1657                         if (error)
 1658                                 goto bad;
 1659                         break;
 1660 #endif
 1661                 case SO_LINGER:
 1662                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
 1663                         if (error)
 1664                                 goto bad;
 1665 
 1666                         SOCK_LOCK(so);
 1667                         so->so_linger = l.l_linger;
 1668                         if (l.l_onoff)
 1669                                 so->so_options |= SO_LINGER;
 1670                         else
 1671                                 so->so_options &= ~SO_LINGER;
 1672                         SOCK_UNLOCK(so);
 1673                         break;
 1674 
 1675                 case SO_DEBUG:
 1676                 case SO_KEEPALIVE:
 1677                 case SO_DONTROUTE:
 1678                 case SO_USELOOPBACK:
 1679                 case SO_BROADCAST:
 1680                 case SO_REUSEADDR:
 1681                 case SO_REUSEPORT:
 1682                 case SO_OOBINLINE:
 1683                 case SO_TIMESTAMP:
 1684                 case SO_BINTIME:
 1685                 case SO_NOSIGPIPE:
 1686                         error = sooptcopyin(sopt, &optval, sizeof optval,
 1687                                             sizeof optval);
 1688                         if (error)
 1689                                 goto bad;
 1690                         SOCK_LOCK(so);
 1691                         if (optval)
 1692                                 so->so_options |= sopt->sopt_name;
 1693                         else
 1694                                 so->so_options &= ~sopt->sopt_name;
 1695                         SOCK_UNLOCK(so);
 1696                         break;
 1697 
 1698                 case SO_SNDBUF:
 1699                 case SO_RCVBUF:
 1700                 case SO_SNDLOWAT:
 1701                 case SO_RCVLOWAT:
 1702                         error = sooptcopyin(sopt, &optval, sizeof optval,
 1703                                             sizeof optval);
 1704                         if (error)
 1705                                 goto bad;
 1706 
 1707                         /*
 1708                          * Values < 1 make no sense for any of these
 1709                          * options, so disallow them.
 1710                          */
 1711                         if (optval < 1) {
 1712                                 error = EINVAL;
 1713                                 goto bad;
 1714                         }
 1715 
 1716                         switch (sopt->sopt_name) {
 1717                         case SO_SNDBUF:
 1718                         case SO_RCVBUF:
 1719                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
 1720                                     &so->so_snd : &so->so_rcv, (u_long)optval,
 1721                                     so, curthread) == 0) {
 1722                                         error = ENOBUFS;
 1723                                         goto bad;
 1724                                 }
 1725                                 break;
 1726 
 1727                         /*
 1728                          * Make sure the low-water is never greater than
 1729                          * the high-water.
 1730                          */
 1731                         case SO_SNDLOWAT:
 1732                                 SOCKBUF_LOCK(&so->so_snd);
 1733                                 so->so_snd.sb_lowat =
 1734                                     (optval > so->so_snd.sb_hiwat) ?
 1735                                     so->so_snd.sb_hiwat : optval;
 1736                                 SOCKBUF_UNLOCK(&so->so_snd);
 1737                                 break;
 1738                         case SO_RCVLOWAT:
 1739                                 SOCKBUF_LOCK(&so->so_rcv);
 1740                                 so->so_rcv.sb_lowat =
 1741                                     (optval > so->so_rcv.sb_hiwat) ?
 1742                                     so->so_rcv.sb_hiwat : optval;
 1743                                 SOCKBUF_UNLOCK(&so->so_rcv);
 1744                                 break;
 1745                         }
 1746                         break;
 1747 
 1748                 case SO_SNDTIMEO:
 1749                 case SO_RCVTIMEO:
 1750                         error = sooptcopyin(sopt, &tv, sizeof tv,
 1751                                             sizeof tv);
 1752                         if (error)
 1753                                 goto bad;
 1754 
 1755                         /* assert(hz > 0); */
 1756                         if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz ||
 1757                             tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
 1758                                 error = EDOM;
 1759                                 goto bad;
 1760                         }
 1761                         /* assert(tick > 0); */
 1762                         /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */
 1763                         val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
 1764                         if (val > SHRT_MAX) {
 1765                                 error = EDOM;
 1766                                 goto bad;
 1767                         }
 1768                         if (val == 0 && tv.tv_usec != 0)
 1769                                 val = 1;
 1770 
 1771                         switch (sopt->sopt_name) {
 1772                         case SO_SNDTIMEO:
 1773                                 so->so_snd.sb_timeo = val;
 1774                                 break;
 1775                         case SO_RCVTIMEO:
 1776                                 so->so_rcv.sb_timeo = val;
 1777                                 break;
 1778                         }
 1779                         break;
 1780                 case SO_LABEL:
 1781 #ifdef MAC
 1782                         error = sooptcopyin(sopt, &extmac, sizeof extmac,
 1783                             sizeof extmac);
 1784                         if (error)
 1785                                 goto bad;
 1786                         error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
 1787                             so, &extmac);
 1788 #else
 1789                         error = EOPNOTSUPP;
 1790 #endif
 1791                         break;
 1792                 default:
 1793                         error = ENOPROTOOPT;
 1794                         break;
 1795                 }
 1796                 if (error == 0 && so->so_proto != NULL &&
 1797                     so->so_proto->pr_ctloutput != NULL) {
 1798                         (void) ((*so->so_proto->pr_ctloutput)
 1799                                   (so, sopt));
 1800                 }
 1801         }
 1802 bad:
 1803         return (error);
 1804 }
 1805 
 1806 /* Helper routine for getsockopt */
 1807 int
 1808 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
 1809 {
 1810         int     error;
 1811         size_t  valsize;
 1812 
 1813         error = 0;
 1814 
 1815         /*
 1816          * Documented get behavior is that we always return a value,
 1817          * possibly truncated to fit in the user's buffer.
 1818          * Traditional behavior is that we always tell the user
 1819          * precisely how much we copied, rather than something useful
 1820          * like the total amount we had available for her.
 1821          * Note that this interface is not idempotent; the entire answer must
 1822          * generated ahead of time.
 1823          */
 1824         valsize = min(len, sopt->sopt_valsize);
 1825         sopt->sopt_valsize = valsize;
 1826         if (sopt->sopt_val != NULL) {
 1827                 if (sopt->sopt_td != NULL)
 1828                         error = copyout(buf, sopt->sopt_val, valsize);
 1829                 else
 1830                         bcopy(buf, sopt->sopt_val, valsize);
 1831         }
 1832         return error;
 1833 }
 1834 
 1835 int
 1836 sogetopt(so, sopt)
 1837         struct socket *so;
 1838         struct sockopt *sopt;
 1839 {
 1840         int     error, optval;
 1841         struct  linger l;
 1842         struct  timeval tv;
 1843 #ifdef INET
 1844         struct accept_filter_arg *afap;
 1845 #endif
 1846 #ifdef MAC
 1847         struct mac extmac;
 1848 #endif
 1849 
 1850         error = 0;
 1851         if (sopt->sopt_level != SOL_SOCKET) {
 1852                 if (so->so_proto && so->so_proto->pr_ctloutput) {
 1853                         return ((*so->so_proto->pr_ctloutput)
 1854                                   (so, sopt));
 1855                 } else
 1856                         return (ENOPROTOOPT);
 1857         } else {
 1858                 switch (sopt->sopt_name) {
 1859 #ifdef INET
 1860                 case SO_ACCEPTFILTER:
 1861                         /* Unlocked read. */
 1862                         if ((so->so_options & SO_ACCEPTCONN) == 0)
 1863                                 return (EINVAL);
 1864                         MALLOC(afap, struct accept_filter_arg *, sizeof(*afap),
 1865                                 M_TEMP, M_WAITOK | M_ZERO);
 1866                         SOCK_LOCK(so);
 1867                         if ((so->so_options & SO_ACCEPTFILTER) != 0) {
 1868                                 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
 1869                                 if (so->so_accf->so_accept_filter_str != NULL)
 1870                                         strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
 1871                         }
 1872                         SOCK_UNLOCK(so);
 1873                         error = sooptcopyout(sopt, afap, sizeof(*afap));
 1874                         FREE(afap, M_TEMP);
 1875                         break;
 1876 #endif
 1877 
 1878                 case SO_LINGER:
 1879                         /*
 1880                          * XXXRW: We grab the lock here to get a consistent
 1881                          * snapshot of both fields.  This may not really
 1882                          * be necessary.
 1883                          */
 1884                         SOCK_LOCK(so);
 1885                         l.l_onoff = so->so_options & SO_LINGER;
 1886                         l.l_linger = so->so_linger;
 1887                         SOCK_UNLOCK(so);
 1888                         error = sooptcopyout(sopt, &l, sizeof l);
 1889                         break;
 1890 
 1891                 case SO_USELOOPBACK:
 1892                 case SO_DONTROUTE:
 1893                 case SO_DEBUG:
 1894                 case SO_KEEPALIVE:
 1895                 case SO_REUSEADDR:
 1896                 case SO_REUSEPORT:
 1897                 case SO_BROADCAST:
 1898                 case SO_OOBINLINE:
 1899                 case SO_TIMESTAMP:
 1900                 case SO_BINTIME:
 1901                 case SO_NOSIGPIPE:
 1902                         optval = so->so_options & sopt->sopt_name;
 1903 integer:
 1904                         error = sooptcopyout(sopt, &optval, sizeof optval);
 1905                         break;
 1906 
 1907                 case SO_TYPE:
 1908                         optval = so->so_type;
 1909                         goto integer;
 1910 
 1911                 case SO_ERROR:
 1912                         optval = so->so_error;
 1913                         so->so_error = 0;
 1914                         goto integer;
 1915 
 1916                 case SO_SNDBUF:
 1917                         optval = so->so_snd.sb_hiwat;
 1918                         goto integer;
 1919 
 1920                 case SO_RCVBUF:
 1921                         optval = so->so_rcv.sb_hiwat;
 1922                         goto integer;
 1923 
 1924                 case SO_SNDLOWAT:
 1925                         optval = so->so_snd.sb_lowat;
 1926                         goto integer;
 1927 
 1928                 case SO_RCVLOWAT:
 1929                         optval = so->so_rcv.sb_lowat;
 1930                         goto integer;
 1931 
 1932                 case SO_SNDTIMEO:
 1933                 case SO_RCVTIMEO:
 1934                         optval = (sopt->sopt_name == SO_SNDTIMEO ?
 1935                                   so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
 1936 
 1937                         tv.tv_sec = optval / hz;
 1938                         tv.tv_usec = (optval % hz) * tick;
 1939                         error = sooptcopyout(sopt, &tv, sizeof tv);
 1940                         break;
 1941                 case SO_LABEL:
 1942 #ifdef MAC
 1943                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
 1944                             sizeof(extmac));
 1945                         if (error)
 1946                                 return (error);
 1947                         error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
 1948                             so, &extmac);
 1949                         if (error)
 1950                                 return (error);
 1951                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
 1952 #else
 1953                         error = EOPNOTSUPP;
 1954 #endif
 1955                         break;
 1956                 case SO_PEERLABEL:
 1957 #ifdef MAC
 1958                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
 1959                             sizeof(extmac));
 1960                         if (error)
 1961                                 return (error);
 1962                         error = mac_getsockopt_peerlabel(
 1963                             sopt->sopt_td->td_ucred, so, &extmac);
 1964                         if (error)
 1965                                 return (error);
 1966                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
 1967 #else
 1968                         error = EOPNOTSUPP;
 1969 #endif
 1970                         break;
 1971                 default:
 1972                         error = ENOPROTOOPT;
 1973                         break;
 1974                 }
 1975                 return (error);
 1976         }
 1977 }
 1978 
 1979 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
 1980 int
 1981 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
 1982 {
 1983         struct mbuf *m, *m_prev;
 1984         int sopt_size = sopt->sopt_valsize;
 1985 
 1986         MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
 1987         if (m == NULL)
 1988                 return ENOBUFS;
 1989         if (sopt_size > MLEN) {
 1990                 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
 1991                 if ((m->m_flags & M_EXT) == 0) {
 1992                         m_free(m);
 1993                         return ENOBUFS;
 1994                 }
 1995                 m->m_len = min(MCLBYTES, sopt_size);
 1996         } else {
 1997                 m->m_len = min(MLEN, sopt_size);
 1998         }
 1999         sopt_size -= m->m_len;
 2000         *mp = m;
 2001         m_prev = m;
 2002 
 2003         while (sopt_size) {
 2004                 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
 2005                 if (m == NULL) {
 2006                         m_freem(*mp);
 2007                         return ENOBUFS;
 2008                 }
 2009                 if (sopt_size > MLEN) {
 2010                         MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
 2011                             M_DONTWAIT);
 2012                         if ((m->m_flags & M_EXT) == 0) {
 2013                                 m_freem(m);
 2014                                 m_freem(*mp);
 2015                                 return ENOBUFS;
 2016                         }
 2017                         m->m_len = min(MCLBYTES, sopt_size);
 2018                 } else {
 2019                         m->m_len = min(MLEN, sopt_size);
 2020                 }
 2021                 sopt_size -= m->m_len;
 2022                 m_prev->m_next = m;
 2023                 m_prev = m;
 2024         }
 2025         return 0;
 2026 }
 2027 
 2028 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
 2029 int
 2030 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
 2031 {
 2032         struct mbuf *m0 = m;
 2033 
 2034         if (sopt->sopt_val == NULL)
 2035                 return 0;
 2036         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 2037                 if (sopt->sopt_td != NULL) {
 2038                         int error;
 2039 
 2040                         error = copyin(sopt->sopt_val, mtod(m, char *),
 2041                                        m->m_len);
 2042                         if (error != 0) {
 2043                                 m_freem(m0);
 2044                                 return(error);
 2045                         }
 2046                 } else
 2047                         bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
 2048                 sopt->sopt_valsize -= m->m_len;
 2049                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
 2050                 m = m->m_next;
 2051         }
 2052         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
 2053                 panic("ip6_sooptmcopyin");
 2054         return 0;
 2055 }
 2056 
 2057 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
 2058 int
 2059 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
 2060 {
 2061         struct mbuf *m0 = m;
 2062         size_t valsize = 0;
 2063 
 2064         if (sopt->sopt_val == NULL)
 2065                 return 0;
 2066         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 2067                 if (sopt->sopt_td != NULL) {
 2068                         int error;
 2069 
 2070                         error = copyout(mtod(m, char *), sopt->sopt_val,
 2071                                        m->m_len);
 2072                         if (error != 0) {
 2073                                 m_freem(m0);
 2074                                 return(error);
 2075                         }
 2076                 } else
 2077                         bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
 2078                sopt->sopt_valsize -= m->m_len;
 2079                sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
 2080                valsize += m->m_len;
 2081                m = m->m_next;
 2082         }
 2083         if (m != NULL) {
 2084                 /* enough soopt buffer should be given from user-land */
 2085                 m_freem(m0);
 2086                 return(EINVAL);
 2087         }
 2088         sopt->sopt_valsize = valsize;
 2089         return 0;
 2090 }
 2091 
 2092 void
 2093 sohasoutofband(so)
 2094         struct socket *so;
 2095 {
 2096         if (so->so_sigio != NULL)
 2097                 pgsigio(&so->so_sigio, SIGURG, 0);
 2098         selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
 2099 }
 2100 
 2101 int
 2102 sopoll(struct socket *so, int events, struct ucred *active_cred,
 2103     struct thread *td)
 2104 {
 2105         int revents = 0;
 2106 
 2107         SOCKBUF_LOCK(&so->so_snd);
 2108         SOCKBUF_LOCK(&so->so_rcv);
 2109         if (events & (POLLIN | POLLRDNORM))
 2110                 if (soreadable(so))
 2111                         revents |= events & (POLLIN | POLLRDNORM);
 2112 
 2113         if (events & POLLINIGNEOF)
 2114                 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
 2115                     !TAILQ_EMPTY(&so->so_comp) || so->so_error)
 2116                         revents |= POLLINIGNEOF;
 2117 
 2118         if (events & (POLLOUT | POLLWRNORM))
 2119                 if (sowriteable(so))
 2120                         revents |= events & (POLLOUT | POLLWRNORM);
 2121 
 2122         if (events & (POLLPRI | POLLRDBAND))
 2123                 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
 2124                         revents |= events & (POLLPRI | POLLRDBAND);
 2125 
 2126         if (revents == 0) {
 2127                 if (events &
 2128                     (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
 2129                      POLLRDBAND)) {
 2130                         selrecord(td, &so->so_rcv.sb_sel);
 2131                         so->so_rcv.sb_flags |= SB_SEL;
 2132                 }
 2133 
 2134                 if (events & (POLLOUT | POLLWRNORM)) {
 2135                         selrecord(td, &so->so_snd.sb_sel);
 2136                         so->so_snd.sb_flags |= SB_SEL;
 2137                 }
 2138         }
 2139 
 2140         SOCKBUF_UNLOCK(&so->so_rcv);
 2141         SOCKBUF_UNLOCK(&so->so_snd);
 2142         return (revents);
 2143 }
 2144 
 2145 int
 2146 soo_kqfilter(struct file *fp, struct knote *kn)
 2147 {
 2148         struct socket *so = kn->kn_fp->f_data;
 2149         struct sockbuf *sb;
 2150 
 2151         switch (kn->kn_filter) {
 2152         case EVFILT_READ:
 2153                 if (so->so_options & SO_ACCEPTCONN)
 2154                         kn->kn_fop = &solisten_filtops;
 2155                 else
 2156                         kn->kn_fop = &soread_filtops;
 2157                 sb = &so->so_rcv;
 2158                 break;
 2159         case EVFILT_WRITE:
 2160                 kn->kn_fop = &sowrite_filtops;
 2161                 sb = &so->so_snd;
 2162                 break;
 2163         default:
 2164                 return (EINVAL);
 2165         }
 2166 
 2167         SOCKBUF_LOCK(sb);
 2168         knlist_add(&sb->sb_sel.si_note, kn, 1);
 2169         sb->sb_flags |= SB_KNOTE;
 2170         SOCKBUF_UNLOCK(sb);
 2171         return (0);
 2172 }
 2173 
 2174 static void
 2175 filt_sordetach(struct knote *kn)
 2176 {
 2177         struct socket *so = kn->kn_fp->f_data;
 2178 
 2179         SOCKBUF_LOCK(&so->so_rcv);
 2180         knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
 2181         if (knlist_empty(&so->so_rcv.sb_sel.si_note))
 2182                 so->so_rcv.sb_flags &= ~SB_KNOTE;
 2183         SOCKBUF_UNLOCK(&so->so_rcv);
 2184 }
 2185 
 2186 /*ARGSUSED*/
 2187 static int
 2188 filt_soread(struct knote *kn, long hint)
 2189 {
 2190         struct socket *so;
 2191 
 2192         so = kn->kn_fp->f_data;
 2193         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 2194 
 2195         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
 2196         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 2197                 kn->kn_flags |= EV_EOF;
 2198                 kn->kn_fflags = so->so_error;
 2199                 return (1);
 2200         } else if (so->so_error)        /* temporary udp error */
 2201                 return (1);
 2202         else if (kn->kn_sfflags & NOTE_LOWAT)
 2203                 return (kn->kn_data >= kn->kn_sdata);
 2204         else
 2205                 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
 2206 }
 2207 
 2208 static void
 2209 filt_sowdetach(struct knote *kn)
 2210 {
 2211         struct socket *so = kn->kn_fp->f_data;
 2212 
 2213         SOCKBUF_LOCK(&so->so_snd);
 2214         knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
 2215         if (knlist_empty(&so->so_snd.sb_sel.si_note))
 2216                 so->so_snd.sb_flags &= ~SB_KNOTE;
 2217         SOCKBUF_UNLOCK(&so->so_snd);
 2218 }
 2219 
 2220 /*ARGSUSED*/
 2221 static int
 2222 filt_sowrite(struct knote *kn, long hint)
 2223 {
 2224         struct socket *so;
 2225 
 2226         so = kn->kn_fp->f_data;
 2227         SOCKBUF_LOCK_ASSERT(&so->so_snd);
 2228         kn->kn_data = sbspace(&so->so_snd);
 2229         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 2230                 kn->kn_flags |= EV_EOF;
 2231                 kn->kn_fflags = so->so_error;
 2232                 return (1);
 2233         } else if (so->so_error)        /* temporary udp error */
 2234                 return (1);
 2235         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
 2236             (so->so_proto->pr_flags & PR_CONNREQUIRED))
 2237                 return (0);
 2238         else if (kn->kn_sfflags & NOTE_LOWAT)
 2239                 return (kn->kn_data >= kn->kn_sdata);
 2240         else
 2241                 return (kn->kn_data >= so->so_snd.sb_lowat);
 2242 }
 2243 
 2244 /*ARGSUSED*/
 2245 static int
 2246 filt_solisten(struct knote *kn, long hint)
 2247 {
 2248         struct socket *so = kn->kn_fp->f_data;
 2249 
 2250         kn->kn_data = so->so_qlen;
 2251         return (! TAILQ_EMPTY(&so->so_comp));
 2252 }
 2253 
 2254 int
 2255 socheckuid(struct socket *so, uid_t uid)
 2256 {
 2257 
 2258         if (so == NULL)
 2259                 return (EPERM);
 2260         if (so->so_cred->cr_uid == uid)
 2261                 return (0);
 2262         return (EPERM);
 2263 }
Cache object: 75df33231327010f0fa0be07d342c547
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/uipc_socket.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/uipc_socket.c