The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/uipc_socket.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2004 The FreeBSD Foundation
    3  * Copyright (c) 2004-2005 Robert N. M. Watson
    4  * Copyright (c) 1982, 1986, 1988, 1990, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  * 4. Neither the name of the University nor the names of its contributors
   16  *    may be used to endorse or promote products derived from this software
   17  *    without specific prior written permission.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   29  * SUCH DAMAGE.
   30  *
   31  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
   32  */
   33 
   34 #include <sys/cdefs.h>
   35 __FBSDID("$FreeBSD: src/sys/kern/uipc_socket.c,v 1.208.2.24 2005/09/27 21:54:02 rwatson Exp $");
   36 
   37 #include "opt_inet.h"
   38 #include "opt_mac.h"
   39 #include "opt_zero.h"
   40 
   41 #include <sys/param.h>
   42 #include <sys/systm.h>
   43 #include <sys/fcntl.h>
   44 #include <sys/limits.h>
   45 #include <sys/lock.h>
   46 #include <sys/mac.h>
   47 #include <sys/malloc.h>
   48 #include <sys/mbuf.h>
   49 #include <sys/mutex.h>
   50 #include <sys/domain.h>
   51 #include <sys/file.h>                   /* for struct knote */
   52 #include <sys/kernel.h>
   53 #include <sys/event.h>
   54 #include <sys/poll.h>
   55 #include <sys/proc.h>
   56 #include <sys/protosw.h>
   57 #include <sys/socket.h>
   58 #include <sys/socketvar.h>
   59 #include <sys/resourcevar.h>
   60 #include <sys/signalvar.h>
   61 #include <sys/sysctl.h>
   62 #include <sys/uio.h>
   63 #include <sys/jail.h>
   64 
   65 #include <vm/uma.h>
   66 
   67 
   68 static int      soreceive_rcvoob(struct socket *so, struct uio *uio,
   69                     int flags);
   70 
   71 static void     filt_sordetach(struct knote *kn);
   72 static int      filt_soread(struct knote *kn, long hint);
   73 static void     filt_sowdetach(struct knote *kn);
   74 static int      filt_sowrite(struct knote *kn, long hint);
   75 static int      filt_solisten(struct knote *kn, long hint);
   76 
   77 static struct filterops solisten_filtops =
   78         { 1, NULL, filt_sordetach, filt_solisten };
   79 static struct filterops soread_filtops =
   80         { 1, NULL, filt_sordetach, filt_soread };
   81 static struct filterops sowrite_filtops =
   82         { 1, NULL, filt_sowdetach, filt_sowrite };
   83 
   84 uma_zone_t socket_zone;
   85 so_gen_t        so_gencnt;      /* generation count for sockets */
   86 
   87 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
   88 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
   89 
   90 SYSCTL_DECL(_kern_ipc);
   91 
   92 static int somaxconn = SOMAXCONN;
   93 static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS);
   94 /* XXX: we dont have SYSCTL_SHORT */
   95 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
   96     0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection "
   97     "queue size");
   98 static int numopensockets;
   99 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
  100     &numopensockets, 0, "Number of open sockets");
  101 #ifdef ZERO_COPY_SOCKETS
  102 /* These aren't static because they're used in other files. */
  103 int so_zero_copy_send = 1;
  104 int so_zero_copy_receive = 1;
  105 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
  106     "Zero copy controls");
  107 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
  108     &so_zero_copy_receive, 0, "Enable zero copy receive");
  109 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
  110     &so_zero_copy_send, 0, "Enable zero copy send");
  111 #endif /* ZERO_COPY_SOCKETS */
  112 
  113 /*
  114  * accept_mtx locks down per-socket fields relating to accept queues.  See
  115  * socketvar.h for an annotation of the protected fields of struct socket.
  116  */
  117 struct mtx accept_mtx;
  118 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
  119 
  120 /*
  121  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
  122  * so_gencnt field.
  123  */
  124 static struct mtx so_global_mtx;
  125 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
  126 
  127 /*
  128  * Socket operation routines.
  129  * These routines are called by the routines in
  130  * sys_socket.c or from a system process, and
  131  * implement the semantics of socket operations by
  132  * switching out to the protocol specific routines.
  133  */
  134 
  135 /*
  136  * Get a socket structure from our zone, and initialize it.
  137  * Note that it would probably be better to allocate socket
  138  * and PCB at the same time, but I'm not convinced that all
  139  * the protocols can be easily modified to do this.
  140  *
  141  * soalloc() returns a socket with a ref count of 0.
  142  */
  143 struct socket *
  144 soalloc(int mflags)
  145 {
  146         struct socket *so;
  147 
  148         so = uma_zalloc(socket_zone, mflags | M_ZERO);
  149         if (so != NULL) {
  150 #ifdef MAC
  151                 if (mac_init_socket(so, mflags) != 0) {
  152                         uma_zfree(socket_zone, so);
  153                         return (NULL);
  154                 }
  155 #endif
  156                 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
  157                 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
  158                 /* sx_init(&so->so_sxlock, "socket sxlock"); */
  159                 TAILQ_INIT(&so->so_aiojobq);
  160                 mtx_lock(&so_global_mtx);
  161                 so->so_gencnt = ++so_gencnt;
  162                 ++numopensockets;
  163                 mtx_unlock(&so_global_mtx);
  164         }
  165         return (so);
  166 }
  167 
  168 /*
  169  * socreate returns a socket with a ref count of 1.  The socket should be
  170  * closed with soclose().
  171  */
  172 int
  173 socreate(dom, aso, type, proto, cred, td)
  174         int dom;
  175         struct socket **aso;
  176         int type;
  177         int proto;
  178         struct ucred *cred;
  179         struct thread *td;
  180 {
  181         struct protosw *prp;
  182         struct socket *so;
  183         int error;
  184 
  185         if (proto)
  186                 prp = pffindproto(dom, proto, type);
  187         else
  188                 prp = pffindtype(dom, type);
  189 
  190         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL)
  191                 return (EPROTONOSUPPORT);
  192 
  193         if (jailed(cred) && jail_socket_unixiproute_only &&
  194             prp->pr_domain->dom_family != PF_LOCAL &&
  195             prp->pr_domain->dom_family != PF_INET &&
  196             prp->pr_domain->dom_family != PF_ROUTE) {
  197                 return (EPROTONOSUPPORT);
  198         }
  199 
  200         if (prp->pr_type != type)
  201                 return (EPROTOTYPE);
  202         so = soalloc(M_WAITOK);
  203         if (so == NULL)
  204                 return (ENOBUFS);
  205 
  206         TAILQ_INIT(&so->so_incomp);
  207         TAILQ_INIT(&so->so_comp);
  208         so->so_type = type;
  209         so->so_cred = crhold(cred);
  210         so->so_proto = prp;
  211 #ifdef MAC
  212         mac_create_socket(cred, so);
  213 #endif
  214         SOCK_LOCK(so);
  215         knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
  216         knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
  217         soref(so);
  218         SOCK_UNLOCK(so);
  219         error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
  220         if (error) {
  221                 ACCEPT_LOCK();
  222                 SOCK_LOCK(so);
  223                 so->so_state |= SS_NOFDREF;
  224                 sorele(so);
  225                 return (error);
  226         }
  227         *aso = so;
  228         return (0);
  229 }
  230 
  231 int
  232 sobind(so, nam, td)
  233         struct socket *so;
  234         struct sockaddr *nam;
  235         struct thread *td;
  236 {
  237 
  238         return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
  239 }
  240 
  241 void
  242 sodealloc(struct socket *so)
  243 {
  244 
  245         KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
  246         mtx_lock(&so_global_mtx);
  247         so->so_gencnt = ++so_gencnt;
  248         mtx_unlock(&so_global_mtx);
  249         if (so->so_rcv.sb_hiwat)
  250                 (void)chgsbsize(so->so_cred->cr_uidinfo,
  251                     &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
  252         if (so->so_snd.sb_hiwat)
  253                 (void)chgsbsize(so->so_cred->cr_uidinfo,
  254                     &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
  255 #ifdef INET
  256         /* remove acccept filter if one is present. */
  257         if (so->so_accf != NULL)
  258                 do_setopt_accept_filter(so, NULL);
  259 #endif
  260 #ifdef MAC
  261         mac_destroy_socket(so);
  262 #endif
  263         crfree(so->so_cred);
  264         SOCKBUF_LOCK_DESTROY(&so->so_snd);
  265         SOCKBUF_LOCK_DESTROY(&so->so_rcv);
  266         /* sx_destroy(&so->so_sxlock); */
  267         uma_zfree(socket_zone, so);
  268         mtx_lock(&so_global_mtx);
  269         --numopensockets;
  270         mtx_unlock(&so_global_mtx);
  271 }
  272 
  273 /*
  274  * solisten() transitions a socket from a non-listening state to a listening
  275  * state, but can also be used to update the listen queue depth on an
  276  * existing listen socket.  The protocol will call back into the sockets
  277  * layer using solisten_proto_check() and solisten_proto() to check and set
  278  * socket-layer listen state.  Call backs are used so that the protocol can
  279  * acquire both protocol and socket layer locks in whatever order is required
  280  * by the protocol.
  281  *
  282  * Protocol implementors are advised to hold the socket lock across the
  283  * socket-layer test and set to avoid races at the socket layer.
  284  */
  285 int
  286 solisten(so, backlog, td)
  287         struct socket *so;
  288         int backlog;
  289         struct thread *td;
  290 {
  291         int error;
  292 
  293         error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
  294         if (error)
  295                 return (error);
  296 
  297         /*
  298          * XXXRW: The following state adjustment should occur in
  299          * solisten_proto(), but we don't currently pass the backlog request
  300          * to the protocol via pru_listen().
  301          */
  302         if (backlog < 0 || backlog > somaxconn)
  303                 backlog = somaxconn;
  304         so->so_qlimit = backlog;
  305         return (0);
  306 }
  307 
  308 int
  309 solisten_proto_check(so)
  310         struct socket *so;
  311 {
  312 
  313         SOCK_LOCK_ASSERT(so);
  314 
  315         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
  316             SS_ISDISCONNECTING))
  317                 return (EINVAL);
  318         return (0);
  319 }
  320 
  321 void
  322 solisten_proto(so)
  323         struct socket *so;
  324 {
  325 
  326         SOCK_LOCK_ASSERT(so);
  327 
  328         so->so_options |= SO_ACCEPTCONN;
  329 }
  330 
  331 /*
  332  * Attempt to free a socket.  This should really be sotryfree().
  333  *
  334  * We free the socket if the protocol is no longer interested in the socket,
  335  * there's no file descriptor reference, and the refcount is 0.  While the
  336  * calling macro sotryfree() tests the refcount, sofree() has to test it
  337  * again as it's possible to race with an accept()ing thread if the socket is
  338  * in an listen queue of a listen socket, as being in the listen queue
  339  * doesn't elevate the reference count.  sofree() acquires the accept mutex
  340  * early for this test in order to avoid that race.
  341  */
  342 void
  343 sofree(so)
  344         struct socket *so;
  345 {
  346         struct socket *head;
  347 
  348         ACCEPT_LOCK_ASSERT();
  349         SOCK_LOCK_ASSERT(so);
  350 
  351         if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0 ||
  352             so->so_count != 0) {
  353                 SOCK_UNLOCK(so);
  354                 ACCEPT_UNLOCK();
  355                 return;
  356         }
  357 
  358         head = so->so_head;
  359         if (head != NULL) {
  360                 KASSERT((so->so_qstate & SQ_COMP) != 0 ||
  361                     (so->so_qstate & SQ_INCOMP) != 0,
  362                     ("sofree: so_head != NULL, but neither SQ_COMP nor "
  363                     "SQ_INCOMP"));
  364                 KASSERT((so->so_qstate & SQ_COMP) == 0 ||
  365                     (so->so_qstate & SQ_INCOMP) == 0,
  366                     ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
  367                 /*
  368                  * accept(2) is responsible draining the completed
  369                  * connection queue and freeing those sockets, so
  370                  * we just return here if this socket is currently
  371                  * on the completed connection queue.  Otherwise,
  372                  * accept(2) may hang after select(2) has indicating
  373                  * that a listening socket was ready.  If it's an
  374                  * incomplete connection, we remove it from the queue
  375                  * and free it; otherwise, it won't be released until
  376                  * the listening socket is closed.
  377                  */
  378                 if ((so->so_qstate & SQ_COMP) != 0) {
  379                         SOCK_UNLOCK(so);
  380                         ACCEPT_UNLOCK();
  381                         return;
  382                 }
  383                 TAILQ_REMOVE(&head->so_incomp, so, so_list);
  384                 head->so_incqlen--;
  385                 so->so_qstate &= ~SQ_INCOMP;
  386                 so->so_head = NULL;
  387         }
  388         KASSERT((so->so_qstate & SQ_COMP) == 0 &&
  389             (so->so_qstate & SQ_INCOMP) == 0,
  390             ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
  391             so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
  392         SOCK_UNLOCK(so);
  393         ACCEPT_UNLOCK();
  394         SOCKBUF_LOCK(&so->so_snd);
  395         so->so_snd.sb_flags |= SB_NOINTR;
  396         (void)sblock(&so->so_snd, M_WAITOK);
  397         /*
  398          * socantsendmore_locked() drops the socket buffer mutex so that it
  399          * can safely perform wakeups.  Re-acquire the mutex before
  400          * continuing.
  401          */
  402         socantsendmore_locked(so);
  403         SOCKBUF_LOCK(&so->so_snd);
  404         sbunlock(&so->so_snd);
  405         sbrelease_locked(&so->so_snd, so);
  406         SOCKBUF_UNLOCK(&so->so_snd);
  407         sorflush(so);
  408         knlist_destroy(&so->so_rcv.sb_sel.si_note);
  409         knlist_destroy(&so->so_snd.sb_sel.si_note);
  410         sodealloc(so);
  411 }
  412 
  413 /*
  414  * Close a socket on last file table reference removal.
  415  * Initiate disconnect if connected.
  416  * Free socket when disconnect complete.
  417  *
  418  * This function will sorele() the socket.  Note that soclose() may be
  419  * called prior to the ref count reaching zero.  The actual socket
  420  * structure will not be freed until the ref count reaches zero.
  421  */
  422 int
  423 soclose(so)
  424         struct socket *so;
  425 {
  426         int error = 0;
  427 
  428         KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
  429 
  430         funsetown(&so->so_sigio);
  431         if (so->so_options & SO_ACCEPTCONN) {
  432                 struct socket *sp;
  433                 ACCEPT_LOCK();
  434                 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
  435                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
  436                         so->so_incqlen--;
  437                         sp->so_qstate &= ~SQ_INCOMP;
  438                         sp->so_head = NULL;
  439                         ACCEPT_UNLOCK();
  440                         (void) soabort(sp);
  441                         ACCEPT_LOCK();
  442                 }
  443                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
  444                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
  445                         so->so_qlen--;
  446                         sp->so_qstate &= ~SQ_COMP;
  447                         sp->so_head = NULL;
  448                         ACCEPT_UNLOCK();
  449                         (void) soabort(sp);
  450                         ACCEPT_LOCK();
  451                 }
  452                 ACCEPT_UNLOCK();
  453         }
  454         if (so->so_pcb == NULL)
  455                 goto discard;
  456         if (so->so_state & SS_ISCONNECTED) {
  457                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
  458                         error = sodisconnect(so);
  459                         if (error)
  460                                 goto drop;
  461                 }
  462                 if (so->so_options & SO_LINGER) {
  463                         if ((so->so_state & SS_ISDISCONNECTING) &&
  464                             (so->so_state & SS_NBIO))
  465                                 goto drop;
  466                         while (so->so_state & SS_ISCONNECTED) {
  467                                 error = tsleep(&so->so_timeo,
  468                                     PSOCK | PCATCH, "soclos", so->so_linger * hz);
  469                                 if (error)
  470                                         break;
  471                         }
  472                 }
  473         }
  474 drop:
  475         if (so->so_pcb != NULL) {
  476                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
  477                 if (error == 0)
  478                         error = error2;
  479         }
  480 discard:
  481         ACCEPT_LOCK();
  482         SOCK_LOCK(so);
  483         KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
  484         so->so_state |= SS_NOFDREF;
  485         sorele(so);
  486         return (error);
  487 }
  488 
  489 /*
  490  * soabort() must not be called with any socket locks held, as it calls
  491  * into the protocol, which will call back into the socket code causing
  492  * it to acquire additional socket locks that may cause recursion or lock
  493  * order reversals.
  494  */
  495 int
  496 soabort(so)
  497         struct socket *so;
  498 {
  499         int error;
  500 
  501         error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
  502         if (error) {
  503                 ACCEPT_LOCK();
  504                 SOCK_LOCK(so);
  505                 sotryfree(so);  /* note: does not decrement the ref count */
  506                 return error;
  507         }
  508         return (0);
  509 }
  510 
  511 int
  512 soaccept(so, nam)
  513         struct socket *so;
  514         struct sockaddr **nam;
  515 {
  516         int error;
  517 
  518         SOCK_LOCK(so);
  519         KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
  520         so->so_state &= ~SS_NOFDREF;
  521         SOCK_UNLOCK(so);
  522         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
  523         return (error);
  524 }
  525 
  526 int
  527 soconnect(so, nam, td)
  528         struct socket *so;
  529         struct sockaddr *nam;
  530         struct thread *td;
  531 {
  532         int error;
  533 
  534         if (so->so_options & SO_ACCEPTCONN)
  535                 return (EOPNOTSUPP);
  536         /*
  537          * If protocol is connection-based, can only connect once.
  538          * Otherwise, if connected, try to disconnect first.
  539          * This allows user to disconnect by connecting to, e.g.,
  540          * a null address.
  541          */
  542         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
  543             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
  544             (error = sodisconnect(so)))) {
  545                 error = EISCONN;
  546         } else {
  547                 SOCK_LOCK(so);
  548                 /*
  549                  * Prevent accumulated error from previous connection
  550                  * from biting us.
  551                  */
  552                 so->so_error = 0;
  553                 SOCK_UNLOCK(so);
  554                 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
  555         }
  556 
  557         return (error);
  558 }
  559 
  560 int
  561 soconnect2(so1, so2)
  562         struct socket *so1;
  563         struct socket *so2;
  564 {
  565 
  566         return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
  567 }
  568 
  569 int
  570 sodisconnect(so)
  571         struct socket *so;
  572 {
  573         int error;
  574 
  575         if ((so->so_state & SS_ISCONNECTED) == 0)
  576                 return (ENOTCONN);
  577         if (so->so_state & SS_ISDISCONNECTING)
  578                 return (EALREADY);
  579         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
  580         return (error);
  581 }
  582 
  583 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
  584 /*
  585  * Send on a socket.
  586  * If send must go all at once and message is larger than
  587  * send buffering, then hard error.
  588  * Lock against other senders.
  589  * If must go all at once and not enough room now, then
  590  * inform user that this would block and do nothing.
  591  * Otherwise, if nonblocking, send as much as possible.
  592  * The data to be sent is described by "uio" if nonzero,
  593  * otherwise by the mbuf chain "top" (which must be null
  594  * if uio is not).  Data provided in mbuf chain must be small
  595  * enough to send all at once.
  596  *
  597  * Returns nonzero on error, timeout or signal; callers
  598  * must check for short counts if EINTR/ERESTART are returned.
  599  * Data and control buffers are freed on return.
  600  */
  601 
  602 #ifdef ZERO_COPY_SOCKETS
  603 struct so_zerocopy_stats{
  604         int size_ok;
  605         int align_ok;
  606         int found_ifp;
  607 };
  608 struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
  609 #include <netinet/in.h>
  610 #include <net/route.h>
  611 #include <netinet/in_pcb.h>
  612 #include <vm/vm.h>
  613 #include <vm/vm_page.h>
  614 #include <vm/vm_object.h>
  615 #endif /*ZERO_COPY_SOCKETS*/
  616 
  617 int
  618 sosend(so, addr, uio, top, control, flags, td)
  619         struct socket *so;
  620         struct sockaddr *addr;
  621         struct uio *uio;
  622         struct mbuf *top;
  623         struct mbuf *control;
  624         int flags;
  625         struct thread *td;
  626 {
  627         struct mbuf **mp;
  628         struct mbuf *m;
  629         long space, len = 0, resid;
  630         int clen = 0, error, dontroute;
  631         int atomic = sosendallatonce(so) || top;
  632 #ifdef ZERO_COPY_SOCKETS
  633         int cow_send;
  634 #endif /* ZERO_COPY_SOCKETS */
  635 
  636         if (uio != NULL)
  637                 resid = uio->uio_resid;
  638         else
  639                 resid = top->m_pkthdr.len;
  640         /*
  641          * In theory resid should be unsigned.
  642          * However, space must be signed, as it might be less than 0
  643          * if we over-committed, and we must use a signed comparison
  644          * of space and resid.  On the other hand, a negative resid
  645          * causes us to loop sending 0-length segments to the protocol.
  646          *
  647          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
  648          * type sockets since that's an error.
  649          */
  650         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
  651                 error = EINVAL;
  652                 goto out;
  653         }
  654 
  655         dontroute =
  656             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
  657             (so->so_proto->pr_flags & PR_ATOMIC);
  658         if (td != NULL)
  659                 td->td_proc->p_stats->p_ru.ru_msgsnd++;
  660         if (control != NULL)
  661                 clen = control->m_len;
  662 #define snderr(errno)   { error = (errno); goto release; }
  663 
  664         SOCKBUF_LOCK(&so->so_snd);
  665 restart:
  666         SOCKBUF_LOCK_ASSERT(&so->so_snd);
  667         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
  668         if (error)
  669                 goto out_locked;
  670         do {
  671                 SOCKBUF_LOCK_ASSERT(&so->so_snd);
  672                 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
  673                         snderr(EPIPE);
  674                 if (so->so_error) {
  675                         error = so->so_error;
  676                         so->so_error = 0;
  677                         goto release;
  678                 }
  679                 if ((so->so_state & SS_ISCONNECTED) == 0) {
  680                         /*
  681                          * `sendto' and `sendmsg' is allowed on a connection-
  682                          * based socket if it supports implied connect.
  683                          * Return ENOTCONN if not connected and no address is
  684                          * supplied.
  685                          */
  686                         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
  687                             (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
  688                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
  689                                     !(resid == 0 && clen != 0))
  690                                         snderr(ENOTCONN);
  691                         } else if (addr == NULL)
  692                             snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
  693                                    ENOTCONN : EDESTADDRREQ);
  694                 }
  695                 space = sbspace(&so->so_snd);
  696                 if (flags & MSG_OOB)
  697                         space += 1024;
  698                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
  699                     clen > so->so_snd.sb_hiwat)
  700                         snderr(EMSGSIZE);
  701                 if (space < resid + clen &&
  702                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
  703                         if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
  704                                 snderr(EWOULDBLOCK);
  705                         sbunlock(&so->so_snd);
  706                         error = sbwait(&so->so_snd);
  707                         if (error)
  708                                 goto out_locked;
  709                         goto restart;
  710                 }
  711                 SOCKBUF_UNLOCK(&so->so_snd);
  712                 mp = &top;
  713                 space -= clen;
  714                 do {
  715                     if (uio == NULL) {
  716                         /*
  717                          * Data is prepackaged in "top".
  718                          */
  719                         resid = 0;
  720                         if (flags & MSG_EOR)
  721                                 top->m_flags |= M_EOR;
  722                     } else do {
  723 #ifdef ZERO_COPY_SOCKETS
  724                         cow_send = 0;
  725 #endif /* ZERO_COPY_SOCKETS */
  726                         if (resid >= MINCLSIZE) {
  727 #ifdef ZERO_COPY_SOCKETS
  728                                 if (top == NULL) {
  729                                         MGETHDR(m, M_TRYWAIT, MT_DATA);
  730                                         if (m == NULL) {
  731                                                 error = ENOBUFS;
  732                                                 SOCKBUF_LOCK(&so->so_snd);
  733                                                 goto release;
  734                                         }
  735                                         m->m_pkthdr.len = 0;
  736                                         m->m_pkthdr.rcvif = (struct ifnet *)0;
  737                                 } else {
  738                                         MGET(m, M_TRYWAIT, MT_DATA);
  739                                         if (m == NULL) {
  740                                                 error = ENOBUFS;
  741                                                 SOCKBUF_LOCK(&so->so_snd);
  742                                                 goto release;
  743                                         }
  744                                 }
  745                                 if (so_zero_copy_send &&
  746                                     resid>=PAGE_SIZE &&
  747                                     space>=PAGE_SIZE &&
  748                                     uio->uio_iov->iov_len>=PAGE_SIZE) {
  749                                         so_zerocp_stats.size_ok++;
  750                                         if (!((vm_offset_t)
  751                                           uio->uio_iov->iov_base & PAGE_MASK)){
  752                                                 so_zerocp_stats.align_ok++;
  753                                                 cow_send = socow_setup(m, uio);
  754                                         }
  755                                 }
  756                                 if (!cow_send) {
  757                                         MCLGET(m, M_TRYWAIT);
  758                                         if ((m->m_flags & M_EXT) == 0) {
  759                                                 m_free(m);
  760                                                 m = NULL;
  761                                         } else {
  762                                                 len = min(min(MCLBYTES, resid), space);
  763                                         }
  764                                 } else
  765                                         len = PAGE_SIZE;
  766 #else /* ZERO_COPY_SOCKETS */
  767                                 if (top == NULL) {
  768                                         m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
  769                                         m->m_pkthdr.len = 0;
  770                                         m->m_pkthdr.rcvif = (struct ifnet *)0;
  771                                 } else
  772                                         m = m_getcl(M_TRYWAIT, MT_DATA, 0);
  773                                 len = min(min(MCLBYTES, resid), space);
  774 #endif /* ZERO_COPY_SOCKETS */
  775                         } else {
  776                                 if (top == NULL) {
  777                                         m = m_gethdr(M_TRYWAIT, MT_DATA);
  778                                         m->m_pkthdr.len = 0;
  779                                         m->m_pkthdr.rcvif = (struct ifnet *)0;
  780 
  781                                         len = min(min(MHLEN, resid), space);
  782                                         /*
  783                                          * For datagram protocols, leave room
  784                                          * for protocol headers in first mbuf.
  785                                          */
  786                                         if (atomic && m && len < MHLEN)
  787                                                 MH_ALIGN(m, len);
  788                                 } else {
  789                                         m = m_get(M_TRYWAIT, MT_DATA);
  790                                         len = min(min(MLEN, resid), space);
  791                                 }
  792                         }
  793                         if (m == NULL) {
  794                                 error = ENOBUFS;
  795                                 SOCKBUF_LOCK(&so->so_snd);
  796                                 goto release;
  797                         }
  798 
  799                         space -= len;
  800 #ifdef ZERO_COPY_SOCKETS
  801                         if (cow_send)
  802                                 error = 0;
  803                         else
  804 #endif /* ZERO_COPY_SOCKETS */
  805                         error = uiomove(mtod(m, void *), (int)len, uio);
  806                         resid = uio->uio_resid;
  807                         m->m_len = len;
  808                         *mp = m;
  809                         top->m_pkthdr.len += len;
  810                         if (error) {
  811                                 SOCKBUF_LOCK(&so->so_snd);
  812                                 goto release;
  813                         }
  814                         mp = &m->m_next;
  815                         if (resid <= 0) {
  816                                 if (flags & MSG_EOR)
  817                                         top->m_flags |= M_EOR;
  818                                 break;
  819                         }
  820                     } while (space > 0 && atomic);
  821                     if (dontroute) {
  822                             SOCK_LOCK(so);
  823                             so->so_options |= SO_DONTROUTE;
  824                             SOCK_UNLOCK(so);
  825                     }
  826                     /*
  827                      * XXX all the SBS_CANTSENDMORE checks previously
  828                      * done could be out of date.  We could have recieved
  829                      * a reset packet in an interrupt or maybe we slept
  830                      * while doing page faults in uiomove() etc. We could
  831                      * probably recheck again inside the locking protection
  832                      * here, but there are probably other places that this
  833                      * also happens.  We must rethink this.
  834                      */
  835                     error = (*so->so_proto->pr_usrreqs->pru_send)(so,
  836                         (flags & MSG_OOB) ? PRUS_OOB :
  837                         /*
  838                          * If the user set MSG_EOF, the protocol
  839                          * understands this flag and nothing left to
  840                          * send then use PRU_SEND_EOF instead of PRU_SEND.
  841                          */
  842                         ((flags & MSG_EOF) &&
  843                          (so->so_proto->pr_flags & PR_IMPLOPCL) &&
  844                          (resid <= 0)) ?
  845                                 PRUS_EOF :
  846                         /* If there is more to send set PRUS_MORETOCOME */
  847                         (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
  848                         top, addr, control, td);
  849                     if (dontroute) {
  850                             SOCK_LOCK(so);
  851                             so->so_options &= ~SO_DONTROUTE;
  852                             SOCK_UNLOCK(so);
  853                     }
  854                     clen = 0;
  855                     control = NULL;
  856                     top = NULL;
  857                     mp = &top;
  858                     if (error) {
  859                         SOCKBUF_LOCK(&so->so_snd);
  860                         goto release;
  861                     }
  862                 } while (resid && space > 0);
  863                 SOCKBUF_LOCK(&so->so_snd);
  864         } while (resid);
  865 
  866 release:
  867         SOCKBUF_LOCK_ASSERT(&so->so_snd);
  868         sbunlock(&so->so_snd);
  869 out_locked:
  870         SOCKBUF_LOCK_ASSERT(&so->so_snd);
  871         SOCKBUF_UNLOCK(&so->so_snd);
  872 out:
  873         if (top != NULL)
  874                 m_freem(top);
  875         if (control != NULL)
  876                 m_freem(control);
  877         return (error);
  878 }
  879 
  880 /*
  881  * The part of soreceive() that implements reading non-inline out-of-band
  882  * data from a socket.  For more complete comments, see soreceive(), from
  883  * which this code originated.
  884  *
  885  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
  886  * unable to return an mbuf chain to the caller.
  887  */
  888 static int
  889 soreceive_rcvoob(so, uio, flags)
  890         struct socket *so;
  891         struct uio *uio;
  892         int flags;
  893 {
  894         struct protosw *pr = so->so_proto;
  895         struct mbuf *m;
  896         int error;
  897 
  898         KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
  899 
  900         m = m_get(M_TRYWAIT, MT_DATA);
  901         if (m == NULL)
  902                 return (ENOBUFS);
  903         error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
  904         if (error)
  905                 goto bad;
  906         do {
  907 #ifdef ZERO_COPY_SOCKETS
  908                 if (so_zero_copy_receive) {
  909                         vm_page_t pg;
  910                         int disposable;
  911 
  912                         if ((m->m_flags & M_EXT)
  913                          && (m->m_ext.ext_type == EXT_DISPOSABLE))
  914                                 disposable = 1;
  915                         else
  916                                 disposable = 0;
  917 
  918                         pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t)));
  919                         if (uio->uio_offset == -1)
  920                                 uio->uio_offset =IDX_TO_OFF(pg->pindex);
  921 
  922                         error = uiomoveco(mtod(m, void *),
  923                                           min(uio->uio_resid, m->m_len),
  924                                           uio, pg->object,
  925                                           disposable);
  926                 } else
  927 #endif /* ZERO_COPY_SOCKETS */
  928                 error = uiomove(mtod(m, void *),
  929                     (int) min(uio->uio_resid, m->m_len), uio);
  930                 m = m_free(m);
  931         } while (uio->uio_resid && error == 0 && m);
  932 bad:
  933         if (m != NULL)
  934                 m_freem(m);
  935         return (error);
  936 }
  937 
  938 /*
  939  * Following replacement or removal of the first mbuf on the first mbuf chain
  940  * of a socket buffer, push necessary state changes back into the socket
  941  * buffer so that other consumers see the values consistently.  'nextrecord'
  942  * is the callers locally stored value of the original value of
  943  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
  944  * NOTE: 'nextrecord' may be NULL.
  945  */
  946 static __inline void
  947 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
  948 {
  949 
  950         SOCKBUF_LOCK_ASSERT(sb);
  951         /*
  952          * First, update for the new value of nextrecord.  If necessary, make
  953          * it the first record.
  954          */
  955         if (sb->sb_mb != NULL)
  956                 sb->sb_mb->m_nextpkt = nextrecord;
  957         else
  958                 sb->sb_mb = nextrecord;
  959 
  960         /*
  961          * Now update any dependent socket buffer fields to reflect the new
  962          * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
  963          * addition of a second clause that takes care of the case where
  964          * sb_mb has been updated, but remains the last record.
  965          */
  966         if (sb->sb_mb == NULL) {
  967                 sb->sb_mbtail = NULL;
  968                 sb->sb_lastrecord = NULL;
  969         } else if (sb->sb_mb->m_nextpkt == NULL)
  970                 sb->sb_lastrecord = sb->sb_mb;
  971 }
  972 
  973 
  974 /*
  975  * Implement receive operations on a socket.
  976  * We depend on the way that records are added to the sockbuf
  977  * by sbappend*.  In particular, each record (mbufs linked through m_next)
  978  * must begin with an address if the protocol so specifies,
  979  * followed by an optional mbuf or mbufs containing ancillary data,
  980  * and then zero or more mbufs of data.
  981  * In order to avoid blocking network interrupts for the entire time here,
  982  * we splx() while doing the actual copy to user space.
  983  * Although the sockbuf is locked, new data may still be appended,
  984  * and thus we must maintain consistency of the sockbuf during that time.
  985  *
  986  * The caller may receive the data as a single mbuf chain by supplying
  987  * an mbuf **mp0 for use in returning the chain.  The uio is then used
  988  * only for the count in uio_resid.
  989  */
  990 int
  991 soreceive(so, psa, uio, mp0, controlp, flagsp)
  992         struct socket *so;
  993         struct sockaddr **psa;
  994         struct uio *uio;
  995         struct mbuf **mp0;
  996         struct mbuf **controlp;
  997         int *flagsp;
  998 {
  999         struct mbuf *m, **mp;
 1000         int flags, len, error, offset;
 1001         struct protosw *pr = so->so_proto;
 1002         struct mbuf *nextrecord;
 1003         int moff, type = 0;
 1004         int orig_resid = uio->uio_resid;
 1005 
 1006         mp = mp0;
 1007         if (psa != NULL)
 1008                 *psa = NULL;
 1009         if (controlp != NULL)
 1010                 *controlp = NULL;
 1011         if (flagsp != NULL)
 1012                 flags = *flagsp &~ MSG_EOR;
 1013         else
 1014                 flags = 0;
 1015         if (flags & MSG_OOB)
 1016                 return (soreceive_rcvoob(so, uio, flags));
 1017         if (mp != NULL)
 1018                 *mp = NULL;
 1019         if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
 1020             && uio->uio_resid)
 1021                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
 1022 
 1023         SOCKBUF_LOCK(&so->so_rcv);
 1024 restart:
 1025         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1026         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
 1027         if (error)
 1028                 goto out;
 1029 
 1030         m = so->so_rcv.sb_mb;
 1031         /*
 1032          * If we have less data than requested, block awaiting more
 1033          * (subject to any timeout) if:
 1034          *   1. the current count is less than the low water mark, or
 1035          *   2. MSG_WAITALL is set, and it is possible to do the entire
 1036          *      receive operation at once if we block (resid <= hiwat).
 1037          *   3. MSG_DONTWAIT is not set
 1038          * If MSG_WAITALL is set but resid is larger than the receive buffer,
 1039          * we have to do the receive in sections, and thus risk returning
 1040          * a short count if a timeout or signal occurs after we start.
 1041          */
 1042         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
 1043             so->so_rcv.sb_cc < uio->uio_resid) &&
 1044             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
 1045             ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
 1046             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
 1047                 KASSERT(m != NULL || !so->so_rcv.sb_cc,
 1048                     ("receive: m == %p so->so_rcv.sb_cc == %u",
 1049                     m, so->so_rcv.sb_cc));
 1050                 if (so->so_error) {
 1051                         if (m != NULL)
 1052                                 goto dontblock;
 1053                         error = so->so_error;
 1054                         if ((flags & MSG_PEEK) == 0)
 1055                                 so->so_error = 0;
 1056                         goto release;
 1057                 }
 1058                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1059                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 1060                         if (m)
 1061                                 goto dontblock;
 1062                         else
 1063                                 goto release;
 1064                 }
 1065                 for (; m != NULL; m = m->m_next)
 1066                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
 1067                                 m = so->so_rcv.sb_mb;
 1068                                 goto dontblock;
 1069                         }
 1070                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
 1071                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
 1072                         error = ENOTCONN;
 1073                         goto release;
 1074                 }
 1075                 if (uio->uio_resid == 0)
 1076                         goto release;
 1077                 if ((so->so_state & SS_NBIO) ||
 1078                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
 1079                         error = EWOULDBLOCK;
 1080                         goto release;
 1081                 }
 1082                 SBLASTRECORDCHK(&so->so_rcv);
 1083                 SBLASTMBUFCHK(&so->so_rcv);
 1084                 sbunlock(&so->so_rcv);
 1085                 error = sbwait(&so->so_rcv);
 1086                 if (error)
 1087                         goto out;
 1088                 goto restart;
 1089         }
 1090 dontblock:
 1091         /*
 1092          * From this point onward, we maintain 'nextrecord' as a cache of the
 1093          * pointer to the next record in the socket buffer.  We must keep the
 1094          * various socket buffer pointers and local stack versions of the
 1095          * pointers in sync, pushing out modifications before dropping the
 1096          * socket buffer mutex, and re-reading them when picking it up.
 1097          *
 1098          * Otherwise, we will race with the network stack appending new data
 1099          * or records onto the socket buffer by using inconsistent/stale
 1100          * versions of the field, possibly resulting in socket buffer
 1101          * corruption.
 1102          *
 1103          * By holding the high-level sblock(), we prevent simultaneous
 1104          * readers from pulling off the front of the socket buffer.
 1105          */
 1106         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1107         if (uio->uio_td)
 1108                 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
 1109         KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
 1110         SBLASTRECORDCHK(&so->so_rcv);
 1111         SBLASTMBUFCHK(&so->so_rcv);
 1112         nextrecord = m->m_nextpkt;
 1113         if (pr->pr_flags & PR_ADDR) {
 1114                 KASSERT(m->m_type == MT_SONAME,
 1115                     ("m->m_type == %d", m->m_type));
 1116                 orig_resid = 0;
 1117                 if (psa != NULL)
 1118                         *psa = sodupsockaddr(mtod(m, struct sockaddr *),
 1119                             M_NOWAIT);
 1120                 if (flags & MSG_PEEK) {
 1121                         m = m->m_next;
 1122                 } else {
 1123                         sbfree(&so->so_rcv, m);
 1124                         so->so_rcv.sb_mb = m_free(m);
 1125                         m = so->so_rcv.sb_mb;
 1126                         sockbuf_pushsync(&so->so_rcv, nextrecord);
 1127                 }
 1128         }
 1129 
 1130         /*
 1131          * Process one or more MT_CONTROL mbufs present before any data mbufs
 1132          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
 1133          * just copy the data; if !MSG_PEEK, we call into the protocol to
 1134          * perform externalization (or freeing if controlp == NULL).
 1135          */
 1136         if (m != NULL && m->m_type == MT_CONTROL) {
 1137                 struct mbuf *cm = NULL, *cmn;
 1138                 struct mbuf **cme = &cm;
 1139 
 1140                 do {
 1141                         if (flags & MSG_PEEK) {
 1142                                 if (controlp != NULL) {
 1143                                         *controlp = m_copy(m, 0, m->m_len);
 1144                                         controlp = &(*controlp)->m_next;
 1145                                 }
 1146                                 m = m->m_next;
 1147                         } else {
 1148                                 sbfree(&so->so_rcv, m);
 1149                                 so->so_rcv.sb_mb = m->m_next;
 1150                                 m->m_next = NULL;
 1151                                 *cme = m;
 1152                                 cme = &(*cme)->m_next;
 1153                                 m = so->so_rcv.sb_mb;
 1154                         }
 1155                 } while (m != NULL && m->m_type == MT_CONTROL);
 1156                 if ((flags & MSG_PEEK) == 0)
 1157                         sockbuf_pushsync(&so->so_rcv, nextrecord);
 1158                 while (cm != NULL) {
 1159                         cmn = cm->m_next;
 1160                         cm->m_next = NULL;
 1161                         if (pr->pr_domain->dom_externalize != NULL) {
 1162                                 SOCKBUF_UNLOCK(&so->so_rcv);
 1163                                 error = (*pr->pr_domain->dom_externalize)
 1164                                     (cm, controlp);
 1165                                 SOCKBUF_LOCK(&so->so_rcv);
 1166                         } else if (controlp != NULL)
 1167                                 *controlp = cm;
 1168                         else
 1169                                 m_freem(cm);
 1170                         if (controlp != NULL) {
 1171                                 orig_resid = 0;
 1172                                 while (*controlp != NULL)
 1173                                         controlp = &(*controlp)->m_next;
 1174                         }
 1175                         cm = cmn;
 1176                 }
 1177                 if (so->so_rcv.sb_mb)
 1178                         nextrecord = so->so_rcv.sb_mb->m_nextpkt;
 1179                 else
 1180                         nextrecord = NULL;
 1181                 orig_resid = 0;
 1182         }
 1183         if (m != NULL) {
 1184                 if ((flags & MSG_PEEK) == 0) {
 1185                         KASSERT(m->m_nextpkt == nextrecord,
 1186                             ("soreceive: post-control, nextrecord !sync"));
 1187                         if (nextrecord == NULL) {
 1188                                 KASSERT(so->so_rcv.sb_mb == m,
 1189                                     ("soreceive: post-control, sb_mb!=m"));
 1190                                 KASSERT(so->so_rcv.sb_lastrecord == m,
 1191                                     ("soreceive: post-control, lastrecord!=m"));
 1192                         }
 1193                 }
 1194                 type = m->m_type;
 1195                 if (type == MT_OOBDATA)
 1196                         flags |= MSG_OOB;
 1197         } else {
 1198                 if ((flags & MSG_PEEK) == 0) {
 1199                         KASSERT(so->so_rcv.sb_mb == nextrecord,
 1200                             ("soreceive: sb_mb != nextrecord"));
 1201                         if (so->so_rcv.sb_mb == NULL) {
 1202                                 KASSERT(so->so_rcv.sb_lastrecord == NULL,
 1203                                     ("soreceive: sb_lastercord != NULL"));
 1204                         }
 1205                 }
 1206         }
 1207         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1208         SBLASTRECORDCHK(&so->so_rcv);
 1209         SBLASTMBUFCHK(&so->so_rcv);
 1210 
 1211         /*
 1212          * Now continue to read any data mbufs off of the head of the socket
 1213          * buffer until the read request is satisfied.  Note that 'type' is
 1214          * used to store the type of any mbuf reads that have happened so far
 1215          * such that soreceive() can stop reading if the type changes, which
 1216          * causes soreceive() to return only one of regular data and inline
 1217          * out-of-band data in a single socket receive operation.
 1218          */
 1219         moff = 0;
 1220         offset = 0;
 1221         while (m != NULL && uio->uio_resid > 0 && error == 0) {
 1222                 /*
 1223                  * If the type of mbuf has changed since the last mbuf
 1224                  * examined ('type'), end the receive operation.
 1225                  */
 1226                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1227                 if (m->m_type == MT_OOBDATA) {
 1228                         if (type != MT_OOBDATA)
 1229                                 break;
 1230                 } else if (type == MT_OOBDATA)
 1231                         break;
 1232                 else
 1233                     KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
 1234                         ("m->m_type == %d", m->m_type));
 1235                 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
 1236                 len = uio->uio_resid;
 1237                 if (so->so_oobmark && len > so->so_oobmark - offset)
 1238                         len = so->so_oobmark - offset;
 1239                 if (len > m->m_len - moff)
 1240                         len = m->m_len - moff;
 1241                 /*
 1242                  * If mp is set, just pass back the mbufs.
 1243                  * Otherwise copy them out via the uio, then free.
 1244                  * Sockbuf must be consistent here (points to current mbuf,
 1245                  * it points to next record) when we drop priority;
 1246                  * we must note any additions to the sockbuf when we
 1247                  * block interrupts again.
 1248                  */
 1249                 if (mp == NULL) {
 1250                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1251                         SBLASTRECORDCHK(&so->so_rcv);
 1252                         SBLASTMBUFCHK(&so->so_rcv);
 1253                         SOCKBUF_UNLOCK(&so->so_rcv);
 1254 #ifdef ZERO_COPY_SOCKETS
 1255                         if (so_zero_copy_receive) {
 1256                                 vm_page_t pg;
 1257                                 int disposable;
 1258 
 1259                                 if ((m->m_flags & M_EXT)
 1260                                  && (m->m_ext.ext_type == EXT_DISPOSABLE))
 1261                                         disposable = 1;
 1262                                 else
 1263                                         disposable = 0;
 1264 
 1265                                 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) +
 1266                                         moff));
 1267 
 1268                                 if (uio->uio_offset == -1)
 1269                                         uio->uio_offset =IDX_TO_OFF(pg->pindex);
 1270 
 1271                                 error = uiomoveco(mtod(m, char *) + moff,
 1272                                                   (int)len, uio,pg->object,
 1273                                                   disposable);
 1274                         } else
 1275 #endif /* ZERO_COPY_SOCKETS */
 1276                         error = uiomove(mtod(m, char *) + moff, (int)len, uio);
 1277                         SOCKBUF_LOCK(&so->so_rcv);
 1278                         if (error)
 1279                                 goto release;
 1280                 } else
 1281                         uio->uio_resid -= len;
 1282                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1283                 if (len == m->m_len - moff) {
 1284                         if (m->m_flags & M_EOR)
 1285                                 flags |= MSG_EOR;
 1286                         if (flags & MSG_PEEK) {
 1287                                 m = m->m_next;
 1288                                 moff = 0;
 1289                         } else {
 1290                                 nextrecord = m->m_nextpkt;
 1291                                 sbfree(&so->so_rcv, m);
 1292                                 if (mp != NULL) {
 1293                                         *mp = m;
 1294                                         mp = &m->m_next;
 1295                                         so->so_rcv.sb_mb = m = m->m_next;
 1296                                         *mp = NULL;
 1297                                 } else {
 1298                                         so->so_rcv.sb_mb = m_free(m);
 1299                                         m = so->so_rcv.sb_mb;
 1300                                 }
 1301                                 if (m != NULL) {
 1302                                         m->m_nextpkt = nextrecord;
 1303                                         if (nextrecord == NULL)
 1304                                                 so->so_rcv.sb_lastrecord = m;
 1305                                 } else {
 1306                                         so->so_rcv.sb_mb = nextrecord;
 1307                                         SB_EMPTY_FIXUP(&so->so_rcv);
 1308                                 }
 1309                                 SBLASTRECORDCHK(&so->so_rcv);
 1310                                 SBLASTMBUFCHK(&so->so_rcv);
 1311                         }
 1312                 } else {
 1313                         if (flags & MSG_PEEK)
 1314                                 moff += len;
 1315                         else {
 1316                                 if (mp != NULL) {
 1317                                         SOCKBUF_UNLOCK(&so->so_rcv);
 1318                                         *mp = m_copym(m, 0, len, M_TRYWAIT);
 1319                                         SOCKBUF_LOCK(&so->so_rcv);
 1320                                 }
 1321                                 m->m_data += len;
 1322                                 m->m_len -= len;
 1323                                 so->so_rcv.sb_cc -= len;
 1324                         }
 1325                 }
 1326                 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1327                 if (so->so_oobmark) {
 1328                         if ((flags & MSG_PEEK) == 0) {
 1329                                 so->so_oobmark -= len;
 1330                                 if (so->so_oobmark == 0) {
 1331                                         so->so_rcv.sb_state |= SBS_RCVATMARK;
 1332                                         break;
 1333                                 }
 1334                         } else {
 1335                                 offset += len;
 1336                                 if (offset == so->so_oobmark)
 1337                                         break;
 1338                         }
 1339                 }
 1340                 if (flags & MSG_EOR)
 1341                         break;
 1342                 /*
 1343                  * If the MSG_WAITALL flag is set (for non-atomic socket),
 1344                  * we must not quit until "uio->uio_resid == 0" or an error
 1345                  * termination.  If a signal/timeout occurs, return
 1346                  * with a short count but without error.
 1347                  * Keep sockbuf locked against other readers.
 1348                  */
 1349                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
 1350                     !sosendallatonce(so) && nextrecord == NULL) {
 1351                         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1352                         if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
 1353                                 break;
 1354                         /*
 1355                          * Notify the protocol that some data has been
 1356                          * drained before blocking.
 1357                          */
 1358                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
 1359                                 SOCKBUF_UNLOCK(&so->so_rcv);
 1360                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
 1361                                 SOCKBUF_LOCK(&so->so_rcv);
 1362                         }
 1363                         SBLASTRECORDCHK(&so->so_rcv);
 1364                         SBLASTMBUFCHK(&so->so_rcv);
 1365                         error = sbwait(&so->so_rcv);
 1366                         if (error)
 1367                                 goto release;
 1368                         m = so->so_rcv.sb_mb;
 1369                         if (m != NULL)
 1370                                 nextrecord = m->m_nextpkt;
 1371                 }
 1372         }
 1373 
 1374         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1375         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
 1376                 flags |= MSG_TRUNC;
 1377                 if ((flags & MSG_PEEK) == 0)
 1378                         (void) sbdroprecord_locked(&so->so_rcv);
 1379         }
 1380         if ((flags & MSG_PEEK) == 0) {
 1381                 if (m == NULL) {
 1382                         /*
 1383                          * First part is an inline SB_EMPTY_FIXUP().  Second
 1384                          * part makes sure sb_lastrecord is up-to-date if
 1385                          * there is still data in the socket buffer.
 1386                          */
 1387                         so->so_rcv.sb_mb = nextrecord;
 1388                         if (so->so_rcv.sb_mb == NULL) {
 1389                                 so->so_rcv.sb_mbtail = NULL;
 1390                                 so->so_rcv.sb_lastrecord = NULL;
 1391                         } else if (nextrecord->m_nextpkt == NULL)
 1392                                 so->so_rcv.sb_lastrecord = nextrecord;
 1393                 }
 1394                 SBLASTRECORDCHK(&so->so_rcv);
 1395                 SBLASTMBUFCHK(&so->so_rcv);
 1396                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
 1397                         SOCKBUF_UNLOCK(&so->so_rcv);
 1398                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
 1399                         SOCKBUF_LOCK(&so->so_rcv);
 1400                 }
 1401         }
 1402         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1403         if (orig_resid == uio->uio_resid && orig_resid &&
 1404             (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
 1405                 sbunlock(&so->so_rcv);
 1406                 goto restart;
 1407         }
 1408 
 1409         if (flagsp != NULL)
 1410                 *flagsp |= flags;
 1411 release:
 1412         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1413         sbunlock(&so->so_rcv);
 1414 out:
 1415         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1416         SOCKBUF_UNLOCK(&so->so_rcv);
 1417         return (error);
 1418 }
 1419 
 1420 int
 1421 soshutdown(so, how)
 1422         struct socket *so;
 1423         int how;
 1424 {
 1425         struct protosw *pr = so->so_proto;
 1426 
 1427         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
 1428                 return (EINVAL);
 1429 
 1430         if (how != SHUT_WR)
 1431                 sorflush(so);
 1432         if (how != SHUT_RD)
 1433                 return ((*pr->pr_usrreqs->pru_shutdown)(so));
 1434         return (0);
 1435 }
 1436 
 1437 void
 1438 sorflush(so)
 1439         struct socket *so;
 1440 {
 1441         struct sockbuf *sb = &so->so_rcv;
 1442         struct protosw *pr = so->so_proto;
 1443         struct sockbuf asb;
 1444         /*
 1445          * XXX: This variable is for an ugly workaround to fix problem,
 1446          * that was fixed in rev. 1.137 of sys/socketvar.h, and keep ABI
 1447          * compatibility.
 1448          */
 1449         short save_sb_state;
 1450 
 1451         /*
 1452          * XXXRW: This is quite ugly.  Previously, this code made a copy of
 1453          * the socket buffer, then zero'd the original to clear the buffer
 1454          * fields.  However, with mutexes in the socket buffer, this causes
 1455          * problems.  We only clear the zeroable bits of the original;
 1456          * however, we have to initialize and destroy the mutex in the copy
 1457          * so that dom_dispose() and sbrelease() can lock t as needed.
 1458          */
 1459         SOCKBUF_LOCK(sb);
 1460         sb->sb_flags |= SB_NOINTR;
 1461         (void) sblock(sb, M_WAITOK);
 1462         /*
 1463          * socantrcvmore_locked() drops the socket buffer mutex so that it
 1464          * can safely perform wakeups.  Re-acquire the mutex before
 1465          * continuing.
 1466          */
 1467         socantrcvmore_locked(so);
 1468         SOCKBUF_LOCK(sb);
 1469         sbunlock(sb);
 1470         /*
 1471          * Invalidate/clear most of the sockbuf structure, but leave
 1472          * selinfo and mutex data unchanged.
 1473          */
 1474         save_sb_state = sb->sb_state;
 1475         bzero(&asb, offsetof(struct sockbuf, sb_startzero));
 1476         bcopy(&sb->sb_startzero, &asb.sb_startzero,
 1477             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 1478         bzero(&sb->sb_startzero,
 1479             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 1480         sb->sb_state = save_sb_state;
 1481         SOCKBUF_UNLOCK(sb);
 1482 
 1483         SOCKBUF_LOCK_INIT(&asb, "so_rcv");
 1484         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
 1485                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
 1486         sbrelease(&asb, so);
 1487         SOCKBUF_LOCK_DESTROY(&asb);
 1488 }
 1489 
 1490 /*
 1491  * Perhaps this routine, and sooptcopyout(), below, ought to come in
 1492  * an additional variant to handle the case where the option value needs
 1493  * to be some kind of integer, but not a specific size.
 1494  * In addition to their use here, these functions are also called by the
 1495  * protocol-level pr_ctloutput() routines.
 1496  */
 1497 int
 1498 sooptcopyin(sopt, buf, len, minlen)
 1499         struct  sockopt *sopt;
 1500         void    *buf;
 1501         size_t  len;
 1502         size_t  minlen;
 1503 {
 1504         size_t  valsize;
 1505 
 1506         /*
 1507          * If the user gives us more than we wanted, we ignore it,
 1508          * but if we don't get the minimum length the caller
 1509          * wants, we return EINVAL.  On success, sopt->sopt_valsize
 1510          * is set to however much we actually retrieved.
 1511          */
 1512         if ((valsize = sopt->sopt_valsize) < minlen)
 1513                 return EINVAL;
 1514         if (valsize > len)
 1515                 sopt->sopt_valsize = valsize = len;
 1516 
 1517         if (sopt->sopt_td != NULL)
 1518                 return (copyin(sopt->sopt_val, buf, valsize));
 1519 
 1520         bcopy(sopt->sopt_val, buf, valsize);
 1521         return 0;
 1522 }
 1523 
 1524 /*
 1525  * Kernel version of setsockopt(2)/
 1526  * XXX: optlen is size_t, not socklen_t
 1527  */
 1528 int
 1529 so_setsockopt(struct socket *so, int level, int optname, void *optval,
 1530     size_t optlen)
 1531 {
 1532         struct sockopt sopt;
 1533 
 1534         sopt.sopt_level = level;
 1535         sopt.sopt_name = optname;
 1536         sopt.sopt_dir = SOPT_SET;
 1537         sopt.sopt_val = optval;
 1538         sopt.sopt_valsize = optlen;
 1539         sopt.sopt_td = NULL;
 1540         return (sosetopt(so, &sopt));
 1541 }
 1542 
 1543 int
 1544 sosetopt(so, sopt)
 1545         struct socket *so;
 1546         struct sockopt *sopt;
 1547 {
 1548         int     error, optval;
 1549         struct  linger l;
 1550         struct  timeval tv;
 1551         u_long  val;
 1552 #ifdef MAC
 1553         struct mac extmac;
 1554 #endif
 1555 
 1556         error = 0;
 1557         if (sopt->sopt_level != SOL_SOCKET) {
 1558                 if (so->so_proto && so->so_proto->pr_ctloutput)
 1559                         return ((*so->so_proto->pr_ctloutput)
 1560                                   (so, sopt));
 1561                 error = ENOPROTOOPT;
 1562         } else {
 1563                 switch (sopt->sopt_name) {
 1564 #ifdef INET
 1565                 case SO_ACCEPTFILTER:
 1566                         error = do_setopt_accept_filter(so, sopt);
 1567                         if (error)
 1568                                 goto bad;
 1569                         break;
 1570 #endif
 1571                 case SO_LINGER:
 1572                         error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
 1573                         if (error)
 1574                                 goto bad;
 1575 
 1576                         SOCK_LOCK(so);
 1577                         so->so_linger = l.l_linger;
 1578                         if (l.l_onoff)
 1579                                 so->so_options |= SO_LINGER;
 1580                         else
 1581                                 so->so_options &= ~SO_LINGER;
 1582                         SOCK_UNLOCK(so);
 1583                         break;
 1584 
 1585                 case SO_DEBUG:
 1586                 case SO_KEEPALIVE:
 1587                 case SO_DONTROUTE:
 1588                 case SO_USELOOPBACK:
 1589                 case SO_BROADCAST:
 1590                 case SO_REUSEADDR:
 1591                 case SO_REUSEPORT:
 1592                 case SO_OOBINLINE:
 1593                 case SO_TIMESTAMP:
 1594                 case SO_BINTIME:
 1595                 case SO_NOSIGPIPE:
 1596                         error = sooptcopyin(sopt, &optval, sizeof optval,
 1597                                             sizeof optval);
 1598                         if (error)
 1599                                 goto bad;
 1600                         SOCK_LOCK(so);
 1601                         if (optval)
 1602                                 so->so_options |= sopt->sopt_name;
 1603                         else
 1604                                 so->so_options &= ~sopt->sopt_name;
 1605                         SOCK_UNLOCK(so);
 1606                         break;
 1607 
 1608                 case SO_SNDBUF:
 1609                 case SO_RCVBUF:
 1610                 case SO_SNDLOWAT:
 1611                 case SO_RCVLOWAT:
 1612                         error = sooptcopyin(sopt, &optval, sizeof optval,
 1613                                             sizeof optval);
 1614                         if (error)
 1615                                 goto bad;
 1616 
 1617                         /*
 1618                          * Values < 1 make no sense for any of these
 1619                          * options, so disallow them.
 1620                          */
 1621                         if (optval < 1) {
 1622                                 error = EINVAL;
 1623                                 goto bad;
 1624                         }
 1625 
 1626                         switch (sopt->sopt_name) {
 1627                         case SO_SNDBUF:
 1628                         case SO_RCVBUF:
 1629                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
 1630                                     &so->so_snd : &so->so_rcv, (u_long)optval,
 1631                                     so, curthread) == 0) {
 1632                                         error = ENOBUFS;
 1633                                         goto bad;
 1634                                 }
 1635                                 break;
 1636 
 1637                         /*
 1638                          * Make sure the low-water is never greater than
 1639                          * the high-water.
 1640                          */
 1641                         case SO_SNDLOWAT:
 1642                                 SOCKBUF_LOCK(&so->so_snd);
 1643                                 so->so_snd.sb_lowat =
 1644                                     (optval > so->so_snd.sb_hiwat) ?
 1645                                     so->so_snd.sb_hiwat : optval;
 1646                                 SOCKBUF_UNLOCK(&so->so_snd);
 1647                                 break;
 1648                         case SO_RCVLOWAT:
 1649                                 SOCKBUF_LOCK(&so->so_rcv);
 1650                                 so->so_rcv.sb_lowat =
 1651                                     (optval > so->so_rcv.sb_hiwat) ?
 1652                                     so->so_rcv.sb_hiwat : optval;
 1653                                 SOCKBUF_UNLOCK(&so->so_rcv);
 1654                                 break;
 1655                         }
 1656                         break;
 1657 
 1658                 case SO_SNDTIMEO:
 1659                 case SO_RCVTIMEO:
 1660                         error = sooptcopyin(sopt, &tv, sizeof tv,
 1661                                             sizeof tv);
 1662                         if (error)
 1663                                 goto bad;
 1664 
 1665                         /* assert(hz > 0); */
 1666                         if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
 1667                             tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
 1668                                 error = EDOM;
 1669                                 goto bad;
 1670                         }
 1671                         /* assert(tick > 0); */
 1672                         /* assert(ULONG_MAX - INT_MAX >= 1000000); */
 1673                         val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
 1674                         if (val > INT_MAX) {
 1675                                 error = EDOM;
 1676                                 goto bad;
 1677                         }
 1678                         if (val == 0 && tv.tv_usec != 0)
 1679                                 val = 1;
 1680 
 1681                         switch (sopt->sopt_name) {
 1682                         case SO_SNDTIMEO:
 1683                                 so->so_snd.sb_timeo = val;
 1684                                 break;
 1685                         case SO_RCVTIMEO:
 1686                                 so->so_rcv.sb_timeo = val;
 1687                                 break;
 1688                         }
 1689                         break;
 1690 
 1691                 case SO_LABEL:
 1692 #ifdef MAC
 1693                         error = sooptcopyin(sopt, &extmac, sizeof extmac,
 1694                             sizeof extmac);
 1695                         if (error)
 1696                                 goto bad;
 1697                         error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
 1698                             so, &extmac);
 1699 #else
 1700                         error = EOPNOTSUPP;
 1701 #endif
 1702                         break;
 1703 
 1704                 default:
 1705                         error = ENOPROTOOPT;
 1706                         break;
 1707                 }
 1708                 if (error == 0 && so->so_proto != NULL &&
 1709                     so->so_proto->pr_ctloutput != NULL) {
 1710                         (void) ((*so->so_proto->pr_ctloutput)
 1711                                   (so, sopt));
 1712                 }
 1713         }
 1714 bad:
 1715         return (error);
 1716 }
 1717 
 1718 /* Helper routine for getsockopt */
 1719 int
 1720 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
 1721 {
 1722         int     error;
 1723         size_t  valsize;
 1724 
 1725         error = 0;
 1726 
 1727         /*
 1728          * Documented get behavior is that we always return a value,
 1729          * possibly truncated to fit in the user's buffer.
 1730          * Traditional behavior is that we always tell the user
 1731          * precisely how much we copied, rather than something useful
 1732          * like the total amount we had available for her.
 1733          * Note that this interface is not idempotent; the entire answer must
 1734          * generated ahead of time.
 1735          */
 1736         valsize = min(len, sopt->sopt_valsize);
 1737         sopt->sopt_valsize = valsize;
 1738         if (sopt->sopt_val != NULL) {
 1739                 if (sopt->sopt_td != NULL)
 1740                         error = copyout(buf, sopt->sopt_val, valsize);
 1741                 else
 1742                         bcopy(buf, sopt->sopt_val, valsize);
 1743         }
 1744         return error;
 1745 }
 1746 
 1747 int
 1748 sogetopt(so, sopt)
 1749         struct socket *so;
 1750         struct sockopt *sopt;
 1751 {
 1752         int     error, optval;
 1753         struct  linger l;
 1754         struct  timeval tv;
 1755 #ifdef INET
 1756         struct accept_filter_arg *afap;
 1757 #endif
 1758 #ifdef MAC
 1759         struct mac extmac;
 1760 #endif
 1761 
 1762         error = 0;
 1763         if (sopt->sopt_level != SOL_SOCKET) {
 1764                 if (so->so_proto && so->so_proto->pr_ctloutput) {
 1765                         return ((*so->so_proto->pr_ctloutput)
 1766                                   (so, sopt));
 1767                 } else
 1768                         return (ENOPROTOOPT);
 1769         } else {
 1770                 switch (sopt->sopt_name) {
 1771 #ifdef INET
 1772                 case SO_ACCEPTFILTER:
 1773                         /* Unlocked read. */
 1774                         if ((so->so_options & SO_ACCEPTCONN) == 0)
 1775                                 return (EINVAL);
 1776                         if ((so->so_options & SO_ACCEPTFILTER) == 0)
 1777                                 return (EINVAL);
 1778                         MALLOC(afap, struct accept_filter_arg *, sizeof(*afap),
 1779                                 M_TEMP, M_WAITOK | M_ZERO);
 1780                         SOCK_LOCK(so);
 1781                         if ((so->so_options & SO_ACCEPTFILTER) != 0) {
 1782                                 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
 1783                                 if (so->so_accf->so_accept_filter_str != NULL)
 1784                                         strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
 1785                         }
 1786                         SOCK_UNLOCK(so);
 1787                         error = sooptcopyout(sopt, afap, sizeof(*afap));
 1788                         FREE(afap, M_TEMP);
 1789                         break;
 1790 #endif
 1791 
 1792                 case SO_LINGER:
 1793                         SOCK_LOCK(so);
 1794                         l.l_onoff = so->so_options & SO_LINGER;
 1795                         l.l_linger = so->so_linger;
 1796                         SOCK_UNLOCK(so);
 1797                         error = sooptcopyout(sopt, &l, sizeof l);
 1798                         break;
 1799 
 1800                 case SO_USELOOPBACK:
 1801                 case SO_DONTROUTE:
 1802                 case SO_DEBUG:
 1803                 case SO_KEEPALIVE:
 1804                 case SO_REUSEADDR:
 1805                 case SO_REUSEPORT:
 1806                 case SO_BROADCAST:
 1807                 case SO_OOBINLINE:
 1808                 case SO_TIMESTAMP:
 1809                 case SO_BINTIME:
 1810                 case SO_NOSIGPIPE:
 1811                         optval = so->so_options & sopt->sopt_name;
 1812 integer:
 1813                         error = sooptcopyout(sopt, &optval, sizeof optval);
 1814                         break;
 1815 
 1816                 case SO_TYPE:
 1817                         optval = so->so_type;
 1818                         goto integer;
 1819 
 1820                 case SO_ERROR:
 1821                         optval = so->so_error;
 1822                         so->so_error = 0;
 1823                         goto integer;
 1824 
 1825                 case SO_SNDBUF:
 1826                         optval = so->so_snd.sb_hiwat;
 1827                         goto integer;
 1828 
 1829                 case SO_RCVBUF:
 1830                         optval = so->so_rcv.sb_hiwat;
 1831                         goto integer;
 1832 
 1833                 case SO_SNDLOWAT:
 1834                         optval = so->so_snd.sb_lowat;
 1835                         goto integer;
 1836 
 1837                 case SO_RCVLOWAT:
 1838                         optval = so->so_rcv.sb_lowat;
 1839                         goto integer;
 1840 
 1841                 case SO_SNDTIMEO:
 1842                 case SO_RCVTIMEO:
 1843                         optval = (sopt->sopt_name == SO_SNDTIMEO ?
 1844                                   so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
 1845 
 1846                         tv.tv_sec = optval / hz;
 1847                         tv.tv_usec = (optval % hz) * tick;
 1848                         error = sooptcopyout(sopt, &tv, sizeof tv);
 1849                         break;
 1850 
 1851                 case SO_LABEL:
 1852 #ifdef MAC
 1853                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
 1854                             sizeof(extmac));
 1855                         if (error)
 1856                                 return (error);
 1857                         error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
 1858                             so, &extmac);
 1859                         if (error)
 1860                                 return (error);
 1861                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
 1862 #else
 1863                         error = EOPNOTSUPP;
 1864 #endif
 1865                         break;
 1866 
 1867                 case SO_PEERLABEL:
 1868 #ifdef MAC
 1869                         error = sooptcopyin(sopt, &extmac, sizeof(extmac),
 1870                             sizeof(extmac));
 1871                         if (error)
 1872                                 return (error);
 1873                         error = mac_getsockopt_peerlabel(
 1874                             sopt->sopt_td->td_ucred, so, &extmac);
 1875                         if (error)
 1876                                 return (error);
 1877                         error = sooptcopyout(sopt, &extmac, sizeof extmac);
 1878 #else
 1879                         error = EOPNOTSUPP;
 1880 #endif
 1881                         break;
 1882 
 1883                 case SO_LISTENQLIMIT:
 1884                         optval = so->so_qlimit;
 1885                         goto integer;
 1886 
 1887                 case SO_LISTENQLEN:
 1888                         optval = so->so_qlen;
 1889                         goto integer;
 1890 
 1891                 case SO_LISTENINCQLEN:
 1892                         optval = so->so_incqlen;
 1893                         goto integer;
 1894 
 1895                 default:
 1896                         error = ENOPROTOOPT;
 1897                         break;
 1898                 }
 1899                 return (error);
 1900         }
 1901 }
 1902 
 1903 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
 1904 int
 1905 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
 1906 {
 1907         struct mbuf *m, *m_prev;
 1908         int sopt_size = sopt->sopt_valsize;
 1909 
 1910         MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
 1911         if (m == NULL)
 1912                 return ENOBUFS;
 1913         if (sopt_size > MLEN) {
 1914                 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT);
 1915                 if ((m->m_flags & M_EXT) == 0) {
 1916                         m_free(m);
 1917                         return ENOBUFS;
 1918                 }
 1919                 m->m_len = min(MCLBYTES, sopt_size);
 1920         } else {
 1921                 m->m_len = min(MLEN, sopt_size);
 1922         }
 1923         sopt_size -= m->m_len;
 1924         *mp = m;
 1925         m_prev = m;
 1926 
 1927         while (sopt_size) {
 1928                 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA);
 1929                 if (m == NULL) {
 1930                         m_freem(*mp);
 1931                         return ENOBUFS;
 1932                 }
 1933                 if (sopt_size > MLEN) {
 1934                         MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT :
 1935                             M_DONTWAIT);
 1936                         if ((m->m_flags & M_EXT) == 0) {
 1937                                 m_freem(m);
 1938                                 m_freem(*mp);
 1939                                 return ENOBUFS;
 1940                         }
 1941                         m->m_len = min(MCLBYTES, sopt_size);
 1942                 } else {
 1943                         m->m_len = min(MLEN, sopt_size);
 1944                 }
 1945                 sopt_size -= m->m_len;
 1946                 m_prev->m_next = m;
 1947                 m_prev = m;
 1948         }
 1949         return 0;
 1950 }
 1951 
 1952 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
 1953 int
 1954 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
 1955 {
 1956         struct mbuf *m0 = m;
 1957 
 1958         if (sopt->sopt_val == NULL)
 1959                 return 0;
 1960         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 1961                 if (sopt->sopt_td != NULL) {
 1962                         int error;
 1963 
 1964                         error = copyin(sopt->sopt_val, mtod(m, char *),
 1965                                        m->m_len);
 1966                         if (error != 0) {
 1967                                 m_freem(m0);
 1968                                 return(error);
 1969                         }
 1970                 } else
 1971                         bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
 1972                 sopt->sopt_valsize -= m->m_len;
 1973                 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
 1974                 m = m->m_next;
 1975         }
 1976         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
 1977                 panic("ip6_sooptmcopyin");
 1978         return 0;
 1979 }
 1980 
 1981 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
 1982 int
 1983 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
 1984 {
 1985         struct mbuf *m0 = m;
 1986         size_t valsize = 0;
 1987 
 1988         if (sopt->sopt_val == NULL)
 1989                 return 0;
 1990         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 1991                 if (sopt->sopt_td != NULL) {
 1992                         int error;
 1993 
 1994                         error = copyout(mtod(m, char *), sopt->sopt_val,
 1995                                        m->m_len);
 1996                         if (error != 0) {
 1997                                 m_freem(m0);
 1998                                 return(error);
 1999                         }
 2000                 } else
 2001                         bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
 2002                sopt->sopt_valsize -= m->m_len;
 2003                sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
 2004                valsize += m->m_len;
 2005                m = m->m_next;
 2006         }
 2007         if (m != NULL) {
 2008                 /* enough soopt buffer should be given from user-land */
 2009                 m_freem(m0);
 2010                 return(EINVAL);
 2011         }
 2012         sopt->sopt_valsize = valsize;
 2013         return 0;
 2014 }
 2015 
 2016 void
 2017 sohasoutofband(so)
 2018         struct socket *so;
 2019 {
 2020         if (so->so_sigio != NULL)
 2021                 pgsigio(&so->so_sigio, SIGURG, 0);
 2022         selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
 2023 }
 2024 
 2025 int
 2026 sopoll(struct socket *so, int events, struct ucred *active_cred,
 2027     struct thread *td)
 2028 {
 2029         int revents = 0;
 2030 
 2031         SOCKBUF_LOCK(&so->so_snd);
 2032         SOCKBUF_LOCK(&so->so_rcv);
 2033         if (events & (POLLIN | POLLRDNORM))
 2034                 if (soreadable(so))
 2035                         revents |= events & (POLLIN | POLLRDNORM);
 2036 
 2037         if (events & POLLINIGNEOF)
 2038                 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
 2039                     !TAILQ_EMPTY(&so->so_comp) || so->so_error)
 2040                         revents |= POLLINIGNEOF;
 2041 
 2042         if (events & (POLLOUT | POLLWRNORM))
 2043                 if (sowriteable(so))
 2044                         revents |= events & (POLLOUT | POLLWRNORM);
 2045 
 2046         if (events & (POLLPRI | POLLRDBAND))
 2047                 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
 2048                         revents |= events & (POLLPRI | POLLRDBAND);
 2049 
 2050         if (revents == 0) {
 2051                 if (events &
 2052                     (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM |
 2053                      POLLRDBAND)) {
 2054                         selrecord(td, &so->so_rcv.sb_sel);
 2055                         so->so_rcv.sb_flags |= SB_SEL;
 2056                 }
 2057 
 2058                 if (events & (POLLOUT | POLLWRNORM)) {
 2059                         selrecord(td, &so->so_snd.sb_sel);
 2060                         so->so_snd.sb_flags |= SB_SEL;
 2061                 }
 2062         }
 2063 
 2064         SOCKBUF_UNLOCK(&so->so_rcv);
 2065         SOCKBUF_UNLOCK(&so->so_snd);
 2066         return (revents);
 2067 }
 2068 
 2069 int
 2070 soo_kqfilter(struct file *fp, struct knote *kn)
 2071 {
 2072         struct socket *so = kn->kn_fp->f_data;
 2073         struct sockbuf *sb;
 2074 
 2075         switch (kn->kn_filter) {
 2076         case EVFILT_READ:
 2077                 if (so->so_options & SO_ACCEPTCONN)
 2078                         kn->kn_fop = &solisten_filtops;
 2079                 else
 2080                         kn->kn_fop = &soread_filtops;
 2081                 sb = &so->so_rcv;
 2082                 break;
 2083         case EVFILT_WRITE:
 2084                 kn->kn_fop = &sowrite_filtops;
 2085                 sb = &so->so_snd;
 2086                 break;
 2087         default:
 2088                 return (EINVAL);
 2089         }
 2090 
 2091         SOCKBUF_LOCK(sb);
 2092         knlist_add(&sb->sb_sel.si_note, kn, 1);
 2093         sb->sb_flags |= SB_KNOTE;
 2094         SOCKBUF_UNLOCK(sb);
 2095         return (0);
 2096 }
 2097 
 2098 static void
 2099 filt_sordetach(struct knote *kn)
 2100 {
 2101         struct socket *so = kn->kn_fp->f_data;
 2102 
 2103         SOCKBUF_LOCK(&so->so_rcv);
 2104         knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
 2105         if (knlist_empty(&so->so_rcv.sb_sel.si_note))
 2106                 so->so_rcv.sb_flags &= ~SB_KNOTE;
 2107         SOCKBUF_UNLOCK(&so->so_rcv);
 2108 }
 2109 
 2110 /*ARGSUSED*/
 2111 static int
 2112 filt_soread(struct knote *kn, long hint)
 2113 {
 2114         struct socket *so;
 2115 
 2116         so = kn->kn_fp->f_data;
 2117         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 2118 
 2119         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
 2120         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 2121                 kn->kn_flags |= EV_EOF;
 2122                 kn->kn_fflags = so->so_error;
 2123                 return (1);
 2124         } else if (so->so_error)        /* temporary udp error */
 2125                 return (1);
 2126         else if (kn->kn_sfflags & NOTE_LOWAT)
 2127                 return (kn->kn_data >= kn->kn_sdata);
 2128         else
 2129                 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
 2130 }
 2131 
 2132 static void
 2133 filt_sowdetach(struct knote *kn)
 2134 {
 2135         struct socket *so = kn->kn_fp->f_data;
 2136 
 2137         SOCKBUF_LOCK(&so->so_snd);
 2138         knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
 2139         if (knlist_empty(&so->so_snd.sb_sel.si_note))
 2140                 so->so_snd.sb_flags &= ~SB_KNOTE;
 2141         SOCKBUF_UNLOCK(&so->so_snd);
 2142 }
 2143 
 2144 /*ARGSUSED*/
 2145 static int
 2146 filt_sowrite(struct knote *kn, long hint)
 2147 {
 2148         struct socket *so;
 2149 
 2150         so = kn->kn_fp->f_data;
 2151         SOCKBUF_LOCK_ASSERT(&so->so_snd);
 2152         kn->kn_data = sbspace(&so->so_snd);
 2153         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 2154                 kn->kn_flags |= EV_EOF;
 2155                 kn->kn_fflags = so->so_error;
 2156                 return (1);
 2157         } else if (so->so_error)        /* temporary udp error */
 2158                 return (1);
 2159         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
 2160             (so->so_proto->pr_flags & PR_CONNREQUIRED))
 2161                 return (0);
 2162         else if (kn->kn_sfflags & NOTE_LOWAT)
 2163                 return (kn->kn_data >= kn->kn_sdata);
 2164         else
 2165                 return (kn->kn_data >= so->so_snd.sb_lowat);
 2166 }
 2167 
 2168 /*ARGSUSED*/
 2169 static int
 2170 filt_solisten(struct knote *kn, long hint)
 2171 {
 2172         struct socket *so = kn->kn_fp->f_data;
 2173 
 2174         kn->kn_data = so->so_qlen;
 2175         return (! TAILQ_EMPTY(&so->so_comp));
 2176 }
 2177 
 2178 int
 2179 socheckuid(struct socket *so, uid_t uid)
 2180 {
 2181 
 2182         if (so == NULL)
 2183                 return (EPERM);
 2184         if (so->so_cred->cr_uid != uid)
 2185                 return (EPERM);
 2186         return (0);
 2187 }
 2188 
 2189 static int
 2190 somaxconn_sysctl(SYSCTL_HANDLER_ARGS)
 2191 {
 2192         int error;
 2193         int val;
 2194 
 2195         val = somaxconn;
 2196         error = sysctl_handle_int(oidp, &val, sizeof(int), req);
 2197         if (error || !req->newptr )
 2198                 return (error);
 2199 
 2200         if (val < 1 || val > SHRT_MAX)
 2201                 return (EINVAL);
 2202 
 2203         somaxconn = val;
 2204         return (0);
 2205 }

Cache object: bb7393369947f37042f9af1a13c36990


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.