The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/uipc_socket.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: uipc_socket.c,v 1.108.2.3 2006/10/25 12:58:56 ghen Exp $       */
    2 
    3 /*-
    4  * Copyright (c) 2002 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Jason R. Thorpe of Wasabi Systems, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the NetBSD
   21  *      Foundation, Inc. and its contributors.
   22  * 4. Neither the name of The NetBSD Foundation nor the names of its
   23  *    contributors may be used to endorse or promote products derived
   24  *    from this software without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   36  * POSSIBILITY OF SUCH DAMAGE.
   37  */
   38 
   39 /*
   40  * Copyright (c) 1982, 1986, 1988, 1990, 1993
   41  *      The Regents of the University of California.  All rights reserved.
   42  *
   43  * Redistribution and use in source and binary forms, with or without
   44  * modification, are permitted provided that the following conditions
   45  * are met:
   46  * 1. Redistributions of source code must retain the above copyright
   47  *    notice, this list of conditions and the following disclaimer.
   48  * 2. Redistributions in binary form must reproduce the above copyright
   49  *    notice, this list of conditions and the following disclaimer in the
   50  *    documentation and/or other materials provided with the distribution.
   51  * 3. Neither the name of the University nor the names of its contributors
   52  *    may be used to endorse or promote products derived from this software
   53  *    without specific prior written permission.
   54  *
   55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   65  * SUCH DAMAGE.
   66  *
   67  *      @(#)uipc_socket.c       8.6 (Berkeley) 5/2/95
   68  */
   69 
   70 #include <sys/cdefs.h>
   71 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.108.2.3 2006/10/25 12:58:56 ghen Exp $");
   72 
   73 #include "opt_sock_counters.h"
   74 #include "opt_sosend_loan.h"
   75 #include "opt_mbuftrace.h"
   76 #include "opt_somaxkva.h"
   77 
   78 #include <sys/param.h>
   79 #include <sys/systm.h>
   80 #include <sys/proc.h>
   81 #include <sys/file.h>
   82 #include <sys/malloc.h>
   83 #include <sys/mbuf.h>
   84 #include <sys/domain.h>
   85 #include <sys/kernel.h>
   86 #include <sys/protosw.h>
   87 #include <sys/socket.h>
   88 #include <sys/socketvar.h>
   89 #include <sys/signalvar.h>
   90 #include <sys/resourcevar.h>
   91 #include <sys/pool.h>
   92 #include <sys/event.h>
   93 #include <sys/poll.h>
   94 
   95 #include <uvm/uvm.h>
   96 
   97 POOL_INIT(socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL);
   98 
   99 MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options");
  100 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
  101 
  102 extern int      somaxconn;                      /* patchable (XXX sysctl) */
  103 int             somaxconn = SOMAXCONN;
  104 
  105 #ifdef SOSEND_COUNTERS
  106 #include <sys/device.h>
  107 
  108 struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  109     NULL, "sosend", "loan big");
  110 struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  111     NULL, "sosend", "copy big");
  112 struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  113     NULL, "sosend", "copy small");
  114 struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  115     NULL, "sosend", "kva limit");
  116 
  117 #define SOSEND_COUNTER_INCR(ev)         (ev)->ev_count++
  118 
  119 EVCNT_ATTACH_STATIC(sosend_loan_big);
  120 EVCNT_ATTACH_STATIC(sosend_copy_big);
  121 EVCNT_ATTACH_STATIC(sosend_copy_small);
  122 EVCNT_ATTACH_STATIC(sosend_kvalimit);
  123 #else
  124 
  125 #define SOSEND_COUNTER_INCR(ev)         /* nothing */
  126 
  127 #endif /* SOSEND_COUNTERS */
  128 
  129 void
  130 soinit(void)
  131 {
  132 
  133         /* Set the initial adjusted socket buffer size. */
  134         if (sb_max_set(sb_max))
  135                 panic("bad initial sb_max value: %lu\n", sb_max);
  136 
  137 }
  138 
  139 #ifdef SOSEND_NO_LOAN
  140 int use_sosend_loan = 0;
  141 #else
  142 int use_sosend_loan = 1;
  143 #endif
  144 
  145 struct simplelock so_pendfree_slock = SIMPLELOCK_INITIALIZER;
  146 struct mbuf *so_pendfree;
  147 
  148 #ifndef SOMAXKVA
  149 #define SOMAXKVA (16 * 1024 * 1024)
  150 #endif
  151 int somaxkva = SOMAXKVA;
  152 int socurkva;
  153 int sokvawaiters;
  154 
  155 #define SOCK_LOAN_THRESH        4096
  156 #define SOCK_LOAN_CHUNK         65536
  157 
  158 static size_t sodopendfree(struct socket *);
  159 static size_t sodopendfreel(struct socket *);
  160 static __inline vsize_t sokvareserve(struct socket *, vsize_t);
  161 static __inline void sokvaunreserve(vsize_t);
  162 
  163 static __inline vsize_t
  164 sokvareserve(struct socket *so, vsize_t len)
  165 {
  166         int s;
  167         int error;
  168 
  169         s = splvm();
  170         simple_lock(&so_pendfree_slock);
  171         while (socurkva + len > somaxkva) {
  172                 size_t freed;
  173 
  174                 /*
  175                  * try to do pendfree.
  176                  */
  177 
  178                 freed = sodopendfreel(so);
  179 
  180                 /*
  181                  * if some kva was freed, try again.
  182                  */
  183 
  184                 if (freed)
  185                         continue;
  186 
  187                 SOSEND_COUNTER_INCR(&sosend_kvalimit);
  188                 sokvawaiters++;
  189                 error = ltsleep(&socurkva, PVM | PCATCH, "sokva", 0,
  190                     &so_pendfree_slock);
  191                 sokvawaiters--;
  192                 if (error) {
  193                         len = 0;
  194                         break;
  195                 }
  196         }
  197         socurkva += len;
  198         simple_unlock(&so_pendfree_slock);
  199         splx(s);
  200         return len;
  201 }
  202 
  203 static __inline void
  204 sokvaunreserve(vsize_t len)
  205 {
  206         int s;
  207 
  208         s = splvm();
  209         simple_lock(&so_pendfree_slock);
  210         socurkva -= len;
  211         if (sokvawaiters)
  212                 wakeup(&socurkva);
  213         simple_unlock(&so_pendfree_slock);
  214         splx(s);
  215 }
  216 
  217 /*
  218  * sokvaalloc: allocate kva for loan.
  219  */
  220 
  221 vaddr_t
  222 sokvaalloc(vsize_t len, struct socket *so)
  223 {
  224         vaddr_t lva;
  225 
  226         /*
  227          * reserve kva.
  228          */
  229 
  230         if (sokvareserve(so, len) == 0)
  231                 return 0;
  232 
  233         /*
  234          * allocate kva.
  235          */
  236 
  237         lva = uvm_km_valloc_wait(kernel_map, len);
  238         if (lva == 0) {
  239                 sokvaunreserve(len);
  240                 return (0);
  241         }
  242 
  243         return lva;
  244 }
  245 
  246 /*
  247  * sokvafree: free kva for loan.
  248  */
  249 
  250 void
  251 sokvafree(vaddr_t sva, vsize_t len)
  252 {
  253 
  254         /*
  255          * free kva.
  256          */
  257 
  258         uvm_km_free(kernel_map, sva, len);
  259 
  260         /*
  261          * unreserve kva.
  262          */
  263 
  264         sokvaunreserve(len);
  265 }
  266 
  267 static void
  268 sodoloanfree(struct vm_page **pgs, caddr_t buf, size_t size)
  269 {
  270         vaddr_t va, sva, eva;
  271         vsize_t len;
  272         paddr_t pa;
  273         int i, npgs;
  274 
  275         eva = round_page((vaddr_t) buf + size);
  276         sva = trunc_page((vaddr_t) buf);
  277         len = eva - sva;
  278         npgs = len >> PAGE_SHIFT;
  279 
  280         if (__predict_false(pgs == NULL)) {
  281                 pgs = alloca(npgs * sizeof(*pgs));
  282 
  283                 for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) {
  284                         if (pmap_extract(pmap_kernel(), va, &pa) == FALSE)
  285                                 panic("sodoloanfree: va 0x%lx not mapped", va);
  286                         pgs[i] = PHYS_TO_VM_PAGE(pa);
  287                 }
  288         }
  289 
  290         pmap_kremove(sva, len);
  291         pmap_update(pmap_kernel());
  292         uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE);
  293         sokvafree(sva, len);
  294 }
  295 
  296 static size_t
  297 sodopendfree(struct socket *so)
  298 {
  299         int s;
  300         size_t rv;
  301 
  302         s = splvm();
  303         simple_lock(&so_pendfree_slock);
  304         rv = sodopendfreel(so);
  305         simple_unlock(&so_pendfree_slock);
  306         splx(s);
  307 
  308         return rv;
  309 }
  310 
  311 /*
  312  * sodopendfreel: free mbufs on "pendfree" list.
  313  * unlock and relock so_pendfree_slock when freeing mbufs.
  314  *
  315  * => called with so_pendfree_slock held.
  316  * => called at splvm.
  317  */
  318 
  319 static size_t
  320 sodopendfreel(struct socket *so)
  321 {
  322         size_t rv = 0;
  323 
  324         LOCK_ASSERT(simple_lock_held(&so_pendfree_slock));
  325 
  326         for (;;) {
  327                 struct mbuf *m;
  328                 struct mbuf *next;
  329 
  330                 m = so_pendfree;
  331                 if (m == NULL)
  332                         break;
  333                 so_pendfree = NULL;
  334                 simple_unlock(&so_pendfree_slock);
  335                 /* XXX splx */
  336 
  337                 for (; m != NULL; m = next) {
  338                         next = m->m_next;
  339 
  340                         rv += m->m_ext.ext_size;
  341                         sodoloanfree((m->m_flags & M_EXT_PAGES) ?
  342                             m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf,
  343                             m->m_ext.ext_size);
  344                         pool_cache_put(&mbpool_cache, m);
  345                 }
  346 
  347                 /* XXX splvm */
  348                 simple_lock(&so_pendfree_slock);
  349         }
  350 
  351         return (rv);
  352 }
  353 
  354 void
  355 soloanfree(struct mbuf *m, caddr_t buf, size_t size, void *arg)
  356 {
  357         int s;
  358 
  359         if (m == NULL) {
  360 
  361                 /*
  362                  * called from MEXTREMOVE.
  363                  */
  364 
  365                 sodoloanfree(NULL, buf, size);
  366                 return;
  367         }
  368 
  369         /*
  370          * postpone freeing mbuf.
  371          *
  372          * we can't do it in interrupt context
  373          * because we need to put kva back to kernel_map.
  374          */
  375 
  376         s = splvm();
  377         simple_lock(&so_pendfree_slock);
  378         m->m_next = so_pendfree;
  379         so_pendfree = m;
  380         if (sokvawaiters)
  381                 wakeup(&socurkva);
  382         simple_unlock(&so_pendfree_slock);
  383         splx(s);
  384 }
  385 
  386 static long
  387 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space)
  388 {
  389         struct iovec *iov = uio->uio_iov;
  390         vaddr_t sva, eva;
  391         vsize_t len;
  392         vaddr_t lva, va;
  393         int npgs, i, error;
  394 
  395         if (uio->uio_segflg != UIO_USERSPACE)
  396                 return (0);
  397 
  398         if (iov->iov_len < (size_t) space)
  399                 space = iov->iov_len;
  400         if (space > SOCK_LOAN_CHUNK)
  401                 space = SOCK_LOAN_CHUNK;
  402 
  403         eva = round_page((vaddr_t) iov->iov_base + space);
  404         sva = trunc_page((vaddr_t) iov->iov_base);
  405         len = eva - sva;
  406         npgs = len >> PAGE_SHIFT;
  407 
  408         /* XXX KDASSERT */
  409         KASSERT(npgs <= M_EXT_MAXPAGES);
  410         KASSERT(uio->uio_procp != NULL);
  411 
  412         lva = sokvaalloc(len, so);
  413         if (lva == 0)
  414                 return 0;
  415 
  416         error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len,
  417             m->m_ext.ext_pgs, UVM_LOAN_TOPAGE);
  418         if (error) {
  419                 sokvafree(lva, len);
  420                 return (0);
  421         }
  422 
  423         for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE)
  424                 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]),
  425                     VM_PROT_READ);
  426         pmap_update(pmap_kernel());
  427 
  428         lva += (vaddr_t) iov->iov_base & PAGE_MASK;
  429 
  430         MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so);
  431         m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;
  432 
  433         uio->uio_resid -= space;
  434         /* uio_offset not updated, not set/used for write(2) */
  435         uio->uio_iov->iov_base = (caddr_t) uio->uio_iov->iov_base + space;
  436         uio->uio_iov->iov_len -= space;
  437         if (uio->uio_iov->iov_len == 0) {
  438                 uio->uio_iov++;
  439                 uio->uio_iovcnt--;
  440         }
  441 
  442         return (space);
  443 }
  444 
  445 /*
  446  * Socket operation routines.
  447  * These routines are called by the routines in
  448  * sys_socket.c or from a system process, and
  449  * implement the semantics of socket operations by
  450  * switching out to the protocol specific routines.
  451  */
  452 /*ARGSUSED*/
  453 int
  454 socreate(int dom, struct socket **aso, int type, int proto, struct proc *p)
  455 {
  456         const struct protosw    *prp;
  457         struct socket   *so;
  458         int             error, s;
  459 
  460         if (proto)
  461                 prp = pffindproto(dom, proto, type);
  462         else
  463                 prp = pffindtype(dom, type);
  464         if (prp == 0 || prp->pr_usrreq == 0)
  465                 return (EPROTONOSUPPORT);
  466         if (prp->pr_type != type)
  467                 return (EPROTOTYPE);
  468         s = splsoftnet();
  469         so = pool_get(&socket_pool, PR_WAITOK);
  470         memset((caddr_t)so, 0, sizeof(*so));
  471         TAILQ_INIT(&so->so_q0);
  472         TAILQ_INIT(&so->so_q);
  473         so->so_type = type;
  474         so->so_proto = prp;
  475         so->so_send = sosend;
  476         so->so_receive = soreceive;
  477 #ifdef MBUFTRACE
  478         so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner;
  479         so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner;
  480         so->so_mowner = &prp->pr_domain->dom_mowner;
  481 #endif
  482         if (p != 0)
  483                 so->so_uid = p->p_ucred->cr_uid;
  484         else
  485                 so->so_uid = UID_MAX;
  486         error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0,
  487             (struct mbuf *)(long)proto, (struct mbuf *)0, p);
  488         if (error) {
  489                 so->so_state |= SS_NOFDREF;
  490                 sofree(so);
  491                 splx(s);
  492                 return (error);
  493         }
  494         splx(s);
  495         *aso = so;
  496         return (0);
  497 }
  498 
  499 int
  500 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
  501 {
  502         int     s, error;
  503 
  504         s = splsoftnet();
  505         error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0,
  506             nam, (struct mbuf *)0, p);
  507         splx(s);
  508         return (error);
  509 }
  510 
  511 int
  512 solisten(struct socket *so, int backlog)
  513 {
  514         int     s, error;
  515 
  516         s = splsoftnet();
  517         error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0,
  518             (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
  519         if (error) {
  520                 splx(s);
  521                 return (error);
  522         }
  523         if (TAILQ_EMPTY(&so->so_q))
  524                 so->so_options |= SO_ACCEPTCONN;
  525         if (backlog < 0)
  526                 backlog = 0;
  527         so->so_qlimit = min(backlog, somaxconn);
  528         splx(s);
  529         return (0);
  530 }
  531 
  532 void
  533 sofree(struct socket *so)
  534 {
  535 
  536         if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
  537                 return;
  538         if (so->so_head) {
  539                 /*
  540                  * We must not decommission a socket that's on the accept(2)
  541                  * queue.  If we do, then accept(2) may hang after select(2)
  542                  * indicated that the listening socket was ready.
  543                  */
  544                 if (!soqremque(so, 0))
  545                         return;
  546         }
  547         if (so->so_rcv.sb_hiwat)
  548                 (void)chgsbsize(so->so_uid, &so->so_rcv.sb_hiwat, 0,
  549                     RLIM_INFINITY);
  550         if (so->so_snd.sb_hiwat)
  551                 (void)chgsbsize(so->so_uid, &so->so_snd.sb_hiwat, 0,
  552                     RLIM_INFINITY);
  553         sbrelease(&so->so_snd, so);
  554         sorflush(so);
  555         pool_put(&socket_pool, so);
  556 }
  557 
  558 /*
  559  * Close a socket on last file table reference removal.
  560  * Initiate disconnect if connected.
  561  * Free socket when disconnect complete.
  562  */
  563 int
  564 soclose(struct socket *so)
  565 {
  566         struct socket   *so2;
  567         int             s, error;
  568 
  569         error = 0;
  570         s = splsoftnet();               /* conservative */
  571         if (so->so_options & SO_ACCEPTCONN) {
  572                 while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) {
  573                         (void) soqremque(so2, 0);
  574                         (void) soabort(so2);
  575                 }
  576                 while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) {
  577                         (void) soqremque(so2, 1);
  578                         (void) soabort(so2);
  579                 }
  580         }
  581         if (so->so_pcb == 0)
  582                 goto discard;
  583         if (so->so_state & SS_ISCONNECTED) {
  584                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
  585                         error = sodisconnect(so);
  586                         if (error)
  587                                 goto drop;
  588                 }
  589                 if (so->so_options & SO_LINGER) {
  590                         if ((so->so_state & SS_ISDISCONNECTING) &&
  591                             (so->so_state & SS_NBIO))
  592                                 goto drop;
  593                         while (so->so_state & SS_ISCONNECTED) {
  594                                 error = tsleep((caddr_t)&so->so_timeo,
  595                                                PSOCK | PCATCH, netcls,
  596                                                so->so_linger * hz);
  597                                 if (error)
  598                                         break;
  599                         }
  600                 }
  601         }
  602  drop:
  603         if (so->so_pcb) {
  604                 int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
  605                     (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
  606                     (struct proc *)0);
  607                 if (error == 0)
  608                         error = error2;
  609         }
  610  discard:
  611         if (so->so_state & SS_NOFDREF)
  612                 panic("soclose: NOFDREF");
  613         so->so_state |= SS_NOFDREF;
  614         sofree(so);
  615         splx(s);
  616         return (error);
  617 }
  618 
  619 /*
  620  * Must be called at splsoftnet...
  621  */
  622 int
  623 soabort(struct socket *so)
  624 {
  625 
  626         return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0,
  627             (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
  628 }
  629 
  630 int
  631 soaccept(struct socket *so, struct mbuf *nam)
  632 {
  633         int     s, error;
  634 
  635         error = 0;
  636         s = splsoftnet();
  637         if ((so->so_state & SS_NOFDREF) == 0)
  638                 panic("soaccept: !NOFDREF");
  639         so->so_state &= ~SS_NOFDREF;
  640         if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
  641             (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
  642                 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
  643                     (struct mbuf *)0, nam, (struct mbuf *)0, (struct proc *)0);
  644         else
  645                 error = ECONNABORTED;
  646 
  647         splx(s);
  648         return (error);
  649 }
  650 
  651 int
  652 soconnect(struct socket *so, struct mbuf *nam, struct proc *p)
  653 {
  654         int             s, error;
  655 
  656         if (so->so_options & SO_ACCEPTCONN)
  657                 return (EOPNOTSUPP);
  658         s = splsoftnet();
  659         /*
  660          * If protocol is connection-based, can only connect once.
  661          * Otherwise, if connected, try to disconnect first.
  662          * This allows user to disconnect by connecting to, e.g.,
  663          * a null address.
  664          */
  665         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
  666             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
  667             (error = sodisconnect(so))))
  668                 error = EISCONN;
  669         else
  670                 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
  671                     (struct mbuf *)0, nam, (struct mbuf *)0, p);
  672         splx(s);
  673         return (error);
  674 }
  675 
  676 int
  677 soconnect2(struct socket *so1, struct socket *so2)
  678 {
  679         int     s, error;
  680 
  681         s = splsoftnet();
  682         error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
  683             (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0,
  684             (struct proc *)0);
  685         splx(s);
  686         return (error);
  687 }
  688 
  689 int
  690 sodisconnect(struct socket *so)
  691 {
  692         int     s, error;
  693 
  694         s = splsoftnet();
  695         if ((so->so_state & SS_ISCONNECTED) == 0) {
  696                 error = ENOTCONN;
  697                 goto bad;
  698         }
  699         if (so->so_state & SS_ISDISCONNECTING) {
  700                 error = EALREADY;
  701                 goto bad;
  702         }
  703         error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
  704             (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
  705             (struct proc *)0);
  706  bad:
  707         splx(s);
  708         sodopendfree(so);
  709         return (error);
  710 }
  711 
  712 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
  713 /*
  714  * Send on a socket.
  715  * If send must go all at once and message is larger than
  716  * send buffering, then hard error.
  717  * Lock against other senders.
  718  * If must go all at once and not enough room now, then
  719  * inform user that this would block and do nothing.
  720  * Otherwise, if nonblocking, send as much as possible.
  721  * The data to be sent is described by "uio" if nonzero,
  722  * otherwise by the mbuf chain "top" (which must be null
  723  * if uio is not).  Data provided in mbuf chain must be small
  724  * enough to send all at once.
  725  *
  726  * Returns nonzero on error, timeout or signal; callers
  727  * must check for short counts if EINTR/ERESTART are returned.
  728  * Data and control buffers are freed on return.
  729  */
  730 int
  731 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
  732         struct mbuf *control, int flags, struct proc *p)
  733 {
  734         struct mbuf     **mp, *m;
  735         long            space, len, resid, clen, mlen;
  736         int             error, s, dontroute, atomic;
  737 
  738         sodopendfree(so);
  739 
  740         clen = 0;
  741         atomic = sosendallatonce(so) || top;
  742         if (uio)
  743                 resid = uio->uio_resid;
  744         else
  745                 resid = top->m_pkthdr.len;
  746         /*
  747          * In theory resid should be unsigned.
  748          * However, space must be signed, as it might be less than 0
  749          * if we over-committed, and we must use a signed comparison
  750          * of space and resid.  On the other hand, a negative resid
  751          * causes us to loop sending 0-length segments to the protocol.
  752          */
  753         if (resid < 0) {
  754                 error = EINVAL;
  755                 goto out;
  756         }
  757         dontroute =
  758             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
  759             (so->so_proto->pr_flags & PR_ATOMIC);
  760         if (p)
  761                 p->p_stats->p_ru.ru_msgsnd++;
  762         if (control)
  763                 clen = control->m_len;
  764 #define snderr(errno)   { error = errno; splx(s); goto release; }
  765 
  766  restart:
  767         if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
  768                 goto out;
  769         do {
  770                 s = splsoftnet();
  771                 if (so->so_state & SS_CANTSENDMORE)
  772                         snderr(EPIPE);
  773                 if (so->so_error) {
  774                         error = so->so_error;
  775                         so->so_error = 0;
  776                         splx(s);
  777                         goto release;
  778                 }
  779                 if ((so->so_state & SS_ISCONNECTED) == 0) {
  780                         if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
  781                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
  782                                     !(resid == 0 && clen != 0))
  783                                         snderr(ENOTCONN);
  784                         } else if (addr == 0)
  785                                 snderr(EDESTADDRREQ);
  786                 }
  787                 space = sbspace(&so->so_snd);
  788                 if (flags & MSG_OOB)
  789                         space += 1024;
  790                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
  791                     clen > so->so_snd.sb_hiwat)
  792                         snderr(EMSGSIZE);
  793                 if (space < resid + clen &&
  794                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
  795                         if (so->so_state & SS_NBIO)
  796                                 snderr(EWOULDBLOCK);
  797                         sbunlock(&so->so_snd);
  798                         error = sbwait(&so->so_snd);
  799                         splx(s);
  800                         if (error)
  801                                 goto out;
  802                         goto restart;
  803                 }
  804                 splx(s);
  805                 mp = &top;
  806                 space -= clen;
  807                 do {
  808                         if (uio == NULL) {
  809                                 /*
  810                                  * Data is prepackaged in "top".
  811                                  */
  812                                 resid = 0;
  813                                 if (flags & MSG_EOR)
  814                                         top->m_flags |= M_EOR;
  815                         } else do {
  816                                 if (top == 0) {
  817                                         m = m_gethdr(M_WAIT, MT_DATA);
  818                                         mlen = MHLEN;
  819                                         m->m_pkthdr.len = 0;
  820                                         m->m_pkthdr.rcvif = (struct ifnet *)0;
  821                                 } else {
  822                                         m = m_get(M_WAIT, MT_DATA);
  823                                         mlen = MLEN;
  824                                 }
  825                                 MCLAIM(m, so->so_snd.sb_mowner);
  826                                 if (use_sosend_loan &&
  827                                     uio->uio_iov->iov_len >= SOCK_LOAN_THRESH &&
  828                                     space >= SOCK_LOAN_THRESH &&
  829                                     (len = sosend_loan(so, uio, m,
  830                                                        space)) != 0) {
  831                                         SOSEND_COUNTER_INCR(&sosend_loan_big);
  832                                         space -= len;
  833                                         goto have_data;
  834                                 }
  835                                 if (resid >= MINCLSIZE && space >= MCLBYTES) {
  836                                         SOSEND_COUNTER_INCR(&sosend_copy_big);
  837                                         m_clget(m, M_WAIT);
  838                                         if ((m->m_flags & M_EXT) == 0)
  839                                                 goto nopages;
  840                                         mlen = MCLBYTES;
  841                                         if (atomic && top == 0) {
  842                                                 len = lmin(MCLBYTES - max_hdr,
  843                                                     resid);
  844                                                 m->m_data += max_hdr;
  845                                         } else
  846                                                 len = lmin(MCLBYTES, resid);
  847                                         space -= len;
  848                                 } else {
  849  nopages:
  850                                         SOSEND_COUNTER_INCR(&sosend_copy_small);
  851                                         len = lmin(lmin(mlen, resid), space);
  852                                         space -= len;
  853                                         /*
  854                                          * For datagram protocols, leave room
  855                                          * for protocol headers in first mbuf.
  856                                          */
  857                                         if (atomic && top == 0 && len < mlen)
  858                                                 MH_ALIGN(m, len);
  859                                 }
  860                                 error = uiomove(mtod(m, caddr_t), (int)len,
  861                                     uio);
  862  have_data:
  863                                 resid = uio->uio_resid;
  864                                 m->m_len = len;
  865                                 *mp = m;
  866                                 top->m_pkthdr.len += len;
  867                                 if (error)
  868                                         goto release;
  869                                 mp = &m->m_next;
  870                                 if (resid <= 0) {
  871                                         if (flags & MSG_EOR)
  872                                                 top->m_flags |= M_EOR;
  873                                         break;
  874                                 }
  875                         } while (space > 0 && atomic);
  876 
  877                         s = splsoftnet();
  878 
  879                         if (so->so_state & SS_CANTSENDMORE)
  880                                 snderr(EPIPE);
  881 
  882                         if (dontroute)
  883                                 so->so_options |= SO_DONTROUTE;
  884                         if (resid > 0)
  885                                 so->so_state |= SS_MORETOCOME;
  886                         error = (*so->so_proto->pr_usrreq)(so,
  887                             (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
  888                             top, addr, control, p);
  889                         if (dontroute)
  890                                 so->so_options &= ~SO_DONTROUTE;
  891                         if (resid > 0)
  892                                 so->so_state &= ~SS_MORETOCOME;
  893                         splx(s);
  894 
  895                         clen = 0;
  896                         control = 0;
  897                         top = 0;
  898                         mp = &top;
  899                         if (error)
  900                                 goto release;
  901                 } while (resid && space > 0);
  902         } while (resid);
  903 
  904  release:
  905         sbunlock(&so->so_snd);
  906  out:
  907         if (top)
  908                 m_freem(top);
  909         if (control)
  910                 m_freem(control);
  911         return (error);
  912 }
  913 
  914 /*
  915  * Implement receive operations on a socket.
  916  * We depend on the way that records are added to the sockbuf
  917  * by sbappend*.  In particular, each record (mbufs linked through m_next)
  918  * must begin with an address if the protocol so specifies,
  919  * followed by an optional mbuf or mbufs containing ancillary data,
  920  * and then zero or more mbufs of data.
  921  * In order to avoid blocking network interrupts for the entire time here,
  922  * we splx() while doing the actual copy to user space.
  923  * Although the sockbuf is locked, new data may still be appended,
  924  * and thus we must maintain consistency of the sockbuf during that time.
  925  *
  926  * The caller may receive the data as a single mbuf chain by supplying
  927  * an mbuf **mp0 for use in returning the chain.  The uio is then used
  928  * only for the count in uio_resid.
  929  */
  930 int
  931 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
  932         struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
  933 {
  934         struct proc * p;
  935         struct mbuf     *m, **mp;
  936         int             flags, len, error, s, offset, moff, type, orig_resid;
  937         const struct protosw    *pr;
  938         struct mbuf     *nextrecord;
  939         int             mbuf_removed = 0;
  940 
  941         pr = so->so_proto;
  942         mp = mp0;
  943         type = 0;
  944         orig_resid = uio->uio_resid;
  945         p = uio->uio_procp;
  946 
  947         if (paddr)
  948                 *paddr = 0;
  949         if (controlp)
  950                 *controlp = 0;
  951         if (flagsp)
  952                 flags = *flagsp &~ MSG_EOR;
  953         else
  954                 flags = 0;
  955 
  956         if ((flags & MSG_DONTWAIT) == 0)
  957                 sodopendfree(so);
  958 
  959         if (flags & MSG_OOB) {
  960                 m = m_get(M_WAIT, MT_DATA);
  961                 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
  962                     (struct mbuf *)(long)(flags & MSG_PEEK),
  963                     (struct mbuf *)0, p);
  964                 if (error)
  965                         goto bad;
  966                 do {
  967                         error = uiomove(mtod(m, caddr_t),
  968                             (int) min(uio->uio_resid, m->m_len), uio);
  969                         m = m_free(m);
  970                 } while (uio->uio_resid && error == 0 && m);
  971  bad:
  972                 if (m)
  973                         m_freem(m);
  974                 return (error);
  975         }
  976         if (mp)
  977                 *mp = (struct mbuf *)0;
  978         if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
  979                 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
  980                     (struct mbuf *)0, (struct mbuf *)0, p);
  981 
  982  restart:
  983         if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
  984                 return (error);
  985         s = splsoftnet();
  986 
  987         m = so->so_rcv.sb_mb;
  988         /*
  989          * If we have less data than requested, block awaiting more
  990          * (subject to any timeout) if:
  991          *   1. the current count is less than the low water mark,
  992          *   2. MSG_WAITALL is set, and it is possible to do the entire
  993          *      receive operation at once if we block (resid <= hiwat), or
  994          *   3. MSG_DONTWAIT is not set.
  995          * If MSG_WAITALL is set but resid is larger than the receive buffer,
  996          * we have to do the receive in sections, and thus risk returning
  997          * a short count if a timeout or signal occurs after we start.
  998          */
  999         if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
 1000             so->so_rcv.sb_cc < uio->uio_resid) &&
 1001             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
 1002             ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
 1003             m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
 1004 #ifdef DIAGNOSTIC
 1005                 if (m == 0 && so->so_rcv.sb_cc)
 1006                         panic("receive 1");
 1007 #endif
 1008                 if (so->so_error) {
 1009                         if (m)
 1010                                 goto dontblock;
 1011                         error = so->so_error;
 1012                         if ((flags & MSG_PEEK) == 0)
 1013                                 so->so_error = 0;
 1014                         goto release;
 1015                 }
 1016                 if (so->so_state & SS_CANTRCVMORE) {
 1017                         if (m)
 1018                                 goto dontblock;
 1019                         else
 1020                                 goto release;
 1021                 }
 1022                 for (; m; m = m->m_next)
 1023                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
 1024                                 m = so->so_rcv.sb_mb;
 1025                                 goto dontblock;
 1026                         }
 1027                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
 1028                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
 1029                         error = ENOTCONN;
 1030                         goto release;
 1031                 }
 1032                 if (uio->uio_resid == 0)
 1033                         goto release;
 1034                 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
 1035                         error = EWOULDBLOCK;
 1036                         goto release;
 1037                 }
 1038                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
 1039                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
 1040                 sbunlock(&so->so_rcv);
 1041                 error = sbwait(&so->so_rcv);
 1042                 splx(s);
 1043                 if (error)
 1044                         return (error);
 1045                 goto restart;
 1046         }
 1047  dontblock:
 1048         /*
 1049          * On entry here, m points to the first record of the socket buffer.
 1050          * While we process the initial mbufs containing address and control
 1051          * info, we save a copy of m->m_nextpkt into nextrecord.
 1052          */
 1053         if (p)
 1054                 p->p_stats->p_ru.ru_msgrcv++;
 1055         KASSERT(m == so->so_rcv.sb_mb);
 1056         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
 1057         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
 1058         nextrecord = m->m_nextpkt;
 1059         if (pr->pr_flags & PR_ADDR) {
 1060 #ifdef DIAGNOSTIC
 1061                 if (m->m_type != MT_SONAME)
 1062                         panic("receive 1a");
 1063 #endif
 1064                 orig_resid = 0;
 1065                 if (flags & MSG_PEEK) {
 1066                         if (paddr)
 1067                                 *paddr = m_copy(m, 0, m->m_len);
 1068                         m = m->m_next;
 1069                 } else {
 1070                         sbfree(&so->so_rcv, m);
 1071                         mbuf_removed = 1;
 1072                         if (paddr) {
 1073                                 *paddr = m;
 1074                                 so->so_rcv.sb_mb = m->m_next;
 1075                                 m->m_next = 0;
 1076                                 m = so->so_rcv.sb_mb;
 1077                         } else {
 1078                                 MFREE(m, so->so_rcv.sb_mb);
 1079                                 m = so->so_rcv.sb_mb;
 1080                         }
 1081                 }
 1082         }
 1083         while (m && m->m_type == MT_CONTROL && error == 0) {
 1084                 if (flags & MSG_PEEK) {
 1085                         if (controlp)
 1086                                 *controlp = m_copy(m, 0, m->m_len);
 1087                         m = m->m_next;
 1088                 } else {
 1089                         sbfree(&so->so_rcv, m);
 1090                         mbuf_removed = 1;
 1091                         if (controlp) {
 1092                                 struct domain *dom = pr->pr_domain;
 1093                                 if (dom->dom_externalize && p &&
 1094                                     mtod(m, struct cmsghdr *)->cmsg_type ==
 1095                                     SCM_RIGHTS)
 1096                                         error = (*dom->dom_externalize)(m, p);
 1097                                 *controlp = m;
 1098                                 so->so_rcv.sb_mb = m->m_next;
 1099                                 m->m_next = 0;
 1100                                 m = so->so_rcv.sb_mb;
 1101                         } else {
 1102                                 /*
 1103                                  * Dispose of any SCM_RIGHTS message that went
 1104                                  * through the read path rather than recv.
 1105                                  */
 1106                                 if (pr->pr_domain->dom_dispose &&
 1107                                     mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS)
 1108                                         (*pr->pr_domain->dom_dispose)(m);
 1109                                 MFREE(m, so->so_rcv.sb_mb);
 1110                                 m = so->so_rcv.sb_mb;
 1111                         }
 1112                 }
 1113                 if (controlp) {
 1114                         orig_resid = 0;
 1115                         controlp = &(*controlp)->m_next;
 1116                 }
 1117         }
 1118 
 1119         /*
 1120          * If m is non-NULL, we have some data to read.  From now on,
 1121          * make sure to keep sb_lastrecord consistent when working on
 1122          * the last packet on the chain (nextrecord == NULL) and we
 1123          * change m->m_nextpkt.
 1124          */
 1125         if (m) {
 1126                 if ((flags & MSG_PEEK) == 0) {
 1127                         m->m_nextpkt = nextrecord;
 1128                         /*
 1129                          * If nextrecord == NULL (this is a single chain),
 1130                          * then sb_lastrecord may not be valid here if m
 1131                          * was changed earlier.
 1132                          */
 1133                         if (nextrecord == NULL) {
 1134                                 KASSERT(so->so_rcv.sb_mb == m);
 1135                                 so->so_rcv.sb_lastrecord = m;
 1136                         }
 1137                 }
 1138                 type = m->m_type;
 1139                 if (type == MT_OOBDATA)
 1140                         flags |= MSG_OOB;
 1141         } else {
 1142                 if ((flags & MSG_PEEK) == 0) {
 1143                         KASSERT(so->so_rcv.sb_mb == m);
 1144                         so->so_rcv.sb_mb = nextrecord;
 1145                         SB_EMPTY_FIXUP(&so->so_rcv);
 1146                 }
 1147         }
 1148         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
 1149         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
 1150 
 1151         moff = 0;
 1152         offset = 0;
 1153         while (m && uio->uio_resid > 0 && error == 0) {
 1154                 if (m->m_type == MT_OOBDATA) {
 1155                         if (type != MT_OOBDATA)
 1156                                 break;
 1157                 } else if (type == MT_OOBDATA)
 1158                         break;
 1159 #ifdef DIAGNOSTIC
 1160                 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
 1161                         panic("receive 3");
 1162 #endif
 1163                 so->so_state &= ~SS_RCVATMARK;
 1164                 len = uio->uio_resid;
 1165                 if (so->so_oobmark && len > so->so_oobmark - offset)
 1166                         len = so->so_oobmark - offset;
 1167                 if (len > m->m_len - moff)
 1168                         len = m->m_len - moff;
 1169                 /*
 1170                  * If mp is set, just pass back the mbufs.
 1171                  * Otherwise copy them out via the uio, then free.
 1172                  * Sockbuf must be consistent here (points to current mbuf,
 1173                  * it points to next record) when we drop priority;
 1174                  * we must note any additions to the sockbuf when we
 1175                  * block interrupts again.
 1176                  */
 1177                 if (mp == 0) {
 1178                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
 1179                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
 1180                         splx(s);
 1181                         error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
 1182                         s = splsoftnet();
 1183                         if (error) {
 1184                                 /*
 1185                                  * If any part of the record has been removed
 1186                                  * (such as the MT_SONAME mbuf, which will
 1187                                  * happen when PR_ADDR, and thus also
 1188                                  * PR_ATOMIC, is set), then drop the entire
 1189                                  * record to maintain the atomicity of the
 1190                                  * receive operation.
 1191                                  *
 1192                                  * This avoids a later panic("receive 1a")
 1193                                  * when compiled with DIAGNOSTIC.
 1194                                  */
 1195                                 if (m && mbuf_removed
 1196                                     && (pr->pr_flags & PR_ATOMIC))
 1197                                         (void) sbdroprecord(&so->so_rcv);
 1198 
 1199                                 goto release;
 1200                         }
 1201                 } else
 1202                         uio->uio_resid -= len;
 1203                 if (len == m->m_len - moff) {
 1204                         if (m->m_flags & M_EOR)
 1205                                 flags |= MSG_EOR;
 1206                         if (flags & MSG_PEEK) {
 1207                                 m = m->m_next;
 1208                                 moff = 0;
 1209                         } else {
 1210                                 nextrecord = m->m_nextpkt;
 1211                                 sbfree(&so->so_rcv, m);
 1212                                 if (mp) {
 1213                                         *mp = m;
 1214                                         mp = &m->m_next;
 1215                                         so->so_rcv.sb_mb = m = m->m_next;
 1216                                         *mp = (struct mbuf *)0;
 1217                                 } else {
 1218                                         MFREE(m, so->so_rcv.sb_mb);
 1219                                         m = so->so_rcv.sb_mb;
 1220                                 }
 1221                                 /*
 1222                                  * If m != NULL, we also know that
 1223                                  * so->so_rcv.sb_mb != NULL.
 1224                                  */
 1225                                 KASSERT(so->so_rcv.sb_mb == m);
 1226                                 if (m) {
 1227                                         m->m_nextpkt = nextrecord;
 1228                                         if (nextrecord == NULL)
 1229                                                 so->so_rcv.sb_lastrecord = m;
 1230                                 } else {
 1231                                         so->so_rcv.sb_mb = nextrecord;
 1232                                         SB_EMPTY_FIXUP(&so->so_rcv);
 1233                                 }
 1234                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
 1235                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
 1236                         }
 1237                 } else {
 1238                         if (flags & MSG_PEEK)
 1239                                 moff += len;
 1240                         else {
 1241                                 if (mp)
 1242                                         *mp = m_copym(m, 0, len, M_WAIT);
 1243                                 m->m_data += len;
 1244                                 m->m_len -= len;
 1245                                 so->so_rcv.sb_cc -= len;
 1246                         }
 1247                 }
 1248                 if (so->so_oobmark) {
 1249                         if ((flags & MSG_PEEK) == 0) {
 1250                                 so->so_oobmark -= len;
 1251                                 if (so->so_oobmark == 0) {
 1252                                         so->so_state |= SS_RCVATMARK;
 1253                                         break;
 1254                                 }
 1255                         } else {
 1256                                 offset += len;
 1257                                 if (offset == so->so_oobmark)
 1258                                         break;
 1259                         }
 1260                 }
 1261                 if (flags & MSG_EOR)
 1262                         break;
 1263                 /*
 1264                  * If the MSG_WAITALL flag is set (for non-atomic socket),
 1265                  * we must not quit until "uio->uio_resid == 0" or an error
 1266                  * termination.  If a signal/timeout occurs, return
 1267                  * with a short count but without error.
 1268                  * Keep sockbuf locked against other readers.
 1269                  */
 1270                 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
 1271                     !sosendallatonce(so) && !nextrecord) {
 1272                         if (so->so_error || so->so_state & SS_CANTRCVMORE)
 1273                                 break;
 1274                         /*
 1275                          * If we are peeking and the socket receive buffer is
 1276                          * full, stop since we can't get more data to peek at.
 1277                          */
 1278                         if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0)
 1279                                 break;
 1280                         /*
 1281                          * If we've drained the socket buffer, tell the
 1282                          * protocol in case it needs to do something to
 1283                          * get it filled again.
 1284                          */
 1285                         if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb)
 1286                                 (*pr->pr_usrreq)(so, PRU_RCVD,
 1287                                     (struct mbuf *)0,
 1288                                     (struct mbuf *)(long)flags,
 1289                                     (struct mbuf *)0, p);
 1290                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
 1291                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
 1292                         error = sbwait(&so->so_rcv);
 1293                         if (error) {
 1294                                 sbunlock(&so->so_rcv);
 1295                                 splx(s);
 1296                                 return (0);
 1297                         }
 1298                         if ((m = so->so_rcv.sb_mb) != NULL)
 1299                                 nextrecord = m->m_nextpkt;
 1300                 }
 1301         }
 1302 
 1303         if (m && pr->pr_flags & PR_ATOMIC) {
 1304                 flags |= MSG_TRUNC;
 1305                 if ((flags & MSG_PEEK) == 0)
 1306                         (void) sbdroprecord(&so->so_rcv);
 1307         }
 1308         if ((flags & MSG_PEEK) == 0) {
 1309                 if (m == 0) {
 1310                         /*
 1311                          * First part is an inline SB_EMPTY_FIXUP().  Second
 1312                          * part makes sure sb_lastrecord is up-to-date if
 1313                          * there is still data in the socket buffer.
 1314                          */
 1315                         so->so_rcv.sb_mb = nextrecord;
 1316                         if (so->so_rcv.sb_mb == NULL) {
 1317                                 so->so_rcv.sb_mbtail = NULL;
 1318                                 so->so_rcv.sb_lastrecord = NULL;
 1319                         } else if (nextrecord->m_nextpkt == NULL)
 1320                                 so->so_rcv.sb_lastrecord = nextrecord;
 1321                 }
 1322                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
 1323                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
 1324                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
 1325                         (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
 1326                             (struct mbuf *)(long)flags, (struct mbuf *)0, p);
 1327         }
 1328         if (orig_resid == uio->uio_resid && orig_resid &&
 1329             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
 1330                 sbunlock(&so->so_rcv);
 1331                 splx(s);
 1332                 goto restart;
 1333         }
 1334 
 1335         if (flagsp)
 1336                 *flagsp |= flags;
 1337  release:
 1338         sbunlock(&so->so_rcv);
 1339         splx(s);
 1340         return (error);
 1341 }
 1342 
 1343 int
 1344 soshutdown(struct socket *so, int how)
 1345 {
 1346         const struct protosw    *pr;
 1347 
 1348         pr = so->so_proto;
 1349         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
 1350                 return (EINVAL);
 1351 
 1352         if (how == SHUT_RD || how == SHUT_RDWR)
 1353                 sorflush(so);
 1354         if (how == SHUT_WR || how == SHUT_RDWR)
 1355                 return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0,
 1356                     (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
 1357         return (0);
 1358 }
 1359 
 1360 void
 1361 sorflush(struct socket *so)
 1362 {
 1363         struct sockbuf  *sb, asb;
 1364         const struct protosw    *pr;
 1365         int             s;
 1366 
 1367         sb = &so->so_rcv;
 1368         pr = so->so_proto;
 1369         sb->sb_flags |= SB_NOINTR;
 1370         (void) sblock(sb, M_WAITOK);
 1371         s = splnet();
 1372         socantrcvmore(so);
 1373         sbunlock(sb);
 1374         asb = *sb;
 1375         /*
 1376          * Clear most of the sockbuf structure, but leave some of the
 1377          * fields valid.
 1378          */
 1379         memset(&sb->sb_startzero, 0,
 1380             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 1381         splx(s);
 1382         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
 1383                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
 1384         sbrelease(&asb, so);
 1385 }
 1386 
 1387 int
 1388 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0)
 1389 {
 1390         int             error;
 1391         struct mbuf     *m;
 1392 
 1393         error = 0;
 1394         m = m0;
 1395         if (level != SOL_SOCKET) {
 1396                 if (so->so_proto && so->so_proto->pr_ctloutput)
 1397                         return ((*so->so_proto->pr_ctloutput)
 1398                                   (PRCO_SETOPT, so, level, optname, &m0));
 1399                 error = ENOPROTOOPT;
 1400         } else {
 1401                 switch (optname) {
 1402 
 1403                 case SO_LINGER:
 1404                         if (m == NULL || m->m_len != sizeof(struct linger)) {
 1405                                 error = EINVAL;
 1406                                 goto bad;
 1407                         }
 1408                         if (mtod(m, struct linger *)->l_linger < 0 ||
 1409                             mtod(m, struct linger *)->l_linger > (INT_MAX / hz)) {
 1410                                 error = EDOM;
 1411                                 goto bad;
 1412                         }
 1413                         so->so_linger = mtod(m, struct linger *)->l_linger;
 1414                         /* fall thru... */
 1415 
 1416                 case SO_DEBUG:
 1417                 case SO_KEEPALIVE:
 1418                 case SO_DONTROUTE:
 1419                 case SO_USELOOPBACK:
 1420                 case SO_BROADCAST:
 1421                 case SO_REUSEADDR:
 1422                 case SO_REUSEPORT:
 1423                 case SO_OOBINLINE:
 1424                 case SO_TIMESTAMP:
 1425                         if (m == NULL || m->m_len < sizeof(int)) {
 1426                                 error = EINVAL;
 1427                                 goto bad;
 1428                         }
 1429                         if (*mtod(m, int *))
 1430                                 so->so_options |= optname;
 1431                         else
 1432                                 so->so_options &= ~optname;
 1433                         break;
 1434 
 1435                 case SO_SNDBUF:
 1436                 case SO_RCVBUF:
 1437                 case SO_SNDLOWAT:
 1438                 case SO_RCVLOWAT:
 1439                     {
 1440                         int optval;
 1441 
 1442                         if (m == NULL || m->m_len < sizeof(int)) {
 1443                                 error = EINVAL;
 1444                                 goto bad;
 1445                         }
 1446 
 1447                         /*
 1448                          * Values < 1 make no sense for any of these
 1449                          * options, so disallow them.
 1450                          */
 1451                         optval = *mtod(m, int *);
 1452                         if (optval < 1) {
 1453                                 error = EINVAL;
 1454                                 goto bad;
 1455                         }
 1456 
 1457                         switch (optname) {
 1458 
 1459                         case SO_SNDBUF:
 1460                         case SO_RCVBUF:
 1461                                 if (sbreserve(optname == SO_SNDBUF ?
 1462                                     &so->so_snd : &so->so_rcv,
 1463                                     (u_long) optval, so) == 0) {
 1464                                         error = ENOBUFS;
 1465                                         goto bad;
 1466                                 }
 1467                                 break;
 1468 
 1469                         /*
 1470                          * Make sure the low-water is never greater than
 1471                          * the high-water.
 1472                          */
 1473                         case SO_SNDLOWAT:
 1474                                 so->so_snd.sb_lowat =
 1475                                     (optval > so->so_snd.sb_hiwat) ?
 1476                                     so->so_snd.sb_hiwat : optval;
 1477                                 break;
 1478                         case SO_RCVLOWAT:
 1479                                 so->so_rcv.sb_lowat =
 1480                                     (optval > so->so_rcv.sb_hiwat) ?
 1481                                     so->so_rcv.sb_hiwat : optval;
 1482                                 break;
 1483                         }
 1484                         break;
 1485                     }
 1486 
 1487                 case SO_SNDTIMEO:
 1488                 case SO_RCVTIMEO:
 1489                     {
 1490                         struct timeval *tv;
 1491                         int val;
 1492 
 1493                         if (m == NULL || m->m_len < sizeof(*tv)) {
 1494                                 error = EINVAL;
 1495                                 goto bad;
 1496                         }
 1497                         tv = mtod(m, struct timeval *);
 1498                         if (tv->tv_sec > (INT_MAX - tv->tv_usec / tick) / hz) {
 1499                                 error = EDOM;
 1500                                 goto bad;
 1501                         }
 1502                         val = tv->tv_sec * hz + tv->tv_usec / tick;
 1503                         if (val == 0 && tv->tv_usec != 0)
 1504                                 val = 1;
 1505 
 1506                         switch (optname) {
 1507 
 1508                         case SO_SNDTIMEO:
 1509                                 so->so_snd.sb_timeo = val;
 1510                                 break;
 1511                         case SO_RCVTIMEO:
 1512                                 so->so_rcv.sb_timeo = val;
 1513                                 break;
 1514                         }
 1515                         break;
 1516                     }
 1517 
 1518                 default:
 1519                         error = ENOPROTOOPT;
 1520                         break;
 1521                 }
 1522                 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
 1523                         (void) ((*so->so_proto->pr_ctloutput)
 1524                                   (PRCO_SETOPT, so, level, optname, &m0));
 1525                         m = NULL;       /* freed by protocol */
 1526                 }
 1527         }
 1528  bad:
 1529         if (m)
 1530                 (void) m_free(m);
 1531         return (error);
 1532 }
 1533 
 1534 int
 1535 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp)
 1536 {
 1537         struct mbuf     *m;
 1538 
 1539         if (level != SOL_SOCKET) {
 1540                 if (so->so_proto && so->so_proto->pr_ctloutput) {
 1541                         return ((*so->so_proto->pr_ctloutput)
 1542                                   (PRCO_GETOPT, so, level, optname, mp));
 1543                 } else
 1544                         return (ENOPROTOOPT);
 1545         } else {
 1546                 m = m_get(M_WAIT, MT_SOOPTS);
 1547                 m->m_len = sizeof(int);
 1548 
 1549                 switch (optname) {
 1550 
 1551                 case SO_LINGER:
 1552                         m->m_len = sizeof(struct linger);
 1553                         mtod(m, struct linger *)->l_onoff =
 1554                                 so->so_options & SO_LINGER;
 1555                         mtod(m, struct linger *)->l_linger = so->so_linger;
 1556                         break;
 1557 
 1558                 case SO_USELOOPBACK:
 1559                 case SO_DONTROUTE:
 1560                 case SO_DEBUG:
 1561                 case SO_KEEPALIVE:
 1562                 case SO_REUSEADDR:
 1563                 case SO_REUSEPORT:
 1564                 case SO_BROADCAST:
 1565                 case SO_OOBINLINE:
 1566                 case SO_TIMESTAMP:
 1567                         *mtod(m, int *) = so->so_options & optname;
 1568                         break;
 1569 
 1570                 case SO_TYPE:
 1571                         *mtod(m, int *) = so->so_type;
 1572                         break;
 1573 
 1574                 case SO_ERROR:
 1575                         *mtod(m, int *) = so->so_error;
 1576                         so->so_error = 0;
 1577                         break;
 1578 
 1579                 case SO_SNDBUF:
 1580                         *mtod(m, int *) = so->so_snd.sb_hiwat;
 1581                         break;
 1582 
 1583                 case SO_RCVBUF:
 1584                         *mtod(m, int *) = so->so_rcv.sb_hiwat;
 1585                         break;
 1586 
 1587                 case SO_SNDLOWAT:
 1588                         *mtod(m, int *) = so->so_snd.sb_lowat;
 1589                         break;
 1590 
 1591                 case SO_RCVLOWAT:
 1592                         *mtod(m, int *) = so->so_rcv.sb_lowat;
 1593                         break;
 1594 
 1595                 case SO_SNDTIMEO:
 1596                 case SO_RCVTIMEO:
 1597                     {
 1598                         int val = (optname == SO_SNDTIMEO ?
 1599                              so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
 1600 
 1601                         m->m_len = sizeof(struct timeval);
 1602                         mtod(m, struct timeval *)->tv_sec = val / hz;
 1603                         mtod(m, struct timeval *)->tv_usec =
 1604                             (val % hz) * tick;
 1605                         break;
 1606                     }
 1607 
 1608                 case SO_OVERFLOWED:
 1609                         *mtod(m, int *) = so->so_rcv.sb_overflowed;
 1610                         break;
 1611 
 1612                 default:
 1613                         (void)m_free(m);
 1614                         return (ENOPROTOOPT);
 1615                 }
 1616                 *mp = m;
 1617                 return (0);
 1618         }
 1619 }
 1620 
 1621 void
 1622 sohasoutofband(struct socket *so)
 1623 {
 1624         fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so);
 1625         selwakeup(&so->so_rcv.sb_sel);
 1626 }
 1627 
 1628 static void
 1629 filt_sordetach(struct knote *kn)
 1630 {
 1631         struct socket   *so;
 1632 
 1633         so = (struct socket *)kn->kn_fp->f_data;
 1634         SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext);
 1635         if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist))
 1636                 so->so_rcv.sb_flags &= ~SB_KNOTE;
 1637 }
 1638 
 1639 /*ARGSUSED*/
 1640 static int
 1641 filt_soread(struct knote *kn, long hint)
 1642 {
 1643         struct socket   *so;
 1644 
 1645         so = (struct socket *)kn->kn_fp->f_data;
 1646         kn->kn_data = so->so_rcv.sb_cc;
 1647         if (so->so_state & SS_CANTRCVMORE) {
 1648                 kn->kn_flags |= EV_EOF;
 1649                 kn->kn_fflags = so->so_error;
 1650                 return (1);
 1651         }
 1652         if (so->so_error)       /* temporary udp error */
 1653                 return (1);
 1654         if (kn->kn_sfflags & NOTE_LOWAT)
 1655                 return (kn->kn_data >= kn->kn_sdata);
 1656         return (kn->kn_data >= so->so_rcv.sb_lowat);
 1657 }
 1658 
 1659 static void
 1660 filt_sowdetach(struct knote *kn)
 1661 {
 1662         struct socket   *so;
 1663 
 1664         so = (struct socket *)kn->kn_fp->f_data;
 1665         SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext);
 1666         if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist))
 1667                 so->so_snd.sb_flags &= ~SB_KNOTE;
 1668 }
 1669 
 1670 /*ARGSUSED*/
 1671 static int
 1672 filt_sowrite(struct knote *kn, long hint)
 1673 {
 1674         struct socket   *so;
 1675 
 1676         so = (struct socket *)kn->kn_fp->f_data;
 1677         kn->kn_data = sbspace(&so->so_snd);
 1678         if (so->so_state & SS_CANTSENDMORE) {
 1679                 kn->kn_flags |= EV_EOF;
 1680                 kn->kn_fflags = so->so_error;
 1681                 return (1);
 1682         }
 1683         if (so->so_error)       /* temporary udp error */
 1684                 return (1);
 1685         if (((so->so_state & SS_ISCONNECTED) == 0) &&
 1686             (so->so_proto->pr_flags & PR_CONNREQUIRED))
 1687                 return (0);
 1688         if (kn->kn_sfflags & NOTE_LOWAT)
 1689                 return (kn->kn_data >= kn->kn_sdata);
 1690         return (kn->kn_data >= so->so_snd.sb_lowat);
 1691 }
 1692 
 1693 /*ARGSUSED*/
 1694 static int
 1695 filt_solisten(struct knote *kn, long hint)
 1696 {
 1697         struct socket   *so;
 1698 
 1699         so = (struct socket *)kn->kn_fp->f_data;
 1700 
 1701         /*
 1702          * Set kn_data to number of incoming connections, not
 1703          * counting partial (incomplete) connections.
 1704          */
 1705         kn->kn_data = so->so_qlen;
 1706         return (kn->kn_data > 0);
 1707 }
 1708 
 1709 static const struct filterops solisten_filtops =
 1710         { 1, NULL, filt_sordetach, filt_solisten };
 1711 static const struct filterops soread_filtops =
 1712         { 1, NULL, filt_sordetach, filt_soread };
 1713 static const struct filterops sowrite_filtops =
 1714         { 1, NULL, filt_sowdetach, filt_sowrite };
 1715 
 1716 int
 1717 soo_kqfilter(struct file *fp, struct knote *kn)
 1718 {
 1719         struct socket   *so;
 1720         struct sockbuf  *sb;
 1721 
 1722         so = (struct socket *)kn->kn_fp->f_data;
 1723         switch (kn->kn_filter) {
 1724         case EVFILT_READ:
 1725                 if (so->so_options & SO_ACCEPTCONN)
 1726                         kn->kn_fop = &solisten_filtops;
 1727                 else
 1728                         kn->kn_fop = &soread_filtops;
 1729                 sb = &so->so_rcv;
 1730                 break;
 1731         case EVFILT_WRITE:
 1732                 kn->kn_fop = &sowrite_filtops;
 1733                 sb = &so->so_snd;
 1734                 break;
 1735         default:
 1736                 return (1);
 1737         }
 1738         SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext);
 1739         sb->sb_flags |= SB_KNOTE;
 1740         return (0);
 1741 }
 1742 
 1743 #include <sys/sysctl.h>
 1744 
 1745 static int sysctl_kern_somaxkva(SYSCTLFN_PROTO);
 1746 
 1747 /*
 1748  * sysctl helper routine for kern.somaxkva.  ensures that the given
 1749  * value is not too small.
 1750  * (XXX should we maybe make sure it's not too large as well?)
 1751  */
 1752 static int
 1753 sysctl_kern_somaxkva(SYSCTLFN_ARGS)
 1754 {
 1755         int error, new_somaxkva;
 1756         struct sysctlnode node;
 1757         int s;
 1758 
 1759         new_somaxkva = somaxkva;
 1760         node = *rnode;
 1761         node.sysctl_data = &new_somaxkva;
 1762         error = sysctl_lookup(SYSCTLFN_CALL(&node));
 1763         if (error || newp == NULL)
 1764                 return (error);
 1765 
 1766         if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */
 1767                 return (EINVAL);
 1768 
 1769         s = splvm();
 1770         simple_lock(&so_pendfree_slock);
 1771         somaxkva = new_somaxkva;
 1772         wakeup(&socurkva);
 1773         simple_unlock(&so_pendfree_slock);
 1774         splx(s);
 1775 
 1776         return (error);
 1777 }
 1778 
 1779 SYSCTL_SETUP(sysctl_kern_somaxkva_setup, "sysctl kern.somaxkva setup")
 1780 {
 1781 
 1782         sysctl_createv(clog, 0, NULL, NULL,
 1783                        CTLFLAG_PERMANENT,
 1784                        CTLTYPE_NODE, "kern", NULL,
 1785                        NULL, 0, NULL, 0,
 1786                        CTL_KERN, CTL_EOL);
 1787 
 1788         sysctl_createv(clog, 0, NULL, NULL,
 1789                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 1790                        CTLTYPE_INT, "somaxkva",
 1791                        SYSCTL_DESCR("Maximum amount of kernel memory to be "
 1792                                     "used for socket buffers"),
 1793                        sysctl_kern_somaxkva, 0, NULL, 0,
 1794                        CTL_KERN, KERN_SOMAXKVA, CTL_EOL);
 1795 }

Cache object: 2044aa1eba7bd5b93e6b9c59fdb47333


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.