The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/uipc_socket.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: uipc_socket.c,v 1.97.2.2 2005/10/31 13:37:33 tron Exp $        */
    2 
    3 /*-
    4  * Copyright (c) 2002 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Jason R. Thorpe of Wasabi Systems, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the NetBSD
   21  *      Foundation, Inc. and its contributors.
   22  * 4. Neither the name of The NetBSD Foundation nor the names of its
   23  *    contributors may be used to endorse or promote products derived
   24  *    from this software without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   36  * POSSIBILITY OF SUCH DAMAGE.
   37  */
   38 
   39 /*
   40  * Copyright (c) 1982, 1986, 1988, 1990, 1993
   41  *      The Regents of the University of California.  All rights reserved.
   42  *
   43  * Redistribution and use in source and binary forms, with or without
   44  * modification, are permitted provided that the following conditions
   45  * are met:
   46  * 1. Redistributions of source code must retain the above copyright
   47  *    notice, this list of conditions and the following disclaimer.
   48  * 2. Redistributions in binary form must reproduce the above copyright
   49  *    notice, this list of conditions and the following disclaimer in the
   50  *    documentation and/or other materials provided with the distribution.
   51  * 3. Neither the name of the University nor the names of its contributors
   52  *    may be used to endorse or promote products derived from this software
   53  *    without specific prior written permission.
   54  *
   55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   65  * SUCH DAMAGE.
   66  *
   67  *      @(#)uipc_socket.c       8.6 (Berkeley) 5/2/95
   68  */
   69 
   70 #include <sys/cdefs.h>
   71 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.97.2.2 2005/10/31 13:37:33 tron Exp $");
   72 
   73 #include "opt_sock_counters.h"
   74 #include "opt_sosend_loan.h"
   75 #include "opt_mbuftrace.h"
   76 #include "opt_somaxkva.h"
   77 
   78 #include <sys/param.h>
   79 #include <sys/systm.h>
   80 #include <sys/proc.h>
   81 #include <sys/file.h>
   82 #include <sys/malloc.h>
   83 #include <sys/mbuf.h>
   84 #include <sys/domain.h>
   85 #include <sys/kernel.h>
   86 #include <sys/protosw.h>
   87 #include <sys/socket.h>
   88 #include <sys/socketvar.h>
   89 #include <sys/signalvar.h>
   90 #include <sys/resourcevar.h>
   91 #include <sys/pool.h>
   92 #include <sys/event.h>
   93 #include <sys/poll.h>
   94 
   95 #include <uvm/uvm.h>
   96 
   97 struct pool     socket_pool;
   98 
   99 MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options");
  100 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
  101 
  102 extern int      somaxconn;                      /* patchable (XXX sysctl) */
  103 int             somaxconn = SOMAXCONN;
  104 
  105 #ifdef SOSEND_COUNTERS
  106 #include <sys/device.h>
  107 
  108 struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  109     NULL, "sosend", "loan big");
  110 struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  111     NULL, "sosend", "copy big");
  112 struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  113     NULL, "sosend", "copy small");
  114 struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  115     NULL, "sosend", "kva limit");
  116 
  117 #define SOSEND_COUNTER_INCR(ev)         (ev)->ev_count++
  118 
  119 #else
  120 
  121 #define SOSEND_COUNTER_INCR(ev)         /* nothing */
  122 
  123 #endif /* SOSEND_COUNTERS */
  124 
  125 void
  126 soinit(void)
  127 {
  128 
  129         /* Set the initial adjusted socket buffer size. */
  130         if (sb_max_set(sb_max))
  131                 panic("bad initial sb_max value: %lu\n", sb_max);
  132 
  133         pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0,
  134             "sockpl", NULL);
  135 
  136 #ifdef SOSEND_COUNTERS
  137         evcnt_attach_static(&sosend_loan_big);
  138         evcnt_attach_static(&sosend_copy_big);
  139         evcnt_attach_static(&sosend_copy_small);
  140         evcnt_attach_static(&sosend_kvalimit);
  141 #endif /* SOSEND_COUNTERS */
  142 }
  143 
  144 #ifdef SOSEND_NO_LOAN
  145 int use_sosend_loan = 0;
  146 #else
  147 int use_sosend_loan = 1;
  148 #endif
  149 
  150 struct simplelock so_pendfree_slock = SIMPLELOCK_INITIALIZER;
  151 struct mbuf *so_pendfree;
  152 
  153 #ifndef SOMAXKVA
  154 #define SOMAXKVA (16 * 1024 * 1024)
  155 #endif
  156 int somaxkva = SOMAXKVA;
  157 int socurkva;
  158 int sokvawaiters;
  159 
  160 #define SOCK_LOAN_THRESH        4096
  161 #define SOCK_LOAN_CHUNK         65536
  162 
  163 static size_t sodopendfree(struct socket *);
  164 static size_t sodopendfreel(struct socket *);
  165 static __inline void sokvareserve(struct socket *, vsize_t);
  166 static __inline void sokvaunreserve(vsize_t);
  167 
  168 static __inline void
  169 sokvareserve(struct socket *so, vsize_t len)
  170 {
  171         int s;
  172 
  173         s = splvm();
  174         simple_lock(&so_pendfree_slock);
  175         while (socurkva + len > somaxkva) {
  176                 size_t freed;
  177 
  178                 /*
  179                  * try to do pendfree.
  180                  */
  181 
  182                 freed = sodopendfreel(so);
  183 
  184                 /*
  185                  * if some kva was freed, try again.
  186                  */
  187 
  188                 if (freed)
  189                         continue;
  190 
  191                 SOSEND_COUNTER_INCR(&sosend_kvalimit);
  192                 sokvawaiters++;
  193                 (void) ltsleep(&socurkva, PVM, "sokva", 0, &so_pendfree_slock);
  194                 sokvawaiters--;
  195         }
  196         socurkva += len;
  197         simple_unlock(&so_pendfree_slock);
  198         splx(s);
  199 }
  200 
  201 static __inline void
  202 sokvaunreserve(vsize_t len)
  203 {
  204         int s;
  205 
  206         s = splvm();
  207         simple_lock(&so_pendfree_slock);
  208         socurkva -= len;
  209         if (sokvawaiters)
  210                 wakeup(&socurkva);
  211         simple_unlock(&so_pendfree_slock);
  212         splx(s);
  213 }
  214 
  215 /*
  216  * sokvaalloc: allocate kva for loan.
  217  */
  218 
  219 vaddr_t
  220 sokvaalloc(vsize_t len, struct socket *so)
  221 {
  222         vaddr_t lva;
  223 
  224         /*
  225          * reserve kva.
  226          */
  227 
  228         sokvareserve(so, len);
  229 
  230         /*
  231          * allocate kva.
  232          */
  233 
  234         lva = uvm_km_valloc_wait(kernel_map, len);
  235         if (lva == 0) {
  236                 sokvaunreserve(len);
  237                 return (0);
  238         }
  239 
  240         return lva;
  241 }
  242 
  243 /*
  244  * sokvafree: free kva for loan.
  245  */
  246 
  247 void
  248 sokvafree(vaddr_t sva, vsize_t len)
  249 {
  250 
  251         /*
  252          * free kva.
  253          */
  254 
  255         uvm_km_free(kernel_map, sva, len);
  256 
  257         /*
  258          * unreserve kva.
  259          */
  260 
  261         sokvaunreserve(len);
  262 }
  263 
  264 static void
  265 sodoloanfree(struct vm_page **pgs, caddr_t buf, size_t size)
  266 {
  267         vaddr_t va, sva, eva;
  268         vsize_t len;
  269         paddr_t pa;
  270         int i, npgs;
  271 
  272         eva = round_page((vaddr_t) buf + size);
  273         sva = trunc_page((vaddr_t) buf);
  274         len = eva - sva;
  275         npgs = len >> PAGE_SHIFT;
  276 
  277         if (__predict_false(pgs == NULL)) {
  278                 pgs = alloca(npgs * sizeof(*pgs));
  279 
  280                 for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) {
  281                         if (pmap_extract(pmap_kernel(), va, &pa) == FALSE)
  282                                 panic("sodoloanfree: va 0x%lx not mapped", va);
  283                         pgs[i] = PHYS_TO_VM_PAGE(pa);
  284                 }
  285         }
  286 
  287         pmap_kremove(sva, len);
  288         pmap_update(pmap_kernel());
  289         uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE);
  290         sokvafree(sva, len);
  291 }
  292 
  293 static size_t
  294 sodopendfree(struct socket *so)
  295 {
  296         int s;
  297         size_t rv;
  298 
  299         s = splvm();
  300         simple_lock(&so_pendfree_slock);
  301         rv = sodopendfreel(so);
  302         simple_unlock(&so_pendfree_slock);
  303         splx(s);
  304 
  305         return rv;
  306 }
  307 
  308 /*
  309  * sodopendfreel: free mbufs on "pendfree" list.
  310  * unlock and relock so_pendfree_slock when freeing mbufs.
  311  *
  312  * => called with so_pendfree_slock held.
  313  * => called at splvm.
  314  */
  315 
  316 static size_t
  317 sodopendfreel(struct socket *so)
  318 {
  319         size_t rv = 0;
  320 
  321         LOCK_ASSERT(simple_lock_held(&so_pendfree_slock));
  322 
  323         for (;;) {
  324                 struct mbuf *m;
  325                 struct mbuf *next;
  326 
  327                 m = so_pendfree;
  328                 if (m == NULL)
  329                         break;
  330                 so_pendfree = NULL;
  331                 simple_unlock(&so_pendfree_slock);
  332                 /* XXX splx */
  333 
  334                 for (; m != NULL; m = next) {
  335                         next = m->m_next;
  336 
  337                         rv += m->m_ext.ext_size;
  338                         sodoloanfree((m->m_flags & M_EXT_PAGES) ?
  339                             m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf,
  340                             m->m_ext.ext_size);
  341                         pool_cache_put(&mbpool_cache, m);
  342                 }
  343 
  344                 /* XXX splvm */
  345                 simple_lock(&so_pendfree_slock);
  346         }
  347 
  348         return (rv);
  349 }
  350 
  351 void
  352 soloanfree(struct mbuf *m, caddr_t buf, size_t size, void *arg)
  353 {
  354         int s;
  355 
  356         if (m == NULL) {
  357 
  358                 /*
  359                  * called from MEXTREMOVE.
  360                  */
  361 
  362                 sodoloanfree(NULL, buf, size);
  363                 return;
  364         }
  365 
  366         /*
  367          * postpone freeing mbuf.
  368          *
  369          * we can't do it in interrupt context
  370          * because we need to put kva back to kernel_map.
  371          */
  372 
  373         s = splvm();
  374         simple_lock(&so_pendfree_slock);
  375         m->m_next = so_pendfree;
  376         so_pendfree = m;
  377         if (sokvawaiters)
  378                 wakeup(&socurkva);
  379         simple_unlock(&so_pendfree_slock);
  380         splx(s);
  381 }
  382 
  383 static long
  384 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space)
  385 {
  386         struct iovec *iov = uio->uio_iov;
  387         vaddr_t sva, eva;
  388         vsize_t len;
  389         vaddr_t lva, va;
  390         int npgs, i, error;
  391 
  392         if (uio->uio_segflg != UIO_USERSPACE)
  393                 return (0);
  394 
  395         if (iov->iov_len < (size_t) space)
  396                 space = iov->iov_len;
  397         if (space > SOCK_LOAN_CHUNK)
  398                 space = SOCK_LOAN_CHUNK;
  399 
  400         eva = round_page((vaddr_t) iov->iov_base + space);
  401         sva = trunc_page((vaddr_t) iov->iov_base);
  402         len = eva - sva;
  403         npgs = len >> PAGE_SHIFT;
  404 
  405         /* XXX KDASSERT */
  406         KASSERT(npgs <= M_EXT_MAXPAGES);
  407 
  408         lva = sokvaalloc(len, so);
  409         if (lva == 0)
  410                 return 0;
  411 
  412         error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len,
  413             m->m_ext.ext_pgs, UVM_LOAN_TOPAGE);
  414         if (error) {
  415                 sokvafree(lva, len);
  416                 return (0);
  417         }
  418 
  419         for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE)
  420                 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]),
  421                     VM_PROT_READ);
  422         pmap_update(pmap_kernel());
  423 
  424         lva += (vaddr_t) iov->iov_base & PAGE_MASK;
  425 
  426         MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so);
  427         m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;
  428 
  429         uio->uio_resid -= space;
  430         /* uio_offset not updated, not set/used for write(2) */
  431         uio->uio_iov->iov_base = (caddr_t) uio->uio_iov->iov_base + space;
  432         uio->uio_iov->iov_len -= space;
  433         if (uio->uio_iov->iov_len == 0) {
  434                 uio->uio_iov++;
  435                 uio->uio_iovcnt--;
  436         }
  437 
  438         return (space);
  439 }
  440 
  441 /*
  442  * Socket operation routines.
  443  * These routines are called by the routines in
  444  * sys_socket.c or from a system process, and
  445  * implement the semantics of socket operations by
  446  * switching out to the protocol specific routines.
  447  */
  448 /*ARGSUSED*/
  449 int
  450 socreate(int dom, struct socket **aso, int type, int proto)
  451 {
  452         struct proc     *p;
  453         struct protosw  *prp;
  454         struct socket   *so;
  455         int             error, s;
  456 
  457         p = curproc;            /* XXX */
  458         if (proto)
  459                 prp = pffindproto(dom, proto, type);
  460         else
  461                 prp = pffindtype(dom, type);
  462         if (prp == 0 || prp->pr_usrreq == 0)
  463                 return (EPROTONOSUPPORT);
  464         if (prp->pr_type != type)
  465                 return (EPROTOTYPE);
  466         s = splsoftnet();
  467         so = pool_get(&socket_pool, PR_WAITOK);
  468         memset((caddr_t)so, 0, sizeof(*so));
  469         TAILQ_INIT(&so->so_q0);
  470         TAILQ_INIT(&so->so_q);
  471         so->so_type = type;
  472         so->so_proto = prp;
  473         so->so_send = sosend;
  474         so->so_receive = soreceive;
  475 #ifdef MBUFTRACE
  476         so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner;
  477         so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner;
  478         so->so_mowner = &prp->pr_domain->dom_mowner;
  479 #endif
  480         if (p != 0)
  481                 so->so_uid = p->p_ucred->cr_uid;
  482         error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0,
  483             (struct mbuf *)(long)proto, (struct mbuf *)0, p);
  484         if (error) {
  485                 so->so_state |= SS_NOFDREF;
  486                 sofree(so);
  487                 splx(s);
  488                 return (error);
  489         }
  490         splx(s);
  491         *aso = so;
  492         return (0);
  493 }
  494 
  495 int
  496 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
  497 {
  498         int     s, error;
  499 
  500         s = splsoftnet();
  501         error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0,
  502             nam, (struct mbuf *)0, p);
  503         splx(s);
  504         return (error);
  505 }
  506 
  507 int
  508 solisten(struct socket *so, int backlog)
  509 {
  510         int     s, error;
  511 
  512         s = splsoftnet();
  513         error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0,
  514             (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
  515         if (error) {
  516                 splx(s);
  517                 return (error);
  518         }
  519         if (TAILQ_EMPTY(&so->so_q))
  520                 so->so_options |= SO_ACCEPTCONN;
  521         if (backlog < 0)
  522                 backlog = 0;
  523         so->so_qlimit = min(backlog, somaxconn);
  524         splx(s);
  525         return (0);
  526 }
  527 
  528 void
  529 sofree(struct socket *so)
  530 {
  531 
  532         if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
  533                 return;
  534         if (so->so_head) {
  535                 /*
  536                  * We must not decommission a socket that's on the accept(2)
  537                  * queue.  If we do, then accept(2) may hang after select(2)
  538                  * indicated that the listening socket was ready.
  539                  */
  540                 if (!soqremque(so, 0))
  541                         return;
  542         }
  543         sbrelease(&so->so_snd);
  544         sorflush(so);
  545         pool_put(&socket_pool, so);
  546 }
  547 
  548 /*
  549  * Close a socket on last file table reference removal.
  550  * Initiate disconnect if connected.
  551  * Free socket when disconnect complete.
  552  */
  553 int
  554 soclose(struct socket *so)
  555 {
  556         struct socket   *so2;
  557         int             s, error;
  558 
  559         error = 0;
  560         s = splsoftnet();               /* conservative */
  561         if (so->so_options & SO_ACCEPTCONN) {
  562                 while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) {
  563                         (void) soqremque(so2, 0);
  564                         (void) soabort(so2);
  565                 }
  566                 while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) {
  567                         (void) soqremque(so2, 1);
  568                         (void) soabort(so2);
  569                 }
  570         }
  571         if (so->so_pcb == 0)
  572                 goto discard;
  573         if (so->so_state & SS_ISCONNECTED) {
  574                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
  575                         error = sodisconnect(so);
  576                         if (error)
  577                                 goto drop;
  578                 }
  579                 if (so->so_options & SO_LINGER) {
  580                         if ((so->so_state & SS_ISDISCONNECTING) &&
  581                             (so->so_state & SS_NBIO))
  582                                 goto drop;
  583                         while (so->so_state & SS_ISCONNECTED) {
  584                                 error = tsleep((caddr_t)&so->so_timeo,
  585                                                PSOCK | PCATCH, netcls,
  586                                                so->so_linger * hz);
  587                                 if (error)
  588                                         break;
  589                         }
  590                 }
  591         }
  592  drop:
  593         if (so->so_pcb) {
  594                 int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
  595                     (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
  596                     (struct proc *)0);
  597                 if (error == 0)
  598                         error = error2;
  599         }
  600  discard:
  601         if (so->so_state & SS_NOFDREF)
  602                 panic("soclose: NOFDREF");
  603         so->so_state |= SS_NOFDREF;
  604         sofree(so);
  605         splx(s);
  606         return (error);
  607 }
  608 
  609 /*
  610  * Must be called at splsoftnet...
  611  */
  612 int
  613 soabort(struct socket *so)
  614 {
  615 
  616         return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0,
  617             (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
  618 }
  619 
  620 int
  621 soaccept(struct socket *so, struct mbuf *nam)
  622 {
  623         int     s, error;
  624 
  625         error = 0;
  626         s = splsoftnet();
  627         if ((so->so_state & SS_NOFDREF) == 0)
  628                 panic("soaccept: !NOFDREF");
  629         so->so_state &= ~SS_NOFDREF;
  630         if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
  631             (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
  632                 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
  633                     (struct mbuf *)0, nam, (struct mbuf *)0, (struct proc *)0);
  634         else
  635                 error = ECONNABORTED;
  636 
  637         splx(s);
  638         return (error);
  639 }
  640 
  641 int
  642 soconnect(struct socket *so, struct mbuf *nam)
  643 {
  644         struct proc     *p;
  645         int             s, error;
  646 
  647         p = curproc;            /* XXX */
  648         if (so->so_options & SO_ACCEPTCONN)
  649                 return (EOPNOTSUPP);
  650         s = splsoftnet();
  651         /*
  652          * If protocol is connection-based, can only connect once.
  653          * Otherwise, if connected, try to disconnect first.
  654          * This allows user to disconnect by connecting to, e.g.,
  655          * a null address.
  656          */
  657         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
  658             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
  659             (error = sodisconnect(so))))
  660                 error = EISCONN;
  661         else
  662                 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
  663                     (struct mbuf *)0, nam, (struct mbuf *)0, p);
  664         splx(s);
  665         return (error);
  666 }
  667 
  668 int
  669 soconnect2(struct socket *so1, struct socket *so2)
  670 {
  671         int     s, error;
  672 
  673         s = splsoftnet();
  674         error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
  675             (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0,
  676             (struct proc *)0);
  677         splx(s);
  678         return (error);
  679 }
  680 
  681 int
  682 sodisconnect(struct socket *so)
  683 {
  684         int     s, error;
  685 
  686         s = splsoftnet();
  687         if ((so->so_state & SS_ISCONNECTED) == 0) {
  688                 error = ENOTCONN;
  689                 goto bad;
  690         }
  691         if (so->so_state & SS_ISDISCONNECTING) {
  692                 error = EALREADY;
  693                 goto bad;
  694         }
  695         error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
  696             (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
  697             (struct proc *)0);
  698  bad:
  699         splx(s);
  700         sodopendfree(so);
  701         return (error);
  702 }
  703 
  704 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
  705 /*
  706  * Send on a socket.
  707  * If send must go all at once and message is larger than
  708  * send buffering, then hard error.
  709  * Lock against other senders.
  710  * If must go all at once and not enough room now, then
  711  * inform user that this would block and do nothing.
  712  * Otherwise, if nonblocking, send as much as possible.
  713  * The data to be sent is described by "uio" if nonzero,
  714  * otherwise by the mbuf chain "top" (which must be null
  715  * if uio is not).  Data provided in mbuf chain must be small
  716  * enough to send all at once.
  717  *
  718  * Returns nonzero on error, timeout or signal; callers
  719  * must check for short counts if EINTR/ERESTART are returned.
  720  * Data and control buffers are freed on return.
  721  */
  722 int
  723 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
  724         struct mbuf *control, int flags)
  725 {
  726         struct proc     *p;
  727         struct mbuf     **mp, *m;
  728         long            space, len, resid, clen, mlen;
  729         int             error, s, dontroute, atomic;
  730 
  731         sodopendfree(so);
  732 
  733         p = curproc;            /* XXX */
  734         clen = 0;
  735         atomic = sosendallatonce(so) || top;
  736         if (uio)
  737                 resid = uio->uio_resid;
  738         else
  739                 resid = top->m_pkthdr.len;
  740         /*
  741          * In theory resid should be unsigned.
  742          * However, space must be signed, as it might be less than 0
  743          * if we over-committed, and we must use a signed comparison
  744          * of space and resid.  On the other hand, a negative resid
  745          * causes us to loop sending 0-length segments to the protocol.
  746          */
  747         if (resid < 0) {
  748                 error = EINVAL;
  749                 goto out;
  750         }
  751         dontroute =
  752             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
  753             (so->so_proto->pr_flags & PR_ATOMIC);
  754         p->p_stats->p_ru.ru_msgsnd++;
  755         if (control)
  756                 clen = control->m_len;
  757 #define snderr(errno)   { error = errno; splx(s); goto release; }
  758 
  759  restart:
  760         if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
  761                 goto out;
  762         do {
  763                 s = splsoftnet();
  764                 if (so->so_state & SS_CANTSENDMORE)
  765                         snderr(EPIPE);
  766                 if (so->so_error) {
  767                         error = so->so_error;
  768                         so->so_error = 0;
  769                         splx(s);
  770                         goto release;
  771                 }
  772                 if ((so->so_state & SS_ISCONNECTED) == 0) {
  773                         if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
  774                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
  775                                     !(resid == 0 && clen != 0))
  776                                         snderr(ENOTCONN);
  777                         } else if (addr == 0)
  778                                 snderr(EDESTADDRREQ);
  779                 }
  780                 space = sbspace(&so->so_snd);
  781                 if (flags & MSG_OOB)
  782                         space += 1024;
  783                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
  784                     clen > so->so_snd.sb_hiwat)
  785                         snderr(EMSGSIZE);
  786                 if (space < resid + clen &&
  787                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
  788                         if (so->so_state & SS_NBIO)
  789                                 snderr(EWOULDBLOCK);
  790                         sbunlock(&so->so_snd);
  791                         error = sbwait(&so->so_snd);
  792                         splx(s);
  793                         if (error)
  794                                 goto out;
  795                         goto restart;
  796                 }
  797                 splx(s);
  798                 mp = &top;
  799                 space -= clen;
  800                 do {
  801                         if (uio == NULL) {
  802                                 /*
  803                                  * Data is prepackaged in "top".
  804                                  */
  805                                 resid = 0;
  806                                 if (flags & MSG_EOR)
  807                                         top->m_flags |= M_EOR;
  808                         } else do {
  809                                 if (top == 0) {
  810                                         m = m_gethdr(M_WAIT, MT_DATA);
  811                                         mlen = MHLEN;
  812                                         m->m_pkthdr.len = 0;
  813                                         m->m_pkthdr.rcvif = (struct ifnet *)0;
  814                                 } else {
  815                                         m = m_get(M_WAIT, MT_DATA);
  816                                         mlen = MLEN;
  817                                 }
  818                                 MCLAIM(m, so->so_snd.sb_mowner);
  819                                 if (use_sosend_loan &&
  820                                     uio->uio_iov->iov_len >= SOCK_LOAN_THRESH &&
  821                                     space >= SOCK_LOAN_THRESH &&
  822                                     (len = sosend_loan(so, uio, m,
  823                                                        space)) != 0) {
  824                                         SOSEND_COUNTER_INCR(&sosend_loan_big);
  825                                         space -= len;
  826                                         goto have_data;
  827                                 }
  828                                 if (resid >= MINCLSIZE && space >= MCLBYTES) {
  829                                         SOSEND_COUNTER_INCR(&sosend_copy_big);
  830                                         m_clget(m, M_WAIT);
  831                                         if ((m->m_flags & M_EXT) == 0)
  832                                                 goto nopages;
  833                                         mlen = MCLBYTES;
  834                                         if (atomic && top == 0) {
  835                                                 len = lmin(MCLBYTES - max_hdr,
  836                                                     resid);
  837                                                 m->m_data += max_hdr;
  838                                         } else
  839                                                 len = lmin(MCLBYTES, resid);
  840                                         space -= len;
  841                                 } else {
  842  nopages:
  843                                         SOSEND_COUNTER_INCR(&sosend_copy_small);
  844                                         len = lmin(lmin(mlen, resid), space);
  845                                         space -= len;
  846                                         /*
  847                                          * For datagram protocols, leave room
  848                                          * for protocol headers in first mbuf.
  849                                          */
  850                                         if (atomic && top == 0 && len < mlen)
  851                                                 MH_ALIGN(m, len);
  852                                 }
  853                                 error = uiomove(mtod(m, caddr_t), (int)len,
  854                                     uio);
  855  have_data:
  856                                 resid = uio->uio_resid;
  857                                 m->m_len = len;
  858                                 *mp = m;
  859                                 top->m_pkthdr.len += len;
  860                                 if (error)
  861                                         goto release;
  862                                 mp = &m->m_next;
  863                                 if (resid <= 0) {
  864                                         if (flags & MSG_EOR)
  865                                                 top->m_flags |= M_EOR;
  866                                         break;
  867                                 }
  868                         } while (space > 0 && atomic);
  869                         
  870                         s = splsoftnet();
  871 
  872                         if (so->so_state & SS_CANTSENDMORE)
  873                                 snderr(EPIPE);
  874 
  875                         if (dontroute)
  876                                 so->so_options |= SO_DONTROUTE;
  877                         if (resid > 0)
  878                                 so->so_state |= SS_MORETOCOME;
  879                         error = (*so->so_proto->pr_usrreq)(so,
  880                             (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
  881                             top, addr, control, p);
  882                         if (dontroute)
  883                                 so->so_options &= ~SO_DONTROUTE;
  884                         if (resid > 0)
  885                                 so->so_state &= ~SS_MORETOCOME;
  886                         splx(s);
  887 
  888                         clen = 0;
  889                         control = 0;
  890                         top = 0;
  891                         mp = &top;
  892                         if (error)
  893                                 goto release;
  894                 } while (resid && space > 0);
  895         } while (resid);
  896 
  897  release:
  898         sbunlock(&so->so_snd);
  899  out:
  900         if (top)
  901                 m_freem(top);
  902         if (control)
  903                 m_freem(control);
  904         return (error);
  905 }
  906 
  907 /*
  908  * Implement receive operations on a socket.
  909  * We depend on the way that records are added to the sockbuf
  910  * by sbappend*.  In particular, each record (mbufs linked through m_next)
  911  * must begin with an address if the protocol so specifies,
  912  * followed by an optional mbuf or mbufs containing ancillary data,
  913  * and then zero or more mbufs of data.
  914  * In order to avoid blocking network interrupts for the entire time here,
  915  * we splx() while doing the actual copy to user space.
  916  * Although the sockbuf is locked, new data may still be appended,
  917  * and thus we must maintain consistency of the sockbuf during that time.
  918  *
  919  * The caller may receive the data as a single mbuf chain by supplying
  920  * an mbuf **mp0 for use in returning the chain.  The uio is then used
  921  * only for the count in uio_resid.
  922  */
  923 int
  924 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
  925         struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
  926 {
  927         struct mbuf     *m, **mp;
  928         int             flags, len, error, s, offset, moff, type, orig_resid;
  929         struct protosw  *pr;
  930         struct mbuf     *nextrecord;
  931         int             mbuf_removed = 0;
  932 
  933         pr = so->so_proto;
  934         mp = mp0;
  935         type = 0;
  936         orig_resid = uio->uio_resid;
  937         if (paddr)
  938                 *paddr = 0;
  939         if (controlp)
  940                 *controlp = 0;
  941         if (flagsp)
  942                 flags = *flagsp &~ MSG_EOR;
  943         else
  944                 flags = 0;
  945 
  946         if ((flags & MSG_DONTWAIT) == 0)
  947                 sodopendfree(so);
  948 
  949         if (flags & MSG_OOB) {
  950                 m = m_get(M_WAIT, MT_DATA);
  951                 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
  952                     (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0,
  953                     (struct proc *)0);
  954                 if (error)
  955                         goto bad;
  956                 do {
  957                         error = uiomove(mtod(m, caddr_t),
  958                             (int) min(uio->uio_resid, m->m_len), uio);
  959                         m = m_free(m);
  960                 } while (uio->uio_resid && error == 0 && m);
  961  bad:
  962                 if (m)
  963                         m_freem(m);
  964                 return (error);
  965         }
  966         if (mp)
  967                 *mp = (struct mbuf *)0;
  968         if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
  969                 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
  970                     (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
  971 
  972  restart:
  973         if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
  974                 return (error);
  975         s = splsoftnet();
  976 
  977         m = so->so_rcv.sb_mb;
  978         /*
  979          * If we have less data than requested, block awaiting more
  980          * (subject to any timeout) if:
  981          *   1. the current count is less than the low water mark,
  982          *   2. MSG_WAITALL is set, and it is possible to do the entire
  983          *      receive operation at once if we block (resid <= hiwat), or
  984          *   3. MSG_DONTWAIT is not set.
  985          * If MSG_WAITALL is set but resid is larger than the receive buffer,
  986          * we have to do the receive in sections, and thus risk returning
  987          * a short count if a timeout or signal occurs after we start.
  988          */
  989         if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
  990             so->so_rcv.sb_cc < uio->uio_resid) &&
  991             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
  992             ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
  993             m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
  994 #ifdef DIAGNOSTIC
  995                 if (m == 0 && so->so_rcv.sb_cc)
  996                         panic("receive 1");
  997 #endif
  998                 if (so->so_error) {
  999                         if (m)
 1000                                 goto dontblock;
 1001                         error = so->so_error;
 1002                         if ((flags & MSG_PEEK) == 0)
 1003                                 so->so_error = 0;
 1004                         goto release;
 1005                 }
 1006                 if (so->so_state & SS_CANTRCVMORE) {
 1007                         if (m)
 1008                                 goto dontblock;
 1009                         else
 1010                                 goto release;
 1011                 }
 1012                 for (; m; m = m->m_next)
 1013                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
 1014                                 m = so->so_rcv.sb_mb;
 1015                                 goto dontblock;
 1016                         }
 1017                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
 1018                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
 1019                         error = ENOTCONN;
 1020                         goto release;
 1021                 }
 1022                 if (uio->uio_resid == 0)
 1023                         goto release;
 1024                 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
 1025                         error = EWOULDBLOCK;
 1026                         goto release;
 1027                 }
 1028                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
 1029                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
 1030                 sbunlock(&so->so_rcv);
 1031                 error = sbwait(&so->so_rcv);
 1032                 splx(s);
 1033                 if (error)
 1034                         return (error);
 1035                 goto restart;
 1036         }
 1037  dontblock:
 1038         /*
 1039          * On entry here, m points to the first record of the socket buffer.
 1040          * While we process the initial mbufs containing address and control
 1041          * info, we save a copy of m->m_nextpkt into nextrecord.
 1042          */
 1043 #ifdef notyet /* XXXX */
 1044         if (uio->uio_procp)
 1045                 uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
 1046 #endif
 1047         KASSERT(m == so->so_rcv.sb_mb);
 1048         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
 1049         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
 1050         nextrecord = m->m_nextpkt;
 1051         if (pr->pr_flags & PR_ADDR) {
 1052 #ifdef DIAGNOSTIC
 1053                 if (m->m_type != MT_SONAME)
 1054                         panic("receive 1a");
 1055 #endif
 1056                 orig_resid = 0;
 1057                 if (flags & MSG_PEEK) {
 1058                         if (paddr)
 1059                                 *paddr = m_copy(m, 0, m->m_len);
 1060                         m = m->m_next;
 1061                 } else {
 1062                         sbfree(&so->so_rcv, m);
 1063                         mbuf_removed = 1;
 1064                         if (paddr) {
 1065                                 *paddr = m;
 1066                                 so->so_rcv.sb_mb = m->m_next;
 1067                                 m->m_next = 0;
 1068                                 m = so->so_rcv.sb_mb;
 1069                         } else {
 1070                                 MFREE(m, so->so_rcv.sb_mb);
 1071                                 m = so->so_rcv.sb_mb;
 1072                         }
 1073                 }
 1074         }
 1075         while (m && m->m_type == MT_CONTROL && error == 0) {
 1076                 if (flags & MSG_PEEK) {
 1077                         if (controlp)
 1078                                 *controlp = m_copy(m, 0, m->m_len);
 1079                         m = m->m_next;
 1080                 } else {
 1081                         sbfree(&so->so_rcv, m);
 1082                         mbuf_removed = 1;
 1083                         if (controlp) {
 1084                                 if (pr->pr_domain->dom_externalize &&
 1085                                     mtod(m, struct cmsghdr *)->cmsg_type ==
 1086                                     SCM_RIGHTS)
 1087                                         error = (*pr->pr_domain->dom_externalize)(m);
 1088                                 *controlp = m;
 1089                                 so->so_rcv.sb_mb = m->m_next;
 1090                                 m->m_next = 0;
 1091                                 m = so->so_rcv.sb_mb;
 1092                         } else {
 1093                                 MFREE(m, so->so_rcv.sb_mb);
 1094                                 m = so->so_rcv.sb_mb;
 1095                         }
 1096                 }
 1097                 if (controlp) {
 1098                         orig_resid = 0;
 1099                         controlp = &(*controlp)->m_next;
 1100                 }
 1101         }
 1102 
 1103         /*
 1104          * If m is non-NULL, we have some data to read.  From now on,
 1105          * make sure to keep sb_lastrecord consistent when working on
 1106          * the last packet on the chain (nextrecord == NULL) and we
 1107          * change m->m_nextpkt.
 1108          */
 1109         if (m) {
 1110                 if ((flags & MSG_PEEK) == 0) {
 1111                         m->m_nextpkt = nextrecord;
 1112                         /*
 1113                          * If nextrecord == NULL (this is a single chain),
 1114                          * then sb_lastrecord may not be valid here if m
 1115                          * was changed earlier.
 1116                          */
 1117                         if (nextrecord == NULL) {
 1118                                 KASSERT(so->so_rcv.sb_mb == m);
 1119                                 so->so_rcv.sb_lastrecord = m;
 1120                         }
 1121                 }
 1122                 type = m->m_type;
 1123                 if (type == MT_OOBDATA)
 1124                         flags |= MSG_OOB;
 1125         } else {
 1126                 if ((flags & MSG_PEEK) == 0) {
 1127                         KASSERT(so->so_rcv.sb_mb == m);
 1128                         so->so_rcv.sb_mb = nextrecord;
 1129                         SB_EMPTY_FIXUP(&so->so_rcv);
 1130                 }
 1131         }
 1132         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
 1133         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
 1134 
 1135         moff = 0;
 1136         offset = 0;
 1137         while (m && uio->uio_resid > 0 && error == 0) {
 1138                 if (m->m_type == MT_OOBDATA) {
 1139                         if (type != MT_OOBDATA)
 1140                                 break;
 1141                 } else if (type == MT_OOBDATA)
 1142                         break;
 1143 #ifdef DIAGNOSTIC
 1144                 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
 1145                         panic("receive 3");
 1146 #endif
 1147                 so->so_state &= ~SS_RCVATMARK;
 1148                 len = uio->uio_resid;
 1149                 if (so->so_oobmark && len > so->so_oobmark - offset)
 1150                         len = so->so_oobmark - offset;
 1151                 if (len > m->m_len - moff)
 1152                         len = m->m_len - moff;
 1153                 /*
 1154                  * If mp is set, just pass back the mbufs.
 1155                  * Otherwise copy them out via the uio, then free.
 1156                  * Sockbuf must be consistent here (points to current mbuf,
 1157                  * it points to next record) when we drop priority;
 1158                  * we must note any additions to the sockbuf when we
 1159                  * block interrupts again.
 1160                  */
 1161                 if (mp == 0) {
 1162                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
 1163                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
 1164                         splx(s);
 1165                         error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
 1166                         s = splsoftnet();
 1167                         if (error) {
 1168                                 /*
 1169                                  * If any part of the record has been removed
 1170                                  * (such as the MT_SONAME mbuf, which will
 1171                                  * happen when PR_ADDR, and thus also
 1172                                  * PR_ATOMIC, is set), then drop the entire
 1173                                  * record to maintain the atomicity of the
 1174                                  * receive operation.
 1175                                  *
 1176                                  * This avoids a later panic("receive 1a")
 1177                                  * when compiled with DIAGNOSTIC.
 1178                                  */
 1179                                 if (m && mbuf_removed
 1180                                     && (pr->pr_flags & PR_ATOMIC))
 1181                                         (void) sbdroprecord(&so->so_rcv);
 1182 
 1183                                 goto release;
 1184                         }
 1185                 } else
 1186                         uio->uio_resid -= len;
 1187                 if (len == m->m_len - moff) {
 1188                         if (m->m_flags & M_EOR)
 1189                                 flags |= MSG_EOR;
 1190                         if (flags & MSG_PEEK) {
 1191                                 m = m->m_next;
 1192                                 moff = 0;
 1193                         } else {
 1194                                 nextrecord = m->m_nextpkt;
 1195                                 sbfree(&so->so_rcv, m);
 1196                                 if (mp) {
 1197                                         *mp = m;
 1198                                         mp = &m->m_next;
 1199                                         so->so_rcv.sb_mb = m = m->m_next;
 1200                                         *mp = (struct mbuf *)0;
 1201                                 } else {
 1202                                         MFREE(m, so->so_rcv.sb_mb);
 1203                                         m = so->so_rcv.sb_mb;
 1204                                 }
 1205                                 /*
 1206                                  * If m != NULL, we also know that
 1207                                  * so->so_rcv.sb_mb != NULL.
 1208                                  */
 1209                                 KASSERT(so->so_rcv.sb_mb == m);
 1210                                 if (m) {
 1211                                         m->m_nextpkt = nextrecord;
 1212                                         if (nextrecord == NULL)
 1213                                                 so->so_rcv.sb_lastrecord = m;
 1214                                 } else {
 1215                                         so->so_rcv.sb_mb = nextrecord;
 1216                                         SB_EMPTY_FIXUP(&so->so_rcv);
 1217                                 }
 1218                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
 1219                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
 1220                         }
 1221                 } else {
 1222                         if (flags & MSG_PEEK)
 1223                                 moff += len;
 1224                         else {
 1225                                 if (mp)
 1226                                         *mp = m_copym(m, 0, len, M_WAIT);
 1227                                 m->m_data += len;
 1228                                 m->m_len -= len;
 1229                                 so->so_rcv.sb_cc -= len;
 1230                         }
 1231                 }
 1232                 if (so->so_oobmark) {
 1233                         if ((flags & MSG_PEEK) == 0) {
 1234                                 so->so_oobmark -= len;
 1235                                 if (so->so_oobmark == 0) {
 1236                                         so->so_state |= SS_RCVATMARK;
 1237                                         break;
 1238                                 }
 1239                         } else {
 1240                                 offset += len;
 1241                                 if (offset == so->so_oobmark)
 1242                                         break;
 1243                         }
 1244                 }
 1245                 if (flags & MSG_EOR)
 1246                         break;
 1247                 /*
 1248                  * If the MSG_WAITALL flag is set (for non-atomic socket),
 1249                  * we must not quit until "uio->uio_resid == 0" or an error
 1250                  * termination.  If a signal/timeout occurs, return
 1251                  * with a short count but without error.
 1252                  * Keep sockbuf locked against other readers.
 1253                  */
 1254                 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
 1255                     !sosendallatonce(so) && !nextrecord) {
 1256                         if (so->so_error || so->so_state & SS_CANTRCVMORE)
 1257                                 break;
 1258                         /*
 1259                          * If we are peeking and the socket receive buffer is
 1260                          * full, stop since we can't get more data to peek at.
 1261                          */
 1262                         if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0)
 1263                                 break;
 1264                         /*
 1265                          * If we've drained the socket buffer, tell the
 1266                          * protocol in case it needs to do something to
 1267                          * get it filled again.
 1268                          */
 1269                         if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb)
 1270                                 (*pr->pr_usrreq)(so, PRU_RCVD,
 1271                                     (struct mbuf *)0,
 1272                                     (struct mbuf *)(long)flags,
 1273                                     (struct mbuf *)0,
 1274                                     (struct proc *)0);
 1275                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
 1276                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
 1277                         error = sbwait(&so->so_rcv);
 1278                         if (error) {
 1279                                 sbunlock(&so->so_rcv);
 1280                                 splx(s);
 1281                                 return (0);
 1282                         }
 1283                         if ((m = so->so_rcv.sb_mb) != NULL)
 1284                                 nextrecord = m->m_nextpkt;
 1285                 }
 1286         }
 1287 
 1288         if (m && pr->pr_flags & PR_ATOMIC) {
 1289                 flags |= MSG_TRUNC;
 1290                 if ((flags & MSG_PEEK) == 0)
 1291                         (void) sbdroprecord(&so->so_rcv);
 1292         }
 1293         if ((flags & MSG_PEEK) == 0) {
 1294                 if (m == 0) {
 1295                         /*
 1296                          * First part is an inline SB_EMPTY_FIXUP().  Second
 1297                          * part makes sure sb_lastrecord is up-to-date if
 1298                          * there is still data in the socket buffer.
 1299                          */
 1300                         so->so_rcv.sb_mb = nextrecord;
 1301                         if (so->so_rcv.sb_mb == NULL) {
 1302                                 so->so_rcv.sb_mbtail = NULL;
 1303                                 so->so_rcv.sb_lastrecord = NULL;
 1304                         } else if (nextrecord->m_nextpkt == NULL)
 1305                                 so->so_rcv.sb_lastrecord = nextrecord;
 1306                 }
 1307                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
 1308                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
 1309                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
 1310                         (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
 1311                             (struct mbuf *)(long)flags, (struct mbuf *)0,
 1312                             (struct proc *)0);
 1313         }
 1314         if (orig_resid == uio->uio_resid && orig_resid &&
 1315             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
 1316                 sbunlock(&so->so_rcv);
 1317                 splx(s);
 1318                 goto restart;
 1319         }
 1320                 
 1321         if (flagsp)
 1322                 *flagsp |= flags;
 1323  release:
 1324         sbunlock(&so->so_rcv);
 1325         splx(s);
 1326         return (error);
 1327 }
 1328 
 1329 int
 1330 soshutdown(struct socket *so, int how)
 1331 {
 1332         struct protosw  *pr;
 1333 
 1334         pr = so->so_proto;
 1335         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
 1336                 return (EINVAL);
 1337 
 1338         if (how == SHUT_RD || how == SHUT_RDWR)
 1339                 sorflush(so);
 1340         if (how == SHUT_WR || how == SHUT_RDWR)
 1341                 return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0,
 1342                     (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
 1343         return (0);
 1344 }
 1345 
 1346 void
 1347 sorflush(struct socket *so)
 1348 {
 1349         struct sockbuf  *sb, asb;
 1350         struct protosw  *pr;
 1351         int             s;
 1352 
 1353         sb = &so->so_rcv;
 1354         pr = so->so_proto;
 1355         sb->sb_flags |= SB_NOINTR;
 1356         (void) sblock(sb, M_WAITOK);
 1357         s = splnet();
 1358         socantrcvmore(so);
 1359         sbunlock(sb);
 1360         asb = *sb;
 1361         /*
 1362          * Clear most of the sockbuf structure, but leave some of the
 1363          * fields valid.
 1364          */
 1365         memset(&sb->sb_startzero, 0,
 1366             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 1367         splx(s);
 1368         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
 1369                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
 1370         sbrelease(&asb);
 1371 }
 1372 
 1373 int
 1374 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0)
 1375 {
 1376         int             error;
 1377         struct mbuf     *m;
 1378 
 1379         error = 0;
 1380         m = m0;
 1381         if (level != SOL_SOCKET) {
 1382                 if (so->so_proto && so->so_proto->pr_ctloutput)
 1383                         return ((*so->so_proto->pr_ctloutput)
 1384                                   (PRCO_SETOPT, so, level, optname, &m0));
 1385                 error = ENOPROTOOPT;
 1386         } else {
 1387                 switch (optname) {
 1388 
 1389                 case SO_LINGER:
 1390                         if (m == NULL || m->m_len != sizeof(struct linger)) {
 1391                                 error = EINVAL;
 1392                                 goto bad;
 1393                         }
 1394                         if (mtod(m, struct linger *)->l_linger < 0 ||
 1395                             mtod(m, struct linger *)->l_linger > (INT_MAX / hz)) {
 1396                                 error = EDOM;
 1397                                 goto bad;
 1398                         }
 1399                         so->so_linger = mtod(m, struct linger *)->l_linger;
 1400                         /* fall thru... */
 1401 
 1402                 case SO_DEBUG:
 1403                 case SO_KEEPALIVE:
 1404                 case SO_DONTROUTE:
 1405                 case SO_USELOOPBACK:
 1406                 case SO_BROADCAST:
 1407                 case SO_REUSEADDR:
 1408                 case SO_REUSEPORT:
 1409                 case SO_OOBINLINE:
 1410                 case SO_TIMESTAMP:
 1411                         if (m == NULL || m->m_len < sizeof(int)) {
 1412                                 error = EINVAL;
 1413                                 goto bad;
 1414                         }
 1415                         if (*mtod(m, int *))
 1416                                 so->so_options |= optname;
 1417                         else
 1418                                 so->so_options &= ~optname;
 1419                         break;
 1420 
 1421                 case SO_SNDBUF:
 1422                 case SO_RCVBUF:
 1423                 case SO_SNDLOWAT:
 1424                 case SO_RCVLOWAT:
 1425                     {
 1426                         int optval;
 1427 
 1428                         if (m == NULL || m->m_len < sizeof(int)) {
 1429                                 error = EINVAL;
 1430                                 goto bad;
 1431                         }
 1432 
 1433                         /*
 1434                          * Values < 1 make no sense for any of these
 1435                          * options, so disallow them.
 1436                          */
 1437                         optval = *mtod(m, int *);
 1438                         if (optval < 1) {
 1439                                 error = EINVAL;
 1440                                 goto bad;
 1441                         }
 1442 
 1443                         switch (optname) {
 1444 
 1445                         case SO_SNDBUF:
 1446                         case SO_RCVBUF:
 1447                                 if (sbreserve(optname == SO_SNDBUF ?
 1448                                     &so->so_snd : &so->so_rcv,
 1449                                     (u_long) optval) == 0) {
 1450                                         error = ENOBUFS;
 1451                                         goto bad;
 1452                                 }
 1453                                 break;
 1454 
 1455                         /*
 1456                          * Make sure the low-water is never greater than
 1457                          * the high-water.
 1458                          */
 1459                         case SO_SNDLOWAT:
 1460                                 so->so_snd.sb_lowat =
 1461                                     (optval > so->so_snd.sb_hiwat) ?
 1462                                     so->so_snd.sb_hiwat : optval;
 1463                                 break;
 1464                         case SO_RCVLOWAT:
 1465                                 so->so_rcv.sb_lowat =
 1466                                     (optval > so->so_rcv.sb_hiwat) ?
 1467                                     so->so_rcv.sb_hiwat : optval;
 1468                                 break;
 1469                         }
 1470                         break;
 1471                     }
 1472 
 1473                 case SO_SNDTIMEO:
 1474                 case SO_RCVTIMEO:
 1475                     {
 1476                         struct timeval *tv;
 1477                         short val;
 1478 
 1479                         if (m == NULL || m->m_len < sizeof(*tv)) {
 1480                                 error = EINVAL;
 1481                                 goto bad;
 1482                         }
 1483                         tv = mtod(m, struct timeval *);
 1484                         if (tv->tv_sec > (SHRT_MAX - tv->tv_usec / tick) / hz) {
 1485                                 error = EDOM;
 1486                                 goto bad;
 1487                         }
 1488                         val = tv->tv_sec * hz + tv->tv_usec / tick;
 1489                         if (val == 0 && tv->tv_usec != 0)
 1490                                 val = 1;
 1491 
 1492                         switch (optname) {
 1493 
 1494                         case SO_SNDTIMEO:
 1495                                 so->so_snd.sb_timeo = val;
 1496                                 break;
 1497                         case SO_RCVTIMEO:
 1498                                 so->so_rcv.sb_timeo = val;
 1499                                 break;
 1500                         }
 1501                         break;
 1502                     }
 1503 
 1504                 default:
 1505                         error = ENOPROTOOPT;
 1506                         break;
 1507                 }
 1508                 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
 1509                         (void) ((*so->so_proto->pr_ctloutput)
 1510                                   (PRCO_SETOPT, so, level, optname, &m0));
 1511                         m = NULL;       /* freed by protocol */
 1512                 }
 1513         }
 1514  bad:
 1515         if (m)
 1516                 (void) m_free(m);
 1517         return (error);
 1518 }
 1519 
 1520 int
 1521 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp)
 1522 {
 1523         struct mbuf     *m;
 1524 
 1525         if (level != SOL_SOCKET) {
 1526                 if (so->so_proto && so->so_proto->pr_ctloutput) {
 1527                         return ((*so->so_proto->pr_ctloutput)
 1528                                   (PRCO_GETOPT, so, level, optname, mp));
 1529                 } else
 1530                         return (ENOPROTOOPT);
 1531         } else {
 1532                 m = m_get(M_WAIT, MT_SOOPTS);
 1533                 m->m_len = sizeof(int);
 1534 
 1535                 switch (optname) {
 1536 
 1537                 case SO_LINGER:
 1538                         m->m_len = sizeof(struct linger);
 1539                         mtod(m, struct linger *)->l_onoff =
 1540                                 so->so_options & SO_LINGER;
 1541                         mtod(m, struct linger *)->l_linger = so->so_linger;
 1542                         break;
 1543 
 1544                 case SO_USELOOPBACK:
 1545                 case SO_DONTROUTE:
 1546                 case SO_DEBUG:
 1547                 case SO_KEEPALIVE:
 1548                 case SO_REUSEADDR:
 1549                 case SO_REUSEPORT:
 1550                 case SO_BROADCAST:
 1551                 case SO_OOBINLINE:
 1552                 case SO_TIMESTAMP:
 1553                         *mtod(m, int *) = so->so_options & optname;
 1554                         break;
 1555 
 1556                 case SO_TYPE:
 1557                         *mtod(m, int *) = so->so_type;
 1558                         break;
 1559 
 1560                 case SO_ERROR:
 1561                         *mtod(m, int *) = so->so_error;
 1562                         so->so_error = 0;
 1563                         break;
 1564 
 1565                 case SO_SNDBUF:
 1566                         *mtod(m, int *) = so->so_snd.sb_hiwat;
 1567                         break;
 1568 
 1569                 case SO_RCVBUF:
 1570                         *mtod(m, int *) = so->so_rcv.sb_hiwat;
 1571                         break;
 1572 
 1573                 case SO_SNDLOWAT:
 1574                         *mtod(m, int *) = so->so_snd.sb_lowat;
 1575                         break;
 1576 
 1577                 case SO_RCVLOWAT:
 1578                         *mtod(m, int *) = so->so_rcv.sb_lowat;
 1579                         break;
 1580 
 1581                 case SO_SNDTIMEO:
 1582                 case SO_RCVTIMEO:
 1583                     {
 1584                         int val = (optname == SO_SNDTIMEO ?
 1585                              so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
 1586 
 1587                         m->m_len = sizeof(struct timeval);
 1588                         mtod(m, struct timeval *)->tv_sec = val / hz;
 1589                         mtod(m, struct timeval *)->tv_usec =
 1590                             (val % hz) * tick;
 1591                         break;
 1592                     }
 1593 
 1594                 default:
 1595                         (void)m_free(m);
 1596                         return (ENOPROTOOPT);
 1597                 }
 1598                 *mp = m;
 1599                 return (0);
 1600         }
 1601 }
 1602 
 1603 void
 1604 sohasoutofband(struct socket *so)
 1605 {
 1606         fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so);
 1607         selwakeup(&so->so_rcv.sb_sel);
 1608 }
 1609 
 1610 static void
 1611 filt_sordetach(struct knote *kn)
 1612 {
 1613         struct socket   *so;
 1614 
 1615         so = (struct socket *)kn->kn_fp->f_data;
 1616         SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext);
 1617         if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist))
 1618                 so->so_rcv.sb_flags &= ~SB_KNOTE;
 1619 }
 1620 
 1621 /*ARGSUSED*/
 1622 static int
 1623 filt_soread(struct knote *kn, long hint)
 1624 {
 1625         struct socket   *so;
 1626 
 1627         so = (struct socket *)kn->kn_fp->f_data;
 1628         kn->kn_data = so->so_rcv.sb_cc;
 1629         if (so->so_state & SS_CANTRCVMORE) {
 1630                 kn->kn_flags |= EV_EOF; 
 1631                 kn->kn_fflags = so->so_error;
 1632                 return (1);
 1633         }
 1634         if (so->so_error)       /* temporary udp error */
 1635                 return (1);
 1636         if (kn->kn_sfflags & NOTE_LOWAT)
 1637                 return (kn->kn_data >= kn->kn_sdata);
 1638         return (kn->kn_data >= so->so_rcv.sb_lowat);
 1639 }
 1640 
 1641 static void
 1642 filt_sowdetach(struct knote *kn)
 1643 {
 1644         struct socket   *so;
 1645 
 1646         so = (struct socket *)kn->kn_fp->f_data;
 1647         SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext);
 1648         if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist))
 1649                 so->so_snd.sb_flags &= ~SB_KNOTE;
 1650 }
 1651 
 1652 /*ARGSUSED*/
 1653 static int
 1654 filt_sowrite(struct knote *kn, long hint)
 1655 {
 1656         struct socket   *so;
 1657 
 1658         so = (struct socket *)kn->kn_fp->f_data;
 1659         kn->kn_data = sbspace(&so->so_snd);
 1660         if (so->so_state & SS_CANTSENDMORE) {
 1661                 kn->kn_flags |= EV_EOF; 
 1662                 kn->kn_fflags = so->so_error;
 1663                 return (1);
 1664         }
 1665         if (so->so_error)       /* temporary udp error */
 1666                 return (1);
 1667         if (((so->so_state & SS_ISCONNECTED) == 0) &&
 1668             (so->so_proto->pr_flags & PR_CONNREQUIRED))
 1669                 return (0);
 1670         if (kn->kn_sfflags & NOTE_LOWAT)
 1671                 return (kn->kn_data >= kn->kn_sdata);
 1672         return (kn->kn_data >= so->so_snd.sb_lowat);
 1673 }
 1674 
 1675 /*ARGSUSED*/
 1676 static int
 1677 filt_solisten(struct knote *kn, long hint)
 1678 {
 1679         struct socket   *so;
 1680 
 1681         so = (struct socket *)kn->kn_fp->f_data;
 1682 
 1683         /*
 1684          * Set kn_data to number of incoming connections, not
 1685          * counting partial (incomplete) connections.
 1686          */ 
 1687         kn->kn_data = so->so_qlen;
 1688         return (kn->kn_data > 0);
 1689 }
 1690 
 1691 static const struct filterops solisten_filtops =
 1692         { 1, NULL, filt_sordetach, filt_solisten };
 1693 static const struct filterops soread_filtops =
 1694         { 1, NULL, filt_sordetach, filt_soread };
 1695 static const struct filterops sowrite_filtops =
 1696         { 1, NULL, filt_sowdetach, filt_sowrite };
 1697 
 1698 int
 1699 soo_kqfilter(struct file *fp, struct knote *kn)
 1700 {
 1701         struct socket   *so;
 1702         struct sockbuf  *sb;
 1703 
 1704         so = (struct socket *)kn->kn_fp->f_data;
 1705         switch (kn->kn_filter) {
 1706         case EVFILT_READ:
 1707                 if (so->so_options & SO_ACCEPTCONN)
 1708                         kn->kn_fop = &solisten_filtops;
 1709                 else
 1710                         kn->kn_fop = &soread_filtops;
 1711                 sb = &so->so_rcv;
 1712                 break;
 1713         case EVFILT_WRITE:
 1714                 kn->kn_fop = &sowrite_filtops;
 1715                 sb = &so->so_snd;
 1716                 break;
 1717         default:
 1718                 return (1);
 1719         }
 1720         SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext);
 1721         sb->sb_flags |= SB_KNOTE;
 1722         return (0);
 1723 }
 1724 
 1725 #include <sys/sysctl.h>
 1726 
 1727 static int sysctl_kern_somaxkva(SYSCTLFN_PROTO);
 1728 
 1729 /*
 1730  * sysctl helper routine for kern.somaxkva.  ensures that the given
 1731  * value is not too small.
 1732  * (XXX should we maybe make sure it's not too large as well?)
 1733  */
 1734 static int
 1735 sysctl_kern_somaxkva(SYSCTLFN_ARGS)
 1736 {
 1737         int error, new_somaxkva;
 1738         struct sysctlnode node;
 1739         int s;
 1740 
 1741         new_somaxkva = somaxkva;
 1742         node = *rnode;
 1743         node.sysctl_data = &new_somaxkva;
 1744         error = sysctl_lookup(SYSCTLFN_CALL(&node));
 1745         if (error || newp == NULL)
 1746                 return (error);
 1747 
 1748         if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */
 1749                 return (EINVAL);
 1750 
 1751         s = splvm();
 1752         simple_lock(&so_pendfree_slock);
 1753         somaxkva = new_somaxkva;
 1754         wakeup(&socurkva);
 1755         simple_unlock(&so_pendfree_slock);
 1756         splx(s);
 1757 
 1758         return (error);
 1759 }
 1760 
 1761 SYSCTL_SETUP(sysctl_kern_somaxkva_setup, "sysctl kern.somaxkva setup")
 1762 {
 1763 
 1764         sysctl_createv(clog, 0, NULL, NULL,
 1765                        CTLFLAG_PERMANENT,
 1766                        CTLTYPE_NODE, "kern", NULL,
 1767                        NULL, 0, NULL, 0,
 1768                        CTL_KERN, CTL_EOL);
 1769 
 1770         sysctl_createv(clog, 0, NULL, NULL,
 1771                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 1772                        CTLTYPE_INT, "somaxkva",
 1773                        SYSCTL_DESCR("Maximum amount of kernel memory to be "
 1774                                     "used for socket buffers"),
 1775                        sysctl_kern_somaxkva, 0, NULL, 0,
 1776                        CTL_KERN, KERN_SOMAXKVA, CTL_EOL);
 1777 }

Cache object: 2ce4b8fa8588285aebf742b51a7d1759


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.