The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/uipc_socket.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: uipc_socket.c,v 1.177.4.3 2009/05/03 13:18:55 bouyer Exp $     */
    2 
    3 /*-
    4  * Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   29  * POSSIBILITY OF SUCH DAMAGE.
   30  */
   31 
   32 /*
   33  * Copyright (c) 2004 The FreeBSD Foundation
   34  * Copyright (c) 2004 Robert Watson
   35  * Copyright (c) 1982, 1986, 1988, 1990, 1993
   36  *      The Regents of the University of California.  All rights reserved.
   37  *
   38  * Redistribution and use in source and binary forms, with or without
   39  * modification, are permitted provided that the following conditions
   40  * are met:
   41  * 1. Redistributions of source code must retain the above copyright
   42  *    notice, this list of conditions and the following disclaimer.
   43  * 2. Redistributions in binary form must reproduce the above copyright
   44  *    notice, this list of conditions and the following disclaimer in the
   45  *    documentation and/or other materials provided with the distribution.
   46  * 3. Neither the name of the University nor the names of its contributors
   47  *    may be used to endorse or promote products derived from this software
   48  *    without specific prior written permission.
   49  *
   50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   60  * SUCH DAMAGE.
   61  *
   62  *      @(#)uipc_socket.c       8.6 (Berkeley) 5/2/95
   63  */
   64 
   65 #include <sys/cdefs.h>
   66 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.177.4.3 2009/05/03 13:18:55 bouyer Exp $");
   67 
   68 #include "opt_sock_counters.h"
   69 #include "opt_sosend_loan.h"
   70 #include "opt_mbuftrace.h"
   71 #include "opt_somaxkva.h"
   72 #include "opt_multiprocessor.h" /* XXX */
   73 
   74 #include <sys/param.h>
   75 #include <sys/systm.h>
   76 #include <sys/proc.h>
   77 #include <sys/file.h>
   78 #include <sys/filedesc.h>
   79 #include <sys/kmem.h>
   80 #include <sys/mbuf.h>
   81 #include <sys/domain.h>
   82 #include <sys/kernel.h>
   83 #include <sys/protosw.h>
   84 #include <sys/socket.h>
   85 #include <sys/socketvar.h>
   86 #include <sys/signalvar.h>
   87 #include <sys/resourcevar.h>
   88 #include <sys/uidinfo.h>
   89 #include <sys/event.h>
   90 #include <sys/poll.h>
   91 #include <sys/kauth.h>
   92 #include <sys/mutex.h>
   93 #include <sys/condvar.h>
   94 
   95 #include <uvm/uvm.h>
   96 
   97 MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options");
   98 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
   99 
  100 extern const struct fileops socketops;
  101 
  102 extern int      somaxconn;                      /* patchable (XXX sysctl) */
  103 int             somaxconn = SOMAXCONN;
  104 kmutex_t        *softnet_lock;
  105 
  106 #ifdef SOSEND_COUNTERS
  107 #include <sys/device.h>
  108 
  109 static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  110     NULL, "sosend", "loan big");
  111 static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  112     NULL, "sosend", "copy big");
  113 static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  114     NULL, "sosend", "copy small");
  115 static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  116     NULL, "sosend", "kva limit");
  117 
  118 #define SOSEND_COUNTER_INCR(ev)         (ev)->ev_count++
  119 
  120 EVCNT_ATTACH_STATIC(sosend_loan_big);
  121 EVCNT_ATTACH_STATIC(sosend_copy_big);
  122 EVCNT_ATTACH_STATIC(sosend_copy_small);
  123 EVCNT_ATTACH_STATIC(sosend_kvalimit);
  124 #else
  125 
  126 #define SOSEND_COUNTER_INCR(ev)         /* nothing */
  127 
  128 #endif /* SOSEND_COUNTERS */
  129 
  130 static struct callback_entry sokva_reclaimerentry;
  131 
  132 #if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR)
  133 int sock_loan_thresh = -1;
  134 #else
  135 int sock_loan_thresh = 4096;
  136 #endif
  137 
  138 static kmutex_t so_pendfree_lock;
  139 static struct mbuf *so_pendfree;
  140 
  141 #ifndef SOMAXKVA
  142 #define SOMAXKVA (16 * 1024 * 1024)
  143 #endif
  144 int somaxkva = SOMAXKVA;
  145 static int socurkva;
  146 static kcondvar_t socurkva_cv;
  147 
  148 #define SOCK_LOAN_CHUNK         65536
  149 
  150 static size_t sodopendfree(void);
  151 static size_t sodopendfreel(void);
  152 
  153 static vsize_t
  154 sokvareserve(struct socket *so, vsize_t len)
  155 {
  156         int error;
  157 
  158         mutex_enter(&so_pendfree_lock);
  159         while (socurkva + len > somaxkva) {
  160                 size_t freed;
  161 
  162                 /*
  163                  * try to do pendfree.
  164                  */
  165 
  166                 freed = sodopendfreel();
  167 
  168                 /*
  169                  * if some kva was freed, try again.
  170                  */
  171 
  172                 if (freed)
  173                         continue;
  174 
  175                 SOSEND_COUNTER_INCR(&sosend_kvalimit);
  176                 error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock);
  177                 if (error) {
  178                         len = 0;
  179                         break;
  180                 }
  181         }
  182         socurkva += len;
  183         mutex_exit(&so_pendfree_lock);
  184         return len;
  185 }
  186 
  187 static void
  188 sokvaunreserve(vsize_t len)
  189 {
  190 
  191         mutex_enter(&so_pendfree_lock);
  192         socurkva -= len;
  193         cv_broadcast(&socurkva_cv);
  194         mutex_exit(&so_pendfree_lock);
  195 }
  196 
  197 /*
  198  * sokvaalloc: allocate kva for loan.
  199  */
  200 
  201 vaddr_t
  202 sokvaalloc(vsize_t len, struct socket *so)
  203 {
  204         vaddr_t lva;
  205 
  206         /*
  207          * reserve kva.
  208          */
  209 
  210         if (sokvareserve(so, len) == 0)
  211                 return 0;
  212 
  213         /*
  214          * allocate kva.
  215          */
  216 
  217         lva = uvm_km_alloc(kernel_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA);
  218         if (lva == 0) {
  219                 sokvaunreserve(len);
  220                 return (0);
  221         }
  222 
  223         return lva;
  224 }
  225 
  226 /*
  227  * sokvafree: free kva for loan.
  228  */
  229 
  230 void
  231 sokvafree(vaddr_t sva, vsize_t len)
  232 {
  233 
  234         /*
  235          * free kva.
  236          */
  237 
  238         uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY);
  239 
  240         /*
  241          * unreserve kva.
  242          */
  243 
  244         sokvaunreserve(len);
  245 }
  246 
  247 static void
  248 sodoloanfree(struct vm_page **pgs, void *buf, size_t size)
  249 {
  250         vaddr_t sva, eva;
  251         vsize_t len;
  252         int npgs;
  253 
  254         KASSERT(pgs != NULL);
  255 
  256         eva = round_page((vaddr_t) buf + size);
  257         sva = trunc_page((vaddr_t) buf);
  258         len = eva - sva;
  259         npgs = len >> PAGE_SHIFT;
  260 
  261         pmap_kremove(sva, len);
  262         pmap_update(pmap_kernel());
  263         uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE);
  264         sokvafree(sva, len);
  265 }
  266 
  267 static size_t
  268 sodopendfree(void)
  269 {
  270         size_t rv;
  271 
  272         if (__predict_true(so_pendfree == NULL))
  273                 return 0;
  274 
  275         mutex_enter(&so_pendfree_lock);
  276         rv = sodopendfreel();
  277         mutex_exit(&so_pendfree_lock);
  278 
  279         return rv;
  280 }
  281 
  282 /*
  283  * sodopendfreel: free mbufs on "pendfree" list.
  284  * unlock and relock so_pendfree_lock when freeing mbufs.
  285  *
  286  * => called with so_pendfree_lock held.
  287  */
  288 
  289 static size_t
  290 sodopendfreel(void)
  291 {
  292         struct mbuf *m, *next;
  293         size_t rv = 0;
  294 
  295         KASSERT(mutex_owned(&so_pendfree_lock));
  296 
  297         while (so_pendfree != NULL) {
  298                 m = so_pendfree;
  299                 so_pendfree = NULL;
  300                 mutex_exit(&so_pendfree_lock);
  301 
  302                 for (; m != NULL; m = next) {
  303                         next = m->m_next;
  304                         KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0);
  305                         KASSERT(m->m_ext.ext_refcnt == 0);
  306 
  307                         rv += m->m_ext.ext_size;
  308                         sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf,
  309                             m->m_ext.ext_size);
  310                         pool_cache_put(mb_cache, m);
  311                 }
  312 
  313                 mutex_enter(&so_pendfree_lock);
  314         }
  315 
  316         return (rv);
  317 }
  318 
  319 void
  320 soloanfree(struct mbuf *m, void *buf, size_t size, void *arg)
  321 {
  322 
  323         KASSERT(m != NULL);
  324 
  325         /*
  326          * postpone freeing mbuf.
  327          *
  328          * we can't do it in interrupt context
  329          * because we need to put kva back to kernel_map.
  330          */
  331 
  332         mutex_enter(&so_pendfree_lock);
  333         m->m_next = so_pendfree;
  334         so_pendfree = m;
  335         cv_broadcast(&socurkva_cv);
  336         mutex_exit(&so_pendfree_lock);
  337 }
  338 
  339 static long
  340 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space)
  341 {
  342         struct iovec *iov = uio->uio_iov;
  343         vaddr_t sva, eva;
  344         vsize_t len;
  345         vaddr_t lva;
  346         int npgs, error;
  347         vaddr_t va;
  348         int i;
  349 
  350         if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace))
  351                 return (0);
  352 
  353         if (iov->iov_len < (size_t) space)
  354                 space = iov->iov_len;
  355         if (space > SOCK_LOAN_CHUNK)
  356                 space = SOCK_LOAN_CHUNK;
  357 
  358         eva = round_page((vaddr_t) iov->iov_base + space);
  359         sva = trunc_page((vaddr_t) iov->iov_base);
  360         len = eva - sva;
  361         npgs = len >> PAGE_SHIFT;
  362 
  363         KASSERT(npgs <= M_EXT_MAXPAGES);
  364 
  365         lva = sokvaalloc(len, so);
  366         if (lva == 0)
  367                 return 0;
  368 
  369         error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len,
  370             m->m_ext.ext_pgs, UVM_LOAN_TOPAGE);
  371         if (error) {
  372                 sokvafree(lva, len);
  373                 return (0);
  374         }
  375 
  376         for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE)
  377                 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]),
  378                     VM_PROT_READ);
  379         pmap_update(pmap_kernel());
  380 
  381         lva += (vaddr_t) iov->iov_base & PAGE_MASK;
  382 
  383         MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so);
  384         m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;
  385 
  386         uio->uio_resid -= space;
  387         /* uio_offset not updated, not set/used for write(2) */
  388         uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space;
  389         uio->uio_iov->iov_len -= space;
  390         if (uio->uio_iov->iov_len == 0) {
  391                 uio->uio_iov++;
  392                 uio->uio_iovcnt--;
  393         }
  394 
  395         return (space);
  396 }
  397 
  398 static int
  399 sokva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg)
  400 {
  401 
  402         KASSERT(ce == &sokva_reclaimerentry);
  403         KASSERT(obj == NULL);
  404 
  405         sodopendfree();
  406         if (!vm_map_starved_p(kernel_map)) {
  407                 return CALLBACK_CHAIN_ABORT;
  408         }
  409         return CALLBACK_CHAIN_CONTINUE;
  410 }
  411 
  412 struct mbuf *
  413 getsombuf(struct socket *so, int type)
  414 {
  415         struct mbuf *m;
  416 
  417         m = m_get(M_WAIT, type);
  418         MCLAIM(m, so->so_mowner);
  419         return m;
  420 }
  421 
  422 void
  423 soinit(void)
  424 {
  425 
  426         mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM);
  427         softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
  428         cv_init(&socurkva_cv, "sokva");
  429         soinit2();
  430 
  431         /* Set the initial adjusted socket buffer size. */
  432         if (sb_max_set(sb_max))
  433                 panic("bad initial sb_max value: %lu", sb_max);
  434 
  435         callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback,
  436             &sokva_reclaimerentry, NULL, sokva_reclaim_callback);
  437 }
  438 
  439 /*
  440  * Socket operation routines.
  441  * These routines are called by the routines in
  442  * sys_socket.c or from a system process, and
  443  * implement the semantics of socket operations by
  444  * switching out to the protocol specific routines.
  445  */
  446 /*ARGSUSED*/
  447 int
  448 socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l,
  449          struct socket *lockso)
  450 {
  451         const struct protosw    *prp;
  452         struct socket   *so;
  453         uid_t           uid;
  454         int             error;
  455         kmutex_t        *lock;
  456 
  457         error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
  458             KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type),
  459             KAUTH_ARG(proto));
  460         if (error != 0)
  461                 return error;
  462 
  463         if (proto)
  464                 prp = pffindproto(dom, proto, type);
  465         else
  466                 prp = pffindtype(dom, type);
  467         if (prp == NULL) {
  468                 /* no support for domain */
  469                 if (pffinddomain(dom) == 0)
  470                         return EAFNOSUPPORT;
  471                 /* no support for socket type */
  472                 if (proto == 0 && type != 0)
  473                         return EPROTOTYPE;
  474                 return EPROTONOSUPPORT;
  475         }
  476         if (prp->pr_usrreq == NULL)
  477                 return EPROTONOSUPPORT;
  478         if (prp->pr_type != type)
  479                 return EPROTOTYPE;
  480 
  481         so = soget(true);
  482         so->so_type = type;
  483         so->so_proto = prp;
  484         so->so_send = sosend;
  485         so->so_receive = soreceive;
  486 #ifdef MBUFTRACE
  487         so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner;
  488         so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner;
  489         so->so_mowner = &prp->pr_domain->dom_mowner;
  490 #endif
  491         uid = kauth_cred_geteuid(l->l_cred);
  492         so->so_uidinfo = uid_find(uid);
  493         so->so_egid = kauth_cred_getegid(l->l_cred);
  494         so->so_cpid = l->l_proc->p_pid;
  495         if (lockso != NULL) {
  496                 /* Caller wants us to share a lock. */
  497                 lock = lockso->so_lock;
  498                 so->so_lock = lock;
  499                 mutex_obj_hold(lock);
  500                 mutex_enter(lock);
  501         } else {
  502                 /* Lock assigned and taken during PRU_ATTACH. */
  503         }
  504         error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL,
  505             (struct mbuf *)(long)proto, NULL, l);
  506         KASSERT(solocked(so));
  507         if (error != 0) {
  508                 so->so_state |= SS_NOFDREF;
  509                 sofree(so);
  510                 return error;
  511         }
  512         sounlock(so);
  513         *aso = so;
  514         return 0;
  515 }
  516 
  517 /* On success, write file descriptor to fdout and return zero.  On
  518  * failure, return non-zero; *fdout will be undefined.
  519  */
  520 int
  521 fsocreate(int domain, struct socket **sop, int type, int protocol,
  522     struct lwp *l, int *fdout)
  523 {
  524         struct socket   *so;
  525         struct file     *fp;
  526         int             fd, error;
  527 
  528         if ((error = fd_allocfile(&fp, &fd)) != 0)
  529                 return (error);
  530         fp->f_flag = FREAD|FWRITE;
  531         fp->f_type = DTYPE_SOCKET;
  532         fp->f_ops = &socketops;
  533         error = socreate(domain, &so, type, protocol, l, NULL);
  534         if (error != 0) {
  535                 fd_abort(curproc, fp, fd);
  536         } else {
  537                 if (sop != NULL)
  538                         *sop = so;
  539                 fp->f_data = so;
  540                 fd_affix(curproc, fp, fd);
  541                 *fdout = fd;
  542         }
  543         return error;
  544 }
  545 
  546 int
  547 sobind(struct socket *so, struct mbuf *nam, struct lwp *l)
  548 {
  549         int     error;
  550 
  551         solock(so);
  552         error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, l);
  553         sounlock(so);
  554         return error;
  555 }
  556 
  557 int
  558 solisten(struct socket *so, int backlog, struct lwp *l)
  559 {
  560         int     error;
  561 
  562         solock(so);
  563         if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 
  564             SS_ISDISCONNECTING)) != 0) {
  565                 sounlock(so);
  566                 return (EOPNOTSUPP);
  567         }
  568         error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL,
  569             NULL, NULL, l);
  570         if (error != 0) {
  571                 sounlock(so);
  572                 return error;
  573         }
  574         if (TAILQ_EMPTY(&so->so_q))
  575                 so->so_options |= SO_ACCEPTCONN;
  576         if (backlog < 0)
  577                 backlog = 0;
  578         so->so_qlimit = min(backlog, somaxconn);
  579         sounlock(so);
  580         return 0;
  581 }
  582 
  583 void
  584 sofree(struct socket *so)
  585 {
  586         u_int refs;
  587 
  588         KASSERT(solocked(so));
  589 
  590         if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
  591                 sounlock(so);
  592                 return;
  593         }
  594         if (so->so_head) {
  595                 /*
  596                  * We must not decommission a socket that's on the accept(2)
  597                  * queue.  If we do, then accept(2) may hang after select(2)
  598                  * indicated that the listening socket was ready.
  599                  */
  600                 if (!soqremque(so, 0)) {
  601                         sounlock(so);
  602                         return;
  603                 }
  604         }
  605         if (so->so_rcv.sb_hiwat)
  606                 (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0,
  607                     RLIM_INFINITY);
  608         if (so->so_snd.sb_hiwat)
  609                 (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0,
  610                     RLIM_INFINITY);
  611         sbrelease(&so->so_snd, so);
  612         KASSERT(!cv_has_waiters(&so->so_cv));
  613         KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv));
  614         KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
  615         sorflush(so);
  616         refs = so->so_aborting; /* XXX */
  617         /* Remove acccept filter if one is present. */
  618         if (so->so_accf != NULL)
  619                 (void)accept_filt_clear(so);
  620         sounlock(so);
  621         if (refs == 0)          /* XXX */
  622                 soput(so);
  623 }
  624 
  625 /*
  626  * Close a socket on last file table reference removal.
  627  * Initiate disconnect if connected.
  628  * Free socket when disconnect complete.
  629  */
  630 int
  631 soclose(struct socket *so)
  632 {
  633         struct socket   *so2;
  634         int             error;
  635         int             error2;
  636 
  637         error = 0;
  638         solock(so);
  639         if (so->so_options & SO_ACCEPTCONN) {
  640                 for (;;) {
  641                         if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) {
  642                                 KASSERT(solocked2(so, so2));
  643                                 (void) soqremque(so2, 0);
  644                                 /* soabort drops the lock. */
  645                                 (void) soabort(so2);
  646                                 solock(so);
  647                                 continue;
  648                         }
  649                         if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) {
  650                                 KASSERT(solocked2(so, so2));
  651                                 (void) soqremque(so2, 1);
  652                                 /* soabort drops the lock. */
  653                                 (void) soabort(so2);
  654                                 solock(so);
  655                                 continue;
  656                         }
  657                         break;
  658                 }
  659         }
  660         if (so->so_pcb == 0)
  661                 goto discard;
  662         if (so->so_state & SS_ISCONNECTED) {
  663                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
  664                         error = sodisconnect(so);
  665                         if (error)
  666                                 goto drop;
  667                 }
  668                 if (so->so_options & SO_LINGER) {
  669                         if ((so->so_state & SS_ISDISCONNECTING) && so->so_nbio)
  670                                 goto drop;
  671                         while (so->so_state & SS_ISCONNECTED) {
  672                                 error = sowait(so, true, so->so_linger * hz);
  673                                 if (error)
  674                                         break;
  675                         }
  676                 }
  677         }
  678  drop:
  679         if (so->so_pcb) {
  680                 error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
  681                     NULL, NULL, NULL, NULL);
  682                 if (error == 0)
  683                         error = error2;
  684         }
  685  discard:
  686         if (so->so_state & SS_NOFDREF)
  687                 panic("soclose: NOFDREF");
  688         so->so_state |= SS_NOFDREF;
  689         sofree(so);
  690         return (error);
  691 }
  692 
  693 /*
  694  * Must be called with the socket locked..  Will return with it unlocked.
  695  */
  696 int
  697 soabort(struct socket *so)
  698 {
  699         u_int refs;
  700         int error;
  701         
  702         KASSERT(solocked(so));
  703         KASSERT(so->so_head == NULL);
  704 
  705         so->so_aborting++;              /* XXX */
  706         error = (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL,
  707             NULL, NULL, NULL);
  708         refs = --so->so_aborting;       /* XXX */
  709         if (error || (refs == 0)) {
  710                 sofree(so);
  711         } else {
  712                 sounlock(so);
  713         }
  714         return error;
  715 }
  716 
  717 int
  718 soaccept(struct socket *so, struct mbuf *nam)
  719 {
  720         int     error;
  721 
  722         KASSERT(solocked(so));
  723 
  724         error = 0;
  725         if ((so->so_state & SS_NOFDREF) == 0)
  726                 panic("soaccept: !NOFDREF");
  727         so->so_state &= ~SS_NOFDREF;
  728         if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
  729             (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
  730                 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
  731                     NULL, nam, NULL, NULL);
  732         else
  733                 error = ECONNABORTED;
  734 
  735         return (error);
  736 }
  737 
  738 int
  739 soconnect(struct socket *so, struct mbuf *nam, struct lwp *l)
  740 {
  741         int             error;
  742 
  743         KASSERT(solocked(so));
  744 
  745         if (so->so_options & SO_ACCEPTCONN)
  746                 return (EOPNOTSUPP);
  747         /*
  748          * If protocol is connection-based, can only connect once.
  749          * Otherwise, if connected, try to disconnect first.
  750          * This allows user to disconnect by connecting to, e.g.,
  751          * a null address.
  752          */
  753         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
  754             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
  755             (error = sodisconnect(so))))
  756                 error = EISCONN;
  757         else
  758                 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
  759                     NULL, nam, NULL, l);
  760         return (error);
  761 }
  762 
  763 int
  764 soconnect2(struct socket *so1, struct socket *so2)
  765 {
  766         int     error;
  767 
  768         KASSERT(solocked2(so1, so2));
  769 
  770         error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
  771             NULL, (struct mbuf *)so2, NULL, NULL);
  772         return (error);
  773 }
  774 
  775 int
  776 sodisconnect(struct socket *so)
  777 {
  778         int     error;
  779 
  780         KASSERT(solocked(so));
  781 
  782         if ((so->so_state & SS_ISCONNECTED) == 0) {
  783                 error = ENOTCONN;
  784         } else if (so->so_state & SS_ISDISCONNECTING) {
  785                 error = EALREADY;
  786         } else {
  787                 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
  788                     NULL, NULL, NULL, NULL);
  789         }
  790         sodopendfree();
  791         return (error);
  792 }
  793 
  794 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
  795 /*
  796  * Send on a socket.
  797  * If send must go all at once and message is larger than
  798  * send buffering, then hard error.
  799  * Lock against other senders.
  800  * If must go all at once and not enough room now, then
  801  * inform user that this would block and do nothing.
  802  * Otherwise, if nonblocking, send as much as possible.
  803  * The data to be sent is described by "uio" if nonzero,
  804  * otherwise by the mbuf chain "top" (which must be null
  805  * if uio is not).  Data provided in mbuf chain must be small
  806  * enough to send all at once.
  807  *
  808  * Returns nonzero on error, timeout or signal; callers
  809  * must check for short counts if EINTR/ERESTART are returned.
  810  * Data and control buffers are freed on return.
  811  */
  812 int
  813 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
  814         struct mbuf *control, int flags, struct lwp *l)
  815 {
  816         struct mbuf     **mp, *m;
  817         struct proc     *p;
  818         long            space, len, resid, clen, mlen;
  819         int             error, s, dontroute, atomic;
  820 
  821         p = l->l_proc;
  822         sodopendfree();
  823         clen = 0;
  824 
  825         /*
  826          * solock() provides atomicity of access.  splsoftnet() prevents
  827          * protocol processing soft interrupts from interrupting us and
  828          * blocking (expensive).
  829          */
  830         s = splsoftnet();
  831         solock(so);
  832         atomic = sosendallatonce(so) || top;
  833         if (uio)
  834                 resid = uio->uio_resid;
  835         else
  836                 resid = top->m_pkthdr.len;
  837         /*
  838          * In theory resid should be unsigned.
  839          * However, space must be signed, as it might be less than 0
  840          * if we over-committed, and we must use a signed comparison
  841          * of space and resid.  On the other hand, a negative resid
  842          * causes us to loop sending 0-length segments to the protocol.
  843          */
  844         if (resid < 0) {
  845                 error = EINVAL;
  846                 goto out;
  847         }
  848         dontroute =
  849             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
  850             (so->so_proto->pr_flags & PR_ATOMIC);
  851         l->l_ru.ru_msgsnd++;
  852         if (control)
  853                 clen = control->m_len;
  854  restart:
  855         if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
  856                 goto out;
  857         do {
  858                 if (so->so_state & SS_CANTSENDMORE) {
  859                         error = EPIPE;
  860                         goto release;
  861                 }
  862                 if (so->so_error) {
  863                         error = so->so_error;
  864                         so->so_error = 0;
  865                         goto release;
  866                 }
  867                 if ((so->so_state & SS_ISCONNECTED) == 0) {
  868                         if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
  869                                 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
  870                                     !(resid == 0 && clen != 0)) {
  871                                         error = ENOTCONN;
  872                                         goto release;
  873                                 }
  874                         } else if (addr == 0) {
  875                                 error = EDESTADDRREQ;
  876                                 goto release;
  877                         }
  878                 }
  879                 space = sbspace(&so->so_snd);
  880                 if (flags & MSG_OOB)
  881                         space += 1024;
  882                 if ((atomic && resid > so->so_snd.sb_hiwat) ||
  883                     clen > so->so_snd.sb_hiwat) {
  884                         error = EMSGSIZE;
  885                         goto release;
  886                 }
  887                 if (space < resid + clen &&
  888                     (atomic || space < so->so_snd.sb_lowat || space < clen)) {
  889                         if (so->so_nbio) {
  890                                 error = EWOULDBLOCK;
  891                                 goto release;
  892                         }
  893                         sbunlock(&so->so_snd);
  894                         error = sbwait(&so->so_snd);
  895                         if (error)
  896                                 goto out;
  897                         goto restart;
  898                 }
  899                 mp = &top;
  900                 space -= clen;
  901                 do {
  902                         if (uio == NULL) {
  903                                 /*
  904                                  * Data is prepackaged in "top".
  905                                  */
  906                                 resid = 0;
  907                                 if (flags & MSG_EOR)
  908                                         top->m_flags |= M_EOR;
  909                         } else do {
  910                                 sounlock(so);
  911                                 splx(s);
  912                                 if (top == NULL) {
  913                                         m = m_gethdr(M_WAIT, MT_DATA);
  914                                         mlen = MHLEN;
  915                                         m->m_pkthdr.len = 0;
  916                                         m->m_pkthdr.rcvif = NULL;
  917                                 } else {
  918                                         m = m_get(M_WAIT, MT_DATA);
  919                                         mlen = MLEN;
  920                                 }
  921                                 MCLAIM(m, so->so_snd.sb_mowner);
  922                                 if (sock_loan_thresh >= 0 &&
  923                                     uio->uio_iov->iov_len >= sock_loan_thresh &&
  924                                     space >= sock_loan_thresh &&
  925                                     (len = sosend_loan(so, uio, m,
  926                                                        space)) != 0) {
  927                                         SOSEND_COUNTER_INCR(&sosend_loan_big);
  928                                         space -= len;
  929                                         goto have_data;
  930                                 }
  931                                 if (resid >= MINCLSIZE && space >= MCLBYTES) {
  932                                         SOSEND_COUNTER_INCR(&sosend_copy_big);
  933                                         m_clget(m, M_WAIT);
  934                                         if ((m->m_flags & M_EXT) == 0)
  935                                                 goto nopages;
  936                                         mlen = MCLBYTES;
  937                                         if (atomic && top == 0) {
  938                                                 len = lmin(MCLBYTES - max_hdr,
  939                                                     resid);
  940                                                 m->m_data += max_hdr;
  941                                         } else
  942                                                 len = lmin(MCLBYTES, resid);
  943                                         space -= len;
  944                                 } else {
  945  nopages:
  946                                         SOSEND_COUNTER_INCR(&sosend_copy_small);
  947                                         len = lmin(lmin(mlen, resid), space);
  948                                         space -= len;
  949                                         /*
  950                                          * For datagram protocols, leave room
  951                                          * for protocol headers in first mbuf.
  952                                          */
  953                                         if (atomic && top == 0 && len < mlen)
  954                                                 MH_ALIGN(m, len);
  955                                 }
  956                                 error = uiomove(mtod(m, void *), (int)len, uio);
  957  have_data:
  958                                 resid = uio->uio_resid;
  959                                 m->m_len = len;
  960                                 *mp = m;
  961                                 top->m_pkthdr.len += len;
  962                                 s = splsoftnet();
  963                                 solock(so);
  964                                 if (error != 0)
  965                                         goto release;
  966                                 mp = &m->m_next;
  967                                 if (resid <= 0) {
  968                                         if (flags & MSG_EOR)
  969                                                 top->m_flags |= M_EOR;
  970                                         break;
  971                                 }
  972                         } while (space > 0 && atomic);
  973 
  974                         if (so->so_state & SS_CANTSENDMORE) {
  975                                 error = EPIPE;
  976                                 goto release;
  977                         }
  978                         if (dontroute)
  979                                 so->so_options |= SO_DONTROUTE;
  980                         if (resid > 0)
  981                                 so->so_state |= SS_MORETOCOME;
  982                         error = (*so->so_proto->pr_usrreq)(so,
  983                             (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
  984                             top, addr, control, curlwp);
  985                         if (dontroute)
  986                                 so->so_options &= ~SO_DONTROUTE;
  987                         if (resid > 0)
  988                                 so->so_state &= ~SS_MORETOCOME;
  989                         clen = 0;
  990                         control = NULL;
  991                         top = NULL;
  992                         mp = &top;
  993                         if (error != 0)
  994                                 goto release;
  995                 } while (resid && space > 0);
  996         } while (resid);
  997 
  998  release:
  999         sbunlock(&so->so_snd);
 1000  out:
 1001         sounlock(so);
 1002         splx(s);
 1003         if (top)
 1004                 m_freem(top);
 1005         if (control)
 1006                 m_freem(control);
 1007         return (error);
 1008 }
 1009 
 1010 /*
 1011  * Following replacement or removal of the first mbuf on the first
 1012  * mbuf chain of a socket buffer, push necessary state changes back
 1013  * into the socket buffer so that other consumers see the values
 1014  * consistently.  'nextrecord' is the callers locally stored value of
 1015  * the original value of sb->sb_mb->m_nextpkt which must be restored
 1016  * when the lead mbuf changes.  NOTE: 'nextrecord' may be NULL.
 1017  */
 1018 static void
 1019 sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
 1020 {
 1021 
 1022         KASSERT(solocked(sb->sb_so));
 1023 
 1024         /*
 1025          * First, update for the new value of nextrecord.  If necessary,
 1026          * make it the first record.
 1027          */
 1028         if (sb->sb_mb != NULL)
 1029                 sb->sb_mb->m_nextpkt = nextrecord;
 1030         else
 1031                 sb->sb_mb = nextrecord;
 1032 
 1033         /*
 1034          * Now update any dependent socket buffer fields to reflect
 1035          * the new state.  This is an inline of SB_EMPTY_FIXUP, with
 1036          * the addition of a second clause that takes care of the
 1037          * case where sb_mb has been updated, but remains the last
 1038          * record.
 1039          */
 1040         if (sb->sb_mb == NULL) {
 1041                 sb->sb_mbtail = NULL;
 1042                 sb->sb_lastrecord = NULL;
 1043         } else if (sb->sb_mb->m_nextpkt == NULL)
 1044                 sb->sb_lastrecord = sb->sb_mb;
 1045 }
 1046 
 1047 /*
 1048  * Implement receive operations on a socket.
 1049  * We depend on the way that records are added to the sockbuf
 1050  * by sbappend*.  In particular, each record (mbufs linked through m_next)
 1051  * must begin with an address if the protocol so specifies,
 1052  * followed by an optional mbuf or mbufs containing ancillary data,
 1053  * and then zero or more mbufs of data.
 1054  * In order to avoid blocking network interrupts for the entire time here,
 1055  * we splx() while doing the actual copy to user space.
 1056  * Although the sockbuf is locked, new data may still be appended,
 1057  * and thus we must maintain consistency of the sockbuf during that time.
 1058  *
 1059  * The caller may receive the data as a single mbuf chain by supplying
 1060  * an mbuf **mp0 for use in returning the chain.  The uio is then used
 1061  * only for the count in uio_resid.
 1062  */
 1063 int
 1064 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
 1065         struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 1066 {
 1067         struct lwp *l = curlwp;
 1068         struct mbuf     *m, **mp, *mt;
 1069         int atomic, flags, len, error, s, offset, moff, type, orig_resid;
 1070         const struct protosw    *pr;
 1071         struct mbuf     *nextrecord;
 1072         int             mbuf_removed = 0;
 1073         const struct domain *dom;
 1074 
 1075         pr = so->so_proto;
 1076         atomic = pr->pr_flags & PR_ATOMIC;
 1077         dom = pr->pr_domain;
 1078         mp = mp0;
 1079         type = 0;
 1080         orig_resid = uio->uio_resid;
 1081 
 1082         if (paddr != NULL)
 1083                 *paddr = NULL;
 1084         if (controlp != NULL)
 1085                 *controlp = NULL;
 1086         if (flagsp != NULL)
 1087                 flags = *flagsp &~ MSG_EOR;
 1088         else
 1089                 flags = 0;
 1090 
 1091         if ((flags & MSG_DONTWAIT) == 0)
 1092                 sodopendfree();
 1093 
 1094         if (flags & MSG_OOB) {
 1095                 m = m_get(M_WAIT, MT_DATA);
 1096                 solock(so);
 1097                 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
 1098                     (struct mbuf *)(long)(flags & MSG_PEEK), NULL, l);
 1099                 sounlock(so);
 1100                 if (error)
 1101                         goto bad;
 1102                 do {
 1103                         error = uiomove(mtod(m, void *),
 1104                             (int) min(uio->uio_resid, m->m_len), uio);
 1105                         m = m_free(m);
 1106                 } while (uio->uio_resid > 0 && error == 0 && m);
 1107  bad:
 1108                 if (m != NULL)
 1109                         m_freem(m);
 1110                 return error;
 1111         }
 1112         if (mp != NULL)
 1113                 *mp = NULL;
 1114 
 1115         /*
 1116          * solock() provides atomicity of access.  splsoftnet() prevents
 1117          * protocol processing soft interrupts from interrupting us and
 1118          * blocking (expensive).
 1119          */
 1120         s = splsoftnet();
 1121         solock(so);
 1122         if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
 1123                 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, l);
 1124 
 1125  restart:
 1126         if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) {
 1127                 sounlock(so);
 1128                 splx(s);
 1129                 return error;
 1130         }
 1131 
 1132         m = so->so_rcv.sb_mb;
 1133         /*
 1134          * If we have less data than requested, block awaiting more
 1135          * (subject to any timeout) if:
 1136          *   1. the current count is less than the low water mark,
 1137          *   2. MSG_WAITALL is set, and it is possible to do the entire
 1138          *      receive operation at once if we block (resid <= hiwat), or
 1139          *   3. MSG_DONTWAIT is not set.
 1140          * If MSG_WAITALL is set but resid is larger than the receive buffer,
 1141          * we have to do the receive in sections, and thus risk returning
 1142          * a short count if a timeout or signal occurs after we start.
 1143          */
 1144         if (m == NULL ||
 1145             ((flags & MSG_DONTWAIT) == 0 &&
 1146              so->so_rcv.sb_cc < uio->uio_resid &&
 1147              (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
 1148               ((flags & MSG_WAITALL) &&
 1149                uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
 1150              m->m_nextpkt == NULL && !atomic)) {
 1151 #ifdef DIAGNOSTIC
 1152                 if (m == NULL && so->so_rcv.sb_cc)
 1153                         panic("receive 1");
 1154 #endif
 1155                 if (so->so_error) {
 1156                         if (m != NULL)
 1157                                 goto dontblock;
 1158                         error = so->so_error;
 1159                         if ((flags & MSG_PEEK) == 0)
 1160                                 so->so_error = 0;
 1161                         goto release;
 1162                 }
 1163                 if (so->so_state & SS_CANTRCVMORE) {
 1164                         if (m != NULL)
 1165                                 goto dontblock;
 1166                         else
 1167                                 goto release;
 1168                 }
 1169                 for (; m != NULL; m = m->m_next)
 1170                         if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
 1171                                 m = so->so_rcv.sb_mb;
 1172                                 goto dontblock;
 1173                         }
 1174                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
 1175                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
 1176                         error = ENOTCONN;
 1177                         goto release;
 1178                 }
 1179                 if (uio->uio_resid == 0)
 1180                         goto release;
 1181                 if (so->so_nbio || (flags & MSG_DONTWAIT)) {
 1182                         error = EWOULDBLOCK;
 1183                         goto release;
 1184                 }
 1185                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
 1186                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
 1187                 sbunlock(&so->so_rcv);
 1188                 error = sbwait(&so->so_rcv);
 1189                 if (error != 0) {
 1190                         sounlock(so);
 1191                         splx(s);
 1192                         return error;
 1193                 }
 1194                 goto restart;
 1195         }
 1196  dontblock:
 1197         /*
 1198          * On entry here, m points to the first record of the socket buffer.
 1199          * From this point onward, we maintain 'nextrecord' as a cache of the
 1200          * pointer to the next record in the socket buffer.  We must keep the
 1201          * various socket buffer pointers and local stack versions of the
 1202          * pointers in sync, pushing out modifications before dropping the
 1203          * socket lock, and re-reading them when picking it up.
 1204          *
 1205          * Otherwise, we will race with the network stack appending new data
 1206          * or records onto the socket buffer by using inconsistent/stale
 1207          * versions of the field, possibly resulting in socket buffer
 1208          * corruption.
 1209          *
 1210          * By holding the high-level sblock(), we prevent simultaneous
 1211          * readers from pulling off the front of the socket buffer.
 1212          */
 1213         if (l != NULL)
 1214                 l->l_ru.ru_msgrcv++;
 1215         KASSERT(m == so->so_rcv.sb_mb);
 1216         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
 1217         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
 1218         nextrecord = m->m_nextpkt;
 1219         if (pr->pr_flags & PR_ADDR) {
 1220 #ifdef DIAGNOSTIC
 1221                 if (m->m_type != MT_SONAME)
 1222                         panic("receive 1a");
 1223 #endif
 1224                 orig_resid = 0;
 1225                 if (flags & MSG_PEEK) {
 1226                         if (paddr)
 1227                                 *paddr = m_copy(m, 0, m->m_len);
 1228                         m = m->m_next;
 1229                 } else {
 1230                         sbfree(&so->so_rcv, m);
 1231                         mbuf_removed = 1;
 1232                         if (paddr != NULL) {
 1233                                 *paddr = m;
 1234                                 so->so_rcv.sb_mb = m->m_next;
 1235                                 m->m_next = NULL;
 1236                                 m = so->so_rcv.sb_mb;
 1237                         } else {
 1238                                 MFREE(m, so->so_rcv.sb_mb);
 1239                                 m = so->so_rcv.sb_mb;
 1240                         }
 1241                         sbsync(&so->so_rcv, nextrecord);
 1242                 }
 1243         }
 1244 
 1245         /*
 1246          * Process one or more MT_CONTROL mbufs present before any data mbufs
 1247          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
 1248          * just copy the data; if !MSG_PEEK, we call into the protocol to
 1249          * perform externalization (or freeing if controlp == NULL).
 1250          */
 1251         if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) {
 1252                 struct mbuf *cm = NULL, *cmn;
 1253                 struct mbuf **cme = &cm;
 1254 
 1255                 do {
 1256                         if (flags & MSG_PEEK) {
 1257                                 if (controlp != NULL) {
 1258                                         *controlp = m_copy(m, 0, m->m_len);
 1259                                         controlp = &(*controlp)->m_next;
 1260                                 }
 1261                                 m = m->m_next;
 1262                         } else {
 1263                                 sbfree(&so->so_rcv, m);
 1264                                 so->so_rcv.sb_mb = m->m_next;
 1265                                 m->m_next = NULL;
 1266                                 *cme = m;
 1267                                 cme = &(*cme)->m_next;
 1268                                 m = so->so_rcv.sb_mb;
 1269                         }
 1270                 } while (m != NULL && m->m_type == MT_CONTROL);
 1271                 if ((flags & MSG_PEEK) == 0)
 1272                         sbsync(&so->so_rcv, nextrecord);
 1273                 for (; cm != NULL; cm = cmn) {
 1274                         cmn = cm->m_next;
 1275                         cm->m_next = NULL;
 1276                         type = mtod(cm, struct cmsghdr *)->cmsg_type;
 1277                         if (controlp != NULL) {
 1278                                 if (dom->dom_externalize != NULL &&
 1279                                     type == SCM_RIGHTS) {
 1280                                         sounlock(so);
 1281                                         splx(s);
 1282                                         error = (*dom->dom_externalize)(cm, l);
 1283                                         s = splsoftnet();
 1284                                         solock(so);
 1285                                 }
 1286                                 *controlp = cm;
 1287                                 while (*controlp != NULL)
 1288                                         controlp = &(*controlp)->m_next;
 1289                         } else {
 1290                                 /*
 1291                                  * Dispose of any SCM_RIGHTS message that went
 1292                                  * through the read path rather than recv.
 1293                                  */
 1294                                 if (dom->dom_dispose != NULL &&
 1295                                     type == SCM_RIGHTS) {
 1296                                         sounlock(so);
 1297                                         (*dom->dom_dispose)(cm);
 1298                                         solock(so);
 1299                                 }
 1300                                 m_freem(cm);
 1301                         }
 1302                 }
 1303                 if (m != NULL)
 1304                         nextrecord = so->so_rcv.sb_mb->m_nextpkt;
 1305                 else
 1306                         nextrecord = so->so_rcv.sb_mb;
 1307                 orig_resid = 0;
 1308         }
 1309 
 1310         /* If m is non-NULL, we have some data to read. */
 1311         if (__predict_true(m != NULL)) {
 1312                 type = m->m_type;
 1313                 if (type == MT_OOBDATA)
 1314                         flags |= MSG_OOB;
 1315         }
 1316         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
 1317         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
 1318 
 1319         moff = 0;
 1320         offset = 0;
 1321         while (m != NULL && uio->uio_resid > 0 && error == 0) {
 1322                 if (m->m_type == MT_OOBDATA) {
 1323                         if (type != MT_OOBDATA)
 1324                                 break;
 1325                 } else if (type == MT_OOBDATA)
 1326                         break;
 1327 #ifdef DIAGNOSTIC
 1328                 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
 1329                         panic("receive 3");
 1330 #endif
 1331                 so->so_state &= ~SS_RCVATMARK;
 1332                 len = uio->uio_resid;
 1333                 if (so->so_oobmark && len > so->so_oobmark - offset)
 1334                         len = so->so_oobmark - offset;
 1335                 if (len > m->m_len - moff)
 1336                         len = m->m_len - moff;
 1337                 /*
 1338                  * If mp is set, just pass back the mbufs.
 1339                  * Otherwise copy them out via the uio, then free.
 1340                  * Sockbuf must be consistent here (points to current mbuf,
 1341                  * it points to next record) when we drop priority;
 1342                  * we must note any additions to the sockbuf when we
 1343                  * block interrupts again.
 1344                  */
 1345                 if (mp == NULL) {
 1346                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
 1347                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
 1348                         sounlock(so);
 1349                         splx(s);
 1350                         error = uiomove(mtod(m, char *) + moff, (int)len, uio);
 1351                         s = splsoftnet();
 1352                         solock(so);
 1353                         if (error != 0) {
 1354                                 /*
 1355                                  * If any part of the record has been removed
 1356                                  * (such as the MT_SONAME mbuf, which will
 1357                                  * happen when PR_ADDR, and thus also
 1358                                  * PR_ATOMIC, is set), then drop the entire
 1359                                  * record to maintain the atomicity of the
 1360                                  * receive operation.
 1361                                  *
 1362                                  * This avoids a later panic("receive 1a")
 1363                                  * when compiled with DIAGNOSTIC.
 1364                                  */
 1365                                 if (m && mbuf_removed && atomic)
 1366                                         (void) sbdroprecord(&so->so_rcv);
 1367 
 1368                                 goto release;
 1369                         }
 1370                 } else
 1371                         uio->uio_resid -= len;
 1372                 if (len == m->m_len - moff) {
 1373                         if (m->m_flags & M_EOR)
 1374                                 flags |= MSG_EOR;
 1375                         if (flags & MSG_PEEK) {
 1376                                 m = m->m_next;
 1377                                 moff = 0;
 1378                         } else {
 1379                                 nextrecord = m->m_nextpkt;
 1380                                 sbfree(&so->so_rcv, m);
 1381                                 if (mp) {
 1382                                         *mp = m;
 1383                                         mp = &m->m_next;
 1384                                         so->so_rcv.sb_mb = m = m->m_next;
 1385                                         *mp = NULL;
 1386                                 } else {
 1387                                         MFREE(m, so->so_rcv.sb_mb);
 1388                                         m = so->so_rcv.sb_mb;
 1389                                 }
 1390                                 /*
 1391                                  * If m != NULL, we also know that
 1392                                  * so->so_rcv.sb_mb != NULL.
 1393                                  */
 1394                                 KASSERT(so->so_rcv.sb_mb == m);
 1395                                 if (m) {
 1396                                         m->m_nextpkt = nextrecord;
 1397                                         if (nextrecord == NULL)
 1398                                                 so->so_rcv.sb_lastrecord = m;
 1399                                 } else {
 1400                                         so->so_rcv.sb_mb = nextrecord;
 1401                                         SB_EMPTY_FIXUP(&so->so_rcv);
 1402                                 }
 1403                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
 1404                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
 1405                         }
 1406                 } else if (flags & MSG_PEEK)
 1407                         moff += len;
 1408                 else {
 1409                         if (mp != NULL) {
 1410                                 mt = m_copym(m, 0, len, M_NOWAIT);
 1411                                 if (__predict_false(mt == NULL)) {
 1412                                         sounlock(so);
 1413                                         mt = m_copym(m, 0, len, M_WAIT);
 1414                                         solock(so);
 1415                                 }
 1416                                 *mp = mt;
 1417                         }
 1418                         m->m_data += len;
 1419                         m->m_len -= len;
 1420                         so->so_rcv.sb_cc -= len;
 1421                 }
 1422                 if (so->so_oobmark) {
 1423                         if ((flags & MSG_PEEK) == 0) {
 1424                                 so->so_oobmark -= len;
 1425                                 if (so->so_oobmark == 0) {
 1426                                         so->so_state |= SS_RCVATMARK;
 1427                                         break;
 1428                                 }
 1429                         } else {
 1430                                 offset += len;
 1431                                 if (offset == so->so_oobmark)
 1432                                         break;
 1433                         }
 1434                 }
 1435                 if (flags & MSG_EOR)
 1436                         break;
 1437                 /*
 1438                  * If the MSG_WAITALL flag is set (for non-atomic socket),
 1439                  * we must not quit until "uio->uio_resid == 0" or an error
 1440                  * termination.  If a signal/timeout occurs, return
 1441                  * with a short count but without error.
 1442                  * Keep sockbuf locked against other readers.
 1443                  */
 1444                 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
 1445                     !sosendallatonce(so) && !nextrecord) {
 1446                         if (so->so_error || so->so_state & SS_CANTRCVMORE)
 1447                                 break;
 1448                         /*
 1449                          * If we are peeking and the socket receive buffer is
 1450                          * full, stop since we can't get more data to peek at.
 1451                          */
 1452                         if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0)
 1453                                 break;
 1454                         /*
 1455                          * If we've drained the socket buffer, tell the
 1456                          * protocol in case it needs to do something to
 1457                          * get it filled again.
 1458                          */
 1459                         if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb)
 1460                                 (*pr->pr_usrreq)(so, PRU_RCVD,
 1461                                     NULL, (struct mbuf *)(long)flags, NULL, l);
 1462                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
 1463                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
 1464                         error = sbwait(&so->so_rcv);
 1465                         if (error != 0) {
 1466                                 sbunlock(&so->so_rcv);
 1467                                 sounlock(so);
 1468                                 splx(s);
 1469                                 return 0;
 1470                         }
 1471                         if ((m = so->so_rcv.sb_mb) != NULL)
 1472                                 nextrecord = m->m_nextpkt;
 1473                 }
 1474         }
 1475 
 1476         if (m && atomic) {
 1477                 flags |= MSG_TRUNC;
 1478                 if ((flags & MSG_PEEK) == 0)
 1479                         (void) sbdroprecord(&so->so_rcv);
 1480         }
 1481         if ((flags & MSG_PEEK) == 0) {
 1482                 if (m == NULL) {
 1483                         /*
 1484                          * First part is an inline SB_EMPTY_FIXUP().  Second
 1485                          * part makes sure sb_lastrecord is up-to-date if
 1486                          * there is still data in the socket buffer.
 1487                          */
 1488                         so->so_rcv.sb_mb = nextrecord;
 1489                         if (so->so_rcv.sb_mb == NULL) {
 1490                                 so->so_rcv.sb_mbtail = NULL;
 1491                                 so->so_rcv.sb_lastrecord = NULL;
 1492                         } else if (nextrecord->m_nextpkt == NULL)
 1493                                 so->so_rcv.sb_lastrecord = nextrecord;
 1494                 }
 1495                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
 1496                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
 1497                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
 1498                         (*pr->pr_usrreq)(so, PRU_RCVD, NULL,
 1499                             (struct mbuf *)(long)flags, NULL, l);
 1500         }
 1501         if (orig_resid == uio->uio_resid && orig_resid &&
 1502             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
 1503                 sbunlock(&so->so_rcv);
 1504                 goto restart;
 1505         }
 1506 
 1507         if (flagsp != NULL)
 1508                 *flagsp |= flags;
 1509  release:
 1510         sbunlock(&so->so_rcv);
 1511         sounlock(so);
 1512         splx(s);
 1513         return error;
 1514 }
 1515 
 1516 int
 1517 soshutdown(struct socket *so, int how)
 1518 {
 1519         const struct protosw    *pr;
 1520         int     error;
 1521 
 1522         KASSERT(solocked(so));
 1523 
 1524         pr = so->so_proto;
 1525         if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
 1526                 return (EINVAL);
 1527 
 1528         if (how == SHUT_RD || how == SHUT_RDWR) {
 1529                 sorflush(so);
 1530                 error = 0;
 1531         }
 1532         if (how == SHUT_WR || how == SHUT_RDWR)
 1533                 error = (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL,
 1534                     NULL, NULL, NULL);
 1535 
 1536         return error;
 1537 }
 1538 
 1539 int
 1540 sodrain(struct socket *so)
 1541 {
 1542         int error;
 1543 
 1544         solock(so);
 1545         so->so_state |= SS_ISDRAINING;
 1546         cv_broadcast(&so->so_cv);
 1547         error = soshutdown(so, SHUT_RDWR);
 1548         sounlock(so);
 1549 
 1550         return error;
 1551 }
 1552 
 1553 void
 1554 sorflush(struct socket *so)
 1555 {
 1556         struct sockbuf  *sb, asb;
 1557         const struct protosw    *pr;
 1558 
 1559         KASSERT(solocked(so));
 1560 
 1561         sb = &so->so_rcv;
 1562         pr = so->so_proto;
 1563         socantrcvmore(so);
 1564         sb->sb_flags |= SB_NOINTR;
 1565         (void )sblock(sb, M_WAITOK);
 1566         sbunlock(sb);
 1567         asb = *sb;
 1568         /*
 1569          * Clear most of the sockbuf structure, but leave some of the
 1570          * fields valid.
 1571          */
 1572         memset(&sb->sb_startzero, 0,
 1573             sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 1574         if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) {
 1575                 sounlock(so);
 1576                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
 1577                 solock(so);
 1578         }
 1579         sbrelease(&asb, so);
 1580 }
 1581 
 1582 /*
 1583  * internal set SOL_SOCKET options
 1584  */
 1585 static int
 1586 sosetopt1(struct socket *so, const struct sockopt *sopt)
 1587 {
 1588         int error, optval;
 1589         struct linger l;
 1590         struct timeval tv;
 1591 
 1592         switch (sopt->sopt_name) {
 1593 
 1594         case SO_ACCEPTFILTER:
 1595                 error = accept_filt_setopt(so, sopt);
 1596                 KASSERT(solocked(so));
 1597                 break;
 1598 
 1599         case SO_LINGER:
 1600                 error = sockopt_get(sopt, &l, sizeof(l));
 1601                 solock(so);
 1602                 if (error)
 1603                         break;
 1604                 if (l.l_linger < 0 || l.l_linger > USHRT_MAX ||
 1605                     l.l_linger > (INT_MAX / hz)) {
 1606                         error = EDOM;
 1607                         break;
 1608                 }
 1609                 so->so_linger = l.l_linger;
 1610                 if (l.l_onoff)
 1611                         so->so_options |= SO_LINGER;
 1612                 else
 1613                         so->so_options &= ~SO_LINGER;
 1614                 break;
 1615 
 1616         case SO_DEBUG:
 1617         case SO_KEEPALIVE:
 1618         case SO_DONTROUTE:
 1619         case SO_USELOOPBACK:
 1620         case SO_BROADCAST:
 1621         case SO_REUSEADDR:
 1622         case SO_REUSEPORT:
 1623         case SO_OOBINLINE:
 1624         case SO_TIMESTAMP:
 1625                 error = sockopt_getint(sopt, &optval);
 1626                 solock(so);
 1627                 if (error)
 1628                         break;
 1629                 if (optval)
 1630                         so->so_options |= sopt->sopt_name;
 1631                 else
 1632                         so->so_options &= ~sopt->sopt_name;
 1633                 break;
 1634 
 1635         case SO_SNDBUF:
 1636         case SO_RCVBUF:
 1637         case SO_SNDLOWAT:
 1638         case SO_RCVLOWAT:
 1639                 error = sockopt_getint(sopt, &optval);
 1640                 solock(so);
 1641                 if (error)
 1642                         break;
 1643 
 1644                 /*
 1645                  * Values < 1 make no sense for any of these
 1646                  * options, so disallow them.
 1647                  */
 1648                 if (optval < 1) {
 1649                         error = EINVAL;
 1650                         break;
 1651                 }
 1652 
 1653                 switch (sopt->sopt_name) {
 1654                 case SO_SNDBUF:
 1655                         if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) {
 1656                                 error = ENOBUFS;
 1657                                 break;
 1658                         }
 1659                         so->so_snd.sb_flags &= ~SB_AUTOSIZE;
 1660                         break;
 1661 
 1662                 case SO_RCVBUF:
 1663                         if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) {
 1664                                 error = ENOBUFS;
 1665                                 break;
 1666                         }
 1667                         so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 1668                         break;
 1669 
 1670                 /*
 1671                  * Make sure the low-water is never greater than
 1672                  * the high-water.
 1673                  */
 1674                 case SO_SNDLOWAT:
 1675                         if (optval > so->so_snd.sb_hiwat)
 1676                                 optval = so->so_snd.sb_hiwat;
 1677 
 1678                         so->so_snd.sb_lowat = optval;
 1679                         break;
 1680 
 1681                 case SO_RCVLOWAT:
 1682                         if (optval > so->so_rcv.sb_hiwat)
 1683                                 optval = so->so_rcv.sb_hiwat;
 1684 
 1685                         so->so_rcv.sb_lowat = optval;
 1686                         break;
 1687                 }
 1688                 break;
 1689 
 1690         case SO_SNDTIMEO:
 1691         case SO_RCVTIMEO:
 1692                 error = sockopt_get(sopt, &tv, sizeof(tv));
 1693                 solock(so);
 1694                 if (error)
 1695                         break;
 1696 
 1697                 if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) {
 1698                         error = EDOM;
 1699                         break;
 1700                 }
 1701 
 1702                 optval = tv.tv_sec * hz + tv.tv_usec / tick;
 1703                 if (optval == 0 && tv.tv_usec != 0)
 1704                         optval = 1;
 1705 
 1706                 switch (sopt->sopt_name) {
 1707                 case SO_SNDTIMEO:
 1708                         so->so_snd.sb_timeo = optval;
 1709                         break;
 1710                 case SO_RCVTIMEO:
 1711                         so->so_rcv.sb_timeo = optval;
 1712                         break;
 1713                 }
 1714                 break;
 1715 
 1716         default:
 1717                 solock(so);
 1718                 error = ENOPROTOOPT;
 1719                 break;
 1720         }
 1721         KASSERT(solocked(so));
 1722         return error;
 1723 }
 1724 
 1725 int
 1726 sosetopt(struct socket *so, struct sockopt *sopt)
 1727 {
 1728         int error, prerr;
 1729 
 1730         if (sopt->sopt_level == SOL_SOCKET) {
 1731                 error = sosetopt1(so, sopt);
 1732                 KASSERT(solocked(so));
 1733         } else {
 1734                 error = ENOPROTOOPT;
 1735                 solock(so);
 1736         }
 1737 
 1738         if ((error == 0 || error == ENOPROTOOPT) &&
 1739             so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) {
 1740                 /* give the protocol stack a shot */
 1741                 prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt);
 1742                 if (prerr == 0)
 1743                         error = 0;
 1744                 else if (prerr != ENOPROTOOPT)
 1745                         error = prerr;
 1746         }
 1747         sounlock(so);
 1748         return error;
 1749 }
 1750 
 1751 /*
 1752  * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt()
 1753  */
 1754 int
 1755 so_setsockopt(struct lwp *l, struct socket *so, int level, int name,
 1756     const void *val, size_t valsize)
 1757 {
 1758         struct sockopt sopt;
 1759         int error;
 1760 
 1761         KASSERT(valsize == 0 || val != NULL);
 1762 
 1763         sockopt_init(&sopt, level, name, valsize);
 1764         sockopt_set(&sopt, val, valsize);
 1765 
 1766         error = sosetopt(so, &sopt);
 1767 
 1768         sockopt_destroy(&sopt);
 1769 
 1770         return error;
 1771 }
 1772  
 1773 /*
 1774  * internal get SOL_SOCKET options
 1775  */
 1776 static int
 1777 sogetopt1(struct socket *so, struct sockopt *sopt)
 1778 {
 1779         int error, optval;
 1780         struct linger l;
 1781         struct timeval tv;
 1782 
 1783         switch (sopt->sopt_name) {
 1784 
 1785         case SO_ACCEPTFILTER:
 1786                 error = accept_filt_getopt(so, sopt);
 1787                 break;
 1788 
 1789         case SO_LINGER:
 1790                 l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0;
 1791                 l.l_linger = so->so_linger;
 1792 
 1793                 error = sockopt_set(sopt, &l, sizeof(l));
 1794                 break;
 1795 
 1796         case SO_USELOOPBACK:
 1797         case SO_DONTROUTE:
 1798         case SO_DEBUG:
 1799         case SO_KEEPALIVE:
 1800         case SO_REUSEADDR:
 1801         case SO_REUSEPORT:
 1802         case SO_BROADCAST:
 1803         case SO_OOBINLINE:
 1804         case SO_TIMESTAMP:
 1805                 error = sockopt_setint(sopt,
 1806                     (so->so_options & sopt->sopt_name) ? 1 : 0);
 1807                 break;
 1808 
 1809         case SO_TYPE:
 1810                 error = sockopt_setint(sopt, so->so_type);
 1811                 break;
 1812 
 1813         case SO_ERROR:
 1814                 error = sockopt_setint(sopt, so->so_error);
 1815                 so->so_error = 0;
 1816                 break;
 1817 
 1818         case SO_SNDBUF:
 1819                 error = sockopt_setint(sopt, so->so_snd.sb_hiwat);
 1820                 break;
 1821 
 1822         case SO_RCVBUF:
 1823                 error = sockopt_setint(sopt, so->so_rcv.sb_hiwat);
 1824                 break;
 1825 
 1826         case SO_SNDLOWAT:
 1827                 error = sockopt_setint(sopt, so->so_snd.sb_lowat);
 1828                 break;
 1829 
 1830         case SO_RCVLOWAT:
 1831                 error = sockopt_setint(sopt, so->so_rcv.sb_lowat);
 1832                 break;
 1833 
 1834         case SO_SNDTIMEO:
 1835         case SO_RCVTIMEO:
 1836                 optval = (sopt->sopt_name == SO_SNDTIMEO ?
 1837                      so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
 1838 
 1839                 tv.tv_sec = optval / hz;
 1840                 tv.tv_usec = (optval % hz) * tick;
 1841 
 1842                 error = sockopt_set(sopt, &tv, sizeof(tv));
 1843                 break;
 1844 
 1845         case SO_OVERFLOWED:
 1846                 error = sockopt_setint(sopt, so->so_rcv.sb_overflowed);
 1847                 break;
 1848 
 1849         default:
 1850                 error = ENOPROTOOPT;
 1851                 break;
 1852         }
 1853 
 1854         return (error);
 1855 }
 1856 
 1857 int
 1858 sogetopt(struct socket *so, struct sockopt *sopt)
 1859 {
 1860         int             error;
 1861 
 1862         solock(so);
 1863         if (sopt->sopt_level != SOL_SOCKET) {
 1864                 if (so->so_proto && so->so_proto->pr_ctloutput) {
 1865                         error = ((*so->so_proto->pr_ctloutput)
 1866                             (PRCO_GETOPT, so, sopt));
 1867                 } else
 1868                         error = (ENOPROTOOPT);
 1869         } else {
 1870                 error = sogetopt1(so, sopt);
 1871         }
 1872         sounlock(so);
 1873         return (error);
 1874 }
 1875 
 1876 /*
 1877  * alloc sockopt data buffer buffer
 1878  *      - will be released at destroy
 1879  */
 1880 static int
 1881 sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag)
 1882 {
 1883 
 1884         KASSERT(sopt->sopt_size == 0);
 1885 
 1886         if (len > sizeof(sopt->sopt_buf)) {
 1887                 sopt->sopt_data = kmem_zalloc(len, kmflag);
 1888                 if (sopt->sopt_data == NULL)
 1889                         return ENOMEM;
 1890         } else
 1891                 sopt->sopt_data = sopt->sopt_buf;
 1892 
 1893         sopt->sopt_size = len;
 1894         return 0;
 1895 }
 1896 
 1897 /*
 1898  * initialise sockopt storage
 1899  *      - MAY sleep during allocation
 1900  */
 1901 void
 1902 sockopt_init(struct sockopt *sopt, int level, int name, size_t size)
 1903 {
 1904 
 1905         memset(sopt, 0, sizeof(*sopt));
 1906 
 1907         sopt->sopt_level = level;
 1908         sopt->sopt_name = name;
 1909         (void)sockopt_alloc(sopt, size, KM_SLEEP);
 1910 }
 1911 
 1912 /*
 1913  * destroy sockopt storage
 1914  *      - will release any held memory references
 1915  */
 1916 void
 1917 sockopt_destroy(struct sockopt *sopt)
 1918 {
 1919 
 1920         if (sopt->sopt_data != sopt->sopt_buf)
 1921                 kmem_free(sopt->sopt_data, sopt->sopt_size);
 1922 
 1923         memset(sopt, 0, sizeof(*sopt));
 1924 }
 1925 
 1926 /*
 1927  * set sockopt value
 1928  *      - value is copied into sockopt
 1929  *      - memory is allocated when necessary, will not sleep
 1930  */
 1931 int
 1932 sockopt_set(struct sockopt *sopt, const void *buf, size_t len)
 1933 {
 1934         int error;
 1935 
 1936         if (sopt->sopt_size == 0) {
 1937                 error = sockopt_alloc(sopt, len, KM_NOSLEEP);
 1938                 if (error)
 1939                         return error;
 1940         }
 1941 
 1942         KASSERT(sopt->sopt_size == len);
 1943         memcpy(sopt->sopt_data, buf, len);
 1944         return 0;
 1945 }
 1946 
 1947 /*
 1948  * common case of set sockopt integer value
 1949  */
 1950 int
 1951 sockopt_setint(struct sockopt *sopt, int val)
 1952 {
 1953 
 1954         return sockopt_set(sopt, &val, sizeof(int));
 1955 }
 1956 
 1957 /*
 1958  * get sockopt value
 1959  *      - correct size must be given
 1960  */
 1961 int
 1962 sockopt_get(const struct sockopt *sopt, void *buf, size_t len)
 1963 {
 1964 
 1965         if (sopt->sopt_size != len)
 1966                 return EINVAL;
 1967 
 1968         memcpy(buf, sopt->sopt_data, len);
 1969         return 0;
 1970 }
 1971 
 1972 /*
 1973  * common case of get sockopt integer value
 1974  */
 1975 int
 1976 sockopt_getint(const struct sockopt *sopt, int *valp)
 1977 {
 1978 
 1979         return sockopt_get(sopt, valp, sizeof(int));
 1980 }
 1981 
 1982 /*
 1983  * set sockopt value from mbuf
 1984  *      - ONLY for legacy code
 1985  *      - mbuf is released by sockopt
 1986  *      - will not sleep
 1987  */
 1988 int
 1989 sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m)
 1990 {
 1991         size_t len;
 1992         int error;
 1993 
 1994         len = m_length(m);
 1995 
 1996         if (sopt->sopt_size == 0) {
 1997                 error = sockopt_alloc(sopt, len, KM_NOSLEEP);
 1998                 if (error)
 1999                         return error;
 2000         }
 2001 
 2002         KASSERT(sopt->sopt_size == len);
 2003         m_copydata(m, 0, len, sopt->sopt_data);
 2004         m_freem(m);
 2005 
 2006         return 0;
 2007 }
 2008 
 2009 /*
 2010  * get sockopt value into mbuf
 2011  *      - ONLY for legacy code
 2012  *      - mbuf to be released by the caller
 2013  *      - will not sleep
 2014  */
 2015 struct mbuf *
 2016 sockopt_getmbuf(const struct sockopt *sopt)
 2017 {
 2018         struct mbuf *m;
 2019 
 2020         if (sopt->sopt_size > MCLBYTES)
 2021                 return NULL;
 2022 
 2023         m = m_get(M_DONTWAIT, MT_SOOPTS);
 2024         if (m == NULL)
 2025                 return NULL;
 2026 
 2027         if (sopt->sopt_size > MLEN) {
 2028                 MCLGET(m, M_DONTWAIT);
 2029                 if ((m->m_flags & M_EXT) == 0) {
 2030                         m_free(m);
 2031                         return NULL;
 2032                 }
 2033         }
 2034 
 2035         memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size);
 2036         m->m_len = sopt->sopt_size;
 2037 
 2038         return m;
 2039 }
 2040 
 2041 void
 2042 sohasoutofband(struct socket *so)
 2043 {
 2044 
 2045         fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so);
 2046         selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, NOTE_SUBMIT);
 2047 }
 2048 
 2049 static void
 2050 filt_sordetach(struct knote *kn)
 2051 {
 2052         struct socket   *so;
 2053 
 2054         so = ((file_t *)kn->kn_obj)->f_data;
 2055         solock(so);
 2056         SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext);
 2057         if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist))
 2058                 so->so_rcv.sb_flags &= ~SB_KNOTE;
 2059         sounlock(so);
 2060 }
 2061 
 2062 /*ARGSUSED*/
 2063 static int
 2064 filt_soread(struct knote *kn, long hint)
 2065 {
 2066         struct socket   *so;
 2067         int rv;
 2068 
 2069         so = ((file_t *)kn->kn_obj)->f_data;
 2070         if (hint != NOTE_SUBMIT)
 2071                 solock(so);
 2072         kn->kn_data = so->so_rcv.sb_cc;
 2073         if (so->so_state & SS_CANTRCVMORE) {
 2074                 kn->kn_flags |= EV_EOF;
 2075                 kn->kn_fflags = so->so_error;
 2076                 rv = 1;
 2077         } else if (so->so_error)        /* temporary udp error */
 2078                 rv = 1;
 2079         else if (kn->kn_sfflags & NOTE_LOWAT)
 2080                 rv = (kn->kn_data >= kn->kn_sdata);
 2081         else 
 2082                 rv = (kn->kn_data >= so->so_rcv.sb_lowat);
 2083         if (hint != NOTE_SUBMIT)
 2084                 sounlock(so);
 2085         return rv;
 2086 }
 2087 
 2088 static void
 2089 filt_sowdetach(struct knote *kn)
 2090 {
 2091         struct socket   *so;
 2092 
 2093         so = ((file_t *)kn->kn_obj)->f_data;
 2094         solock(so);
 2095         SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext);
 2096         if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist))
 2097                 so->so_snd.sb_flags &= ~SB_KNOTE;
 2098         sounlock(so);
 2099 }
 2100 
 2101 /*ARGSUSED*/
 2102 static int
 2103 filt_sowrite(struct knote *kn, long hint)
 2104 {
 2105         struct socket   *so;
 2106         int rv;
 2107 
 2108         so = ((file_t *)kn->kn_obj)->f_data;
 2109         if (hint != NOTE_SUBMIT)
 2110                 solock(so);
 2111         kn->kn_data = sbspace(&so->so_snd);
 2112         if (so->so_state & SS_CANTSENDMORE) {
 2113                 kn->kn_flags |= EV_EOF;
 2114                 kn->kn_fflags = so->so_error;
 2115                 rv = 1;
 2116         } else if (so->so_error)        /* temporary udp error */
 2117                 rv = 1;
 2118         else if (((so->so_state & SS_ISCONNECTED) == 0) &&
 2119             (so->so_proto->pr_flags & PR_CONNREQUIRED))
 2120                 rv = 0;
 2121         else if (kn->kn_sfflags & NOTE_LOWAT)
 2122                 rv = (kn->kn_data >= kn->kn_sdata);
 2123         else
 2124                 rv = (kn->kn_data >= so->so_snd.sb_lowat);
 2125         if (hint != NOTE_SUBMIT)
 2126                 sounlock(so);
 2127         return rv;
 2128 }
 2129 
 2130 /*ARGSUSED*/
 2131 static int
 2132 filt_solisten(struct knote *kn, long hint)
 2133 {
 2134         struct socket   *so;
 2135         int rv;
 2136 
 2137         so = ((file_t *)kn->kn_obj)->f_data;
 2138 
 2139         /*
 2140          * Set kn_data to number of incoming connections, not
 2141          * counting partial (incomplete) connections.
 2142          */
 2143         if (hint != NOTE_SUBMIT)
 2144                 solock(so);
 2145         kn->kn_data = so->so_qlen;
 2146         rv = (kn->kn_data > 0);
 2147         if (hint != NOTE_SUBMIT)
 2148                 sounlock(so);
 2149         return rv;
 2150 }
 2151 
 2152 static const struct filterops solisten_filtops =
 2153         { 1, NULL, filt_sordetach, filt_solisten };
 2154 static const struct filterops soread_filtops =
 2155         { 1, NULL, filt_sordetach, filt_soread };
 2156 static const struct filterops sowrite_filtops =
 2157         { 1, NULL, filt_sowdetach, filt_sowrite };
 2158 
 2159 int
 2160 soo_kqfilter(struct file *fp, struct knote *kn)
 2161 {
 2162         struct socket   *so;
 2163         struct sockbuf  *sb;
 2164 
 2165         so = ((file_t *)kn->kn_obj)->f_data;
 2166         solock(so);
 2167         switch (kn->kn_filter) {
 2168         case EVFILT_READ:
 2169                 if (so->so_options & SO_ACCEPTCONN)
 2170                         kn->kn_fop = &solisten_filtops;
 2171                 else
 2172                         kn->kn_fop = &soread_filtops;
 2173                 sb = &so->so_rcv;
 2174                 break;
 2175         case EVFILT_WRITE:
 2176                 kn->kn_fop = &sowrite_filtops;
 2177                 sb = &so->so_snd;
 2178                 break;
 2179         default:
 2180                 sounlock(so);
 2181                 return (EINVAL);
 2182         }
 2183         SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext);
 2184         sb->sb_flags |= SB_KNOTE;
 2185         sounlock(so);
 2186         return (0);
 2187 }
 2188 
 2189 static int
 2190 sodopoll(struct socket *so, int events)
 2191 {
 2192         int revents;
 2193 
 2194         revents = 0;
 2195 
 2196         if (events & (POLLIN | POLLRDNORM))
 2197                 if (soreadable(so))
 2198                         revents |= events & (POLLIN | POLLRDNORM);
 2199 
 2200         if (events & (POLLOUT | POLLWRNORM))
 2201                 if (sowritable(so))
 2202                         revents |= events & (POLLOUT | POLLWRNORM);
 2203 
 2204         if (events & (POLLPRI | POLLRDBAND))
 2205                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
 2206                         revents |= events & (POLLPRI | POLLRDBAND);
 2207 
 2208         return revents;
 2209 }
 2210 
 2211 int
 2212 sopoll(struct socket *so, int events)
 2213 {
 2214         int revents = 0;
 2215 
 2216 #ifndef DIAGNOSTIC
 2217         /*
 2218          * Do a quick, unlocked check in expectation that the socket
 2219          * will be ready for I/O.  Don't do this check if DIAGNOSTIC,
 2220          * as the solocked() assertions will fail.
 2221          */
 2222         if ((revents = sodopoll(so, events)) != 0)
 2223                 return revents;
 2224 #endif
 2225 
 2226         solock(so);
 2227         if ((revents = sodopoll(so, events)) == 0) {
 2228                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
 2229                         selrecord(curlwp, &so->so_rcv.sb_sel);
 2230                         so->so_rcv.sb_flags |= SB_NOTIFY;
 2231                 }
 2232 
 2233                 if (events & (POLLOUT | POLLWRNORM)) {
 2234                         selrecord(curlwp, &so->so_snd.sb_sel);
 2235                         so->so_snd.sb_flags |= SB_NOTIFY;
 2236                 }
 2237         }
 2238         sounlock(so);
 2239 
 2240         return revents;
 2241 }
 2242 
 2243 
 2244 #include <sys/sysctl.h>
 2245 
 2246 static int sysctl_kern_somaxkva(SYSCTLFN_PROTO);
 2247 
 2248 /*
 2249  * sysctl helper routine for kern.somaxkva.  ensures that the given
 2250  * value is not too small.
 2251  * (XXX should we maybe make sure it's not too large as well?)
 2252  */
 2253 static int
 2254 sysctl_kern_somaxkva(SYSCTLFN_ARGS)
 2255 {
 2256         int error, new_somaxkva;
 2257         struct sysctlnode node;
 2258 
 2259         new_somaxkva = somaxkva;
 2260         node = *rnode;
 2261         node.sysctl_data = &new_somaxkva;
 2262         error = sysctl_lookup(SYSCTLFN_CALL(&node));
 2263         if (error || newp == NULL)
 2264                 return (error);
 2265 
 2266         if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */
 2267                 return (EINVAL);
 2268 
 2269         mutex_enter(&so_pendfree_lock);
 2270         somaxkva = new_somaxkva;
 2271         cv_broadcast(&socurkva_cv);
 2272         mutex_exit(&so_pendfree_lock);
 2273 
 2274         return (error);
 2275 }
 2276 
 2277 SYSCTL_SETUP(sysctl_kern_somaxkva_setup, "sysctl kern.somaxkva setup")
 2278 {
 2279 
 2280         sysctl_createv(clog, 0, NULL, NULL,
 2281                        CTLFLAG_PERMANENT,
 2282                        CTLTYPE_NODE, "kern", NULL,
 2283                        NULL, 0, NULL, 0,
 2284                        CTL_KERN, CTL_EOL);
 2285 
 2286         sysctl_createv(clog, 0, NULL, NULL,
 2287                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 2288                        CTLTYPE_INT, "somaxkva",
 2289                        SYSCTL_DESCR("Maximum amount of kernel memory to be "
 2290                                     "used for socket buffers"),
 2291                        sysctl_kern_somaxkva, 0, NULL, 0,
 2292                        CTL_KERN, KERN_SOMAXKVA, CTL_EOL);
 2293 }

Cache object: 358963ef43d5800958f6a07c9e4d71d5


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.