The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/uipc_syscalls.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  *
    5  * sendfile(2) and related extensions:
    6  * Copyright (c) 1998, David Greenman. All rights reserved.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  * 4. Neither the name of the University nor the names of its contributors
   17  *    may be used to endorse or promote products derived from this software
   18  *    without specific prior written permission.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   30  * SUCH DAMAGE.
   31  *
   32  *      @(#)uipc_syscalls.c     8.4 (Berkeley) 2/21/94
   33  */
   34 
   35 #include <sys/cdefs.h>
   36 __FBSDID("$FreeBSD: releng/10.4/sys/kern/uipc_syscalls.c 321021 2017-07-15 17:28:03Z dchagin $");
   37 
   38 #include "opt_capsicum.h"
   39 #include "opt_inet.h"
   40 #include "opt_inet6.h"
   41 #include "opt_compat.h"
   42 #include "opt_ktrace.h"
   43 
   44 #include <sys/param.h>
   45 #include <sys/systm.h>
   46 #include <sys/capsicum.h>
   47 #include <sys/condvar.h>
   48 #include <sys/kernel.h>
   49 #include <sys/lock.h>
   50 #include <sys/mutex.h>
   51 #include <sys/sysproto.h>
   52 #include <sys/malloc.h>
   53 #include <sys/filedesc.h>
   54 #include <sys/event.h>
   55 #include <sys/proc.h>
   56 #include <sys/fcntl.h>
   57 #include <sys/file.h>
   58 #include <sys/filio.h>
   59 #include <sys/jail.h>
   60 #include <sys/mman.h>
   61 #include <sys/mount.h>
   62 #include <sys/mbuf.h>
   63 #include <sys/protosw.h>
   64 #include <sys/rwlock.h>
   65 #include <sys/sf_buf.h>
   66 #include <sys/sysent.h>
   67 #include <sys/socket.h>
   68 #include <sys/socketvar.h>
   69 #include <sys/signalvar.h>
   70 #include <sys/syscallsubr.h>
   71 #include <sys/sysctl.h>
   72 #include <sys/uio.h>
   73 #include <sys/vnode.h>
   74 #ifdef KTRACE
   75 #include <sys/ktrace.h>
   76 #endif
   77 #ifdef COMPAT_FREEBSD32
   78 #include <compat/freebsd32/freebsd32_util.h>
   79 #endif
   80 
   81 #include <net/vnet.h>
   82 
   83 #include <security/audit/audit.h>
   84 #include <security/mac/mac_framework.h>
   85 
   86 #include <vm/vm.h>
   87 #include <vm/vm_param.h>
   88 #include <vm/vm_object.h>
   89 #include <vm/vm_page.h>
   90 #include <vm/vm_pager.h>
   91 #include <vm/vm_kern.h>
   92 #include <vm/vm_extern.h>
   93 
   94 /*
   95  * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
   96  * and SOCK_NONBLOCK.
   97  */
   98 #define ACCEPT4_INHERIT 0x1
   99 #define ACCEPT4_COMPAT  0x2
  100 
  101 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
  102 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
  103 
  104 static int accept1(struct thread *td, int s, struct sockaddr *uname,
  105                    socklen_t *anamelen, int flags);
  106 static int do_sendfile(struct thread *td, struct sendfile_args *uap,
  107                    int compat);
  108 static int getsockname1(struct thread *td, struct getsockname_args *uap,
  109                         int compat);
  110 static int getpeername1(struct thread *td, struct getpeername_args *uap,
  111                         int compat);
  112 
  113 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
  114 
  115 /*
  116  * sendfile(2)-related variables and associated sysctls
  117  */
  118 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0,
  119     "sendfile(2) tunables");
  120 static int sfreadahead = 1;
  121 SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW,
  122     &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks");
  123 
  124 
  125 static void
  126 sfstat_init(const void *unused)
  127 {
  128 
  129         COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
  130             M_WAITOK);
  131 }
  132 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
  133 
  134 static int
  135 sfstat_sysctl(SYSCTL_HANDLER_ARGS)
  136 {
  137         struct sfstat s;
  138 
  139         COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
  140         if (req->newptr)
  141                 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
  142         return (SYSCTL_OUT(req, &s, sizeof(s)));
  143 }
  144 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
  145     NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
  146 
  147 /*
  148  * Convert a user file descriptor to a kernel file entry and check if required
  149  * capability rights are present.
  150  * A reference on the file entry is held upon returning.
  151  */
  152 int
  153 getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp,
  154     struct file **fpp, u_int *fflagp)
  155 {
  156         struct file *fp;
  157         int error;
  158 
  159         error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, 0, &fp, NULL);
  160         if (error != 0)
  161                 return (error);
  162         if (fp->f_type != DTYPE_SOCKET) {
  163                 fdrop(fp, td);
  164                 return (ENOTSOCK);
  165         }
  166         if (fflagp != NULL)
  167                 *fflagp = fp->f_flag;
  168         *fpp = fp;
  169         return (0);
  170 }
  171 
  172 /*
  173  * System call interface to the socket abstraction.
  174  */
  175 #if defined(COMPAT_43)
  176 #define COMPAT_OLDSOCK
  177 #endif
  178 
  179 int
  180 sys_socket(td, uap)
  181         struct thread *td;
  182         struct socket_args /* {
  183                 int     domain;
  184                 int     type;
  185                 int     protocol;
  186         } */ *uap;
  187 {
  188         struct socket *so;
  189         struct file *fp;
  190         int fd, error, type, oflag, fflag;
  191 
  192         AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
  193 
  194         type = uap->type;
  195         oflag = 0;
  196         fflag = 0;
  197         if ((type & SOCK_CLOEXEC) != 0) {
  198                 type &= ~SOCK_CLOEXEC;
  199                 oflag |= O_CLOEXEC;
  200         }
  201         if ((type & SOCK_NONBLOCK) != 0) {
  202                 type &= ~SOCK_NONBLOCK;
  203                 fflag |= FNONBLOCK;
  204         }
  205 
  206 #ifdef MAC
  207         error = mac_socket_check_create(td->td_ucred, uap->domain, type,
  208             uap->protocol);
  209         if (error != 0)
  210                 return (error);
  211 #endif
  212         error = falloc(td, &fp, &fd, oflag);
  213         if (error != 0)
  214                 return (error);
  215         /* An extra reference on `fp' has been held for us by falloc(). */
  216         error = socreate(uap->domain, &so, type, uap->protocol,
  217             td->td_ucred, td);
  218         if (error != 0) {
  219                 fdclose(td, fp, fd);
  220         } else {
  221                 finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
  222                 if ((fflag & FNONBLOCK) != 0)
  223                         (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
  224                 td->td_retval[0] = fd;
  225         }
  226         fdrop(fp, td);
  227         return (error);
  228 }
  229 
  230 /* ARGSUSED */
  231 int
  232 sys_bind(td, uap)
  233         struct thread *td;
  234         struct bind_args /* {
  235                 int     s;
  236                 caddr_t name;
  237                 int     namelen;
  238         } */ *uap;
  239 {
  240         struct sockaddr *sa;
  241         int error;
  242 
  243         error = getsockaddr(&sa, uap->name, uap->namelen);
  244         if (error == 0) {
  245                 error = kern_bind(td, uap->s, sa);
  246                 free(sa, M_SONAME);
  247         }
  248         return (error);
  249 }
  250 
  251 static int
  252 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
  253 {
  254         struct socket *so;
  255         struct file *fp;
  256         cap_rights_t rights;
  257         int error;
  258 
  259         AUDIT_ARG_FD(fd);
  260         AUDIT_ARG_SOCKADDR(td, dirfd, sa);
  261         error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND),
  262             &fp, NULL);
  263         if (error != 0)
  264                 return (error);
  265         so = fp->f_data;
  266 #ifdef KTRACE
  267         if (KTRPOINT(td, KTR_STRUCT))
  268                 ktrsockaddr(sa);
  269 #endif
  270 #ifdef MAC
  271         error = mac_socket_check_bind(td->td_ucred, so, sa);
  272         if (error == 0) {
  273 #endif
  274                 if (dirfd == AT_FDCWD)
  275                         error = sobind(so, sa, td);
  276                 else
  277                         error = sobindat(dirfd, so, sa, td);
  278 #ifdef MAC
  279         }
  280 #endif
  281         fdrop(fp, td);
  282         return (error);
  283 }
  284 
  285 int
  286 kern_bind(struct thread *td, int fd, struct sockaddr *sa)
  287 {
  288 
  289         return (kern_bindat(td, AT_FDCWD, fd, sa));
  290 }
  291 
  292 /* ARGSUSED */
  293 int
  294 sys_bindat(td, uap)
  295         struct thread *td;
  296         struct bindat_args /* {
  297                 int     fd;
  298                 int     s;
  299                 caddr_t name;
  300                 int     namelen;
  301         } */ *uap;
  302 {
  303         struct sockaddr *sa;
  304         int error;
  305 
  306         error = getsockaddr(&sa, uap->name, uap->namelen);
  307         if (error == 0) {
  308                 error = kern_bindat(td, uap->fd, uap->s, sa);
  309                 free(sa, M_SONAME);
  310         }
  311         return (error);
  312 }
  313 
  314 /* ARGSUSED */
  315 int
  316 sys_listen(td, uap)
  317         struct thread *td;
  318         struct listen_args /* {
  319                 int     s;
  320                 int     backlog;
  321         } */ *uap;
  322 {
  323         struct socket *so;
  324         struct file *fp;
  325         cap_rights_t rights;
  326         int error;
  327 
  328         AUDIT_ARG_FD(uap->s);
  329         error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_LISTEN),
  330             &fp, NULL);
  331         if (error == 0) {
  332                 so = fp->f_data;
  333 #ifdef MAC
  334                 error = mac_socket_check_listen(td->td_ucred, so);
  335                 if (error == 0)
  336 #endif
  337                         error = solisten(so, uap->backlog, td);
  338                 fdrop(fp, td);
  339         }
  340         return(error);
  341 }
  342 
  343 /*
  344  * accept1()
  345  */
  346 static int
  347 accept1(td, s, uname, anamelen, flags)
  348         struct thread *td;
  349         int s;
  350         struct sockaddr *uname;
  351         socklen_t *anamelen;
  352         int flags;
  353 {
  354         struct sockaddr *name;
  355         socklen_t namelen;
  356         struct file *fp;
  357         int error;
  358 
  359         if (uname == NULL)
  360                 return (kern_accept4(td, s, NULL, NULL, flags, NULL));
  361 
  362         error = copyin(anamelen, &namelen, sizeof (namelen));
  363         if (error != 0)
  364                 return (error);
  365 
  366         error = kern_accept4(td, s, &name, &namelen, flags, &fp);
  367 
  368         /*
  369          * return a namelen of zero for older code which might
  370          * ignore the return value from accept.
  371          */
  372         if (error != 0) {
  373                 (void) copyout(&namelen, anamelen, sizeof(*anamelen));
  374                 return (error);
  375         }
  376 
  377         if (error == 0 && uname != NULL) {
  378 #ifdef COMPAT_OLDSOCK
  379                 if (flags & ACCEPT4_COMPAT)
  380                         ((struct osockaddr *)name)->sa_family =
  381                             name->sa_family;
  382 #endif
  383                 error = copyout(name, uname, namelen);
  384         }
  385         if (error == 0)
  386                 error = copyout(&namelen, anamelen,
  387                     sizeof(namelen));
  388         if (error != 0)
  389                 fdclose(td, fp, td->td_retval[0]);
  390         fdrop(fp, td);
  391         free(name, M_SONAME);
  392         return (error);
  393 }
  394 
  395 int
  396 kern_accept(struct thread *td, int s, struct sockaddr **name,
  397     socklen_t *namelen, struct file **fp)
  398 {
  399         return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
  400 }
  401 
  402 int
  403 kern_accept4(struct thread *td, int s, struct sockaddr **name,
  404     socklen_t *namelen, int flags, struct file **fp)
  405 {
  406         struct file *headfp, *nfp = NULL;
  407         struct sockaddr *sa = NULL;
  408         struct socket *head, *so;
  409         cap_rights_t rights;
  410         u_int fflag;
  411         pid_t pgid;
  412         int error, fd, tmp;
  413 
  414         if (name != NULL)
  415                 *name = NULL;
  416 
  417         AUDIT_ARG_FD(s);
  418         error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT),
  419             &headfp, &fflag);
  420         if (error != 0)
  421                 return (error);
  422         head = headfp->f_data;
  423         if ((head->so_options & SO_ACCEPTCONN) == 0) {
  424                 error = EINVAL;
  425                 goto done;
  426         }
  427 #ifdef MAC
  428         error = mac_socket_check_accept(td->td_ucred, head);
  429         if (error != 0)
  430                 goto done;
  431 #endif
  432         error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0);
  433         if (error != 0)
  434                 goto done;
  435         ACCEPT_LOCK();
  436         if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
  437                 ACCEPT_UNLOCK();
  438                 error = EWOULDBLOCK;
  439                 goto noconnection;
  440         }
  441         while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
  442                 if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
  443                         head->so_error = ECONNABORTED;
  444                         break;
  445                 }
  446                 error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
  447                     "accept", 0);
  448                 if (error != 0) {
  449                         ACCEPT_UNLOCK();
  450                         goto noconnection;
  451                 }
  452         }
  453         if (head->so_error) {
  454                 error = head->so_error;
  455                 head->so_error = 0;
  456                 ACCEPT_UNLOCK();
  457                 goto noconnection;
  458         }
  459         so = TAILQ_FIRST(&head->so_comp);
  460         KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
  461         KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
  462 
  463         /*
  464          * Before changing the flags on the socket, we have to bump the
  465          * reference count.  Otherwise, if the protocol calls sofree(),
  466          * the socket will be released due to a zero refcount.
  467          */
  468         SOCK_LOCK(so);                  /* soref() and so_state update */
  469         soref(so);                      /* file descriptor reference */
  470 
  471         TAILQ_REMOVE(&head->so_comp, so, so_list);
  472         head->so_qlen--;
  473         if (flags & ACCEPT4_INHERIT)
  474                 so->so_state |= (head->so_state & SS_NBIO);
  475         else
  476                 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
  477         so->so_qstate &= ~SQ_COMP;
  478         so->so_head = NULL;
  479 
  480         SOCK_UNLOCK(so);
  481         ACCEPT_UNLOCK();
  482 
  483         /* An extra reference on `nfp' has been held for us by falloc(). */
  484         td->td_retval[0] = fd;
  485 
  486         /* connection has been removed from the listen queue */
  487         KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
  488 
  489         if (flags & ACCEPT4_INHERIT) {
  490                 pgid = fgetown(&head->so_sigio);
  491                 if (pgid != 0)
  492                         fsetown(pgid, &so->so_sigio);
  493         } else {
  494                 fflag &= ~(FNONBLOCK | FASYNC);
  495                 if (flags & SOCK_NONBLOCK)
  496                         fflag |= FNONBLOCK;
  497         }
  498 
  499         finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
  500         /* Sync socket nonblocking/async state with file flags */
  501         tmp = fflag & FNONBLOCK;
  502         (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
  503         tmp = fflag & FASYNC;
  504         (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
  505         sa = 0;
  506         error = soaccept(so, &sa);
  507         if (error != 0) {
  508                 /*
  509                  * return a namelen of zero for older code which might
  510                  * ignore the return value from accept.
  511                  */
  512                 if (name)
  513                         *namelen = 0;
  514                 goto noconnection;
  515         }
  516         if (sa == NULL) {
  517                 if (name)
  518                         *namelen = 0;
  519                 goto done;
  520         }
  521         AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
  522         if (name) {
  523                 /* check sa_len before it is destroyed */
  524                 if (*namelen > sa->sa_len)
  525                         *namelen = sa->sa_len;
  526 #ifdef KTRACE
  527                 if (KTRPOINT(td, KTR_STRUCT))
  528                         ktrsockaddr(sa);
  529 #endif
  530                 *name = sa;
  531                 sa = NULL;
  532         }
  533 noconnection:
  534         free(sa, M_SONAME);
  535 
  536         /*
  537          * close the new descriptor, assuming someone hasn't ripped it
  538          * out from under us.
  539          */
  540         if (error != 0)
  541                 fdclose(td, nfp, fd);
  542 
  543         /*
  544          * Release explicitly held references before returning.  We return
  545          * a reference on nfp to the caller on success if they request it.
  546          */
  547 done:
  548         if (fp != NULL) {
  549                 if (error == 0) {
  550                         *fp = nfp;
  551                         nfp = NULL;
  552                 } else
  553                         *fp = NULL;
  554         }
  555         if (nfp != NULL)
  556                 fdrop(nfp, td);
  557         fdrop(headfp, td);
  558         return (error);
  559 }
  560 
  561 int
  562 sys_accept(td, uap)
  563         struct thread *td;
  564         struct accept_args *uap;
  565 {
  566 
  567         return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
  568 }
  569 
  570 int
  571 sys_accept4(td, uap)
  572         struct thread *td;
  573         struct accept4_args *uap;
  574 {
  575 
  576         if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
  577                 return (EINVAL);
  578 
  579         return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
  580 }
  581 
  582 #ifdef COMPAT_OLDSOCK
  583 int
  584 oaccept(td, uap)
  585         struct thread *td;
  586         struct accept_args *uap;
  587 {
  588 
  589         return (accept1(td, uap->s, uap->name, uap->anamelen,
  590             ACCEPT4_INHERIT | ACCEPT4_COMPAT));
  591 }
  592 #endif /* COMPAT_OLDSOCK */
  593 
  594 /* ARGSUSED */
  595 int
  596 sys_connect(td, uap)
  597         struct thread *td;
  598         struct connect_args /* {
  599                 int     s;
  600                 caddr_t name;
  601                 int     namelen;
  602         } */ *uap;
  603 {
  604         struct sockaddr *sa;
  605         int error;
  606 
  607         error = getsockaddr(&sa, uap->name, uap->namelen);
  608         if (error == 0) {
  609                 error = kern_connect(td, uap->s, sa);
  610                 free(sa, M_SONAME);
  611         }
  612         return (error);
  613 }
  614 
  615 static int
  616 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
  617 {
  618         struct socket *so;
  619         struct file *fp;
  620         cap_rights_t rights;
  621         int error, interrupted = 0;
  622 
  623         AUDIT_ARG_FD(fd);
  624         AUDIT_ARG_SOCKADDR(td, dirfd, sa);
  625         error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT),
  626             &fp, NULL);
  627         if (error != 0)
  628                 return (error);
  629         so = fp->f_data;
  630         if (so->so_state & SS_ISCONNECTING) {
  631                 error = EALREADY;
  632                 goto done1;
  633         }
  634 #ifdef KTRACE
  635         if (KTRPOINT(td, KTR_STRUCT))
  636                 ktrsockaddr(sa);
  637 #endif
  638 #ifdef MAC
  639         error = mac_socket_check_connect(td->td_ucred, so, sa);
  640         if (error != 0)
  641                 goto bad;
  642 #endif
  643         if (dirfd == AT_FDCWD)
  644                 error = soconnect(so, sa, td);
  645         else
  646                 error = soconnectat(dirfd, so, sa, td);
  647         if (error != 0)
  648                 goto bad;
  649         if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
  650                 error = EINPROGRESS;
  651                 goto done1;
  652         }
  653         SOCK_LOCK(so);
  654         while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
  655                 error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
  656                     "connec", 0);
  657                 if (error != 0) {
  658                         if (error == EINTR || error == ERESTART)
  659                                 interrupted = 1;
  660                         break;
  661                 }
  662         }
  663         if (error == 0) {
  664                 error = so->so_error;
  665                 so->so_error = 0;
  666         }
  667         SOCK_UNLOCK(so);
  668 bad:
  669         if (!interrupted)
  670                 so->so_state &= ~SS_ISCONNECTING;
  671         if (error == ERESTART)
  672                 error = EINTR;
  673 done1:
  674         fdrop(fp, td);
  675         return (error);
  676 }
  677 
  678 int
  679 kern_connect(struct thread *td, int fd, struct sockaddr *sa)
  680 {
  681 
  682         return (kern_connectat(td, AT_FDCWD, fd, sa));
  683 }
  684 
  685 /* ARGSUSED */
  686 int
  687 sys_connectat(td, uap)
  688         struct thread *td;
  689         struct connectat_args /* {
  690                 int     fd;
  691                 int     s;
  692                 caddr_t name;
  693                 int     namelen;
  694         } */ *uap;
  695 {
  696         struct sockaddr *sa;
  697         int error;
  698 
  699         error = getsockaddr(&sa, uap->name, uap->namelen);
  700         if (error == 0) {
  701                 error = kern_connectat(td, uap->fd, uap->s, sa);
  702                 free(sa, M_SONAME);
  703         }
  704         return (error);
  705 }
  706 
  707 int
  708 kern_socketpair(struct thread *td, int domain, int type, int protocol,
  709     int *rsv)
  710 {
  711         struct file *fp1, *fp2;
  712         struct socket *so1, *so2;
  713         int fd, error, oflag, fflag;
  714 
  715         AUDIT_ARG_SOCKET(domain, type, protocol);
  716 
  717         oflag = 0;
  718         fflag = 0;
  719         if ((type & SOCK_CLOEXEC) != 0) {
  720                 type &= ~SOCK_CLOEXEC;
  721                 oflag |= O_CLOEXEC;
  722         }
  723         if ((type & SOCK_NONBLOCK) != 0) {
  724                 type &= ~SOCK_NONBLOCK;
  725                 fflag |= FNONBLOCK;
  726         }
  727 #ifdef MAC
  728         /* We might want to have a separate check for socket pairs. */
  729         error = mac_socket_check_create(td->td_ucred, domain, type,
  730             protocol);
  731         if (error != 0)
  732                 return (error);
  733 #endif
  734         error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
  735         if (error != 0)
  736                 return (error);
  737         error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
  738         if (error != 0)
  739                 goto free1;
  740         /* On success extra reference to `fp1' and 'fp2' is set by falloc. */
  741         error = falloc(td, &fp1, &fd, oflag);
  742         if (error != 0)
  743                 goto free2;
  744         rsv[0] = fd;
  745         fp1->f_data = so1;      /* so1 already has ref count */
  746         error = falloc(td, &fp2, &fd, oflag);
  747         if (error != 0)
  748                 goto free3;
  749         fp2->f_data = so2;      /* so2 already has ref count */
  750         rsv[1] = fd;
  751         error = soconnect2(so1, so2);
  752         if (error != 0)
  753                 goto free4;
  754         if (type == SOCK_DGRAM) {
  755                 /*
  756                  * Datagram socket connection is asymmetric.
  757                  */
  758                  error = soconnect2(so2, so1);
  759                  if (error != 0)
  760                         goto free4;
  761         }
  762         finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
  763             &socketops);
  764         finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
  765             &socketops);
  766         if ((fflag & FNONBLOCK) != 0) {
  767                 (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
  768                 (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
  769         }
  770         fdrop(fp1, td);
  771         fdrop(fp2, td);
  772         return (0);
  773 free4:
  774         fdclose(td, fp2, rsv[1]);
  775         fdrop(fp2, td);
  776 free3:
  777         fdclose(td, fp1, rsv[0]);
  778         fdrop(fp1, td);
  779 free2:
  780         if (so2 != NULL)
  781                 (void)soclose(so2);
  782 free1:
  783         if (so1 != NULL)
  784                 (void)soclose(so1);
  785         return (error);
  786 }
  787 
  788 int
  789 sys_socketpair(struct thread *td, struct socketpair_args *uap)
  790 {
  791         int error, sv[2];
  792 
  793         error = kern_socketpair(td, uap->domain, uap->type,
  794             uap->protocol, sv);
  795         if (error != 0)
  796                 return (error);
  797         error = copyout(sv, uap->rsv, 2 * sizeof(int));
  798         if (error != 0) {
  799                 (void)kern_close(td, sv[0]);
  800                 (void)kern_close(td, sv[1]);
  801         }
  802         return (error);
  803 }
  804 
  805 static int
  806 sendit(td, s, mp, flags)
  807         struct thread *td;
  808         int s;
  809         struct msghdr *mp;
  810         int flags;
  811 {
  812         struct mbuf *control;
  813         struct sockaddr *to;
  814         int error;
  815 
  816 #ifdef CAPABILITY_MODE
  817         if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
  818                 return (ECAPMODE);
  819 #endif
  820 
  821         if (mp->msg_name != NULL) {
  822                 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
  823                 if (error != 0) {
  824                         to = NULL;
  825                         goto bad;
  826                 }
  827                 mp->msg_name = to;
  828         } else {
  829                 to = NULL;
  830         }
  831 
  832         if (mp->msg_control) {
  833                 if (mp->msg_controllen < sizeof(struct cmsghdr)
  834 #ifdef COMPAT_OLDSOCK
  835                     && mp->msg_flags != MSG_COMPAT
  836 #endif
  837                 ) {
  838                         error = EINVAL;
  839                         goto bad;
  840                 }
  841                 error = sockargs(&control, mp->msg_control,
  842                     mp->msg_controllen, MT_CONTROL);
  843                 if (error != 0)
  844                         goto bad;
  845 #ifdef COMPAT_OLDSOCK
  846                 if (mp->msg_flags == MSG_COMPAT) {
  847                         struct cmsghdr *cm;
  848 
  849                         M_PREPEND(control, sizeof(*cm), M_WAITOK);
  850                         cm = mtod(control, struct cmsghdr *);
  851                         cm->cmsg_len = control->m_len;
  852                         cm->cmsg_level = SOL_SOCKET;
  853                         cm->cmsg_type = SCM_RIGHTS;
  854                 }
  855 #endif
  856         } else {
  857                 control = NULL;
  858         }
  859 
  860         error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
  861 
  862 bad:
  863         free(to, M_SONAME);
  864         return (error);
  865 }
  866 
  867 int
  868 kern_sendit(td, s, mp, flags, control, segflg)
  869         struct thread *td;
  870         int s;
  871         struct msghdr *mp;
  872         int flags;
  873         struct mbuf *control;
  874         enum uio_seg segflg;
  875 {
  876         struct file *fp;
  877         struct uio auio;
  878         struct iovec *iov;
  879         struct socket *so;
  880         cap_rights_t rights;
  881 #ifdef KTRACE
  882         struct uio *ktruio = NULL;
  883 #endif
  884         ssize_t len;
  885         int i, error;
  886 
  887         AUDIT_ARG_FD(s);
  888         cap_rights_init(&rights, CAP_SEND);
  889         if (mp->msg_name != NULL) {
  890                 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
  891                 cap_rights_set(&rights, CAP_CONNECT);
  892         }
  893         error = getsock_cap(td, s, &rights, &fp, NULL);
  894         if (error != 0)
  895                 return (error);
  896         so = (struct socket *)fp->f_data;
  897 
  898 #ifdef KTRACE
  899         if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
  900                 ktrsockaddr(mp->msg_name);
  901 #endif
  902 #ifdef MAC
  903         if (mp->msg_name != NULL) {
  904                 error = mac_socket_check_connect(td->td_ucred, so,
  905                     mp->msg_name);
  906                 if (error != 0)
  907                         goto bad;
  908         }
  909         error = mac_socket_check_send(td->td_ucred, so);
  910         if (error != 0)
  911                 goto bad;
  912 #endif
  913 
  914         auio.uio_iov = mp->msg_iov;
  915         auio.uio_iovcnt = mp->msg_iovlen;
  916         auio.uio_segflg = segflg;
  917         auio.uio_rw = UIO_WRITE;
  918         auio.uio_td = td;
  919         auio.uio_offset = 0;                    /* XXX */
  920         auio.uio_resid = 0;
  921         iov = mp->msg_iov;
  922         for (i = 0; i < mp->msg_iovlen; i++, iov++) {
  923                 if ((auio.uio_resid += iov->iov_len) < 0) {
  924                         error = EINVAL;
  925                         goto bad;
  926                 }
  927         }
  928 #ifdef KTRACE
  929         if (KTRPOINT(td, KTR_GENIO))
  930                 ktruio = cloneuio(&auio);
  931 #endif
  932         len = auio.uio_resid;
  933         error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
  934         if (error != 0) {
  935                 if (auio.uio_resid != len && (error == ERESTART ||
  936                     error == EINTR || error == EWOULDBLOCK))
  937                         error = 0;
  938                 /* Generation of SIGPIPE can be controlled per socket */
  939                 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
  940                     !(flags & MSG_NOSIGNAL)) {
  941                         PROC_LOCK(td->td_proc);
  942                         tdsignal(td, SIGPIPE);
  943                         PROC_UNLOCK(td->td_proc);
  944                 }
  945         }
  946         if (error == 0)
  947                 td->td_retval[0] = len - auio.uio_resid;
  948 #ifdef KTRACE
  949         if (ktruio != NULL) {
  950                 ktruio->uio_resid = td->td_retval[0];
  951                 ktrgenio(s, UIO_WRITE, ktruio, error);
  952         }
  953 #endif
  954 bad:
  955         fdrop(fp, td);
  956         return (error);
  957 }
  958 
  959 int
  960 sys_sendto(td, uap)
  961         struct thread *td;
  962         struct sendto_args /* {
  963                 int     s;
  964                 caddr_t buf;
  965                 size_t  len;
  966                 int     flags;
  967                 caddr_t to;
  968                 int     tolen;
  969         } */ *uap;
  970 {
  971         struct msghdr msg;
  972         struct iovec aiov;
  973 
  974         msg.msg_name = uap->to;
  975         msg.msg_namelen = uap->tolen;
  976         msg.msg_iov = &aiov;
  977         msg.msg_iovlen = 1;
  978         msg.msg_control = 0;
  979 #ifdef COMPAT_OLDSOCK
  980         msg.msg_flags = 0;
  981 #endif
  982         aiov.iov_base = uap->buf;
  983         aiov.iov_len = uap->len;
  984         return (sendit(td, uap->s, &msg, uap->flags));
  985 }
  986 
  987 #ifdef COMPAT_OLDSOCK
  988 int
  989 osend(td, uap)
  990         struct thread *td;
  991         struct osend_args /* {
  992                 int     s;
  993                 caddr_t buf;
  994                 int     len;
  995                 int     flags;
  996         } */ *uap;
  997 {
  998         struct msghdr msg;
  999         struct iovec aiov;
 1000 
 1001         msg.msg_name = 0;
 1002         msg.msg_namelen = 0;
 1003         msg.msg_iov = &aiov;
 1004         msg.msg_iovlen = 1;
 1005         aiov.iov_base = uap->buf;
 1006         aiov.iov_len = uap->len;
 1007         msg.msg_control = 0;
 1008         msg.msg_flags = 0;
 1009         return (sendit(td, uap->s, &msg, uap->flags));
 1010 }
 1011 
 1012 int
 1013 osendmsg(td, uap)
 1014         struct thread *td;
 1015         struct osendmsg_args /* {
 1016                 int     s;
 1017                 caddr_t msg;
 1018                 int     flags;
 1019         } */ *uap;
 1020 {
 1021         struct msghdr msg;
 1022         struct iovec *iov;
 1023         int error;
 1024 
 1025         error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
 1026         if (error != 0)
 1027                 return (error);
 1028         error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 1029         if (error != 0)
 1030                 return (error);
 1031         msg.msg_iov = iov;
 1032         msg.msg_flags = MSG_COMPAT;
 1033         error = sendit(td, uap->s, &msg, uap->flags);
 1034         free(iov, M_IOV);
 1035         return (error);
 1036 }
 1037 #endif
 1038 
 1039 int
 1040 sys_sendmsg(td, uap)
 1041         struct thread *td;
 1042         struct sendmsg_args /* {
 1043                 int     s;
 1044                 caddr_t msg;
 1045                 int     flags;
 1046         } */ *uap;
 1047 {
 1048         struct msghdr msg;
 1049         struct iovec *iov;
 1050         int error;
 1051 
 1052         error = copyin(uap->msg, &msg, sizeof (msg));
 1053         if (error != 0)
 1054                 return (error);
 1055         error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 1056         if (error != 0)
 1057                 return (error);
 1058         msg.msg_iov = iov;
 1059 #ifdef COMPAT_OLDSOCK
 1060         msg.msg_flags = 0;
 1061 #endif
 1062         error = sendit(td, uap->s, &msg, uap->flags);
 1063         free(iov, M_IOV);
 1064         return (error);
 1065 }
 1066 
 1067 int
 1068 kern_recvit(td, s, mp, fromseg, controlp)
 1069         struct thread *td;
 1070         int s;
 1071         struct msghdr *mp;
 1072         enum uio_seg fromseg;
 1073         struct mbuf **controlp;
 1074 {
 1075         struct uio auio;
 1076         struct iovec *iov;
 1077         struct mbuf *m, *control = NULL;
 1078         caddr_t ctlbuf;
 1079         struct file *fp;
 1080         struct socket *so;
 1081         struct sockaddr *fromsa = NULL;
 1082         cap_rights_t rights;
 1083 #ifdef KTRACE
 1084         struct uio *ktruio = NULL;
 1085 #endif
 1086         ssize_t len;
 1087         int error, i;
 1088 
 1089         if (controlp != NULL)
 1090                 *controlp = NULL;
 1091 
 1092         AUDIT_ARG_FD(s);
 1093         error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV),
 1094             &fp, NULL);
 1095         if (error != 0)
 1096                 return (error);
 1097         so = fp->f_data;
 1098 
 1099 #ifdef MAC
 1100         error = mac_socket_check_receive(td->td_ucred, so);
 1101         if (error != 0) {
 1102                 fdrop(fp, td);
 1103                 return (error);
 1104         }
 1105 #endif
 1106 
 1107         auio.uio_iov = mp->msg_iov;
 1108         auio.uio_iovcnt = mp->msg_iovlen;
 1109         auio.uio_segflg = UIO_USERSPACE;
 1110         auio.uio_rw = UIO_READ;
 1111         auio.uio_td = td;
 1112         auio.uio_offset = 0;                    /* XXX */
 1113         auio.uio_resid = 0;
 1114         iov = mp->msg_iov;
 1115         for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 1116                 if ((auio.uio_resid += iov->iov_len) < 0) {
 1117                         fdrop(fp, td);
 1118                         return (EINVAL);
 1119                 }
 1120         }
 1121 #ifdef KTRACE
 1122         if (KTRPOINT(td, KTR_GENIO))
 1123                 ktruio = cloneuio(&auio);
 1124 #endif
 1125         len = auio.uio_resid;
 1126         error = soreceive(so, &fromsa, &auio, NULL,
 1127             (mp->msg_control || controlp) ? &control : NULL,
 1128             &mp->msg_flags);
 1129         if (error != 0) {
 1130                 if (auio.uio_resid != len && (error == ERESTART ||
 1131                     error == EINTR || error == EWOULDBLOCK))
 1132                         error = 0;
 1133         }
 1134         if (fromsa != NULL)
 1135                 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
 1136 #ifdef KTRACE
 1137         if (ktruio != NULL) {
 1138                 ktruio->uio_resid = len - auio.uio_resid;
 1139                 ktrgenio(s, UIO_READ, ktruio, error);
 1140         }
 1141 #endif
 1142         if (error != 0)
 1143                 goto out;
 1144         td->td_retval[0] = len - auio.uio_resid;
 1145         if (mp->msg_name) {
 1146                 len = mp->msg_namelen;
 1147                 if (len <= 0 || fromsa == NULL)
 1148                         len = 0;
 1149                 else {
 1150                         /* save sa_len before it is destroyed by MSG_COMPAT */
 1151                         len = MIN(len, fromsa->sa_len);
 1152 #ifdef COMPAT_OLDSOCK
 1153                         if (mp->msg_flags & MSG_COMPAT)
 1154                                 ((struct osockaddr *)fromsa)->sa_family =
 1155                                     fromsa->sa_family;
 1156 #endif
 1157                         if (fromseg == UIO_USERSPACE) {
 1158                                 error = copyout(fromsa, mp->msg_name,
 1159                                     (unsigned)len);
 1160                                 if (error != 0)
 1161                                         goto out;
 1162                         } else
 1163                                 bcopy(fromsa, mp->msg_name, len);
 1164                 }
 1165                 mp->msg_namelen = len;
 1166         }
 1167         if (mp->msg_control && controlp == NULL) {
 1168 #ifdef COMPAT_OLDSOCK
 1169                 /*
 1170                  * We assume that old recvmsg calls won't receive access
 1171                  * rights and other control info, esp. as control info
 1172                  * is always optional and those options didn't exist in 4.3.
 1173                  * If we receive rights, trim the cmsghdr; anything else
 1174                  * is tossed.
 1175                  */
 1176                 if (control && mp->msg_flags & MSG_COMPAT) {
 1177                         if (mtod(control, struct cmsghdr *)->cmsg_level !=
 1178                             SOL_SOCKET ||
 1179                             mtod(control, struct cmsghdr *)->cmsg_type !=
 1180                             SCM_RIGHTS) {
 1181                                 mp->msg_controllen = 0;
 1182                                 goto out;
 1183                         }
 1184                         control->m_len -= sizeof (struct cmsghdr);
 1185                         control->m_data += sizeof (struct cmsghdr);
 1186                 }
 1187 #endif
 1188                 len = mp->msg_controllen;
 1189                 m = control;
 1190                 mp->msg_controllen = 0;
 1191                 ctlbuf = mp->msg_control;
 1192 
 1193                 while (m && len > 0) {
 1194                         unsigned int tocopy;
 1195 
 1196                         if (len >= m->m_len)
 1197                                 tocopy = m->m_len;
 1198                         else {
 1199                                 mp->msg_flags |= MSG_CTRUNC;
 1200                                 tocopy = len;
 1201                         }
 1202 
 1203                         if ((error = copyout(mtod(m, caddr_t),
 1204                                         ctlbuf, tocopy)) != 0)
 1205                                 goto out;
 1206 
 1207                         ctlbuf += tocopy;
 1208                         len -= tocopy;
 1209                         m = m->m_next;
 1210                 }
 1211                 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
 1212         }
 1213 out:
 1214         fdrop(fp, td);
 1215 #ifdef KTRACE
 1216         if (fromsa && KTRPOINT(td, KTR_STRUCT))
 1217                 ktrsockaddr(fromsa);
 1218 #endif
 1219         free(fromsa, M_SONAME);
 1220 
 1221         if (error == 0 && controlp != NULL)
 1222                 *controlp = control;
 1223         else  if (control)
 1224                 m_freem(control);
 1225 
 1226         return (error);
 1227 }
 1228 
 1229 static int
 1230 recvit(td, s, mp, namelenp)
 1231         struct thread *td;
 1232         int s;
 1233         struct msghdr *mp;
 1234         void *namelenp;
 1235 {
 1236         int error;
 1237 
 1238         error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
 1239         if (error != 0)
 1240                 return (error);
 1241         if (namelenp != NULL) {
 1242                 error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
 1243 #ifdef COMPAT_OLDSOCK
 1244                 if (mp->msg_flags & MSG_COMPAT)
 1245                         error = 0;      /* old recvfrom didn't check */
 1246 #endif
 1247         }
 1248         return (error);
 1249 }
 1250 
 1251 int
 1252 sys_recvfrom(td, uap)
 1253         struct thread *td;
 1254         struct recvfrom_args /* {
 1255                 int     s;
 1256                 caddr_t buf;
 1257                 size_t  len;
 1258                 int     flags;
 1259                 struct sockaddr * __restrict    from;
 1260                 socklen_t * __restrict fromlenaddr;
 1261         } */ *uap;
 1262 {
 1263         struct msghdr msg;
 1264         struct iovec aiov;
 1265         int error;
 1266 
 1267         if (uap->fromlenaddr) {
 1268                 error = copyin(uap->fromlenaddr,
 1269                     &msg.msg_namelen, sizeof (msg.msg_namelen));
 1270                 if (error != 0)
 1271                         goto done2;
 1272         } else {
 1273                 msg.msg_namelen = 0;
 1274         }
 1275         msg.msg_name = uap->from;
 1276         msg.msg_iov = &aiov;
 1277         msg.msg_iovlen = 1;
 1278         aiov.iov_base = uap->buf;
 1279         aiov.iov_len = uap->len;
 1280         msg.msg_control = 0;
 1281         msg.msg_flags = uap->flags;
 1282         error = recvit(td, uap->s, &msg, uap->fromlenaddr);
 1283 done2:
 1284         return (error);
 1285 }
 1286 
 1287 #ifdef COMPAT_OLDSOCK
 1288 int
 1289 orecvfrom(td, uap)
 1290         struct thread *td;
 1291         struct recvfrom_args *uap;
 1292 {
 1293 
 1294         uap->flags |= MSG_COMPAT;
 1295         return (sys_recvfrom(td, uap));
 1296 }
 1297 #endif
 1298 
 1299 #ifdef COMPAT_OLDSOCK
 1300 int
 1301 orecv(td, uap)
 1302         struct thread *td;
 1303         struct orecv_args /* {
 1304                 int     s;
 1305                 caddr_t buf;
 1306                 int     len;
 1307                 int     flags;
 1308         } */ *uap;
 1309 {
 1310         struct msghdr msg;
 1311         struct iovec aiov;
 1312 
 1313         msg.msg_name = 0;
 1314         msg.msg_namelen = 0;
 1315         msg.msg_iov = &aiov;
 1316         msg.msg_iovlen = 1;
 1317         aiov.iov_base = uap->buf;
 1318         aiov.iov_len = uap->len;
 1319         msg.msg_control = 0;
 1320         msg.msg_flags = uap->flags;
 1321         return (recvit(td, uap->s, &msg, NULL));
 1322 }
 1323 
 1324 /*
 1325  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
 1326  * overlays the new one, missing only the flags, and with the (old) access
 1327  * rights where the control fields are now.
 1328  */
 1329 int
 1330 orecvmsg(td, uap)
 1331         struct thread *td;
 1332         struct orecvmsg_args /* {
 1333                 int     s;
 1334                 struct  omsghdr *msg;
 1335                 int     flags;
 1336         } */ *uap;
 1337 {
 1338         struct msghdr msg;
 1339         struct iovec *iov;
 1340         int error;
 1341 
 1342         error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
 1343         if (error != 0)
 1344                 return (error);
 1345         error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 1346         if (error != 0)
 1347                 return (error);
 1348         msg.msg_flags = uap->flags | MSG_COMPAT;
 1349         msg.msg_iov = iov;
 1350         error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
 1351         if (msg.msg_controllen && error == 0)
 1352                 error = copyout(&msg.msg_controllen,
 1353                     &uap->msg->msg_accrightslen, sizeof (int));
 1354         free(iov, M_IOV);
 1355         return (error);
 1356 }
 1357 #endif
 1358 
 1359 int
 1360 sys_recvmsg(td, uap)
 1361         struct thread *td;
 1362         struct recvmsg_args /* {
 1363                 int     s;
 1364                 struct  msghdr *msg;
 1365                 int     flags;
 1366         } */ *uap;
 1367 {
 1368         struct msghdr msg;
 1369         struct iovec *uiov, *iov;
 1370         int error;
 1371 
 1372         error = copyin(uap->msg, &msg, sizeof (msg));
 1373         if (error != 0)
 1374                 return (error);
 1375         error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 1376         if (error != 0)
 1377                 return (error);
 1378         msg.msg_flags = uap->flags;
 1379 #ifdef COMPAT_OLDSOCK
 1380         msg.msg_flags &= ~MSG_COMPAT;
 1381 #endif
 1382         uiov = msg.msg_iov;
 1383         msg.msg_iov = iov;
 1384         error = recvit(td, uap->s, &msg, NULL);
 1385         if (error == 0) {
 1386                 msg.msg_iov = uiov;
 1387                 error = copyout(&msg, uap->msg, sizeof(msg));
 1388         }
 1389         free(iov, M_IOV);
 1390         return (error);
 1391 }
 1392 
 1393 /* ARGSUSED */
 1394 int
 1395 sys_shutdown(td, uap)
 1396         struct thread *td;
 1397         struct shutdown_args /* {
 1398                 int     s;
 1399                 int     how;
 1400         } */ *uap;
 1401 {
 1402         struct socket *so;
 1403         struct file *fp;
 1404         cap_rights_t rights;
 1405         int error;
 1406 
 1407         AUDIT_ARG_FD(uap->s);
 1408         error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN),
 1409             &fp, NULL);
 1410         if (error == 0) {
 1411                 so = fp->f_data;
 1412                 error = soshutdown(so, uap->how);
 1413                 fdrop(fp, td);
 1414         }
 1415         return (error);
 1416 }
 1417 
 1418 /* ARGSUSED */
 1419 int
 1420 sys_setsockopt(td, uap)
 1421         struct thread *td;
 1422         struct setsockopt_args /* {
 1423                 int     s;
 1424                 int     level;
 1425                 int     name;
 1426                 caddr_t val;
 1427                 int     valsize;
 1428         } */ *uap;
 1429 {
 1430 
 1431         return (kern_setsockopt(td, uap->s, uap->level, uap->name,
 1432             uap->val, UIO_USERSPACE, uap->valsize));
 1433 }
 1434 
 1435 int
 1436 kern_setsockopt(td, s, level, name, val, valseg, valsize)
 1437         struct thread *td;
 1438         int s;
 1439         int level;
 1440         int name;
 1441         void *val;
 1442         enum uio_seg valseg;
 1443         socklen_t valsize;
 1444 {
 1445         struct socket *so;
 1446         struct file *fp;
 1447         struct sockopt sopt;
 1448         cap_rights_t rights;
 1449         int error;
 1450 
 1451         if (val == NULL && valsize != 0)
 1452                 return (EFAULT);
 1453         if ((int)valsize < 0)
 1454                 return (EINVAL);
 1455 
 1456         sopt.sopt_dir = SOPT_SET;
 1457         sopt.sopt_level = level;
 1458         sopt.sopt_name = name;
 1459         sopt.sopt_val = val;
 1460         sopt.sopt_valsize = valsize;
 1461         switch (valseg) {
 1462         case UIO_USERSPACE:
 1463                 sopt.sopt_td = td;
 1464                 break;
 1465         case UIO_SYSSPACE:
 1466                 sopt.sopt_td = NULL;
 1467                 break;
 1468         default:
 1469                 panic("kern_setsockopt called with bad valseg");
 1470         }
 1471 
 1472         AUDIT_ARG_FD(s);
 1473         error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT),
 1474             &fp, NULL);
 1475         if (error == 0) {
 1476                 so = fp->f_data;
 1477                 error = sosetopt(so, &sopt);
 1478                 fdrop(fp, td);
 1479         }
 1480         return(error);
 1481 }
 1482 
 1483 /* ARGSUSED */
 1484 int
 1485 sys_getsockopt(td, uap)
 1486         struct thread *td;
 1487         struct getsockopt_args /* {
 1488                 int     s;
 1489                 int     level;
 1490                 int     name;
 1491                 void * __restrict       val;
 1492                 socklen_t * __restrict avalsize;
 1493         } */ *uap;
 1494 {
 1495         socklen_t valsize;
 1496         int error;
 1497 
 1498         if (uap->val) {
 1499                 error = copyin(uap->avalsize, &valsize, sizeof (valsize));
 1500                 if (error != 0)
 1501                         return (error);
 1502         }
 1503 
 1504         error = kern_getsockopt(td, uap->s, uap->level, uap->name,
 1505             uap->val, UIO_USERSPACE, &valsize);
 1506 
 1507         if (error == 0)
 1508                 error = copyout(&valsize, uap->avalsize, sizeof (valsize));
 1509         return (error);
 1510 }
 1511 
 1512 /*
 1513  * Kernel version of getsockopt.
 1514  * optval can be a userland or userspace. optlen is always a kernel pointer.
 1515  */
 1516 int
 1517 kern_getsockopt(td, s, level, name, val, valseg, valsize)
 1518         struct thread *td;
 1519         int s;
 1520         int level;
 1521         int name;
 1522         void *val;
 1523         enum uio_seg valseg;
 1524         socklen_t *valsize;
 1525 {
 1526         struct socket *so;
 1527         struct file *fp;
 1528         struct sockopt sopt;
 1529         cap_rights_t rights;
 1530         int error;
 1531 
 1532         if (val == NULL)
 1533                 *valsize = 0;
 1534         if ((int)*valsize < 0)
 1535                 return (EINVAL);
 1536 
 1537         sopt.sopt_dir = SOPT_GET;
 1538         sopt.sopt_level = level;
 1539         sopt.sopt_name = name;
 1540         sopt.sopt_val = val;
 1541         sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
 1542         switch (valseg) {
 1543         case UIO_USERSPACE:
 1544                 sopt.sopt_td = td;
 1545                 break;
 1546         case UIO_SYSSPACE:
 1547                 sopt.sopt_td = NULL;
 1548                 break;
 1549         default:
 1550                 panic("kern_getsockopt called with bad valseg");
 1551         }
 1552 
 1553         AUDIT_ARG_FD(s);
 1554         error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT),
 1555             &fp, NULL);
 1556         if (error == 0) {
 1557                 so = fp->f_data;
 1558                 error = sogetopt(so, &sopt);
 1559                 *valsize = sopt.sopt_valsize;
 1560                 fdrop(fp, td);
 1561         }
 1562         return (error);
 1563 }
 1564 
 1565 /*
 1566  * getsockname1() - Get socket name.
 1567  */
 1568 /* ARGSUSED */
 1569 static int
 1570 getsockname1(td, uap, compat)
 1571         struct thread *td;
 1572         struct getsockname_args /* {
 1573                 int     fdes;
 1574                 struct sockaddr * __restrict asa;
 1575                 socklen_t * __restrict alen;
 1576         } */ *uap;
 1577         int compat;
 1578 {
 1579         struct sockaddr *sa;
 1580         socklen_t len;
 1581         int error;
 1582 
 1583         error = copyin(uap->alen, &len, sizeof(len));
 1584         if (error != 0)
 1585                 return (error);
 1586 
 1587         error = kern_getsockname(td, uap->fdes, &sa, &len);
 1588         if (error != 0)
 1589                 return (error);
 1590 
 1591         if (len != 0) {
 1592 #ifdef COMPAT_OLDSOCK
 1593                 if (compat)
 1594                         ((struct osockaddr *)sa)->sa_family = sa->sa_family;
 1595 #endif
 1596                 error = copyout(sa, uap->asa, (u_int)len);
 1597         }
 1598         free(sa, M_SONAME);
 1599         if (error == 0)
 1600                 error = copyout(&len, uap->alen, sizeof(len));
 1601         return (error);
 1602 }
 1603 
 1604 int
 1605 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
 1606     socklen_t *alen)
 1607 {
 1608         struct socket *so;
 1609         struct file *fp;
 1610         cap_rights_t rights;
 1611         socklen_t len;
 1612         int error;
 1613 
 1614         AUDIT_ARG_FD(fd);
 1615         error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME),
 1616             &fp, NULL);
 1617         if (error != 0)
 1618                 return (error);
 1619         so = fp->f_data;
 1620         *sa = NULL;
 1621         CURVNET_SET(so->so_vnet);
 1622         error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
 1623         CURVNET_RESTORE();
 1624         if (error != 0)
 1625                 goto bad;
 1626         if (*sa == NULL)
 1627                 len = 0;
 1628         else
 1629                 len = MIN(*alen, (*sa)->sa_len);
 1630         *alen = len;
 1631 #ifdef KTRACE
 1632         if (KTRPOINT(td, KTR_STRUCT))
 1633                 ktrsockaddr(*sa);
 1634 #endif
 1635 bad:
 1636         fdrop(fp, td);
 1637         if (error != 0 && *sa != NULL) {
 1638                 free(*sa, M_SONAME);
 1639                 *sa = NULL;
 1640         }
 1641         return (error);
 1642 }
 1643 
 1644 int
 1645 sys_getsockname(td, uap)
 1646         struct thread *td;
 1647         struct getsockname_args *uap;
 1648 {
 1649 
 1650         return (getsockname1(td, uap, 0));
 1651 }
 1652 
 1653 #ifdef COMPAT_OLDSOCK
 1654 int
 1655 ogetsockname(td, uap)
 1656         struct thread *td;
 1657         struct getsockname_args *uap;
 1658 {
 1659 
 1660         return (getsockname1(td, uap, 1));
 1661 }
 1662 #endif /* COMPAT_OLDSOCK */
 1663 
 1664 /*
 1665  * getpeername1() - Get name of peer for connected socket.
 1666  */
 1667 /* ARGSUSED */
 1668 static int
 1669 getpeername1(td, uap, compat)
 1670         struct thread *td;
 1671         struct getpeername_args /* {
 1672                 int     fdes;
 1673                 struct sockaddr * __restrict    asa;
 1674                 socklen_t * __restrict  alen;
 1675         } */ *uap;
 1676         int compat;
 1677 {
 1678         struct sockaddr *sa;
 1679         socklen_t len;
 1680         int error;
 1681 
 1682         error = copyin(uap->alen, &len, sizeof (len));
 1683         if (error != 0)
 1684                 return (error);
 1685 
 1686         error = kern_getpeername(td, uap->fdes, &sa, &len);
 1687         if (error != 0)
 1688                 return (error);
 1689 
 1690         if (len != 0) {
 1691 #ifdef COMPAT_OLDSOCK
 1692                 if (compat)
 1693                         ((struct osockaddr *)sa)->sa_family = sa->sa_family;
 1694 #endif
 1695                 error = copyout(sa, uap->asa, (u_int)len);
 1696         }
 1697         free(sa, M_SONAME);
 1698         if (error == 0)
 1699                 error = copyout(&len, uap->alen, sizeof(len));
 1700         return (error);
 1701 }
 1702 
 1703 int
 1704 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
 1705     socklen_t *alen)
 1706 {
 1707         struct socket *so;
 1708         struct file *fp;
 1709         cap_rights_t rights;
 1710         socklen_t len;
 1711         int error;
 1712 
 1713         AUDIT_ARG_FD(fd);
 1714         error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME),
 1715             &fp, NULL);
 1716         if (error != 0)
 1717                 return (error);
 1718         so = fp->f_data;
 1719         if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
 1720                 error = ENOTCONN;
 1721                 goto done;
 1722         }
 1723         *sa = NULL;
 1724         CURVNET_SET(so->so_vnet);
 1725         error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
 1726         CURVNET_RESTORE();
 1727         if (error != 0)
 1728                 goto bad;
 1729         if (*sa == NULL)
 1730                 len = 0;
 1731         else
 1732                 len = MIN(*alen, (*sa)->sa_len);
 1733         *alen = len;
 1734 #ifdef KTRACE
 1735         if (KTRPOINT(td, KTR_STRUCT))
 1736                 ktrsockaddr(*sa);
 1737 #endif
 1738 bad:
 1739         if (error != 0 && *sa != NULL) {
 1740                 free(*sa, M_SONAME);
 1741                 *sa = NULL;
 1742         }
 1743 done:
 1744         fdrop(fp, td);
 1745         return (error);
 1746 }
 1747 
 1748 int
 1749 sys_getpeername(td, uap)
 1750         struct thread *td;
 1751         struct getpeername_args *uap;
 1752 {
 1753 
 1754         return (getpeername1(td, uap, 0));
 1755 }
 1756 
 1757 #ifdef COMPAT_OLDSOCK
 1758 int
 1759 ogetpeername(td, uap)
 1760         struct thread *td;
 1761         struct ogetpeername_args *uap;
 1762 {
 1763 
 1764         /* XXX uap should have type `getpeername_args *' to begin with. */
 1765         return (getpeername1(td, (struct getpeername_args *)uap, 1));
 1766 }
 1767 #endif /* COMPAT_OLDSOCK */
 1768 
 1769 int
 1770 sockargs(mp, buf, buflen, type)
 1771         struct mbuf **mp;
 1772         caddr_t buf;
 1773         int buflen, type;
 1774 {
 1775         struct sockaddr *sa;
 1776         struct mbuf *m;
 1777         int error;
 1778 
 1779         if (buflen < 0)
 1780                 return (EINVAL);
 1781 
 1782         if (buflen > MLEN) {
 1783 #ifdef COMPAT_OLDSOCK
 1784                 if (type == MT_SONAME && buflen <= 112)
 1785                         buflen = MLEN;          /* unix domain compat. hack */
 1786                 else
 1787 #endif
 1788                         if (buflen > MCLBYTES)
 1789                                 return (EINVAL);
 1790         }
 1791         m = m_get2(buflen, M_WAITOK, type, 0);
 1792         m->m_len = buflen;
 1793         error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
 1794         if (error != 0)
 1795                 (void) m_free(m);
 1796         else {
 1797                 *mp = m;
 1798                 if (type == MT_SONAME) {
 1799                         sa = mtod(m, struct sockaddr *);
 1800 
 1801 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 1802                         if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
 1803                                 sa->sa_family = sa->sa_len;
 1804 #endif
 1805                         sa->sa_len = buflen;
 1806                 }
 1807         }
 1808         return (error);
 1809 }
 1810 
 1811 int
 1812 getsockaddr(namp, uaddr, len)
 1813         struct sockaddr **namp;
 1814         caddr_t uaddr;
 1815         size_t len;
 1816 {
 1817         struct sockaddr *sa;
 1818         int error;
 1819 
 1820         if (len > SOCK_MAXADDRLEN)
 1821                 return (ENAMETOOLONG);
 1822         if (len < offsetof(struct sockaddr, sa_data[0]))
 1823                 return (EINVAL);
 1824         sa = malloc(len, M_SONAME, M_WAITOK);
 1825         error = copyin(uaddr, sa, len);
 1826         if (error != 0) {
 1827                 free(sa, M_SONAME);
 1828         } else {
 1829 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 1830                 if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
 1831                         sa->sa_family = sa->sa_len;
 1832 #endif
 1833                 sa->sa_len = len;
 1834                 *namp = sa;
 1835         }
 1836         return (error);
 1837 }
 1838 
 1839 struct sendfile_sync {
 1840         struct mtx      mtx;
 1841         struct cv       cv;
 1842         unsigned        count;
 1843 };
 1844 
 1845 /*
 1846  * Detach mapped page and release resources back to the system.
 1847  */
 1848 int
 1849 sf_buf_mext(struct mbuf *mb, void *addr, void *args)
 1850 {
 1851         vm_page_t m;
 1852         struct sendfile_sync *sfs;
 1853 
 1854         m = sf_buf_page(args);
 1855         sf_buf_free(args);
 1856         vm_page_lock(m);
 1857         vm_page_unwire(m, 0);
 1858         /*
 1859          * Check for the object going away on us. This can
 1860          * happen since we don't hold a reference to it.
 1861          * If so, we're responsible for freeing the page.
 1862          */
 1863         if (m->wire_count == 0 && m->object == NULL)
 1864                 vm_page_free(m);
 1865         vm_page_unlock(m);
 1866         if (addr == NULL)
 1867                 return (EXT_FREE_OK);
 1868         sfs = addr;
 1869         mtx_lock(&sfs->mtx);
 1870         KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
 1871         if (--sfs->count == 0)
 1872                 cv_signal(&sfs->cv);
 1873         mtx_unlock(&sfs->mtx);
 1874         return (EXT_FREE_OK);
 1875 }
 1876 
 1877 /*
 1878  * sendfile(2)
 1879  *
 1880  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
 1881  *       struct sf_hdtr *hdtr, off_t *sbytes, int flags)
 1882  *
 1883  * Send a file specified by 'fd' and starting at 'offset' to a socket
 1884  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
 1885  * 0.  Optionally add a header and/or trailer to the socket output.  If
 1886  * specified, write the total number of bytes sent into *sbytes.
 1887  */
 1888 int
 1889 sys_sendfile(struct thread *td, struct sendfile_args *uap)
 1890 {
 1891 
 1892         return (do_sendfile(td, uap, 0));
 1893 }
 1894 
 1895 static int
 1896 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
 1897 {
 1898         struct sf_hdtr hdtr;
 1899         struct uio *hdr_uio, *trl_uio;
 1900         struct file *fp;
 1901         cap_rights_t rights;
 1902         int error;
 1903 
 1904         /*
 1905          * File offset must be positive.  If it goes beyond EOF
 1906          * we send only the header/trailer and no payload data.
 1907          */
 1908         if (uap->offset < 0)
 1909                 return (EINVAL);
 1910 
 1911         hdr_uio = trl_uio = NULL;
 1912 
 1913         if (uap->hdtr != NULL) {
 1914                 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
 1915                 if (error != 0)
 1916                         goto out;
 1917                 if (hdtr.headers != NULL) {
 1918                         error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
 1919                         if (error != 0)
 1920                                 goto out;
 1921                 }
 1922                 if (hdtr.trailers != NULL) {
 1923                         error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
 1924                         if (error != 0)
 1925                                 goto out;
 1926 
 1927                 }
 1928         }
 1929 
 1930         AUDIT_ARG_FD(uap->fd);
 1931 
 1932         /*
 1933          * sendfile(2) can start at any offset within a file so we require
 1934          * CAP_READ+CAP_SEEK = CAP_PREAD.
 1935          */
 1936         if ((error = fget_read(td, uap->fd,
 1937             cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) {
 1938                 goto out;
 1939         }
 1940 
 1941         error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
 1942             uap->nbytes, uap->sbytes, uap->flags, compat ? SFK_COMPAT : 0, td);
 1943         fdrop(fp, td);
 1944 
 1945 out:
 1946         free(hdr_uio, M_IOV);
 1947         free(trl_uio, M_IOV);
 1948         return (error);
 1949 }
 1950 
 1951 #ifdef COMPAT_FREEBSD4
 1952 int
 1953 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
 1954 {
 1955         struct sendfile_args args;
 1956 
 1957         args.fd = uap->fd;
 1958         args.s = uap->s;
 1959         args.offset = uap->offset;
 1960         args.nbytes = uap->nbytes;
 1961         args.hdtr = uap->hdtr;
 1962         args.sbytes = uap->sbytes;
 1963         args.flags = uap->flags;
 1964 
 1965         return (do_sendfile(td, &args, 1));
 1966 }
 1967 #endif /* COMPAT_FREEBSD4 */
 1968 
 1969 static int
 1970 sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd,
 1971     off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res)
 1972 {
 1973         vm_page_t m;
 1974         vm_pindex_t pindex;
 1975         ssize_t resid;
 1976         int error, readahead, rv;
 1977 
 1978         pindex = OFF_TO_IDX(off);
 1979         VM_OBJECT_WLOCK(obj);
 1980         m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY |
 1981             VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL);
 1982 
 1983         /*
 1984          * Check if page is valid for what we need, otherwise initiate I/O.
 1985          *
 1986          * The non-zero nd argument prevents disk I/O, instead we
 1987          * return the caller what he specified in nd.  In particular,
 1988          * if we already turned some pages into mbufs, nd == EAGAIN
 1989          * and the main function send them the pages before we come
 1990          * here again and block.
 1991          */
 1992         if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) {
 1993                 if (vp == NULL)
 1994                         vm_page_xunbusy(m);
 1995                 VM_OBJECT_WUNLOCK(obj);
 1996                 *res = m;
 1997                 return (0);
 1998         } else if (nd != 0) {
 1999                 if (vp == NULL)
 2000                         vm_page_xunbusy(m);
 2001                 error = nd;
 2002                 goto free_page;
 2003         }
 2004 
 2005         /*
 2006          * Get the page from backing store.
 2007          */
 2008         error = 0;
 2009         if (vp != NULL) {
 2010                 VM_OBJECT_WUNLOCK(obj);
 2011                 readahead = sfreadahead * MAXBSIZE;
 2012 
 2013                 /*
 2014                  * Use vn_rdwr() instead of the pager interface for
 2015                  * the vnode, to allow the read-ahead.
 2016                  *
 2017                  * XXXMAC: Because we don't have fp->f_cred here, we
 2018                  * pass in NOCRED.  This is probably wrong, but is
 2019                  * consistent with our original implementation.
 2020                  */
 2021                 error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off),
 2022                     UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead /
 2023                     bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td);
 2024                 SFSTAT_INC(sf_iocnt);
 2025                 VM_OBJECT_WLOCK(obj);
 2026         } else {
 2027                 if (vm_pager_has_page(obj, pindex, NULL, NULL)) {
 2028                         rv = vm_pager_get_pages(obj, &m, 1, 0);
 2029                         SFSTAT_INC(sf_iocnt);
 2030                         m = vm_page_lookup(obj, pindex);
 2031                         if (m == NULL)
 2032                                 error = EIO;
 2033                         else if (rv != VM_PAGER_OK) {
 2034                                 vm_page_lock(m);
 2035                                 vm_page_free(m);
 2036                                 vm_page_unlock(m);
 2037                                 m = NULL;
 2038                                 error = EIO;
 2039                         }
 2040                 } else {
 2041                         pmap_zero_page(m);
 2042                         m->valid = VM_PAGE_BITS_ALL;
 2043                         m->dirty = 0;
 2044                 }
 2045                 if (m != NULL)
 2046                         vm_page_xunbusy(m);
 2047         }
 2048         if (error == 0) {
 2049                 *res = m;
 2050         } else if (m != NULL) {
 2051 free_page:
 2052                 vm_page_lock(m);
 2053                 vm_page_unwire(m, 0);
 2054 
 2055                 /*
 2056                  * See if anyone else might know about this page.  If
 2057                  * not and it is not valid, then free it.
 2058                  */
 2059                 if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m))
 2060                         vm_page_free(m);
 2061                 vm_page_unlock(m);
 2062         }
 2063         KASSERT(error != 0 || (m->wire_count > 0 &&
 2064             vm_page_is_valid(m, off & PAGE_MASK, xfsize)),
 2065             ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off,
 2066             xfsize));
 2067         VM_OBJECT_WUNLOCK(obj);
 2068         return (error);
 2069 }
 2070 
 2071 static int
 2072 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
 2073     struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
 2074     int *bsize)
 2075 {
 2076         struct vattr va;
 2077         vm_object_t obj;
 2078         struct vnode *vp;
 2079         struct shmfd *shmfd;
 2080         int error;
 2081 
 2082         vp = *vp_res = NULL;
 2083         obj = NULL;
 2084         shmfd = *shmfd_res = NULL;
 2085         *bsize = 0;
 2086 
 2087         /*
 2088          * The file descriptor must be a regular file and have a
 2089          * backing VM object.
 2090          */
 2091         if (fp->f_type == DTYPE_VNODE) {
 2092                 vp = fp->f_vnode;
 2093                 vn_lock(vp, LK_SHARED | LK_RETRY);
 2094                 if (vp->v_type != VREG) {
 2095                         error = EINVAL;
 2096                         goto out;
 2097                 }
 2098                 *bsize = vp->v_mount->mnt_stat.f_iosize;
 2099                 error = VOP_GETATTR(vp, &va, td->td_ucred);
 2100                 if (error != 0)
 2101                         goto out;
 2102                 *obj_size = va.va_size;
 2103                 obj = vp->v_object;
 2104                 if (obj == NULL) {
 2105                         error = EINVAL;
 2106                         goto out;
 2107                 }
 2108         } else if (fp->f_type == DTYPE_SHM) {
 2109                 error = 0;
 2110                 shmfd = fp->f_data;
 2111                 obj = shmfd->shm_object;
 2112                 *obj_size = shmfd->shm_size;
 2113         } else {
 2114                 error = EINVAL;
 2115                 goto out;
 2116         }
 2117 
 2118         VM_OBJECT_WLOCK(obj);
 2119         if ((obj->flags & OBJ_DEAD) != 0) {
 2120                 VM_OBJECT_WUNLOCK(obj);
 2121                 error = EBADF;
 2122                 goto out;
 2123         }
 2124 
 2125         /*
 2126          * Temporarily increase the backing VM object's reference
 2127          * count so that a forced reclamation of its vnode does not
 2128          * immediately destroy it.
 2129          */
 2130         vm_object_reference_locked(obj);
 2131         VM_OBJECT_WUNLOCK(obj);
 2132         *obj_res = obj;
 2133         *vp_res = vp;
 2134         *shmfd_res = shmfd;
 2135 
 2136 out:
 2137         if (vp != NULL)
 2138                 VOP_UNLOCK(vp, 0);
 2139         return (error);
 2140 }
 2141 
 2142 static int
 2143 kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
 2144     struct socket **so)
 2145 {
 2146         cap_rights_t rights;
 2147         int error;
 2148 
 2149         *sock_fp = NULL;
 2150         *so = NULL;
 2151 
 2152         /*
 2153          * The socket must be a stream socket and connected.
 2154          */
 2155         error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND),
 2156             sock_fp, NULL);
 2157         if (error != 0)
 2158                 return (error);
 2159         *so = (*sock_fp)->f_data;
 2160         if ((*so)->so_type != SOCK_STREAM)
 2161                 return (EINVAL);
 2162         if (((*so)->so_state & SS_ISCONNECTED) == 0)
 2163                 return (ENOTCONN);
 2164         return (0);
 2165 }
 2166 
 2167 int
 2168 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
 2169     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
 2170     int kflags, struct thread *td)
 2171 {
 2172         struct file *sock_fp;
 2173         struct vnode *vp;
 2174         struct vm_object *obj;
 2175         struct socket *so;
 2176         struct mbuf *m;
 2177         struct sf_buf *sf;
 2178         struct vm_page *pg;
 2179         struct shmfd *shmfd;
 2180         struct sendfile_sync *sfs;
 2181         struct vattr va;
 2182         off_t off, xfsize, fsbytes, sbytes, rem, obj_size;
 2183         int error, bsize, nd, hdrlen, mnw;
 2184         bool inflight_called;
 2185 
 2186         pg = NULL;
 2187         obj = NULL;
 2188         so = NULL;
 2189         m = NULL;
 2190         sfs = NULL;
 2191         fsbytes = sbytes = 0;
 2192         hdrlen = mnw = 0;
 2193         rem = nbytes;
 2194         obj_size = 0;
 2195         inflight_called = false;
 2196 
 2197         error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
 2198         if (error != 0)
 2199                 return (error);
 2200         if (rem == 0)
 2201                 rem = obj_size;
 2202 
 2203         error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so);
 2204         if (error != 0)
 2205                 goto out;
 2206 
 2207         /*
 2208          * Do not wait on memory allocations but return ENOMEM for
 2209          * caller to retry later.
 2210          * XXX: Experimental.
 2211          */
 2212         if (flags & SF_MNOWAIT)
 2213                 mnw = 1;
 2214 
 2215         if (flags & SF_SYNC) {
 2216                 sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
 2217                 mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
 2218                 cv_init(&sfs->cv, "sendfile");
 2219         }
 2220 
 2221 #ifdef MAC
 2222         error = mac_socket_check_send(td->td_ucred, so);
 2223         if (error != 0)
 2224                 goto out;
 2225 #endif
 2226 
 2227         /* If headers are specified copy them into mbufs. */
 2228         if (hdr_uio != NULL) {
 2229                 hdr_uio->uio_td = td;
 2230                 hdr_uio->uio_rw = UIO_WRITE;
 2231                 if (hdr_uio->uio_resid > 0) {
 2232                         /*
 2233                          * In FBSD < 5.0 the nbytes to send also included
 2234                          * the header.  If compat is specified subtract the
 2235                          * header size from nbytes.
 2236                          */
 2237                         if (kflags & SFK_COMPAT) {
 2238                                 if (nbytes > hdr_uio->uio_resid)
 2239                                         nbytes -= hdr_uio->uio_resid;
 2240                                 else
 2241                                         nbytes = 0;
 2242                         }
 2243                         m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
 2244                             0, 0, 0);
 2245                         if (m == NULL) {
 2246                                 error = mnw ? EAGAIN : ENOBUFS;
 2247                                 goto out;
 2248                         }
 2249                         hdrlen = m_length(m, NULL);
 2250                 }
 2251         }
 2252 
 2253         /*
 2254          * Protect against multiple writers to the socket.
 2255          *
 2256          * XXXRW: Historically this has assumed non-interruptibility, so now
 2257          * we implement that, but possibly shouldn't.
 2258          */
 2259         (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
 2260 
 2261         /*
 2262          * Loop through the pages of the file, starting with the requested
 2263          * offset. Get a file page (do I/O if necessary), map the file page
 2264          * into an sf_buf, attach an mbuf header to the sf_buf, and queue
 2265          * it on the socket.
 2266          * This is done in two loops.  The inner loop turns as many pages
 2267          * as it can, up to available socket buffer space, without blocking
 2268          * into mbufs to have it bulk delivered into the socket send buffer.
 2269          * The outer loop checks the state and available space of the socket
 2270          * and takes care of the overall progress.
 2271          */
 2272         for (off = offset; ; ) {
 2273                 struct mbuf *mtail;
 2274                 int loopbytes;
 2275                 int space;
 2276                 int done;
 2277 
 2278                 if ((nbytes != 0 && nbytes == fsbytes) ||
 2279                     (nbytes == 0 && obj_size == fsbytes))
 2280                         break;
 2281 
 2282                 mtail = NULL;
 2283                 loopbytes = 0;
 2284                 space = 0;
 2285                 done = 0;
 2286 
 2287                 /*
 2288                  * Check the socket state for ongoing connection,
 2289                  * no errors and space in socket buffer.
 2290                  * If space is low allow for the remainder of the
 2291                  * file to be processed if it fits the socket buffer.
 2292                  * Otherwise block in waiting for sufficient space
 2293                  * to proceed, or if the socket is nonblocking, return
 2294                  * to userland with EAGAIN while reporting how far
 2295                  * we've come.
 2296                  * We wait until the socket buffer has significant free
 2297                  * space to do bulk sends.  This makes good use of file
 2298                  * system read ahead and allows packet segmentation
 2299                  * offloading hardware to take over lots of work.  If
 2300                  * we were not careful here we would send off only one
 2301                  * sfbuf at a time.
 2302                  */
 2303                 SOCKBUF_LOCK(&so->so_snd);
 2304                 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
 2305                         so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
 2306 retry_space:
 2307                 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 2308                         error = EPIPE;
 2309                         SOCKBUF_UNLOCK(&so->so_snd);
 2310                         goto done;
 2311                 } else if (so->so_error) {
 2312                         error = so->so_error;
 2313                         so->so_error = 0;
 2314                         SOCKBUF_UNLOCK(&so->so_snd);
 2315                         goto done;
 2316                 }
 2317                 space = sbspace(&so->so_snd);
 2318                 if (space < rem &&
 2319                     (space <= 0 ||
 2320                      space < so->so_snd.sb_lowat)) {
 2321                         if (so->so_state & SS_NBIO) {
 2322                                 SOCKBUF_UNLOCK(&so->so_snd);
 2323                                 error = EAGAIN;
 2324                                 goto done;
 2325                         }
 2326                         /*
 2327                          * sbwait drops the lock while sleeping.
 2328                          * When we loop back to retry_space the
 2329                          * state may have changed and we retest
 2330                          * for it.
 2331                          */
 2332                         error = sbwait(&so->so_snd);
 2333                         /*
 2334                          * An error from sbwait usually indicates that we've
 2335                          * been interrupted by a signal. If we've sent anything
 2336                          * then return bytes sent, otherwise return the error.
 2337                          */
 2338                         if (error != 0) {
 2339                                 SOCKBUF_UNLOCK(&so->so_snd);
 2340                                 goto done;
 2341                         }
 2342                         goto retry_space;
 2343                 }
 2344                 SOCKBUF_UNLOCK(&so->so_snd);
 2345 
 2346                 /*
 2347                  * Reduce space in the socket buffer by the size of
 2348                  * the header mbuf chain.
 2349                  * hdrlen is set to 0 after the first loop.
 2350                  */
 2351                 space -= hdrlen;
 2352 
 2353                 if (vp != NULL) {
 2354                         error = vn_lock(vp, LK_SHARED);
 2355                         if (error != 0)
 2356                                 goto done;
 2357                         error = VOP_GETATTR(vp, &va, td->td_ucred);
 2358                         if (error != 0 || off >= va.va_size) {
 2359                                 VOP_UNLOCK(vp, 0);
 2360                                 goto done;
 2361                         }
 2362                         obj_size = va.va_size;
 2363                 }
 2364 
 2365                 /*
 2366                  * Loop and construct maximum sized mbuf chain to be bulk
 2367                  * dumped into socket buffer.
 2368                  */
 2369                 while (space > loopbytes) {
 2370                         vm_offset_t pgoff;
 2371                         struct mbuf *m0;
 2372 
 2373                         /*
 2374                          * Calculate the amount to transfer.
 2375                          * Not to exceed a page, the EOF,
 2376                          * or the passed in nbytes.
 2377                          */
 2378                         pgoff = (vm_offset_t)(off & PAGE_MASK);
 2379                         rem = obj_size - offset;
 2380                         if (nbytes != 0)
 2381                                 rem = omin(rem, nbytes);
 2382                         rem -= fsbytes + loopbytes;
 2383                         xfsize = omin(PAGE_SIZE - pgoff, rem);
 2384                         xfsize = omin(space - loopbytes, xfsize);
 2385                         if (xfsize <= 0) {
 2386                                 done = 1;               /* all data sent */
 2387                                 break;
 2388                         }
 2389 
 2390                         /*
 2391                          * Attempt to look up the page.  Allocate
 2392                          * if not found or wait and loop if busy.
 2393                          */
 2394                         if (m != NULL)
 2395                                 nd = EAGAIN; /* send what we already got */
 2396                         else if ((flags & SF_NODISKIO) != 0)
 2397                                 nd = EBUSY;
 2398                         else
 2399                                 nd = 0;
 2400                         error = sendfile_readpage(obj, vp, nd, off,
 2401                             xfsize, bsize, td, &pg);
 2402                         if (error != 0) {
 2403                                 if (error == EAGAIN)
 2404                                         error = 0;      /* not a real error */
 2405                                 break;
 2406                         }
 2407 
 2408                         /*
 2409                          * Get a sendfile buf.  When allocating the
 2410                          * first buffer for mbuf chain, we usually
 2411                          * wait as long as necessary, but this wait
 2412                          * can be interrupted.  For consequent
 2413                          * buffers, do not sleep, since several
 2414                          * threads might exhaust the buffers and then
 2415                          * deadlock.
 2416                          */
 2417                         sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
 2418                             SFB_CATCH);
 2419                         if (sf == NULL) {
 2420                                 SFSTAT_INC(sf_allocfail);
 2421                                 vm_page_lock(pg);
 2422                                 vm_page_unwire(pg, 0);
 2423                                 KASSERT(pg->object != NULL,
 2424                                     ("%s: object disappeared", __func__));
 2425                                 vm_page_unlock(pg);
 2426                                 if (m == NULL)
 2427                                         error = (mnw ? EAGAIN : EINTR);
 2428                                 break;
 2429                         }
 2430 
 2431                         /*
 2432                          * Get an mbuf and set it up as having
 2433                          * external storage.
 2434                          */
 2435                         m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
 2436                         if (m0 == NULL) {
 2437                                 error = (mnw ? EAGAIN : ENOBUFS);
 2438                                 (void)sf_buf_mext(NULL, NULL, sf);
 2439                                 break;
 2440                         }
 2441                         if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
 2442                             sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
 2443                             (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
 2444                                 error = (mnw ? EAGAIN : ENOBUFS);
 2445                                 (void)sf_buf_mext(NULL, NULL, sf);
 2446                                 m_freem(m0);
 2447                                 break;
 2448                         }
 2449                         m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
 2450                         m0->m_len = xfsize;
 2451 
 2452                         /* Append to mbuf chain. */
 2453                         if (mtail != NULL)
 2454                                 mtail->m_next = m0;
 2455                         else if (m != NULL)
 2456                                 m_last(m)->m_next = m0;
 2457                         else
 2458                                 m = m0;
 2459                         mtail = m0;
 2460 
 2461                         /* Keep track of bits processed. */
 2462                         loopbytes += xfsize;
 2463                         off += xfsize;
 2464 
 2465                         if (sfs != NULL) {
 2466                                 mtx_lock(&sfs->mtx);
 2467                                 sfs->count++;
 2468                                 mtx_unlock(&sfs->mtx);
 2469                         }
 2470                 }
 2471 
 2472                 if (vp != NULL)
 2473                         VOP_UNLOCK(vp, 0);
 2474 
 2475                 /* Add the buffer chain to the socket buffer. */
 2476                 if (m != NULL) {
 2477                         int mlen, err;
 2478 
 2479                         mlen = m_length(m, NULL);
 2480                         SOCKBUF_LOCK(&so->so_snd);
 2481                         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 2482                                 error = EPIPE;
 2483                                 SOCKBUF_UNLOCK(&so->so_snd);
 2484                                 goto done;
 2485                         }
 2486                         SOCKBUF_UNLOCK(&so->so_snd);
 2487                         CURVNET_SET(so->so_vnet);
 2488                         /* Avoid error aliasing. */
 2489                         err = (*so->so_proto->pr_usrreqs->pru_send)
 2490                                     (so, 0, m, NULL, NULL, td);
 2491                         CURVNET_RESTORE();
 2492                         if (err == 0) {
 2493                                 /*
 2494                                  * We need two counters to get the
 2495                                  * file offset and nbytes to send
 2496                                  * right:
 2497                                  * - sbytes contains the total amount
 2498                                  *   of bytes sent, including headers.
 2499                                  * - fsbytes contains the total amount
 2500                                  *   of bytes sent from the file.
 2501                                  */
 2502                                 sbytes += mlen;
 2503                                 fsbytes += mlen;
 2504                                 if (hdrlen) {
 2505                                         fsbytes -= hdrlen;
 2506                                         hdrlen = 0;
 2507                                 }
 2508                         } else if (error == 0)
 2509                                 error = err;
 2510                         m = NULL;       /* pru_send always consumes */
 2511                 }
 2512 
 2513                 /* Quit outer loop on error or when we're done. */
 2514                 if (done)
 2515                         break;
 2516                 if (error != 0)
 2517                         goto done;
 2518         }
 2519 
 2520         /*
 2521          * Send trailers. Wimp out and use writev(2).
 2522          */
 2523         if (trl_uio != NULL) {
 2524                 sbunlock(&so->so_snd);
 2525                 error = kern_writev(td, sockfd, trl_uio);
 2526                 if (error == 0)
 2527                         sbytes += td->td_retval[0];
 2528                 goto out;
 2529         }
 2530 
 2531 done:
 2532         sbunlock(&so->so_snd);
 2533 out:
 2534         /*
 2535          * If there was no error we have to clear td->td_retval[0]
 2536          * because it may have been set by writev.
 2537          */
 2538         if (error == 0) {
 2539                 td->td_retval[0] = 0;
 2540         }
 2541         if (sent != NULL) {
 2542                 copyout(&sbytes, sent, sizeof(off_t));
 2543         }
 2544         if (obj != NULL)
 2545                 vm_object_deallocate(obj);
 2546         if (so)
 2547                 fdrop(sock_fp, td);
 2548         if (m)
 2549                 m_freem(m);
 2550 
 2551         if (sfs != NULL) {
 2552                 mtx_lock(&sfs->mtx);
 2553                 if (sfs->count != 0)
 2554                         cv_wait(&sfs->cv, &sfs->mtx);
 2555                 KASSERT(sfs->count == 0, ("sendfile sync still busy"));
 2556                 cv_destroy(&sfs->cv);
 2557                 mtx_destroy(&sfs->mtx);
 2558                 free(sfs, M_TEMP);
 2559         }
 2560 
 2561         if (error == ERESTART)
 2562                 error = EINTR;
 2563 
 2564         return (error);
 2565 }

Cache object: d9be4be3c222c4c9f8f17759b297e665


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.