The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_sendfile.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2013-2015 Gleb Smirnoff <glebius@FreeBSD.org>
    3  * Copyright (c) 1998, David Greenman. All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  * 4. Neither the name of the University nor the names of its contributors
   14  *    may be used to endorse or promote products derived from this software
   15  *    without specific prior written permission.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  */
   29 
   30 #include <sys/cdefs.h>
   31 __FBSDID("$FreeBSD: releng/11.1/sys/kern/kern_sendfile.c 319985 2017-06-15 18:49:46Z alc $");
   32 
   33 #include "opt_compat.h"
   34 
   35 #include <sys/param.h>
   36 #include <sys/systm.h>
   37 #include <sys/capsicum.h>
   38 #include <sys/kernel.h>
   39 #include <sys/lock.h>
   40 #include <sys/mutex.h>
   41 #include <sys/sysproto.h>
   42 #include <sys/malloc.h>
   43 #include <sys/proc.h>
   44 #include <sys/mman.h>
   45 #include <sys/mount.h>
   46 #include <sys/mbuf.h>
   47 #include <sys/protosw.h>
   48 #include <sys/rwlock.h>
   49 #include <sys/sf_buf.h>
   50 #include <sys/socket.h>
   51 #include <sys/socketvar.h>
   52 #include <sys/syscallsubr.h>
   53 #include <sys/sysctl.h>
   54 #include <sys/vnode.h>
   55 
   56 #include <net/vnet.h>
   57 
   58 #include <security/audit/audit.h>
   59 #include <security/mac/mac_framework.h>
   60 
   61 #include <vm/vm.h>
   62 #include <vm/vm_object.h>
   63 #include <vm/vm_pager.h>
   64 
   65 /*
   66  * Structure describing a single sendfile(2) I/O, which may consist of
   67  * several underlying pager I/Os.
   68  *
   69  * The syscall context allocates the structure and initializes 'nios'
   70  * to 1.  As sendfile_swapin() runs through pages and starts asynchronous
   71  * paging operations, it increments 'nios'.
   72  *
   73  * Every I/O completion calls sendfile_iodone(), which decrements the 'nios',
   74  * and the syscall also calls sendfile_iodone() after allocating all mbufs,
   75  * linking them and sending to socket.  Whoever reaches zero 'nios' is
   76  * responsible to * call pru_ready on the socket, to notify it of readyness
   77  * of the data.
   78  */
   79 struct sf_io {
   80         volatile u_int  nios;
   81         u_int           error;
   82         int             npages;
   83         struct file     *sock_fp;
   84         struct mbuf     *m;
   85         vm_page_t       pa[];
   86 };
   87 
   88 /*
   89  * Structure used to track requests with SF_SYNC flag.
   90  */
   91 struct sendfile_sync {
   92         struct mtx      mtx;
   93         struct cv       cv;
   94         unsigned        count;
   95 };
   96 
   97 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
   98 
   99 static void
  100 sfstat_init(const void *unused)
  101 {
  102 
  103         COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
  104             M_WAITOK);
  105 }
  106 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
  107 
  108 static int
  109 sfstat_sysctl(SYSCTL_HANDLER_ARGS)
  110 {
  111         struct sfstat s;
  112 
  113         COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
  114         if (req->newptr)
  115                 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
  116         return (SYSCTL_OUT(req, &s, sizeof(s)));
  117 }
  118 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
  119     NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
  120 
  121 /*
  122  * Detach mapped page and release resources back to the system.  Called
  123  * by mbuf(9) code when last reference to a page is freed.
  124  */
  125 void
  126 sf_ext_free(void *arg1, void *arg2)
  127 {
  128         struct sf_buf *sf = arg1;
  129         struct sendfile_sync *sfs = arg2;
  130         vm_page_t pg = sf_buf_page(sf);
  131 
  132         sf_buf_free(sf);
  133 
  134         vm_page_lock(pg);
  135         /*
  136          * Check for the object going away on us. This can
  137          * happen since we don't hold a reference to it.
  138          * If so, we're responsible for freeing the page.
  139          */
  140         if (vm_page_unwire(pg, PQ_INACTIVE) && pg->object == NULL)
  141                 vm_page_free(pg);
  142         vm_page_unlock(pg);
  143 
  144         if (sfs != NULL) {
  145                 mtx_lock(&sfs->mtx);
  146                 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
  147                 if (--sfs->count == 0)
  148                         cv_signal(&sfs->cv);
  149                 mtx_unlock(&sfs->mtx);
  150         }
  151 }
  152 
  153 /*
  154  * Same as above, but forces the page to be detached from the object
  155  * and go into free pool.
  156  */
  157 void
  158 sf_ext_free_nocache(void *arg1, void *arg2)
  159 {
  160         struct sf_buf *sf = arg1;
  161         struct sendfile_sync *sfs = arg2;
  162         vm_page_t pg = sf_buf_page(sf);
  163 
  164         sf_buf_free(sf);
  165 
  166         vm_page_lock(pg);
  167         if (vm_page_unwire(pg, PQ_NONE)) {
  168                 vm_object_t obj;
  169 
  170                 /* Try to free the page, but only if it is cheap to. */
  171                 if ((obj = pg->object) == NULL)
  172                         vm_page_free(pg);
  173                 else if (!vm_page_xbusied(pg) && VM_OBJECT_TRYWLOCK(obj)) {
  174                         vm_page_free(pg);
  175                         VM_OBJECT_WUNLOCK(obj);
  176                 } else
  177                         vm_page_deactivate(pg);
  178         }
  179         vm_page_unlock(pg);
  180 
  181         if (sfs != NULL) {
  182                 mtx_lock(&sfs->mtx);
  183                 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
  184                 if (--sfs->count == 0)
  185                         cv_signal(&sfs->cv);
  186                 mtx_unlock(&sfs->mtx);
  187         }
  188 }
  189 
  190 /*
  191  * Helper function to calculate how much data to put into page i of n.
  192  * Only first and last pages are special.
  193  */
  194 static inline off_t
  195 xfsize(int i, int n, off_t off, off_t len)
  196 {
  197 
  198         if (i == 0)
  199                 return (omin(PAGE_SIZE - (off & PAGE_MASK), len));
  200 
  201         if (i == n - 1 && ((off + len) & PAGE_MASK) > 0)
  202                 return ((off + len) & PAGE_MASK);
  203 
  204         return (PAGE_SIZE);
  205 }
  206 
  207 /*
  208  * Helper function to get offset within object for i page.
  209  */
  210 static inline vm_ooffset_t
  211 vmoff(int i, off_t off)
  212 {
  213 
  214         if (i == 0)
  215                 return ((vm_ooffset_t)off);
  216 
  217         return (trunc_page(off + i * PAGE_SIZE));
  218 }
  219 
  220 /*
  221  * Helper function used when allocation of a page or sf_buf failed.
  222  * Pretend as if we don't have enough space, subtract xfsize() of
  223  * all pages that failed.
  224  */
  225 static inline void
  226 fixspace(int old, int new, off_t off, int *space)
  227 {
  228 
  229         KASSERT(old > new, ("%s: old %d new %d", __func__, old, new));
  230 
  231         /* Subtract last one. */
  232         *space -= xfsize(old - 1, old, off, *space);
  233         old--;
  234 
  235         if (new == old)
  236                 /* There was only one page. */
  237                 return;
  238 
  239         /* Subtract first one. */
  240         if (new == 0) {
  241                 *space -= xfsize(0, old, off, *space);
  242                 new++;
  243         }
  244 
  245         /* Rest of pages are full sized. */
  246         *space -= (old - new) * PAGE_SIZE;
  247 
  248         KASSERT(*space >= 0, ("%s: space went backwards", __func__));
  249 }
  250 
  251 /*
  252  * I/O completion callback.
  253  */
  254 static void
  255 sendfile_iodone(void *arg, vm_page_t *pg, int count, int error)
  256 {
  257         struct sf_io *sfio = arg;
  258         struct socket *so;
  259 
  260         for (int i = 0; i < count; i++)
  261                 vm_page_xunbusy(pg[i]);
  262 
  263         if (error)
  264                 sfio->error = error;
  265 
  266         if (!refcount_release(&sfio->nios))
  267                 return;
  268 
  269         so = sfio->sock_fp->f_data;
  270 
  271         if (sfio->error) {
  272                 struct mbuf *m;
  273 
  274                 /*
  275                  * I/O operation failed.  The state of data in the socket
  276                  * is now inconsistent, and all what we can do is to tear
  277                  * it down. Protocol abort method would tear down protocol
  278                  * state, free all ready mbufs and detach not ready ones.
  279                  * We will free the mbufs corresponding to this I/O manually.
  280                  *
  281                  * The socket would be marked with EIO and made available
  282                  * for read, so that application receives EIO on next
  283                  * syscall and eventually closes the socket.
  284                  */
  285                 so->so_proto->pr_usrreqs->pru_abort(so);
  286                 so->so_error = EIO;
  287 
  288                 m = sfio->m;
  289                 for (int i = 0; i < sfio->npages; i++)
  290                         m = m_free(m);
  291         } else {
  292                 CURVNET_SET(so->so_vnet);
  293                 (void )(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m,
  294                     sfio->npages);
  295                 CURVNET_RESTORE();
  296         }
  297 
  298         /* XXXGL: curthread */
  299         fdrop(sfio->sock_fp, curthread);
  300         free(sfio, M_TEMP);
  301 }
  302 
  303 /*
  304  * Iterate through pages vector and request paging for non-valid pages.
  305  */
  306 static int
  307 sendfile_swapin(vm_object_t obj, struct sf_io *sfio, off_t off, off_t len,
  308     int npages, int rhpages, int flags)
  309 {
  310         vm_page_t *pa = sfio->pa;
  311         int nios;
  312 
  313         nios = 0;
  314         flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0;
  315 
  316         /*
  317          * First grab all the pages and wire them.  Note that we grab
  318          * only required pages.  Readahead pages are dealt with later.
  319          */
  320         VM_OBJECT_WLOCK(obj);
  321         for (int i = 0; i < npages; i++) {
  322                 pa[i] = vm_page_grab(obj, OFF_TO_IDX(vmoff(i, off)),
  323                     VM_ALLOC_WIRED | VM_ALLOC_NORMAL | flags);
  324                 if (pa[i] == NULL) {
  325                         npages = i;
  326                         rhpages = 0;
  327                         break;
  328                 }
  329         }
  330 
  331         for (int i = 0; i < npages;) {
  332                 int j, a, count, rv;
  333 
  334                 /* Skip valid pages. */
  335                 if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK,
  336                     xfsize(i, npages, off, len))) {
  337                         vm_page_xunbusy(pa[i]);
  338                         SFSTAT_INC(sf_pages_valid);
  339                         i++;
  340                         continue;
  341                 }
  342 
  343                 /*
  344                  * Now 'i' points to first invalid page, iterate further
  345                  * to make 'j' point at first valid after a bunch of
  346                  * invalid ones.
  347                  */
  348                 for (j = i + 1; j < npages; j++)
  349                         if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
  350                             xfsize(j, npages, off, len))) {
  351                                 SFSTAT_INC(sf_pages_valid);
  352                                 break;
  353                         }
  354 
  355                 /*
  356                  * Now we got region of invalid pages between 'i' and 'j'.
  357                  * Check that they belong to pager.  They may not be there,
  358                  * which is a regular situation for shmem pager.  For vnode
  359                  * pager this happens only in case of sparse file.
  360                  *
  361                  * Important feature of vm_pager_has_page() is the hint
  362                  * stored in 'a', about how many pages we can pagein after
  363                  * this page in a single I/O.
  364                  */
  365                 while (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)),
  366                     NULL, &a) && i < j) {
  367                         pmap_zero_page(pa[i]);
  368                         pa[i]->valid = VM_PAGE_BITS_ALL;
  369                         pa[i]->dirty = 0;
  370                         vm_page_xunbusy(pa[i]);
  371                         i++;
  372                 }
  373                 if (i == j)
  374                         continue;
  375 
  376                 /*
  377                  * We want to pagein as many pages as possible, limited only
  378                  * by the 'a' hint and actual request.
  379                  *
  380                  * We should not pagein into already valid page, thus if
  381                  * 'j' didn't reach last page, trim by that page.
  382                  *
  383                  * When the pagein fulfils the request, also specify readahead.
  384                  */
  385                 if (j < npages)
  386                         a = min(a, j - i - 1);
  387                 count = min(a + 1, npages - i);
  388 
  389                 refcount_acquire(&sfio->nios);
  390                 rv = vm_pager_get_pages_async(obj, pa + i, count, NULL,
  391                     i + count == npages ? &rhpages : NULL,
  392                     &sendfile_iodone, sfio);
  393                 KASSERT(rv == VM_PAGER_OK, ("%s: pager fail obj %p page %p",
  394                     __func__, obj, pa[i]));
  395 
  396                 SFSTAT_INC(sf_iocnt);
  397                 SFSTAT_ADD(sf_pages_read, count);
  398                 if (i + count == npages)
  399                         SFSTAT_ADD(sf_rhpages_read, rhpages);
  400 
  401 #ifdef INVARIANTS
  402                 for (j = i; j < i + count && j < npages; j++)
  403                         KASSERT(pa[j] == vm_page_lookup(obj,
  404                             OFF_TO_IDX(vmoff(j, off))),
  405                             ("pa[j] %p lookup %p\n", pa[j],
  406                             vm_page_lookup(obj, OFF_TO_IDX(vmoff(j, off)))));
  407 #endif
  408                 i += count;
  409                 nios++;
  410         }
  411 
  412         VM_OBJECT_WUNLOCK(obj);
  413 
  414         if (nios == 0 && npages != 0)
  415                 SFSTAT_INC(sf_noiocnt);
  416 
  417         return (nios);
  418 }
  419 
  420 static int
  421 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
  422     struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
  423     int *bsize)
  424 {
  425         struct vattr va;
  426         vm_object_t obj;
  427         struct vnode *vp;
  428         struct shmfd *shmfd;
  429         int error;
  430 
  431         vp = *vp_res = NULL;
  432         obj = NULL;
  433         shmfd = *shmfd_res = NULL;
  434         *bsize = 0;
  435 
  436         /*
  437          * The file descriptor must be a regular file and have a
  438          * backing VM object.
  439          */
  440         if (fp->f_type == DTYPE_VNODE) {
  441                 vp = fp->f_vnode;
  442                 vn_lock(vp, LK_SHARED | LK_RETRY);
  443                 if (vp->v_type != VREG) {
  444                         error = EINVAL;
  445                         goto out;
  446                 }
  447                 *bsize = vp->v_mount->mnt_stat.f_iosize;
  448                 error = VOP_GETATTR(vp, &va, td->td_ucred);
  449                 if (error != 0)
  450                         goto out;
  451                 *obj_size = va.va_size;
  452                 obj = vp->v_object;
  453                 if (obj == NULL) {
  454                         error = EINVAL;
  455                         goto out;
  456                 }
  457         } else if (fp->f_type == DTYPE_SHM) {
  458                 error = 0;
  459                 shmfd = fp->f_data;
  460                 obj = shmfd->shm_object;
  461                 *obj_size = shmfd->shm_size;
  462         } else {
  463                 error = EINVAL;
  464                 goto out;
  465         }
  466 
  467         VM_OBJECT_WLOCK(obj);
  468         if ((obj->flags & OBJ_DEAD) != 0) {
  469                 VM_OBJECT_WUNLOCK(obj);
  470                 error = EBADF;
  471                 goto out;
  472         }
  473 
  474         /*
  475          * Temporarily increase the backing VM object's reference
  476          * count so that a forced reclamation of its vnode does not
  477          * immediately destroy it.
  478          */
  479         vm_object_reference_locked(obj);
  480         VM_OBJECT_WUNLOCK(obj);
  481         *obj_res = obj;
  482         *vp_res = vp;
  483         *shmfd_res = shmfd;
  484 
  485 out:
  486         if (vp != NULL)
  487                 VOP_UNLOCK(vp, 0);
  488         return (error);
  489 }
  490 
  491 static int
  492 sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
  493     struct socket **so)
  494 {
  495         cap_rights_t rights;
  496         int error;
  497 
  498         *sock_fp = NULL;
  499         *so = NULL;
  500 
  501         /*
  502          * The socket must be a stream socket and connected.
  503          */
  504         error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND),
  505             sock_fp, NULL, NULL);
  506         if (error != 0)
  507                 return (error);
  508         *so = (*sock_fp)->f_data;
  509         if ((*so)->so_type != SOCK_STREAM)
  510                 return (EINVAL);
  511         if (((*so)->so_state & SS_ISCONNECTED) == 0)
  512                 return (ENOTCONN);
  513         return (0);
  514 }
  515 
  516 int
  517 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
  518     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
  519     struct thread *td)
  520 {
  521         struct file *sock_fp;
  522         struct vnode *vp;
  523         struct vm_object *obj;
  524         struct socket *so;
  525         struct mbuf *m, *mh, *mhtail;
  526         struct sf_buf *sf;
  527         struct shmfd *shmfd;
  528         struct sendfile_sync *sfs;
  529         struct vattr va;
  530         off_t off, sbytes, rem, obj_size;
  531         int error, softerr, bsize, hdrlen;
  532 
  533         obj = NULL;
  534         so = NULL;
  535         m = mh = NULL;
  536         sfs = NULL;
  537         hdrlen = sbytes = 0;
  538         softerr = 0;
  539 
  540         error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
  541         if (error != 0)
  542                 return (error);
  543 
  544         error = sendfile_getsock(td, sockfd, &sock_fp, &so);
  545         if (error != 0)
  546                 goto out;
  547 
  548 #ifdef MAC
  549         error = mac_socket_check_send(td->td_ucred, so);
  550         if (error != 0)
  551                 goto out;
  552 #endif
  553 
  554         SFSTAT_INC(sf_syscalls);
  555         SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags));
  556 
  557         if (flags & SF_SYNC) {
  558                 sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
  559                 mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
  560                 cv_init(&sfs->cv, "sendfile");
  561         }
  562 
  563         rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset;
  564 
  565         /*
  566          * Protect against multiple writers to the socket.
  567          *
  568          * XXXRW: Historically this has assumed non-interruptibility, so now
  569          * we implement that, but possibly shouldn't.
  570          */
  571         (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
  572 
  573         /*
  574          * Loop through the pages of the file, starting with the requested
  575          * offset. Get a file page (do I/O if necessary), map the file page
  576          * into an sf_buf, attach an mbuf header to the sf_buf, and queue
  577          * it on the socket.
  578          * This is done in two loops.  The inner loop turns as many pages
  579          * as it can, up to available socket buffer space, without blocking
  580          * into mbufs to have it bulk delivered into the socket send buffer.
  581          * The outer loop checks the state and available space of the socket
  582          * and takes care of the overall progress.
  583          */
  584         for (off = offset; rem > 0; ) {
  585                 struct sf_io *sfio;
  586                 vm_page_t *pa;
  587                 struct mbuf *mtail;
  588                 int nios, space, npages, rhpages;
  589 
  590                 mtail = NULL;
  591                 /*
  592                  * Check the socket state for ongoing connection,
  593                  * no errors and space in socket buffer.
  594                  * If space is low allow for the remainder of the
  595                  * file to be processed if it fits the socket buffer.
  596                  * Otherwise block in waiting for sufficient space
  597                  * to proceed, or if the socket is nonblocking, return
  598                  * to userland with EAGAIN while reporting how far
  599                  * we've come.
  600                  * We wait until the socket buffer has significant free
  601                  * space to do bulk sends.  This makes good use of file
  602                  * system read ahead and allows packet segmentation
  603                  * offloading hardware to take over lots of work.  If
  604                  * we were not careful here we would send off only one
  605                  * sfbuf at a time.
  606                  */
  607                 SOCKBUF_LOCK(&so->so_snd);
  608                 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
  609                         so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
  610 retry_space:
  611                 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
  612                         error = EPIPE;
  613                         SOCKBUF_UNLOCK(&so->so_snd);
  614                         goto done;
  615                 } else if (so->so_error) {
  616                         error = so->so_error;
  617                         so->so_error = 0;
  618                         SOCKBUF_UNLOCK(&so->so_snd);
  619                         goto done;
  620                 }
  621                 space = sbspace(&so->so_snd);
  622                 if (space < rem &&
  623                     (space <= 0 ||
  624                      space < so->so_snd.sb_lowat)) {
  625                         if (so->so_state & SS_NBIO) {
  626                                 SOCKBUF_UNLOCK(&so->so_snd);
  627                                 error = EAGAIN;
  628                                 goto done;
  629                         }
  630                         /*
  631                          * sbwait drops the lock while sleeping.
  632                          * When we loop back to retry_space the
  633                          * state may have changed and we retest
  634                          * for it.
  635                          */
  636                         error = sbwait(&so->so_snd);
  637                         /*
  638                          * An error from sbwait usually indicates that we've
  639                          * been interrupted by a signal. If we've sent anything
  640                          * then return bytes sent, otherwise return the error.
  641                          */
  642                         if (error != 0) {
  643                                 SOCKBUF_UNLOCK(&so->so_snd);
  644                                 goto done;
  645                         }
  646                         goto retry_space;
  647                 }
  648                 SOCKBUF_UNLOCK(&so->so_snd);
  649 
  650                 /*
  651                  * At the beginning of the first loop check if any headers
  652                  * are specified and copy them into mbufs.  Reduce space in
  653                  * the socket buffer by the size of the header mbuf chain.
  654                  * Clear hdr_uio here and hdrlen at the end of the first loop.
  655                  */
  656                 if (hdr_uio != NULL && hdr_uio->uio_resid > 0) {
  657                         hdr_uio->uio_td = td;
  658                         hdr_uio->uio_rw = UIO_WRITE;
  659                         mh = m_uiotombuf(hdr_uio, M_WAITOK, space, 0, 0);
  660                         hdrlen = m_length(mh, &mhtail);
  661                         space -= hdrlen;
  662                         /*
  663                          * If header consumed all the socket buffer space,
  664                          * don't waste CPU cycles and jump to the end.
  665                          */
  666                         if (space == 0) {
  667                                 sfio = NULL;
  668                                 nios = 0;
  669                                 goto prepend_header;
  670                         }
  671                         hdr_uio = NULL;
  672                 }
  673 
  674                 if (vp != NULL) {
  675                         error = vn_lock(vp, LK_SHARED);
  676                         if (error != 0)
  677                                 goto done;
  678                         error = VOP_GETATTR(vp, &va, td->td_ucred);
  679                         if (error != 0 || off >= va.va_size) {
  680                                 VOP_UNLOCK(vp, 0);
  681                                 goto done;
  682                         }
  683                         if (va.va_size != obj_size) {
  684                                 obj_size = va.va_size;
  685                                 rem = nbytes ?
  686                                     omin(nbytes + offset, obj_size) : obj_size;
  687                                 rem -= off;
  688                         }
  689                 }
  690 
  691                 if (space > rem)
  692                         space = rem;
  693 
  694                 npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE);
  695 
  696                 /*
  697                  * Calculate maximum allowed number of pages for readahead
  698                  * at this iteration.  First, we allow readahead up to "rem".
  699                  * If application wants more, let it be, but there is no
  700                  * reason to go above MAXPHYS.  Also check against "obj_size",
  701                  * since vm_pager_has_page() can hint beyond EOF.
  702                  */
  703                 rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) - npages;
  704                 rhpages += SF_READAHEAD(flags);
  705                 rhpages = min(howmany(MAXPHYS, PAGE_SIZE), rhpages);
  706                 rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) -
  707                     npages, rhpages);
  708 
  709                 sfio = malloc(sizeof(struct sf_io) +
  710                     npages * sizeof(vm_page_t), M_TEMP, M_WAITOK);
  711                 refcount_init(&sfio->nios, 1);
  712                 sfio->error = 0;
  713 
  714                 nios = sendfile_swapin(obj, sfio, off, space, npages, rhpages,
  715                     flags);
  716 
  717                 /*
  718                  * Loop and construct maximum sized mbuf chain to be bulk
  719                  * dumped into socket buffer.
  720                  */
  721                 pa = sfio->pa;
  722                 for (int i = 0; i < npages; i++) {
  723                         struct mbuf *m0;
  724 
  725                         /*
  726                          * If a page wasn't grabbed successfully, then
  727                          * trim the array. Can happen only with SF_NODISKIO.
  728                          */
  729                         if (pa[i] == NULL) {
  730                                 SFSTAT_INC(sf_busy);
  731                                 fixspace(npages, i, off, &space);
  732                                 npages = i;
  733                                 softerr = EBUSY;
  734                                 break;
  735                         }
  736 
  737                         /*
  738                          * Get a sendfile buf.  When allocating the
  739                          * first buffer for mbuf chain, we usually
  740                          * wait as long as necessary, but this wait
  741                          * can be interrupted.  For consequent
  742                          * buffers, do not sleep, since several
  743                          * threads might exhaust the buffers and then
  744                          * deadlock.
  745                          */
  746                         sf = sf_buf_alloc(pa[i],
  747                             m != NULL ? SFB_NOWAIT : SFB_CATCH);
  748                         if (sf == NULL) {
  749                                 SFSTAT_INC(sf_allocfail);
  750                                 for (int j = i; j < npages; j++) {
  751                                         vm_page_lock(pa[j]);
  752                                         vm_page_unwire(pa[j], PQ_INACTIVE);
  753                                         vm_page_unlock(pa[j]);
  754                                 }
  755                                 if (m == NULL)
  756                                         softerr = ENOBUFS;
  757                                 fixspace(npages, i, off, &space);
  758                                 npages = i;
  759                                 break;
  760                         }
  761 
  762                         m0 = m_get(M_WAITOK, MT_DATA);
  763                         m0->m_ext.ext_buf = (char *)sf_buf_kva(sf);
  764                         m0->m_ext.ext_size = PAGE_SIZE;
  765                         m0->m_ext.ext_arg1 = sf;
  766                         m0->m_ext.ext_arg2 = sfs;
  767                         /*
  768                          * SF_NOCACHE sets the page as being freed upon send.
  769                          * However, we ignore it for the last page in 'space',
  770                          * if the page is truncated, and we got more data to
  771                          * send (rem > space), or if we have readahead
  772                          * configured (rhpages > 0).
  773                          */
  774                         if ((flags & SF_NOCACHE) == 0 ||
  775                             (i == npages - 1 &&
  776                             ((off + space) & PAGE_MASK) &&
  777                             (rem > space || rhpages > 0)))
  778                                 m0->m_ext.ext_type = EXT_SFBUF;
  779                         else
  780                                 m0->m_ext.ext_type = EXT_SFBUF_NOCACHE;
  781                         m0->m_ext.ext_flags = EXT_FLAG_EMBREF;
  782                         m0->m_ext.ext_count = 1;
  783                         m0->m_flags |= (M_EXT | M_RDONLY);
  784                         if (nios)
  785                                 m0->m_flags |= M_NOTREADY;
  786                         m0->m_data = (char *)sf_buf_kva(sf) +
  787                             (vmoff(i, off) & PAGE_MASK);
  788                         m0->m_len = xfsize(i, npages, off, space);
  789 
  790                         if (i == 0)
  791                                 sfio->m = m0;
  792 
  793                         /* Append to mbuf chain. */
  794                         if (mtail != NULL)
  795                                 mtail->m_next = m0;
  796                         else
  797                                 m = m0;
  798                         mtail = m0;
  799 
  800                         if (sfs != NULL) {
  801                                 mtx_lock(&sfs->mtx);
  802                                 sfs->count++;
  803                                 mtx_unlock(&sfs->mtx);
  804                         }
  805                 }
  806 
  807                 if (vp != NULL)
  808                         VOP_UNLOCK(vp, 0);
  809 
  810                 /* Keep track of bytes processed. */
  811                 off += space;
  812                 rem -= space;
  813 
  814                 /* Prepend header, if any. */
  815                 if (hdrlen) {
  816 prepend_header:
  817                         mhtail->m_next = m;
  818                         m = mh;
  819                         mh = NULL;
  820                 }
  821 
  822                 if (m == NULL) {
  823                         KASSERT(softerr, ("%s: m NULL, no error", __func__));
  824                         error = softerr;
  825                         free(sfio, M_TEMP);
  826                         goto done;
  827                 }
  828 
  829                 /* Add the buffer chain to the socket buffer. */
  830                 KASSERT(m_length(m, NULL) == space + hdrlen,
  831                     ("%s: mlen %u space %d hdrlen %d",
  832                     __func__, m_length(m, NULL), space, hdrlen));
  833 
  834                 CURVNET_SET(so->so_vnet);
  835                 if (nios == 0) {
  836                         /*
  837                          * If sendfile_swapin() didn't initiate any I/Os,
  838                          * which happens if all data is cached in VM, then
  839                          * we can send data right now without the
  840                          * PRUS_NOTREADY flag.
  841                          */
  842                         free(sfio, M_TEMP);
  843                         error = (*so->so_proto->pr_usrreqs->pru_send)
  844                             (so, 0, m, NULL, NULL, td);
  845                 } else {
  846                         sfio->sock_fp = sock_fp;
  847                         sfio->npages = npages;
  848                         fhold(sock_fp);
  849                         error = (*so->so_proto->pr_usrreqs->pru_send)
  850                             (so, PRUS_NOTREADY, m, NULL, NULL, td);
  851                         sendfile_iodone(sfio, NULL, 0, 0);
  852                 }
  853                 CURVNET_RESTORE();
  854 
  855                 m = NULL;       /* pru_send always consumes */
  856                 if (error)
  857                         goto done;
  858                 sbytes += space + hdrlen;
  859                 if (hdrlen)
  860                         hdrlen = 0;
  861                 if (softerr) {
  862                         error = softerr;
  863                         goto done;
  864                 }
  865         }
  866 
  867         /*
  868          * Send trailers. Wimp out and use writev(2).
  869          */
  870         if (trl_uio != NULL) {
  871                 sbunlock(&so->so_snd);
  872                 error = kern_writev(td, sockfd, trl_uio);
  873                 if (error == 0)
  874                         sbytes += td->td_retval[0];
  875                 goto out;
  876         }
  877 
  878 done:
  879         sbunlock(&so->so_snd);
  880 out:
  881         /*
  882          * If there was no error we have to clear td->td_retval[0]
  883          * because it may have been set by writev.
  884          */
  885         if (error == 0) {
  886                 td->td_retval[0] = 0;
  887         }
  888         if (sent != NULL) {
  889                 (*sent) = sbytes;
  890         }
  891         if (obj != NULL)
  892                 vm_object_deallocate(obj);
  893         if (so)
  894                 fdrop(sock_fp, td);
  895         if (m)
  896                 m_freem(m);
  897         if (mh)
  898                 m_freem(mh);
  899 
  900         if (sfs != NULL) {
  901                 mtx_lock(&sfs->mtx);
  902                 if (sfs->count != 0)
  903                         cv_wait(&sfs->cv, &sfs->mtx);
  904                 KASSERT(sfs->count == 0, ("sendfile sync still busy"));
  905                 cv_destroy(&sfs->cv);
  906                 mtx_destroy(&sfs->mtx);
  907                 free(sfs, M_TEMP);
  908         }
  909 
  910         if (error == ERESTART)
  911                 error = EINTR;
  912 
  913         return (error);
  914 }
  915 
  916 static int
  917 sendfile(struct thread *td, struct sendfile_args *uap, int compat)
  918 {
  919         struct sf_hdtr hdtr;
  920         struct uio *hdr_uio, *trl_uio;
  921         struct file *fp;
  922         cap_rights_t rights;
  923         off_t sbytes;
  924         int error;
  925 
  926         /*
  927          * File offset must be positive.  If it goes beyond EOF
  928          * we send only the header/trailer and no payload data.
  929          */
  930         if (uap->offset < 0)
  931                 return (EINVAL);
  932 
  933         hdr_uio = trl_uio = NULL;
  934 
  935         if (uap->hdtr != NULL) {
  936                 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
  937                 if (error != 0)
  938                         goto out;
  939                 if (hdtr.headers != NULL) {
  940                         error = copyinuio(hdtr.headers, hdtr.hdr_cnt,
  941                             &hdr_uio);
  942                         if (error != 0)
  943                                 goto out;
  944 #ifdef COMPAT_FREEBSD4
  945                         /*
  946                          * In FreeBSD < 5.0 the nbytes to send also included
  947                          * the header.  If compat is specified subtract the
  948                          * header size from nbytes.
  949                          */
  950                         if (compat) {
  951                                 if (uap->nbytes > hdr_uio->uio_resid)
  952                                         uap->nbytes -= hdr_uio->uio_resid;
  953                                 else
  954                                         uap->nbytes = 0;
  955                         }
  956 #endif
  957                 }
  958                 if (hdtr.trailers != NULL) {
  959                         error = copyinuio(hdtr.trailers, hdtr.trl_cnt,
  960                             &trl_uio);
  961                         if (error != 0)
  962                                 goto out;
  963                 }
  964         }
  965 
  966         AUDIT_ARG_FD(uap->fd);
  967 
  968         /*
  969          * sendfile(2) can start at any offset within a file so we require
  970          * CAP_READ+CAP_SEEK = CAP_PREAD.
  971          */
  972         if ((error = fget_read(td, uap->fd,
  973             cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) {
  974                 goto out;
  975         }
  976 
  977         error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
  978             uap->nbytes, &sbytes, uap->flags, td);
  979         fdrop(fp, td);
  980 
  981         if (uap->sbytes != NULL)
  982                 copyout(&sbytes, uap->sbytes, sizeof(off_t));
  983 
  984 out:
  985         free(hdr_uio, M_IOV);
  986         free(trl_uio, M_IOV);
  987         return (error);
  988 }
  989 
  990 /*
  991  * sendfile(2)
  992  * 
  993  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
  994  *       struct sf_hdtr *hdtr, off_t *sbytes, int flags)
  995  * 
  996  * Send a file specified by 'fd' and starting at 'offset' to a socket
  997  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
  998  * 0.  Optionally add a header and/or trailer to the socket output.  If
  999  * specified, write the total number of bytes sent into *sbytes.
 1000  */
 1001 int
 1002 sys_sendfile(struct thread *td, struct sendfile_args *uap)
 1003 {
 1004  
 1005         return (sendfile(td, uap, 0));
 1006 }
 1007 
 1008 #ifdef COMPAT_FREEBSD4
 1009 int
 1010 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
 1011 {
 1012         struct sendfile_args args;
 1013 
 1014         args.fd = uap->fd;
 1015         args.s = uap->s;
 1016         args.offset = uap->offset;
 1017         args.nbytes = uap->nbytes;
 1018         args.hdtr = uap->hdtr;
 1019         args.sbytes = uap->sbytes;
 1020         args.flags = uap->flags;
 1021 
 1022         return (sendfile(td, &args, 1));
 1023 }
 1024 #endif /* COMPAT_FREEBSD4 */

Cache object: 5b9cf398ec754eef5359fbaf9c66842f


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.