The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_physio.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: kern_physio.c,v 1.92 2011/02/10 14:46:45 pooka Exp $   */
    2 
    3 /*-
    4  * Copyright (c) 1982, 1986, 1990, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  * (c) UNIX System Laboratories, Inc.
    7  * All or some portions of this file are derived from material licensed
    8  * to the University of California by American Telephone and Telegraph
    9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   10  * the permission of UNIX System Laboratories, Inc.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  *      @(#)kern_physio.c       8.1 (Berkeley) 6/10/93
   37  */
   38 
   39 /*-
   40  * Copyright (c) 1994 Christopher G. Demetriou
   41  *
   42  * Redistribution and use in source and binary forms, with or without
   43  * modification, are permitted provided that the following conditions
   44  * are met:
   45  * 1. Redistributions of source code must retain the above copyright
   46  *    notice, this list of conditions and the following disclaimer.
   47  * 2. Redistributions in binary form must reproduce the above copyright
   48  *    notice, this list of conditions and the following disclaimer in the
   49  *    documentation and/or other materials provided with the distribution.
   50  * 3. All advertising materials mentioning features or use of this software
   51  *    must display the following acknowledgement:
   52  *      This product includes software developed by the University of
   53  *      California, Berkeley and its contributors.
   54  * 4. Neither the name of the University nor the names of its contributors
   55  *    may be used to endorse or promote products derived from this software
   56  *    without specific prior written permission.
   57  *
   58  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   59  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   60  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   61  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   62  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   63  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   64  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   68  * SUCH DAMAGE.
   69  *
   70  *      @(#)kern_physio.c       8.1 (Berkeley) 6/10/93
   71  */
   72 
   73 #include <sys/cdefs.h>
   74 __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.92 2011/02/10 14:46:45 pooka Exp $");
   75 
   76 #include <sys/param.h>
   77 #include <sys/systm.h>
   78 #include <sys/buf.h>
   79 #include <sys/proc.h>
   80 #include <sys/once.h>
   81 #include <sys/workqueue.h>
   82 #include <sys/kmem.h>
   83 
   84 #include <uvm/uvm_extern.h>
   85 
   86 ONCE_DECL(physio_initialized);
   87 struct workqueue *physio_workqueue;
   88 
   89 int physio_concurrency = 16;
   90 
   91 /* #define      PHYSIO_DEBUG */
   92 #if defined(PHYSIO_DEBUG)
   93 #define DPRINTF(a)      printf a
   94 #else /* defined(PHYSIO_DEBUG) */
   95 #define DPRINTF(a)      /* nothing */
   96 #endif /* defined(PHYSIO_DEBUG) */
   97 
   98 struct physio_stat {
   99         int ps_running;
  100         int ps_error;
  101         int ps_failed;
  102         off_t ps_endoffset;
  103         buf_t *ps_orig_bp;
  104         kmutex_t ps_lock;
  105         kcondvar_t ps_cv;
  106 };
  107 
  108 static void
  109 physio_done(struct work *wk, void *dummy)
  110 {
  111         struct buf *bp = (void *)wk;
  112         size_t todo = bp->b_bufsize;
  113         size_t done = bp->b_bcount - bp->b_resid;
  114         struct physio_stat *ps = bp->b_private;
  115         bool is_iobuf;
  116 
  117         KASSERT(&bp->b_work == wk);
  118         KASSERT(bp->b_bcount <= todo);
  119         KASSERT(bp->b_resid <= bp->b_bcount);
  120         KASSERT((bp->b_flags & B_PHYS) != 0);
  121         KASSERT(dummy == NULL);
  122 
  123         vunmapbuf(bp, todo);
  124         uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo);
  125 
  126         mutex_enter(&ps->ps_lock);
  127         is_iobuf = (bp != ps->ps_orig_bp);
  128         if (__predict_false(done != todo)) {
  129                 off_t endoffset = dbtob(bp->b_blkno) + done;
  130 
  131                 /*
  132                  * we got an error or hit EOM.
  133                  *
  134                  * we only care about the first one.
  135                  * ie. the one at the lowest offset.
  136                  */
  137 
  138                 KASSERT(ps->ps_endoffset != endoffset);
  139                 DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64
  140                     ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n",
  141                     __func__, bp->b_error, dbtob(bp->b_blkno), endoffset,
  142                     bp->b_blkno, bp->b_bcount, bp->b_flags));
  143 
  144                 if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) {
  145                         DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64
  146                             " -> %" PRIu64 "\n",
  147                             __func__, ps,
  148                             ps->ps_error, bp->b_error,
  149                             ps->ps_endoffset, endoffset));
  150 
  151                         ps->ps_endoffset = endoffset;
  152                         ps->ps_error = bp->b_error;
  153                 }
  154                 ps->ps_failed++;
  155         } else {
  156                 KASSERT(bp->b_error == 0);
  157         }
  158 
  159         ps->ps_running--;
  160         cv_signal(&ps->ps_cv);
  161         mutex_exit(&ps->ps_lock);
  162 
  163         if (is_iobuf)
  164                 putiobuf(bp);
  165 }
  166 
  167 static void
  168 physio_biodone(struct buf *bp)
  169 {
  170 #if defined(DIAGNOSTIC)
  171         struct physio_stat *ps = bp->b_private;
  172         size_t todo = bp->b_bufsize;
  173         size_t done = bp->b_bcount - bp->b_resid;
  174 
  175         KASSERT(ps->ps_running > 0);
  176         KASSERT(bp->b_bcount <= todo);
  177         KASSERT(bp->b_resid <= bp->b_bcount);
  178         if (done == todo)
  179                 KASSERT(bp->b_error == 0);
  180 #endif /* defined(DIAGNOSTIC) */
  181 
  182         workqueue_enqueue(physio_workqueue, &bp->b_work, NULL);
  183 }
  184 
  185 static void
  186 physio_wait(struct physio_stat *ps, int n)
  187 {
  188 
  189         KASSERT(mutex_owned(&ps->ps_lock));
  190 
  191         while (ps->ps_running > n)
  192                 cv_wait(&ps->ps_cv, &ps->ps_lock);
  193 }
  194 
  195 static int
  196 physio_init(void)
  197 {
  198         int error;
  199 
  200         KASSERT(physio_workqueue == NULL);
  201 
  202         error = workqueue_create(&physio_workqueue, "physiod",
  203             physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE);
  204 
  205         return error;
  206 }
  207 
  208 /*
  209  * Do "physical I/O" on behalf of a user.  "Physical I/O" is I/O directly
  210  * from the raw device to user buffers, and bypasses the buffer cache.
  211  */
  212 int
  213 physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags,
  214     void (*min_phys)(struct buf *), struct uio *uio)
  215 {
  216         struct iovec *iovp;
  217         struct lwp *l = curlwp;
  218         struct proc *p = l->l_proc;
  219         int i, error;
  220         struct buf *bp = NULL;
  221         struct physio_stat *ps;
  222         int concurrency = physio_concurrency - 1;
  223 
  224         error = RUN_ONCE(&physio_initialized, physio_init);
  225         if (__predict_false(error != 0)) {
  226                 return error;
  227         }
  228 
  229         DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n",
  230             __func__, uio->uio_offset, uio->uio_resid));
  231 
  232         flags &= B_READ | B_WRITE;
  233 
  234         if ((ps = kmem_zalloc(sizeof(*ps), KM_SLEEP)) == NULL)
  235                 return ENOMEM;
  236         /* ps->ps_running = 0; */
  237         /* ps->ps_error = 0; */
  238         /* ps->ps_failed = 0; */
  239         ps->ps_orig_bp = obp;
  240         ps->ps_endoffset = -1;
  241         mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE);
  242         cv_init(&ps->ps_cv, "physio");
  243 
  244         /* Make sure we have a buffer, creating one if necessary. */
  245         if (obp != NULL) {
  246                 mutex_enter(&bufcache_lock);
  247                 /* Mark it busy, so nobody else will use it. */
  248                 while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH)
  249                         ;
  250                 mutex_exit(&bufcache_lock);
  251                 concurrency = 0; /* see "XXXkludge" comment below */
  252         }
  253 
  254         for (i = 0; i < uio->uio_iovcnt; i++) {
  255                 bool sync = true;
  256 
  257                 iovp = &uio->uio_iov[i];
  258                 while (iovp->iov_len > 0) {
  259                         size_t todo;
  260                         vaddr_t endp;
  261 
  262                         mutex_enter(&ps->ps_lock);
  263                         if (ps->ps_failed != 0) {
  264                                 goto done_locked;
  265                         }
  266                         physio_wait(ps, sync ? 0 : concurrency);
  267                         mutex_exit(&ps->ps_lock);
  268                         if (obp != NULL) {
  269                                 /*
  270                                  * XXXkludge
  271                                  * some drivers use "obp" as an identifier.
  272                                  */
  273                                 bp = obp;
  274                         } else {
  275                                 bp = getiobuf(NULL, true);
  276                                 bp->b_cflags = BC_BUSY;
  277                         }
  278                         bp->b_dev = dev;
  279                         bp->b_proc = p;
  280                         bp->b_private = ps;
  281 
  282                         /*
  283                          * Mrk the buffer busy for physical I/O.  Also set
  284                          * B_PHYS because it's an I/O to user memory, and
  285                          * B_RAW because B_RAW is to be "set by physio for
  286                          * raw transfers".
  287                          */
  288                         bp->b_oflags = 0;
  289                         bp->b_cflags = BC_BUSY;
  290                         bp->b_flags = flags | B_PHYS | B_RAW;
  291                         bp->b_iodone = physio_biodone;
  292 
  293                         /* Set up the buffer for a maximum-sized transfer. */
  294                         bp->b_blkno = btodb(uio->uio_offset);
  295                         if (dbtob(bp->b_blkno) != uio->uio_offset) {
  296                                 error = EINVAL;
  297                                 goto done;
  298                         }
  299                         bp->b_bcount = MIN(MAXPHYS, iovp->iov_len);
  300                         bp->b_data = iovp->iov_base;
  301 
  302                         /*
  303                          * Call minphys to bound the transfer size,
  304                          * and remember the amount of data to transfer,
  305                          * for later comparison.
  306                          */
  307                         (*min_phys)(bp);
  308                         todo = bp->b_bufsize = bp->b_bcount;
  309 #if defined(DIAGNOSTIC)
  310                         if (todo > MAXPHYS)
  311                                 panic("todo(%zu) > MAXPHYS; minphys broken",
  312                                     todo);
  313 #endif /* defined(DIAGNOSTIC) */
  314 
  315                         sync = false;
  316                         endp = (vaddr_t)bp->b_data + todo;
  317                         if (trunc_page(endp) != endp) {
  318                                 /*
  319                                  * Following requests can overlap.
  320                                  * note that uvm_vslock does round_page.
  321                                  */
  322                                 sync = true;
  323                         }
  324 
  325                         /*
  326                          * Lock the part of the user address space involved
  327                          * in the transfer.
  328                          */
  329                         error = uvm_vslock(p->p_vmspace, bp->b_data, todo,
  330                             (flags & B_READ) ?  VM_PROT_WRITE : VM_PROT_READ);
  331                         if (error) {
  332                                 goto done;
  333                         }
  334 
  335                         /*
  336                          * Beware vmapbuf(); if succesful it clobbers
  337                          * b_data and saves it in b_saveaddr.
  338                          * However, vunmapbuf() restores b_data.
  339                          */
  340                         if ((error = vmapbuf(bp, todo)) != 0) {
  341                                 uvm_vsunlock(p->p_vmspace, bp->b_data, todo);
  342                                 goto done;
  343                         }
  344 
  345                         BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
  346 
  347                         mutex_enter(&ps->ps_lock);
  348                         ps->ps_running++;
  349                         mutex_exit(&ps->ps_lock);
  350 
  351                         /* Call strategy to start the transfer. */
  352                         (*strategy)(bp);
  353                         bp = NULL;
  354 
  355                         iovp->iov_len -= todo;
  356                         iovp->iov_base = (char *)iovp->iov_base + todo;
  357                         uio->uio_offset += todo;
  358                         uio->uio_resid -= todo;
  359                 }
  360         }
  361 
  362 done:
  363         mutex_enter(&ps->ps_lock);
  364 done_locked:
  365         physio_wait(ps, 0);
  366         mutex_exit(&ps->ps_lock);
  367 
  368         if (ps->ps_failed != 0) {
  369                 off_t delta;
  370 
  371                 delta = uio->uio_offset - ps->ps_endoffset;
  372                 KASSERT(delta > 0);
  373                 uio->uio_resid += delta;
  374                 /* uio->uio_offset = ps->ps_endoffset; */
  375         } else {
  376                 KASSERT(ps->ps_endoffset == -1);
  377         }
  378         if (bp != NULL && bp != obp) {
  379                 putiobuf(bp);
  380         }
  381         if (error == 0) {
  382                 error = ps->ps_error;
  383         }
  384         mutex_destroy(&ps->ps_lock);
  385         cv_destroy(&ps->ps_cv);
  386         kmem_free(ps, sizeof(*ps));
  387 
  388         /*
  389          * Clean up the state of the buffer.  Remember if somebody wants
  390          * it, so we can wake them up below.  Also, if we had to steal it,
  391          * give it back.
  392          */
  393         if (obp != NULL) {
  394                 KASSERT((obp->b_cflags & BC_BUSY) != 0);
  395 
  396                 /*
  397                  * If another process is waiting for the raw I/O buffer,
  398                  * wake up processes waiting to do physical I/O;
  399                  */
  400                 mutex_enter(&bufcache_lock);
  401                 obp->b_cflags &= ~(BC_BUSY | BC_WANTED);
  402                 obp->b_flags &= ~(B_PHYS | B_RAW);
  403                 obp->b_iodone = NULL;
  404                 cv_broadcast(&obp->b_busy);
  405                 mutex_exit(&bufcache_lock);
  406         }
  407 
  408         DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n",
  409             __func__, uio->uio_offset, uio->uio_resid));
  410 
  411         return error;
  412 }
  413 
  414 /*
  415  * A minphys() routine is called by physio() to adjust the size of each
  416  * I/O transfer before the latter is passed to the strategy routine.
  417  *
  418  * This minphys() is a default that must be called to enforce limits
  419  * that are applicable to all devices, because of limitations in the
  420  * kernel or the hardware platform.
  421  */
  422 void
  423 minphys(struct buf *bp)
  424 {
  425 
  426         if (bp->b_bcount > MAXPHYS)
  427                 bp->b_bcount = MAXPHYS;
  428 }

Cache object: 1a2ddbfeee7916d2f28315c07345167e


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.