The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_physio.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: kern_physio.c,v 1.88.4.1 2009/05/27 21:42:08 snj Exp $ */
    2 
    3 /*-
    4  * Copyright (c) 1982, 1986, 1990, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  * (c) UNIX System Laboratories, Inc.
    7  * All or some portions of this file are derived from material licensed
    8  * to the University of California by American Telephone and Telegraph
    9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   10  * the permission of UNIX System Laboratories, Inc.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  *      @(#)kern_physio.c       8.1 (Berkeley) 6/10/93
   37  */
   38 
   39 /*-
   40  * Copyright (c) 1994 Christopher G. Demetriou
   41  *
   42  * Redistribution and use in source and binary forms, with or without
   43  * modification, are permitted provided that the following conditions
   44  * are met:
   45  * 1. Redistributions of source code must retain the above copyright
   46  *    notice, this list of conditions and the following disclaimer.
   47  * 2. Redistributions in binary form must reproduce the above copyright
   48  *    notice, this list of conditions and the following disclaimer in the
   49  *    documentation and/or other materials provided with the distribution.
   50  * 3. All advertising materials mentioning features or use of this software
   51  *    must display the following acknowledgement:
   52  *      This product includes software developed by the University of
   53  *      California, Berkeley and its contributors.
   54  * 4. Neither the name of the University nor the names of its contributors
   55  *    may be used to endorse or promote products derived from this software
   56  *    without specific prior written permission.
   57  *
   58  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   59  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   60  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   61  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   62  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   63  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   64  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   68  * SUCH DAMAGE.
   69  *
   70  *      @(#)kern_physio.c       8.1 (Berkeley) 6/10/93
   71  */
   72 
   73 #include <sys/cdefs.h>
   74 __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.88.4.1 2009/05/27 21:42:08 snj Exp $");
   75 
   76 #include <sys/param.h>
   77 #include <sys/systm.h>
   78 #include <sys/buf.h>
   79 #include <sys/proc.h>
   80 #include <sys/once.h>
   81 #include <sys/workqueue.h>
   82 #include <sys/kmem.h>
   83 
   84 #include <uvm/uvm_extern.h>
   85 
   86 ONCE_DECL(physio_initialized);
   87 struct workqueue *physio_workqueue;
   88 
   89 /*
   90  * The routines implemented in this file are described in:
   91  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
   92  *          UNIX Operating System (Addison Welley, 1989)
   93  * on pages 231-233.
   94  */
   95 
   96 /* #define      PHYSIO_DEBUG */
   97 #if defined(PHYSIO_DEBUG)
   98 #define DPRINTF(a)      printf a
   99 #else /* defined(PHYSIO_DEBUG) */
  100 #define DPRINTF(a)      /* nothing */
  101 #endif /* defined(PHYSIO_DEBUG) */
  102 
  103 struct physio_stat {
  104         int ps_running;
  105         int ps_error;
  106         int ps_failed;
  107         off_t ps_endoffset;
  108         buf_t *ps_orig_bp;
  109         kmutex_t ps_lock;
  110         kcondvar_t ps_cv;
  111 };
  112 
  113 static void
  114 physio_done(struct work *wk, void *dummy)
  115 {
  116         struct buf *bp = (void *)wk;
  117         size_t todo = bp->b_bufsize;
  118         size_t done = bp->b_bcount - bp->b_resid;
  119         struct physio_stat *ps = bp->b_private;
  120         bool is_iobuf;
  121 
  122         KASSERT(&bp->b_work == wk);
  123         KASSERT(bp->b_bcount <= todo);
  124         KASSERT(bp->b_resid <= bp->b_bcount);
  125         KASSERT((bp->b_flags & B_PHYS) != 0);
  126         KASSERT(dummy == NULL);
  127 
  128         vunmapbuf(bp, todo);
  129         uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo);
  130 
  131         mutex_enter(&ps->ps_lock);
  132         is_iobuf = (bp != ps->ps_orig_bp);
  133         if (__predict_false(done != todo)) {
  134                 off_t endoffset = dbtob(bp->b_blkno) + done;
  135 
  136                 /*
  137                  * we got an error or hit EOM.
  138                  *
  139                  * we only care about the first one.
  140                  * ie. the one at the lowest offset.
  141                  */
  142 
  143                 KASSERT(ps->ps_endoffset != endoffset);
  144                 DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64
  145                     ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n",
  146                     __func__, bp->b_error, dbtob(bp->b_blkno), endoffset,
  147                     bp->b_blkno, bp->b_bcount, bp->b_flags));
  148 
  149                 if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) {
  150                         DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64
  151                             " -> %" PRIu64 "\n",
  152                             __func__, ps,
  153                             ps->ps_error, bp->b_error,
  154                             ps->ps_endoffset, endoffset));
  155 
  156                         ps->ps_endoffset = endoffset;
  157                         ps->ps_error = bp->b_error;
  158                 }
  159                 ps->ps_failed++;
  160         } else {
  161                 KASSERT(bp->b_error == 0);
  162         }
  163 
  164         ps->ps_running--;
  165         cv_signal(&ps->ps_cv);
  166         mutex_exit(&ps->ps_lock);
  167 
  168         if (is_iobuf)
  169                 putiobuf(bp);
  170 }
  171 
  172 static void
  173 physio_biodone(struct buf *bp)
  174 {
  175 #if defined(DIAGNOSTIC)
  176         struct physio_stat *ps = bp->b_private;
  177         size_t todo = bp->b_bufsize;
  178 
  179         KASSERT(ps->ps_running > 0);
  180         KASSERT(bp->b_bcount <= todo);
  181         KASSERT(bp->b_resid <= bp->b_bcount);
  182 #endif /* defined(DIAGNOSTIC) */
  183 
  184         workqueue_enqueue(physio_workqueue, &bp->b_work, NULL);
  185 }
  186 
  187 static void
  188 physio_wait(struct physio_stat *ps, int n)
  189 {
  190 
  191         KASSERT(mutex_owned(&ps->ps_lock));
  192 
  193         while (ps->ps_running > n)
  194                 cv_wait(&ps->ps_cv, &ps->ps_lock);
  195 }
  196 
  197 static int
  198 physio_init(void)
  199 {
  200         int error;
  201 
  202         KASSERT(physio_workqueue == NULL);
  203 
  204         error = workqueue_create(&physio_workqueue, "physiod",
  205             physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE);
  206 
  207         return error;
  208 }
  209 
  210 #define PHYSIO_CONCURRENCY      16      /* XXX tune */
  211 
  212 /*
  213  * Do "physical I/O" on behalf of a user.  "Physical I/O" is I/O directly
  214  * from the raw device to user buffers, and bypasses the buffer cache.
  215  *
  216  * Comments in brackets are from Leffler, et al.'s pseudo-code implementation.
  217  */
  218 int
  219 physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags,
  220     void (*min_phys)(struct buf *), struct uio *uio)
  221 {
  222         struct iovec *iovp;
  223         struct lwp *l = curlwp;
  224         struct proc *p = l->l_proc;
  225         int i, error;
  226         struct buf *bp = NULL;
  227         struct physio_stat *ps;
  228         int concurrency = PHYSIO_CONCURRENCY - 1;
  229 
  230         error = RUN_ONCE(&physio_initialized, physio_init);
  231         if (__predict_false(error != 0)) {
  232                 return error;
  233         }
  234 
  235         DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n",
  236             __func__, uio->uio_offset, uio->uio_resid));
  237 
  238         flags &= B_READ | B_WRITE;
  239 
  240         if ((ps = kmem_zalloc(sizeof(*ps), KM_SLEEP)) == NULL)
  241                 return ENOMEM;
  242         /* ps->ps_running = 0; */
  243         /* ps->ps_error = 0; */
  244         /* ps->ps_failed = 0; */
  245         ps->ps_orig_bp = obp;
  246         ps->ps_endoffset = -1;
  247         mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE);
  248         cv_init(&ps->ps_cv, "physio");
  249 
  250         /* Make sure we have a buffer, creating one if necessary. */
  251         if (obp != NULL) {
  252                 /* [raise the processor priority level to splbio;] */
  253                 mutex_enter(&bufcache_lock);
  254                 /* Mark it busy, so nobody else will use it. */
  255                 while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH)
  256                         ;
  257                 mutex_exit(&bufcache_lock);
  258                 concurrency = 0; /* see "XXXkludge" comment below */
  259         }
  260 
  261         uvm_lwp_hold(l);
  262 
  263         for (i = 0; i < uio->uio_iovcnt; i++) {
  264                 bool sync = true;
  265 
  266                 iovp = &uio->uio_iov[i];
  267                 while (iovp->iov_len > 0) {
  268                         size_t todo;
  269                         vaddr_t endp;
  270 
  271                         mutex_enter(&ps->ps_lock);
  272                         if (ps->ps_failed != 0) {
  273                                 goto done_locked;
  274                         }
  275                         physio_wait(ps, sync ? 0 : concurrency);
  276                         mutex_exit(&ps->ps_lock);
  277                         if (obp != NULL) {
  278                                 /*
  279                                  * XXXkludge
  280                                  * some drivers use "obp" as an identifier.
  281                                  */
  282                                 bp = obp;
  283                         } else {
  284                                 bp = getiobuf(NULL, true);
  285                                 bp->b_cflags = BC_BUSY;
  286                         }
  287                         bp->b_dev = dev;
  288                         bp->b_proc = p;
  289                         bp->b_private = ps;
  290 
  291                         /*
  292                          * [mark the buffer busy for physical I/O]
  293                          * (i.e. set B_PHYS (because it's an I/O to user
  294                          * memory, and B_RAW, because B_RAW is to be
  295                          * "Set by physio for raw transfers.", in addition
  296                          * to the "busy" and read/write flag.)
  297                          */
  298                         bp->b_oflags = 0;
  299                         bp->b_cflags = BC_BUSY;
  300                         bp->b_flags = flags | B_PHYS | B_RAW;
  301                         bp->b_iodone = physio_biodone;
  302 
  303                         /* [set up the buffer for a maximum-sized transfer] */
  304                         bp->b_blkno = btodb(uio->uio_offset);
  305                         if (dbtob(bp->b_blkno) != uio->uio_offset) {
  306                                 error = EINVAL;
  307                                 goto done;
  308                         }
  309                         bp->b_bcount = MIN(MAXPHYS, iovp->iov_len);
  310                         bp->b_data = iovp->iov_base;
  311 
  312                         /*
  313                          * [call minphys to bound the transfer size]
  314                          * and remember the amount of data to transfer,
  315                          * for later comparison.
  316                          */
  317                         (*min_phys)(bp);
  318                         todo = bp->b_bufsize = bp->b_bcount;
  319 #if defined(DIAGNOSTIC)
  320                         if (todo > MAXPHYS)
  321                                 panic("todo(%zu) > MAXPHYS; minphys broken",
  322                                     todo);
  323 #endif /* defined(DIAGNOSTIC) */
  324 
  325                         sync = false;
  326                         endp = (vaddr_t)bp->b_data + todo;
  327                         if (trunc_page(endp) != endp) {
  328                                 /*
  329                                  * following requests can overlap.
  330                                  * note that uvm_vslock does round_page.
  331                                  */
  332                                 sync = true;
  333                         }
  334 
  335                         /*
  336                          * [lock the part of the user address space involved
  337                          *    in the transfer]
  338                          * Beware vmapbuf(); it clobbers b_data and
  339                          * saves it in b_saveaddr.  However, vunmapbuf()
  340                          * restores it.
  341                          */
  342                         error = uvm_vslock(p->p_vmspace, bp->b_data, todo,
  343                             (flags & B_READ) ?  VM_PROT_WRITE : VM_PROT_READ);
  344                         if (error) {
  345                                 goto done;
  346                         }
  347                         vmapbuf(bp, todo);
  348 
  349                         BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
  350 
  351                         mutex_enter(&ps->ps_lock);
  352                         ps->ps_running++;
  353                         mutex_exit(&ps->ps_lock);
  354 
  355                         /* [call strategy to start the transfer] */
  356                         (*strategy)(bp);
  357                         bp = NULL;
  358 
  359                         iovp->iov_len -= todo;
  360                         iovp->iov_base = (char *)iovp->iov_base + todo;
  361                         uio->uio_offset += todo;
  362                         uio->uio_resid -= todo;
  363                 }
  364         }
  365 
  366 done:
  367         mutex_enter(&ps->ps_lock);
  368 done_locked:
  369         physio_wait(ps, 0);
  370         mutex_exit(&ps->ps_lock);
  371 
  372         if (ps->ps_failed != 0) {
  373                 off_t delta;
  374 
  375                 delta = uio->uio_offset - ps->ps_endoffset;
  376                 KASSERT(delta > 0);
  377                 uio->uio_resid += delta;
  378                 /* uio->uio_offset = ps->ps_endoffset; */
  379         } else {
  380                 KASSERT(ps->ps_endoffset == -1);
  381         }
  382         if (bp != NULL && bp != obp) {
  383                 putiobuf(bp);
  384         }
  385         if (error == 0) {
  386                 error = ps->ps_error;
  387         }
  388         mutex_destroy(&ps->ps_lock);
  389         cv_destroy(&ps->ps_cv);
  390         kmem_free(ps, sizeof(*ps));
  391 
  392         /*
  393          * [clean up the state of the buffer]
  394          * Remember if somebody wants it, so we can wake them up below.
  395          * Also, if we had to steal it, give it back.
  396          */
  397         if (obp != NULL) {
  398                 KASSERT((obp->b_cflags & BC_BUSY) != 0);
  399 
  400                 /*
  401                  * [if another process is waiting for the raw I/O buffer,
  402                  *    wake up processes waiting to do physical I/O;
  403                  */
  404                 mutex_enter(&bufcache_lock);
  405                 obp->b_cflags &= ~(BC_BUSY | BC_WANTED);
  406                 obp->b_flags &= ~(B_PHYS | B_RAW);
  407                 obp->b_iodone = NULL;
  408                 cv_broadcast(&obp->b_busy);
  409                 mutex_exit(&bufcache_lock);
  410         }
  411         uvm_lwp_rele(l);
  412 
  413         DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n",
  414             __func__, uio->uio_offset, uio->uio_resid));
  415 
  416         return error;
  417 }
  418 
  419 /*
  420  * Leffler, et al., says on p. 231:
  421  * "The minphys() routine is called by physio() to adjust the
  422  * size of each I/O transfer before the latter is passed to
  423  * the strategy routine..."
  424  *
  425  * so, just adjust the buffer's count accounting to MAXPHYS here,
  426  * and return the new count;
  427  */
  428 void
  429 minphys(struct buf *bp)
  430 {
  431 
  432         if (bp->b_bcount > MAXPHYS)
  433                 bp->b_bcount = MAXPHYS;
  434 }

Cache object: f971b4184b5fc182ea02f3f73f9f7ecd


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.