The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_physio.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: kern_physio.c,v 1.76 2006/11/01 10:17:58 yamt Exp $    */
    2 
    3 /*-
    4  * Copyright (c) 1982, 1986, 1990, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  * (c) UNIX System Laboratories, Inc.
    7  * All or some portions of this file are derived from material licensed
    8  * to the University of California by American Telephone and Telegraph
    9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   10  * the permission of UNIX System Laboratories, Inc.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  *      @(#)kern_physio.c       8.1 (Berkeley) 6/10/93
   37  */
   38 
   39 /*-
   40  * Copyright (c) 1994 Christopher G. Demetriou
   41  *
   42  * Redistribution and use in source and binary forms, with or without
   43  * modification, are permitted provided that the following conditions
   44  * are met:
   45  * 1. Redistributions of source code must retain the above copyright
   46  *    notice, this list of conditions and the following disclaimer.
   47  * 2. Redistributions in binary form must reproduce the above copyright
   48  *    notice, this list of conditions and the following disclaimer in the
   49  *    documentation and/or other materials provided with the distribution.
   50  * 3. All advertising materials mentioning features or use of this software
   51  *    must display the following acknowledgement:
   52  *      This product includes software developed by the University of
   53  *      California, Berkeley and its contributors.
   54  * 4. Neither the name of the University nor the names of its contributors
   55  *    may be used to endorse or promote products derived from this software
   56  *    without specific prior written permission.
   57  *
   58  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   59  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   60  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   61  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   62  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   63  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   64  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   68  * SUCH DAMAGE.
   69  *
   70  *      @(#)kern_physio.c       8.1 (Berkeley) 6/10/93
   71  */
   72 
   73 #include <sys/cdefs.h>
   74 __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.76 2006/11/01 10:17:58 yamt Exp $");
   75 
   76 #include <sys/param.h>
   77 #include <sys/systm.h>
   78 #include <sys/buf.h>
   79 #include <sys/proc.h>
   80 #include <sys/once.h>
   81 #include <sys/workqueue.h>
   82 
   83 #include <uvm/uvm_extern.h>
   84 
   85 ONCE_DECL(physio_initialized);
   86 struct workqueue *physio_workqueue;
   87 
   88 /*
   89  * The routines implemented in this file are described in:
   90  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
   91  *          UNIX Operating System (Addison Welley, 1989)
   92  * on pages 231-233.
   93  *
   94  * The routines "getphysbuf" and "putphysbuf" steal and return a swap
   95  * buffer.  Leffler, et al., says that swap buffers are used to do the
   96  * I/O, so raw I/O requests don't have to be single-threaded.  Of course,
   97  * NetBSD doesn't use "swap buffers" -- we have our own memory pool for
   98  * buffer descriptors.
   99  */
  100 
  101 /* #define      PHYSIO_DEBUG */
  102 #if defined(PHYSIO_DEBUG)
  103 #define DPRINTF(a)      printf a
  104 #else /* defined(PHYSIO_DEBUG) */
  105 #define DPRINTF(a)      /* nothing */
  106 #endif /* defined(PHYSIO_DEBUG) */
  107 
  108 /* abuse these members/flags of struct buf */
  109 #define b_running       b_freelistindex
  110 #define b_endoffset     b_lblkno
  111 #define B_DONTFREE      B_AGE
  112 
  113 /*
  114  * allocate a buffer structure for use in physical I/O.
  115  */
  116 static struct buf *
  117 getphysbuf(void)
  118 {
  119         struct buf *bp;
  120 
  121         bp = getiobuf();
  122         bp->b_error = 0;
  123         bp->b_flags = B_BUSY;
  124         return(bp);
  125 }
  126 
  127 /*
  128  * get rid of a swap buffer structure which has been used in physical I/O.
  129  */
  130 static void
  131 putphysbuf(struct buf *bp)
  132 {
  133 
  134         if ((bp->b_flags & B_DONTFREE) != 0) {
  135                 return;
  136         }
  137 
  138         if (__predict_false(bp->b_flags & B_WANTED))
  139                 panic("putphysbuf: private buf B_WANTED");
  140         putiobuf(bp);
  141 }
  142 
  143 static void
  144 physio_done(struct work *wk, void *dummy)
  145 {
  146         struct buf *bp = (void *)wk;
  147         size_t todo = bp->b_bufsize;
  148         size_t done = bp->b_bcount - bp->b_resid;
  149         struct buf *mbp = bp->b_private;
  150 
  151         KASSERT(&bp->b_work == wk);
  152         KASSERT(bp->b_bcount <= todo);
  153         KASSERT(bp->b_resid <= bp->b_bcount);
  154         KASSERT((bp->b_flags & B_PHYS) != 0);
  155         KASSERT(dummy == NULL);
  156 
  157         vunmapbuf(bp, todo);
  158         uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo);
  159 
  160         simple_lock(&mbp->b_interlock);
  161         if (__predict_false(done != todo)) {
  162                 off_t endoffset = dbtob(bp->b_blkno) + done;
  163 
  164                 /*
  165                  * we got an error or hit EOM.
  166                  *
  167                  * we only care about the first one.
  168                  * ie. the one at the lowest offset.
  169                  */
  170 
  171                 KASSERT(mbp->b_endoffset != endoffset);
  172                 DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64
  173                     ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n",
  174                     __func__, bp->b_error, dbtob(bp->b_blkno), endoffset,
  175                     bp->b_blkno, bp->b_bcount, bp->b_flags));
  176 
  177                 if (mbp->b_endoffset == -1 || endoffset < mbp->b_endoffset) {
  178                         int error;
  179 
  180                         if ((bp->b_flags & B_ERROR) != 0) {
  181                                 if (bp->b_error == 0) {
  182                                         error = EIO; /* XXX */
  183                                 } else {
  184                                         error = bp->b_error;
  185                                 }
  186                         } else {
  187                                 error = 0; /* EOM */
  188                         }
  189 
  190                         DPRINTF(("%s: mbp=%p, error %d -> %d, endoff %" PRIu64
  191                             " -> %" PRIu64 "\n",
  192                             __func__, mbp,
  193                             mbp->b_error, error,
  194                             mbp->b_endoffset, endoffset));
  195 
  196                         mbp->b_endoffset = endoffset;
  197                         mbp->b_error = error;
  198                 }
  199                 mbp->b_flags |= B_ERROR;
  200         } else {
  201                 KASSERT((bp->b_flags & B_ERROR) == 0);
  202         }
  203 
  204         mbp->b_running--;
  205         if ((mbp->b_flags & B_WANTED) != 0) {
  206                 mbp->b_flags &= ~B_WANTED;
  207                 wakeup(mbp);
  208         }
  209         simple_unlock(&mbp->b_interlock);
  210 
  211         putphysbuf(bp);
  212 }
  213 
  214 static void
  215 physio_biodone(struct buf *bp)
  216 {
  217 #if defined(DIAGNOSTIC)
  218         struct buf *mbp = bp->b_private;
  219         size_t todo = bp->b_bufsize;
  220 
  221         KASSERT(mbp->b_running > 0);
  222         KASSERT(bp->b_bcount <= todo);
  223         KASSERT(bp->b_resid <= bp->b_bcount);
  224 #endif /* defined(DIAGNOSTIC) */
  225 
  226         workqueue_enqueue(physio_workqueue, &bp->b_work);
  227 }
  228 
  229 static int
  230 physio_wait(struct buf *bp, int n, const char *wchan)
  231 {
  232         int error = 0;
  233 
  234         LOCK_ASSERT(simple_lock_held(&bp->b_interlock));
  235 
  236         while (bp->b_running > n) {
  237                 bp->b_flags |= B_WANTED;
  238                 error = ltsleep(bp, PRIBIO + 1, wchan, 0, &bp->b_interlock);
  239                 if (error) {
  240                         break;
  241                 }
  242         }
  243 
  244         return error;
  245 }
  246 
  247 static int
  248 physio_init(void)
  249 {
  250         int error;
  251 
  252         KASSERT(physio_workqueue == NULL);
  253 
  254         error = workqueue_create(&physio_workqueue, "physiod",
  255             physio_done, NULL, PRIBIO, 0/* IPL_BIO notyet */, 0);
  256 
  257         return error;
  258 }
  259 
  260 #define PHYSIO_CONCURRENCY      16      /* XXX tune */
  261 
  262 /*
  263  * Do "physical I/O" on behalf of a user.  "Physical I/O" is I/O directly
  264  * from the raw device to user buffers, and bypasses the buffer cache.
  265  *
  266  * Comments in brackets are from Leffler, et al.'s pseudo-code implementation.
  267  */
  268 int
  269 physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags,
  270     void (*min_phys)(struct buf *), struct uio *uio)
  271 {
  272         struct iovec *iovp;
  273         struct lwp *l = curlwp;
  274         struct proc *p = l->l_proc;
  275         int i, s;
  276         int error;
  277         int error2;
  278         struct buf *bp = NULL;
  279         struct buf *mbp;
  280         int concurrency = PHYSIO_CONCURRENCY - 1;
  281 
  282         error = RUN_ONCE(&physio_initialized, physio_init);
  283         if (__predict_false(error != 0)) {
  284                 return error;
  285         }
  286 
  287         DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n",
  288             __func__, uio->uio_offset, uio->uio_resid));
  289 
  290         flags &= B_READ | B_WRITE;
  291 
  292         /* Make sure we have a buffer, creating one if necessary. */
  293         if (obp != NULL) {
  294                 /* [raise the processor priority level to splbio;] */
  295                 s = splbio();
  296                 simple_lock(&obp->b_interlock);
  297 
  298                 /* [while the buffer is marked busy] */
  299                 while (obp->b_flags & B_BUSY) {
  300                         /* [mark the buffer wanted] */
  301                         obp->b_flags |= B_WANTED;
  302                         /* [wait until the buffer is available] */
  303                         ltsleep(obp, PRIBIO+1, "physbuf", 0, &obp->b_interlock);
  304                 }
  305 
  306                 /* Mark it busy, so nobody else will use it. */
  307                 obp->b_flags = B_BUSY | B_DONTFREE;
  308 
  309                 /* [lower the priority level] */
  310                 simple_unlock(&obp->b_interlock);
  311                 splx(s);
  312 
  313                 concurrency = 0; /* see "XXXkludge" comment below */
  314         }
  315 
  316         mbp = getphysbuf();
  317         mbp->b_running = 0;
  318         mbp->b_endoffset = -1;
  319 
  320         PHOLD(l);
  321 
  322         for (i = 0; i < uio->uio_iovcnt; i++) {
  323                 boolean_t sync = TRUE;
  324 
  325                 iovp = &uio->uio_iov[i];
  326                 while (iovp->iov_len > 0) {
  327                         size_t todo;
  328                         vaddr_t endp;
  329 
  330                         simple_lock(&mbp->b_interlock);
  331                         if ((mbp->b_flags & B_ERROR) != 0) {
  332                                 goto done_locked;
  333                         }
  334                         error = physio_wait(mbp, sync ? 0 : concurrency,
  335                             "physio1");
  336                         if (error) {
  337                                 goto done_locked;
  338                         }
  339                         simple_unlock(&mbp->b_interlock);
  340                         if (obp != NULL) {
  341                                 /*
  342                                  * XXXkludge
  343                                  * some drivers use "obp" as an identifier.
  344                                  */
  345                                 bp = obp;
  346                         } else {
  347                                 bp = getphysbuf();
  348                         }
  349                         bp->b_dev = dev;
  350                         bp->b_proc = p;
  351                         bp->b_private = mbp;
  352                         bp->b_vp = NULL;
  353 
  354                         /*
  355                          * [mark the buffer busy for physical I/O]
  356                          * (i.e. set B_PHYS (because it's an I/O to user
  357                          * memory, and B_RAW, because B_RAW is to be
  358                          * "Set by physio for raw transfers.", in addition
  359                          * to the "busy" and read/write flag.)
  360                          */
  361                         bp->b_flags = (bp->b_flags & B_DONTFREE) |
  362                             B_BUSY | B_PHYS | B_RAW | B_CALL | flags;
  363                         bp->b_iodone = physio_biodone;
  364 
  365                         /* [set up the buffer for a maximum-sized transfer] */
  366                         bp->b_blkno = btodb(uio->uio_offset);
  367                         if (dbtob(bp->b_blkno) != uio->uio_offset) {
  368                                 error = EINVAL;
  369                                 goto done;
  370                         }
  371                         bp->b_bcount = MIN(MAXPHYS, iovp->iov_len);
  372                         bp->b_data = iovp->iov_base;
  373 
  374                         /*
  375                          * [call minphys to bound the transfer size]
  376                          * and remember the amount of data to transfer,
  377                          * for later comparison.
  378                          */
  379                         (*min_phys)(bp);
  380                         todo = bp->b_bufsize = bp->b_bcount;
  381 #if defined(DIAGNOSTIC)
  382                         if (todo > MAXPHYS)
  383                                 panic("todo(%zu) > MAXPHYS; minphys broken",
  384                                     todo);
  385 #endif /* defined(DIAGNOSTIC) */
  386 
  387                         sync = FALSE;
  388                         endp = (vaddr_t)bp->b_data + todo;
  389                         if (trunc_page(endp) != endp) {
  390                                 /*
  391                                  * following requests can overlap.
  392                                  * note that uvm_vslock does round_page.
  393                                  */
  394                                 sync = TRUE;
  395                         }
  396 
  397                         /*
  398                          * [lock the part of the user address space involved
  399                          *    in the transfer]
  400                          * Beware vmapbuf(); it clobbers b_data and
  401                          * saves it in b_saveaddr.  However, vunmapbuf()
  402                          * restores it.
  403                          */
  404                         error = uvm_vslock(p->p_vmspace, bp->b_data, todo,
  405                             (flags & B_READ) ?  VM_PROT_WRITE : VM_PROT_READ);
  406                         if (error) {
  407                                 goto done;
  408                         }
  409                         vmapbuf(bp, todo);
  410 
  411                         BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
  412 
  413                         simple_lock(&mbp->b_interlock);
  414                         mbp->b_running++;
  415                         simple_unlock(&mbp->b_interlock);
  416 
  417                         /* [call strategy to start the transfer] */
  418                         (*strategy)(bp);
  419                         bp = NULL;
  420 
  421                         iovp->iov_len -= todo;
  422                         iovp->iov_base = (caddr_t)iovp->iov_base + todo;
  423                         uio->uio_offset += todo;
  424                         uio->uio_resid -= todo;
  425                 }
  426         }
  427 
  428 done:
  429         simple_lock(&mbp->b_interlock);
  430 done_locked:
  431         error2 = physio_wait(mbp, 0, "physio2");
  432         if (error == 0) {
  433                 error = error2;
  434         }
  435         simple_unlock(&mbp->b_interlock);
  436 
  437         if ((mbp->b_flags & B_ERROR) != 0) {
  438                 off_t delta;
  439 
  440                 delta = uio->uio_offset - mbp->b_endoffset;
  441                 KASSERT(delta > 0);
  442                 uio->uio_resid += delta;
  443                 /* uio->uio_offset = mbp->b_endoffset; */
  444         } else {
  445                 KASSERT(mbp->b_endoffset == -1);
  446         }
  447         if (bp != NULL) {
  448                 putphysbuf(bp);
  449         }
  450         if (error == 0) {
  451                 error = mbp->b_error;
  452         }
  453         putphysbuf(mbp);
  454 
  455         /*
  456          * [clean up the state of the buffer]
  457          * Remember if somebody wants it, so we can wake them up below.
  458          * Also, if we had to steal it, give it back.
  459          */
  460         if (obp != NULL) {
  461                 KASSERT((obp->b_flags & B_BUSY) != 0);
  462                 KASSERT((obp->b_flags & B_DONTFREE) != 0);
  463 
  464                 /*
  465                  * [if another process is waiting for the raw I/O buffer,
  466                  *    wake up processes waiting to do physical I/O;
  467                  */
  468                 s = splbio();
  469                 simple_lock(&obp->b_interlock);
  470                 obp->b_flags &=
  471                     ~(B_BUSY | B_PHYS | B_RAW | B_CALL | B_DONTFREE);
  472                 if ((obp->b_flags & B_WANTED) != 0) {
  473                         obp->b_flags &= ~B_WANTED;
  474                         wakeup(obp);
  475                 }
  476                 simple_unlock(&obp->b_interlock);
  477                 splx(s);
  478         }
  479         PRELE(l);
  480 
  481         DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n",
  482             __func__, uio->uio_offset, uio->uio_resid));
  483 
  484         return error;
  485 }
  486 
  487 /*
  488  * Leffler, et al., says on p. 231:
  489  * "The minphys() routine is called by physio() to adjust the
  490  * size of each I/O transfer before the latter is passed to
  491  * the strategy routine..."
  492  *
  493  * so, just adjust the buffer's count accounting to MAXPHYS here,
  494  * and return the new count;
  495  */
  496 void
  497 minphys(struct buf *bp)
  498 {
  499 
  500         if (bp->b_bcount > MAXPHYS)
  501                 bp->b_bcount = MAXPHYS;
  502 }

Cache object: f2b8e8f12e8da417d6c2e28010652c55


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.