The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_physio.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: kern_physio.c,v 1.102 2022/07/10 23:11:55 riastradh Exp $      */
    2 
    3 /*-
    4  * Copyright (c) 1982, 1986, 1990, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  * (c) UNIX System Laboratories, Inc.
    7  * All or some portions of this file are derived from material licensed
    8  * to the University of California by American Telephone and Telegraph
    9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   10  * the permission of UNIX System Laboratories, Inc.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  *      @(#)kern_physio.c       8.1 (Berkeley) 6/10/93
   37  */
   38 
   39 /*-
   40  * Copyright (c) 1994 Christopher G. Demetriou
   41  *
   42  * Redistribution and use in source and binary forms, with or without
   43  * modification, are permitted provided that the following conditions
   44  * are met:
   45  * 1. Redistributions of source code must retain the above copyright
   46  *    notice, this list of conditions and the following disclaimer.
   47  * 2. Redistributions in binary form must reproduce the above copyright
   48  *    notice, this list of conditions and the following disclaimer in the
   49  *    documentation and/or other materials provided with the distribution.
   50  * 3. All advertising materials mentioning features or use of this software
   51  *    must display the following acknowledgement:
   52  *      This product includes software developed by the University of
   53  *      California, Berkeley and its contributors.
   54  * 4. Neither the name of the University nor the names of its contributors
   55  *    may be used to endorse or promote products derived from this software
   56  *    without specific prior written permission.
   57  *
   58  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   59  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   60  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   61  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   62  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   63  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   64  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   68  * SUCH DAMAGE.
   69  *
   70  *      @(#)kern_physio.c       8.1 (Berkeley) 6/10/93
   71  */
   72 
   73 #include <sys/cdefs.h>
   74 __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.102 2022/07/10 23:11:55 riastradh Exp $");
   75 
   76 #include <sys/param.h>
   77 #include <sys/systm.h>
   78 #include <sys/conf.h>
   79 #include <sys/buf.h>
   80 #include <sys/proc.h>
   81 #include <sys/once.h>
   82 #include <sys/workqueue.h>
   83 #include <sys/kmem.h>
   84 
   85 #include <uvm/uvm_extern.h>
   86 
   87 ONCE_DECL(physio_initialized);
   88 struct workqueue *physio_workqueue;
   89 
   90 int physio_concurrency = 16;
   91 
   92 /* #define      PHYSIO_DEBUG */
   93 #if defined(PHYSIO_DEBUG)
   94 #define DPRINTF(a)      printf a
   95 #else /* defined(PHYSIO_DEBUG) */
   96 #define DPRINTF(a)      /* nothing */
   97 #endif /* defined(PHYSIO_DEBUG) */
   98 
   99 struct physio_stat {
  100         int ps_running;
  101         int ps_error;
  102         int ps_failed;
  103         off_t ps_endoffset;
  104         size_t ps_resid;
  105         buf_t *ps_orig_bp;
  106         kmutex_t ps_lock;
  107         kcondvar_t ps_cv;
  108 };
  109 
  110 static void
  111 physio_done(struct work *wk, void *dummy)
  112 {
  113         struct buf *bp = (void *)wk;
  114         size_t todo = bp->b_bufsize;
  115         size_t done = bp->b_bcount - bp->b_resid;
  116         struct physio_stat *ps = bp->b_private;
  117         bool is_iobuf;
  118 
  119         KASSERT(&bp->b_work == wk);
  120         KASSERT(bp->b_bcount <= todo);
  121         KASSERT(bp->b_resid <= bp->b_bcount);
  122         KASSERT((bp->b_flags & B_PHYS) != 0);
  123         KASSERT(dummy == NULL);
  124 
  125         vunmapbuf(bp, todo);
  126         uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo);
  127 
  128         mutex_enter(&ps->ps_lock);
  129         is_iobuf = (bp != ps->ps_orig_bp);
  130         if (__predict_false(done != todo)) {
  131                 off_t endoffset = dbtob(bp->b_blkno) + done;
  132 
  133                 /*
  134                  * we got an error or hit EOM.
  135                  *
  136                  * we only care about the first one.
  137                  * ie. the one at the lowest offset.
  138                  */
  139 
  140                 KASSERT(ps->ps_endoffset != endoffset);
  141                 DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64
  142                     ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n",
  143                     __func__, bp->b_error, dbtob(bp->b_blkno), endoffset,
  144                     bp->b_blkno, bp->b_bcount, bp->b_flags));
  145 
  146                 if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) {
  147                         DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64
  148                             " -> %" PRIu64 "\n",
  149                             __func__, ps,
  150                             ps->ps_error, bp->b_error,
  151                             ps->ps_endoffset, endoffset));
  152 
  153                         ps->ps_endoffset = endoffset;
  154                         ps->ps_error = bp->b_error;
  155                 }
  156                 ps->ps_failed++;
  157 
  158                 ps->ps_resid += todo - done;
  159         } else {
  160                 KASSERT(bp->b_error == 0);
  161         }
  162 
  163         ps->ps_running--;
  164         cv_signal(&ps->ps_cv);
  165         mutex_exit(&ps->ps_lock);
  166 
  167         if (is_iobuf)
  168                 putiobuf(bp);
  169 }
  170 
  171 static void
  172 physio_biodone(struct buf *bp)
  173 {
  174 #if defined(DIAGNOSTIC)
  175         struct physio_stat *ps = bp->b_private;
  176         size_t todo = bp->b_bufsize;
  177         size_t done = bp->b_bcount - bp->b_resid;
  178 
  179         KASSERT(ps->ps_running > 0);
  180         KASSERT(bp->b_bcount <= todo);
  181         KASSERT(bp->b_resid <= bp->b_bcount);
  182         if (done == todo)
  183                 KASSERTMSG(bp->b_error == 0, "error=%d", bp->b_error);
  184 #endif /* defined(DIAGNOSTIC) */
  185 
  186         workqueue_enqueue(physio_workqueue, &bp->b_work, NULL);
  187 }
  188 
  189 static void
  190 physio_wait(struct physio_stat *ps, int n)
  191 {
  192 
  193         KASSERT(mutex_owned(&ps->ps_lock));
  194 
  195         while (ps->ps_running > n)
  196                 cv_wait(&ps->ps_cv, &ps->ps_lock);
  197 }
  198 
  199 static int
  200 physio_init(void)
  201 {
  202         int error;
  203 
  204         KASSERT(physio_workqueue == NULL);
  205 
  206         error = workqueue_create(&physio_workqueue, "physiod",
  207             physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE);
  208 
  209         return error;
  210 }
  211 
  212 /*
  213  * Do "physical I/O" on behalf of a user.  "Physical I/O" is I/O directly
  214  * from the raw device to user buffers, and bypasses the buffer cache.
  215  */
  216 int
  217 physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags,
  218     void (*min_phys)(struct buf *), struct uio *uio)
  219 {
  220         struct iovec *iovp;
  221         struct lwp *l = curlwp;
  222         struct proc *p = l->l_proc;
  223         int i, error;
  224         struct buf *bp = NULL;
  225         struct physio_stat *ps;
  226         int concurrency = physio_concurrency - 1;
  227         int isdisk;
  228 
  229         error = RUN_ONCE(&physio_initialized, physio_init);
  230         if (__predict_false(error != 0)) {
  231                 return error;
  232         }
  233 
  234         DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n",
  235             __func__, uio->uio_offset, uio->uio_resid));
  236 
  237         flags &= B_READ | B_WRITE;
  238 
  239         ps = kmem_zalloc(sizeof(*ps), KM_SLEEP);
  240         /* ps->ps_running = 0; */
  241         /* ps->ps_error = 0; */
  242         /* ps->ps_failed = 0; */
  243         ps->ps_orig_bp = obp;
  244         ps->ps_endoffset = -1;
  245         ps->ps_resid = 0;
  246         mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE);
  247         cv_init(&ps->ps_cv, "physio");
  248 
  249         /* Allow concurrent I/O only for disks */
  250         isdisk = cdev_type(dev) == D_DISK;
  251         if (!isdisk)
  252                 concurrency = 0;
  253 
  254         /* Make sure we have a buffer, creating one if necessary. */
  255         if (obp != NULL) {
  256                 mutex_enter(&bufcache_lock);
  257                 /* Mark it busy, so nobody else will use it. */
  258                 while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH)
  259                         ;
  260                 mutex_exit(&bufcache_lock);
  261                 concurrency = 0; /* see "XXXkludge" comment below */
  262         }
  263 
  264         for (i = 0; i < uio->uio_iovcnt; i++) {
  265                 bool sync = true;
  266 
  267                 iovp = &uio->uio_iov[i];
  268                 while (iovp->iov_len > 0) {
  269                         size_t todo;
  270                         vaddr_t endp;
  271 
  272                         mutex_enter(&ps->ps_lock);
  273                         if (ps->ps_failed != 0) {
  274                                 goto done_locked;
  275                         }
  276                         physio_wait(ps, sync ? 0 : concurrency);
  277                         mutex_exit(&ps->ps_lock);
  278                         if (obp != NULL) {
  279                                 /*
  280                                  * XXXkludge
  281                                  * some drivers use "obp" as an identifier.
  282                                  */
  283                                 bp = obp;
  284                         } else {
  285                                 bp = getiobuf(NULL, true);
  286                                 bp->b_cflags |= BC_BUSY;
  287                         }
  288                         bp->b_dev = dev;
  289                         bp->b_proc = p;
  290                         bp->b_private = ps;
  291 
  292                         /*
  293                          * Mrk the buffer busy for physical I/O.  Also set
  294                          * B_PHYS because it's an I/O to user memory, and
  295                          * B_RAW because B_RAW is to be "set by physio for
  296                          * raw transfers".
  297                          */
  298                         bp->b_oflags = 0;
  299                         bp->b_cflags |= BC_BUSY;
  300                         bp->b_flags = flags | B_PHYS | B_RAW;
  301                         bp->b_iodone = physio_biodone;
  302 
  303                         /* Set up the buffer for a maximum-sized transfer. */
  304                         bp->b_blkno = btodb(uio->uio_offset);
  305                         if (isdisk) {
  306                                 /*
  307                                  * For disks, check that offsets are at least block
  308                                  * aligned, the block addresses are used to track
  309                                  * errors of finished requests.
  310                                  */
  311                                 if (uio->uio_offset & (DEV_BSIZE - 1)) {
  312                                         error = EINVAL;
  313                                         goto done;
  314                                 }
  315                                 /*
  316                                  * Split request into MAXPHYS chunks
  317                                  */
  318                                 bp->b_bcount = MIN(MAXPHYS, iovp->iov_len);
  319                         } else {
  320                                 bp->b_bcount = MIN(INT_MAX, iovp->iov_len);
  321                         }
  322                         bp->b_data = iovp->iov_base;
  323 
  324                         /*
  325                          * Call minphys to bound the transfer size,
  326                          * and remember the amount of data to transfer,
  327                          * for later comparison.
  328                          */
  329                         (*min_phys)(bp);
  330                         todo = bp->b_bufsize = bp->b_bcount;
  331 #if defined(DIAGNOSTIC)
  332                         if (todo > MAXPHYS)
  333                                 panic("todo(%zu) > MAXPHYS; minphys broken",
  334                                     todo);
  335 #endif /* defined(DIAGNOSTIC) */
  336 
  337                         sync = false;
  338                         endp = (vaddr_t)bp->b_data + todo;
  339                         if (trunc_page(endp) != endp) {
  340                                 /*
  341                                  * Following requests can overlap.
  342                                  * note that uvm_vslock does round_page.
  343                                  */
  344                                 sync = true;
  345                         }
  346 
  347                         /*
  348                          * Lock the part of the user address space involved
  349                          * in the transfer.
  350                          */
  351                         error = uvm_vslock(p->p_vmspace, bp->b_data, todo,
  352                             (flags & B_READ) ?  VM_PROT_WRITE : VM_PROT_READ);
  353                         if (error) {
  354                                 goto done;
  355                         }
  356 
  357                         /*
  358                          * Beware vmapbuf(); if successful it clobbers
  359                          * b_data and saves it in b_saveaddr.
  360                          * However, vunmapbuf() restores b_data.
  361                          */
  362                         if ((error = vmapbuf(bp, todo)) != 0) {
  363                                 uvm_vsunlock(p->p_vmspace, bp->b_data, todo);
  364                                 goto done;
  365                         }
  366 
  367                         BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
  368 
  369                         mutex_enter(&ps->ps_lock);
  370                         ps->ps_running++;
  371                         mutex_exit(&ps->ps_lock);
  372 
  373                         /* Call strategy to start the transfer. */
  374                         (*strategy)(bp);
  375                         bp = NULL;
  376 
  377                         iovp->iov_len -= todo;
  378                         iovp->iov_base = (char *)iovp->iov_base + todo;
  379                         uio->uio_offset += todo;
  380                         uio->uio_resid -= todo;
  381                 }
  382         }
  383 
  384 done:
  385         mutex_enter(&ps->ps_lock);
  386 done_locked:
  387         physio_wait(ps, 0);
  388         mutex_exit(&ps->ps_lock);
  389 
  390         KASSERT(ps->ps_failed || ps->ps_endoffset == -1);
  391 
  392         /*
  393          * Compute residual, for disks adjust for the
  394          * lowest numbered block that returned an error.
  395          */
  396         if (isdisk) {
  397                 if (ps->ps_failed != 0) {
  398                         off_t delta;
  399 
  400                         delta = uio->uio_offset - ps->ps_endoffset;
  401                         KASSERT(delta > 0);
  402                         uio->uio_resid += delta;
  403                         /* uio->uio_offset = ps->ps_endoffset; */
  404                 }
  405         } else {
  406                 uio->uio_resid += ps->ps_resid;
  407         }
  408 
  409         if (bp != NULL && bp != obp) {
  410                 putiobuf(bp);
  411         }
  412         if (error == 0) {
  413                 error = ps->ps_error;
  414         }
  415         mutex_destroy(&ps->ps_lock);
  416         cv_destroy(&ps->ps_cv);
  417         kmem_free(ps, sizeof(*ps));
  418 
  419         /*
  420          * Clean up the state of the buffer.  Remember if somebody wants
  421          * it, so we can wake them up below.  Also, if we had to steal it,
  422          * give it back.
  423          */
  424         if (obp != NULL) {
  425                 KASSERT((obp->b_cflags & BC_BUSY) != 0);
  426 
  427                 /*
  428                  * If another process is waiting for the raw I/O buffer,
  429                  * wake up processes waiting to do physical I/O;
  430                  */
  431                 mutex_enter(&bufcache_lock);
  432                 obp->b_cflags &= ~(BC_BUSY | BC_WANTED);
  433                 obp->b_flags &= ~(B_PHYS | B_RAW);
  434                 obp->b_iodone = NULL;
  435                 cv_broadcast(&obp->b_busy);
  436                 mutex_exit(&bufcache_lock);
  437         }
  438 
  439         DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n",
  440             __func__, uio->uio_offset, uio->uio_resid));
  441 
  442         return error;
  443 }
  444 
  445 /*
  446  * A minphys() routine is called by physio() to adjust the size of each
  447  * I/O transfer before the latter is passed to the strategy routine.
  448  *
  449  * This minphys() is a default that must be called to enforce limits
  450  * that are applicable to all devices, because of limitations in the
  451  * kernel or the hardware platform.
  452  */
  453 void
  454 minphys(struct buf *bp)
  455 {
  456 
  457         if (bp->b_bcount > MAXPHYS)
  458                 bp->b_bcount = MAXPHYS;
  459 }

Cache object: 1a2ddbfeee7916d2f28315c07345167e


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.