vfs_bio.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $OpenBSD: vfs_bio.c,v 1.210 2022/08/14 01:58:28 jsg Exp $       */
    2 /*      $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $  */
    3 
    4 /*
    5  * Copyright (c) 1994 Christopher G. Demetriou
    6  * Copyright (c) 1982, 1986, 1989, 1993
    7  *      The Regents of the University of California.  All rights reserved.
    8  * (c) UNIX System Laboratories, Inc.
    9  * All or some portions of this file are derived from material licensed
   10  * to the University of California by American Telephone and Telegraph
   11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   12  * the permission of UNIX System Laboratories, Inc.
   13  *
   14  * Redistribution and use in source and binary forms, with or without
   15  * modification, are permitted provided that the following conditions
   16  * are met:
   17  * 1. Redistributions of source code must retain the above copyright
   18  *    notice, this list of conditions and the following disclaimer.
   19  * 2. Redistributions in binary form must reproduce the above copyright
   20  *    notice, this list of conditions and the following disclaimer in the
   21  *    documentation and/or other materials provided with the distribution.
   22  * 3. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
   39  */
   40 
   41 /*
   42  * Some references:
   43  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
   44  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
   45  *              UNIX Operating System (Addison Welley, 1989)
   46  */
   47 
   48 #include <sys/param.h>
   49 #include <sys/systm.h>
   50 #include <sys/proc.h>
   51 #include <sys/buf.h>
   52 #include <sys/vnode.h>
   53 #include <sys/mount.h>
   54 #include <sys/malloc.h>
   55 #include <sys/pool.h>
   56 #include <sys/specdev.h>
   57 #include <sys/tracepoint.h>
   58 #include <uvm/uvm_extern.h>
   59 
   60 /* XXX Should really be in buf.h, but for uvm_constraint_range.. */
   61 int     buf_realloc_pages(struct buf *, struct uvm_constraint_range *, int);
   62 
   63 struct uvm_constraint_range high_constraint;
   64 int fliphigh;
   65 
   66 int nobuffers;
   67 int needbuffer;
   68 struct bio_ops bioops;
   69 
   70 /* private bufcache functions */
   71 void bufcache_init(void);
   72 void bufcache_adjust(void);
   73 struct buf *bufcache_gethighcleanbuf(void);
   74 struct buf *bufcache_getdmacleanbuf(void);
   75 
   76 /*
   77  * Buffer pool for I/O buffers.
   78  */
   79 struct pool bufpool;
   80 struct bufhead bufhead = LIST_HEAD_INITIALIZER(bufhead);
   81 void buf_put(struct buf *);
   82 
   83 struct buf *bio_doread(struct vnode *, daddr_t, int, int);
   84 struct buf *buf_get(struct vnode *, daddr_t, size_t);
   85 void bread_cluster_callback(struct buf *);
   86 int64_t bufcache_recover_dmapages(int discard, int64_t howmany);
   87 
   88 struct bcachestats bcstats;  /* counters */
   89 long lodirtypages;      /* dirty page count low water mark */
   90 long hidirtypages;      /* dirty page count high water mark */
   91 long targetpages;       /* target number of pages for cache size */
   92 long buflowpages;       /* smallest size cache allowed */
   93 long bufhighpages;      /* largest size cache allowed */
   94 long bufbackpages;      /* minimum number of pages we shrink when asked to */
   95 
   96 vsize_t bufkvm;
   97 
   98 struct proc *cleanerproc;
   99 int bd_req;                     /* Sleep point for cleaner daemon. */
  100 
  101 #define NUM_CACHES 2
  102 #define DMA_CACHE 0
  103 struct bufcache cleancache[NUM_CACHES];
  104 struct bufqueue dirtyqueue;
  105 
  106 void
  107 buf_put(struct buf *bp)
  108 {
  109         splassert(IPL_BIO);
  110 
  111 #ifdef DIAGNOSTIC
  112         if (bp->b_pobj != NULL)
  113                 KASSERT(bp->b_bufsize > 0);
  114         if (ISSET(bp->b_flags, B_DELWRI))
  115                 panic("buf_put: releasing dirty buffer");
  116         if (bp->b_freelist.tqe_next != NOLIST &&
  117             bp->b_freelist.tqe_next != (void *)-1)
  118                 panic("buf_put: still on the free list");
  119         if (bp->b_vnbufs.le_next != NOLIST &&
  120             bp->b_vnbufs.le_next != (void *)-1)
  121                 panic("buf_put: still on the vnode list");
  122         if (!LIST_EMPTY(&bp->b_dep))
  123                 panic("buf_put: b_dep is not empty");
  124 #endif
  125 
  126         LIST_REMOVE(bp, b_list);
  127         bcstats.numbufs--;
  128 
  129         if (buf_dealloc_mem(bp) != 0)
  130                 return;
  131         pool_put(&bufpool, bp);
  132 }
  133 
  134 /*
  135  * Initialize buffers and hash links for buffers.
  136  */
  137 void
  138 bufinit(void)
  139 {
  140         u_int64_t dmapages;
  141         u_int64_t highpages;
  142 
  143         dmapages = uvm_pagecount(&dma_constraint);
  144         /* take away a guess at how much of this the kernel will consume */
  145         dmapages -= (atop(physmem) - atop(uvmexp.free));
  146 
  147         /* See if we have memory above the dma accessible region. */
  148         high_constraint.ucr_low = dma_constraint.ucr_high;
  149         high_constraint.ucr_high = no_constraint.ucr_high;
  150         if (high_constraint.ucr_low != high_constraint.ucr_high)
  151                 high_constraint.ucr_low++;
  152         highpages = uvm_pagecount(&high_constraint);
  153 
  154         /*
  155          * Do we have any significant amount of high memory above
  156          * the DMA region? if so enable moving buffers there, if not,
  157          * don't bother.
  158          */
  159         if (highpages > dmapages / 4)
  160                 fliphigh = 1;
  161         else
  162                 fliphigh = 0;
  163 
  164         /*
  165          * If MD code doesn't say otherwise, use up to 10% of DMA'able
  166          * memory for buffers.
  167          */
  168         if (bufcachepercent == 0)
  169                 bufcachepercent = 10;
  170 
  171         /*
  172          * XXX these values and their same use in kern_sysctl
  173          * need to move into buf.h
  174          */
  175         KASSERT(bufcachepercent <= 90);
  176         KASSERT(bufcachepercent >= 5);
  177         if (bufpages == 0)
  178                 bufpages = dmapages * bufcachepercent / 100;
  179         if (bufpages < BCACHE_MIN)
  180                 bufpages = BCACHE_MIN;
  181         KASSERT(bufpages < dmapages);
  182 
  183         bufhighpages = bufpages;
  184 
  185         /*
  186          * Set the base backoff level for the buffer cache.  We will
  187          * not allow uvm to steal back more than this number of pages.
  188          */
  189         buflowpages = dmapages * 5 / 100;
  190         if (buflowpages < BCACHE_MIN)
  191                 buflowpages = BCACHE_MIN;
  192 
  193         /*
  194          * set bufbackpages to 100 pages, or 10 percent of the low water mark
  195          * if we don't have that many pages.
  196          */
  197 
  198         bufbackpages = buflowpages * 10 / 100;
  199         if (bufbackpages > 100)
  200                 bufbackpages = 100;
  201 
  202         /*
  203          * If the MD code does not say otherwise, reserve 10% of kva
  204          * space for mapping buffers.
  205          */
  206         if (bufkvm == 0)
  207                 bufkvm = VM_KERNEL_SPACE_SIZE / 10;
  208 
  209         /*
  210          * Don't use more than twice the amount of bufpages for mappings.
  211          * It's twice since we map things sparsely.
  212          */
  213         if (bufkvm > bufpages * PAGE_SIZE)
  214                 bufkvm = bufpages * PAGE_SIZE;
  215         /*
  216          * Round bufkvm to MAXPHYS because we allocate chunks of va space
  217          * in MAXPHYS chunks.
  218          */
  219         bufkvm &= ~(MAXPHYS - 1);
  220 
  221         pool_init(&bufpool, sizeof(struct buf), 0, IPL_BIO, 0, "bufpl", NULL);
  222 
  223         bufcache_init();
  224 
  225         /*
  226          * hmm - bufkvm is an argument because it's static, while
  227          * bufpages is global because it can change while running.
  228          */
  229         buf_mem_init(bufkvm);
  230 
  231         /*
  232          * Set the dirty page high water mark to be less than the low
  233          * water mark for pages in the buffer cache. This ensures we
  234          * can always back off by throwing away clean pages, and give
  235          * ourselves a chance to write out the dirty pages eventually.
  236          */
  237         hidirtypages = (buflowpages / 4) * 3;
  238         lodirtypages = buflowpages / 2;
  239 
  240         /*
  241          * We are allowed to use up to the reserve.
  242          */
  243         targetpages = bufpages - RESERVE_PAGES;
  244 }
  245 
  246 /*
  247  * Change cachepct
  248  */
  249 void
  250 bufadjust(int newbufpages)
  251 {
  252         int s;
  253         int64_t npages;
  254 
  255         if (newbufpages < buflowpages)
  256                 newbufpages = buflowpages;
  257 
  258         s = splbio();
  259         bufpages = newbufpages;
  260 
  261         /*
  262          * We are allowed to use up to the reserve
  263          */
  264         targetpages = bufpages - RESERVE_PAGES;
  265 
  266         npages = bcstats.dmapages - targetpages;
  267 
  268         /*
  269          * Shrinking the cache happens here only if someone has manually
  270          * adjusted bufcachepercent - or the pagedaemon has told us
  271          * to give back memory *now* - so we give it all back.
  272          */
  273         if (bcstats.dmapages > targetpages)
  274                 (void) bufcache_recover_dmapages(0, bcstats.dmapages - targetpages);
  275         bufcache_adjust();
  276 
  277         /*
  278          * Wake up the cleaner if we have lots of dirty pages,
  279          * or if we are getting low on buffer cache kva.
  280          */
  281         if ((UNCLEAN_PAGES >= hidirtypages) ||
  282             bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS)
  283                 wakeup(&bd_req);
  284 
  285         splx(s);
  286 }
  287 
  288 /*
  289  * Make the buffer cache back off from cachepct.
  290  */
  291 int
  292 bufbackoff(struct uvm_constraint_range *range, long size)
  293 {
  294         /*
  295          * Back off "size" buffer cache pages. Called by the page
  296          * daemon to consume buffer cache pages rather than scanning.
  297          *
  298          * It returns 0 to the pagedaemon to indicate that it has
  299          * succeeded in freeing enough pages. It returns -1 to
  300          * indicate that it could not and the pagedaemon should take
  301          * other measures.
  302          *
  303          */
  304         long pdelta, oldbufpages;
  305 
  306         /*
  307          * If we will accept high memory for this backoff
  308          * try to steal it from the high memory buffer cache.
  309          */
  310         if (range != NULL && range->ucr_high > dma_constraint.ucr_high) {
  311                 struct buf *bp;
  312                 int64_t start = bcstats.numbufpages, recovered = 0;
  313                 int s = splbio();
  314 
  315                 while ((recovered < size) &&
  316                     (bp = bufcache_gethighcleanbuf())) {
  317                         bufcache_take(bp);
  318                         if (bp->b_vp) {
  319                                 RBT_REMOVE(buf_rb_bufs,
  320                                     &bp->b_vp->v_bufs_tree, bp);
  321                                 brelvp(bp);
  322                         }
  323                         buf_put(bp);
  324                         recovered = start - bcstats.numbufpages;
  325                 }
  326                 bufcache_adjust();
  327                 splx(s);
  328 
  329                 /* If we got enough, return success */
  330                 if (recovered >= size)
  331                         return 0;
  332 
  333                 /*
  334                  * If we needed only memory above DMA,
  335                  * return failure
  336                  */
  337                 if (range->ucr_low > dma_constraint.ucr_high)
  338                         return -1;
  339 
  340                 /* Otherwise get the rest from DMA */
  341                 size -= recovered;
  342         }
  343 
  344         /*
  345          * XXX Otherwise do the dma memory cache dance. this needs
  346          * refactoring later to get rid of 'bufpages'
  347          */
  348 
  349         /*
  350          * Back off by at least bufbackpages. If the page daemon gave us
  351          * a larger size, back off by that much.
  352          */
  353         pdelta = (size > bufbackpages) ? size : bufbackpages;
  354 
  355         if (bufpages <= buflowpages)
  356                 return(-1);
  357         if (bufpages - pdelta < buflowpages)
  358                 pdelta = bufpages - buflowpages;
  359         oldbufpages = bufpages;
  360         bufadjust(bufpages - pdelta);
  361         if (oldbufpages - bufpages < size)
  362                 return (-1); /* we did not free what we were asked */
  363         else
  364                 return(0);
  365 }
  366 
  367 
  368 /*
  369  * Opportunistically flip a buffer into high memory. Will move the buffer
  370  * if memory is available without sleeping, and return 0, otherwise will
  371  * fail and return -1 with the buffer unchanged.
  372  */
  373 
  374 int
  375 buf_flip_high(struct buf *bp)
  376 {
  377         int s;
  378         int ret = -1;
  379 
  380         KASSERT(ISSET(bp->b_flags, B_BC));
  381         KASSERT(ISSET(bp->b_flags, B_DMA));
  382         KASSERT(bp->cache == DMA_CACHE);
  383         KASSERT(fliphigh);
  384 
  385         /* Attempt to move the buffer to high memory if we can */
  386         s = splbio();
  387         if (buf_realloc_pages(bp, &high_constraint, UVM_PLA_NOWAIT) == 0) {
  388                 KASSERT(!ISSET(bp->b_flags, B_DMA));
  389                 bcstats.highflips++;
  390                 ret = 0;
  391         } else
  392                 bcstats.highflops++;
  393         splx(s);
  394 
  395         return ret;
  396 }
  397 
  398 /*
  399  * Flip a buffer to dma reachable memory, when we need it there for
  400  * I/O. This can sleep since it will wait for memory allocation in the
  401  * DMA reachable area since we have to have the buffer there to proceed.
  402  */
  403 void
  404 buf_flip_dma(struct buf *bp)
  405 {
  406         KASSERT(ISSET(bp->b_flags, B_BC));
  407         KASSERT(ISSET(bp->b_flags, B_BUSY));
  408         KASSERT(bp->cache < NUM_CACHES);
  409 
  410         if (!ISSET(bp->b_flags, B_DMA)) {
  411                 int s = splbio();
  412 
  413                 /* move buf to dma reachable memory */
  414                 (void) buf_realloc_pages(bp, &dma_constraint, UVM_PLA_WAITOK);
  415                 KASSERT(ISSET(bp->b_flags, B_DMA));
  416                 bcstats.dmaflips++;
  417                 splx(s);
  418         }
  419 
  420         if (bp->cache > DMA_CACHE) {
  421                 CLR(bp->b_flags, B_COLD);
  422                 CLR(bp->b_flags, B_WARM);
  423                 bp->cache = DMA_CACHE;
  424         }
  425 }
  426 
  427 struct buf *
  428 bio_doread(struct vnode *vp, daddr_t blkno, int size, int async)
  429 {
  430         struct buf *bp;
  431         struct mount *mp;
  432 
  433         bp = getblk(vp, blkno, size, 0, INFSLP);
  434 
  435         /*
  436          * If buffer does not have valid data, start a read.
  437          * Note that if buffer is B_INVAL, getblk() won't return it.
  438          * Therefore, it's valid if its I/O has completed or been delayed.
  439          */
  440         if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
  441                 SET(bp->b_flags, B_READ | async);
  442                 bcstats.pendingreads++;
  443                 bcstats.numreads++;
  444                 VOP_STRATEGY(bp->b_vp, bp);
  445                 /* Pay for the read. */
  446                 curproc->p_ru.ru_inblock++;                     /* XXX */
  447         } else if (async) {
  448                 brelse(bp);
  449         }
  450 
  451         mp = vp->v_type == VBLK ? vp->v_specmountpoint : vp->v_mount;
  452 
  453         /*
  454          * Collect statistics on synchronous and asynchronous reads.
  455          * Reads from block devices are charged to their associated
  456          * filesystem (if any).
  457          */
  458         if (mp != NULL) {
  459                 if (async == 0)
  460                         mp->mnt_stat.f_syncreads++;
  461                 else
  462                         mp->mnt_stat.f_asyncreads++;
  463         }
  464 
  465         return (bp);
  466 }
  467 
  468 /*
  469  * Read a disk block.
  470  * This algorithm described in Bach (p.54).
  471  */
  472 int
  473 bread(struct vnode *vp, daddr_t blkno, int size, struct buf **bpp)
  474 {
  475         struct buf *bp;
  476 
  477         /* Get buffer for block. */
  478         bp = *bpp = bio_doread(vp, blkno, size, 0);
  479 
  480         /* Wait for the read to complete, and return result. */
  481         return (biowait(bp));
  482 }
  483 
  484 /*
  485  * Read-ahead multiple disk blocks. The first is sync, the rest async.
  486  * Trivial modification to the breada algorithm presented in Bach (p.55).
  487  */
  488 int
  489 breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t rablks[],
  490     int rasizes[], int nrablks, struct buf **bpp)
  491 {
  492         struct buf *bp;
  493         int i;
  494 
  495         bp = *bpp = bio_doread(vp, blkno, size, 0);
  496 
  497         /*
  498          * For each of the read-ahead blocks, start a read, if necessary.
  499          */
  500         for (i = 0; i < nrablks; i++) {
  501                 /* If it's in the cache, just go on to next one. */
  502                 if (incore(vp, rablks[i]))
  503                         continue;
  504 
  505                 /* Get a buffer for the read-ahead block */
  506                 (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
  507         }
  508 
  509         /* Otherwise, we had to start a read for it; wait until it's valid. */
  510         return (biowait(bp));
  511 }
  512 
  513 /*
  514  * Called from interrupt context.
  515  */
  516 void
  517 bread_cluster_callback(struct buf *bp)
  518 {
  519         struct buf **xbpp = bp->b_saveaddr;
  520         int i;
  521 
  522         if (xbpp[1] != NULL) {
  523                 size_t newsize = xbpp[1]->b_bufsize;
  524 
  525                 /*
  526                  * Shrink this buffer's mapping to only cover its part of
  527                  * the total I/O.
  528                  */
  529                 buf_fix_mapping(bp, newsize);
  530                 bp->b_bcount = newsize;
  531         }
  532 
  533         /* Invalidate read-ahead buffers if read short */
  534         if (bp->b_resid > 0) {
  535                 for (i = 1; xbpp[i] != NULL; i++)
  536                         continue;
  537                 for (i = i - 1; i != 0; i--) {
  538                         if (xbpp[i]->b_bufsize <= bp->b_resid) {
  539                                 bp->b_resid -= xbpp[i]->b_bufsize;
  540                                 SET(xbpp[i]->b_flags, B_INVAL);
  541                         } else if (bp->b_resid > 0) {
  542                                 bp->b_resid = 0;
  543                                 SET(xbpp[i]->b_flags, B_INVAL);
  544                         } else
  545                                 break;
  546                 }
  547         }
  548 
  549         for (i = 1; xbpp[i] != NULL; i++) {
  550                 if (ISSET(bp->b_flags, B_ERROR))
  551                         SET(xbpp[i]->b_flags, B_INVAL | B_ERROR);
  552                 /*
  553                  * Move the pages from the master buffer's uvm object
  554                  * into the individual buffer's uvm objects.
  555                  */
  556                 struct uvm_object *newobj = &xbpp[i]->b_uobj;
  557                 struct uvm_object *oldobj = &bp->b_uobj;
  558                 int page;
  559 
  560                 uvm_obj_init(newobj, &bufcache_pager, 1);
  561                 for (page = 0; page < atop(xbpp[i]->b_bufsize); page++) {
  562                         struct vm_page *pg = uvm_pagelookup(oldobj,
  563                             xbpp[i]->b_poffs + ptoa(page));
  564                         KASSERT(pg != NULL);
  565                         KASSERT(pg->wire_count == 1);
  566                         uvm_pagerealloc(pg, newobj, xbpp[i]->b_poffs + ptoa(page));
  567                 }
  568                 xbpp[i]->b_pobj = newobj;
  569 
  570                 biodone(xbpp[i]);
  571         }
  572 
  573         free(xbpp, M_TEMP, (i + 1) * sizeof(*xbpp));
  574 
  575         if (ISSET(bp->b_flags, B_ASYNC)) {
  576                 brelse(bp);
  577         } else {
  578                 CLR(bp->b_flags, B_WANTED);
  579                 wakeup(bp);
  580         }
  581 }
  582 
  583 /*
  584  * Read-ahead multiple disk blocks, but make sure only one (big) I/O
  585  * request is sent to the disk.
  586  * XXX This should probably be dropped and breadn should instead be optimized
  587  * XXX to do fewer I/O requests.
  588  */
  589 int
  590 bread_cluster(struct vnode *vp, daddr_t blkno, int size, struct buf **rbpp)
  591 {
  592         struct buf *bp, **xbpp;
  593         int howmany, maxra, i, inc;
  594         daddr_t sblkno;
  595 
  596         *rbpp = bio_doread(vp, blkno, size, 0);
  597 
  598         /*
  599          * If the buffer is in the cache skip any I/O operation.
  600          */
  601         if (ISSET((*rbpp)->b_flags, B_CACHE))
  602                 goto out;
  603 
  604         if (size != round_page(size))
  605                 goto out;
  606 
  607         if (VOP_BMAP(vp, blkno + 1, NULL, &sblkno, &maxra))
  608                 goto out;
  609 
  610         maxra++;
  611         if (sblkno == -1 || maxra < 2)
  612                 goto out;
  613 
  614         howmany = MAXPHYS / size;
  615         if (howmany > maxra)
  616                 howmany = maxra;
  617 
  618         xbpp = mallocarray(howmany + 1, sizeof(*xbpp), M_TEMP, M_NOWAIT);
  619         if (xbpp == NULL)
  620                 goto out;
  621 
  622         for (i = howmany - 1; i >= 0; i--) {
  623                 size_t sz;
  624 
  625                 /*
  626                  * First buffer allocates big enough size to cover what
  627                  * all the other buffers need.
  628                  */
  629                 sz = i == 0 ? howmany * size : 0;
  630 
  631                 xbpp[i] = buf_get(vp, blkno + i + 1, sz);
  632                 if (xbpp[i] == NULL) {
  633                         for (++i; i < howmany; i++) {
  634                                 SET(xbpp[i]->b_flags, B_INVAL);
  635                                 brelse(xbpp[i]);
  636                         }
  637                         free(xbpp, M_TEMP, (howmany + 1) * sizeof(*xbpp));
  638                         goto out;
  639                 }
  640         }
  641 
  642         bp = xbpp[0];
  643 
  644         xbpp[howmany] = NULL;
  645 
  646         inc = btodb(size);
  647 
  648         for (i = 1; i < howmany; i++) {
  649                 bcstats.pendingreads++;
  650                 bcstats.numreads++;
  651                 /*
  652                 * We set B_DMA here because bp above will be B_DMA,
  653                 * and we are playing buffer slice-n-dice games from
  654                 * the memory allocated in bp.
  655                 */
  656                 SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC);
  657                 xbpp[i]->b_blkno = sblkno + (i * inc);
  658                 xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size;
  659                 xbpp[i]->b_data = NULL;
  660                 xbpp[i]->b_pobj = bp->b_pobj;
  661                 xbpp[i]->b_poffs = bp->b_poffs + (i * size);
  662         }
  663 
  664         KASSERT(bp->b_lblkno == blkno + 1);
  665         KASSERT(bp->b_vp == vp);
  666 
  667         bp->b_blkno = sblkno;
  668         SET(bp->b_flags, B_READ | B_ASYNC | B_CALL);
  669 
  670         bp->b_saveaddr = (void *)xbpp;
  671         bp->b_iodone = bread_cluster_callback;
  672 
  673         bcstats.pendingreads++;
  674         bcstats.numreads++;
  675         VOP_STRATEGY(bp->b_vp, bp);
  676         curproc->p_ru.ru_inblock++;
  677 
  678 out:
  679         return (biowait(*rbpp));
  680 }
  681 
  682 /*
  683  * Block write.  Described in Bach (p.56)
  684  */
  685 int
  686 bwrite(struct buf *bp)
  687 {
  688         int rv, async, wasdelayed, s;
  689         struct vnode *vp;
  690         struct mount *mp;
  691 
  692         vp = bp->b_vp;
  693         if (vp != NULL)
  694                 mp = vp->v_type == VBLK? vp->v_specmountpoint : vp->v_mount;
  695         else
  696                 mp = NULL;
  697 
  698         /*
  699          * Remember buffer type, to switch on it later.  If the write was
  700          * synchronous, but the file system was mounted with MNT_ASYNC,
  701          * convert it to a delayed write.
  702          * XXX note that this relies on delayed tape writes being converted
  703          * to async, not sync writes (which is safe, but ugly).
  704          */
  705         async = ISSET(bp->b_flags, B_ASYNC);
  706         if (!async && mp && ISSET(mp->mnt_flag, MNT_ASYNC)) {
  707                 /*
  708                  * Don't convert writes from VND on async filesystems
  709                  * that already have delayed writes in the upper layer.
  710                  */
  711                 if (!ISSET(bp->b_flags, B_NOCACHE)) {
  712                         bdwrite(bp);
  713                         return (0);
  714                 }
  715         }
  716 
  717         /*
  718          * Collect statistics on synchronous and asynchronous writes.
  719          * Writes to block devices are charged to their associated
  720          * filesystem (if any).
  721          */
  722         if (mp != NULL) {
  723                 if (async)
  724                         mp->mnt_stat.f_asyncwrites++;
  725                 else
  726                         mp->mnt_stat.f_syncwrites++;
  727         }
  728         bcstats.pendingwrites++;
  729         bcstats.numwrites++;
  730 
  731         wasdelayed = ISSET(bp->b_flags, B_DELWRI);
  732         CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
  733 
  734         s = splbio();
  735 
  736         /*
  737          * If not synchronous, pay for the I/O operation and make
  738          * sure the buf is on the correct vnode queue.  We have
  739          * to do this now, because if we don't, the vnode may not
  740          * be properly notified that its I/O has completed.
  741          */
  742         if (wasdelayed) {
  743                 reassignbuf(bp);
  744         } else
  745                 curproc->p_ru.ru_oublock++;
  746 
  747 
  748         /* Initiate disk write.  Make sure the appropriate party is charged. */
  749         bp->b_vp->v_numoutput++;
  750         splx(s);
  751         buf_flip_dma(bp);
  752         SET(bp->b_flags, B_WRITEINPROG);
  753         VOP_STRATEGY(bp->b_vp, bp);
  754 
  755         /*
  756          * If the queue is above the high water mark, wait till
  757          * the number of outstanding write bufs drops below the low
  758          * water mark.
  759          */
  760         if (bp->b_bq)
  761                 bufq_wait(bp->b_bq);
  762 
  763         if (async)
  764                 return (0);
  765 
  766         /*
  767          * If I/O was synchronous, wait for it to complete.
  768          */
  769         rv = biowait(bp);
  770 
  771         /* Release the buffer. */
  772         brelse(bp);
  773 
  774         return (rv);
  775 }
  776 
  777 
  778 /*
  779  * Delayed write.
  780  *
  781  * The buffer is marked dirty, but is not queued for I/O.
  782  * This routine should be used when the buffer is expected
  783  * to be modified again soon, typically a small write that
  784  * partially fills a buffer.
  785  *
  786  * NB: magnetic tapes cannot be delayed; they must be
  787  * written in the order that the writes are requested.
  788  *
  789  * Described in Leffler, et al. (pp. 208-213).
  790  */
  791 void
  792 bdwrite(struct buf *bp)
  793 {
  794         int s;
  795 
  796         /*
  797          * If the block hasn't been seen before:
  798          *      (1) Mark it as having been seen,
  799          *      (2) Charge for the write.
  800          *      (3) Make sure it's on its vnode's correct block list,
  801          *      (4) If a buffer is rewritten, move it to end of dirty list
  802          */
  803         if (!ISSET(bp->b_flags, B_DELWRI)) {
  804                 SET(bp->b_flags, B_DELWRI);
  805                 s = splbio();
  806                 buf_flip_dma(bp);
  807                 reassignbuf(bp);
  808                 splx(s);
  809                 curproc->p_ru.ru_oublock++;             /* XXX */
  810         }
  811 
  812         /* The "write" is done, so mark and release the buffer. */
  813         CLR(bp->b_flags, B_NEEDCOMMIT);
  814         CLR(bp->b_flags, B_NOCACHE); /* Must cache delayed writes */
  815         SET(bp->b_flags, B_DONE);
  816         brelse(bp);
  817 }
  818 
  819 /*
  820  * Asynchronous block write; just an asynchronous bwrite().
  821  */
  822 void
  823 bawrite(struct buf *bp)
  824 {
  825 
  826         SET(bp->b_flags, B_ASYNC);
  827         VOP_BWRITE(bp);
  828 }
  829 
  830 /*
  831  * Must be called at splbio()
  832  */
  833 void
  834 buf_dirty(struct buf *bp)
  835 {
  836         splassert(IPL_BIO);
  837 
  838 #ifdef DIAGNOSTIC
  839         if (!ISSET(bp->b_flags, B_BUSY))
  840                 panic("Trying to dirty buffer on freelist!");
  841 #endif
  842 
  843         if (ISSET(bp->b_flags, B_DELWRI) == 0) {
  844                 SET(bp->b_flags, B_DELWRI);
  845                 buf_flip_dma(bp);
  846                 reassignbuf(bp);
  847         }
  848 }
  849 
  850 /*
  851  * Must be called at splbio()
  852  */
  853 void
  854 buf_undirty(struct buf *bp)
  855 {
  856         splassert(IPL_BIO);
  857 
  858 #ifdef DIAGNOSTIC
  859         if (!ISSET(bp->b_flags, B_BUSY))
  860                 panic("Trying to undirty buffer on freelist!");
  861 #endif
  862         if (ISSET(bp->b_flags, B_DELWRI)) {
  863                 CLR(bp->b_flags, B_DELWRI);
  864                 reassignbuf(bp);
  865         }
  866 }
  867 
  868 /*
  869  * Release a buffer on to the free lists.
  870  * Described in Bach (p. 46).
  871  */
  872 void
  873 brelse(struct buf *bp)
  874 {
  875         int s;
  876 
  877         s = splbio();
  878 
  879         if (bp->b_data != NULL)
  880                 KASSERT(bp->b_bufsize > 0);
  881 
  882         /*
  883          * softdep is basically incompatible with not caching buffers
  884          * that have dependencies, so this buffer must be cached
  885          */
  886         if (LIST_FIRST(&bp->b_dep) != NULL)
  887                 CLR(bp->b_flags, B_NOCACHE);
  888 
  889         /*
  890          * Determine which queue the buffer should be on, then put it there.
  891          */
  892 
  893         /* If it's not cacheable, or an error, mark it invalid. */
  894         if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
  895                 SET(bp->b_flags, B_INVAL);
  896         /* If it's a write error, also mark the vnode as damaged. */
  897         if (ISSET(bp->b_flags, B_ERROR) && !ISSET(bp->b_flags, B_READ)) {
  898                 if (bp->b_vp && bp->b_vp->v_type == VREG)
  899                         SET(bp->b_vp->v_bioflag, VBIOERROR);
  900         }
  901 
  902         if (ISSET(bp->b_flags, B_INVAL)) {
  903                 /*
  904                  * If the buffer is invalid, free it now rather than leaving
  905                  * it in a queue and wasting memory.
  906                  */
  907                 if (LIST_FIRST(&bp->b_dep) != NULL)
  908                         buf_deallocate(bp);
  909 
  910                 if (ISSET(bp->b_flags, B_DELWRI)) {
  911                         CLR(bp->b_flags, B_DELWRI);
  912                 }
  913 
  914                 if (bp->b_vp) {
  915                         RBT_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp);
  916                         brelvp(bp);
  917                 }
  918                 bp->b_vp = NULL;
  919 
  920                 /*
  921                  * Wake up any processes waiting for _this_ buffer to
  922                  * become free. They are not allowed to grab it
  923                  * since it will be freed. But the only sleeper is
  924                  * getblk and it will restart the operation after
  925                  * sleep.
  926                  */
  927                 if (ISSET(bp->b_flags, B_WANTED)) {
  928                         CLR(bp->b_flags, B_WANTED);
  929                         wakeup(bp);
  930                 }
  931                 buf_put(bp);
  932         } else {
  933                 /*
  934                  * It has valid data.  Put it on the end of the appropriate
  935                  * queue, so that it'll stick around for as long as possible.
  936                  */
  937                 bufcache_release(bp);
  938 
  939                 /* Unlock the buffer. */
  940                 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED));
  941                 buf_release(bp);
  942 
  943                 /* Wake up any processes waiting for _this_ buffer to
  944                  * become free. */
  945                 if (ISSET(bp->b_flags, B_WANTED)) {
  946                         CLR(bp->b_flags, B_WANTED);
  947                         wakeup(bp);
  948                 }
  949 
  950                 if (bcstats.dmapages > targetpages)
  951                         (void) bufcache_recover_dmapages(0,
  952                             bcstats.dmapages - targetpages);
  953                 bufcache_adjust();
  954         }
  955 
  956         /* Wake up syncer and cleaner processes waiting for buffers. */
  957         if (nobuffers) {
  958                 nobuffers = 0;
  959                 wakeup(&nobuffers);
  960         }
  961 
  962         /* Wake up any processes waiting for any buffer to become free. */
  963         if (needbuffer && bcstats.dmapages < targetpages &&
  964             bcstats.kvaslots_avail > RESERVE_SLOTS) {
  965                 needbuffer = 0;
  966                 wakeup(&needbuffer);
  967         }
  968 
  969         splx(s);
  970 }
  971 
  972 /*
  973  * Determine if a block is in the cache. Just look on what would be its hash
  974  * chain. If it's there, return a pointer to it, unless it's marked invalid.
  975  */
  976 struct buf *
  977 incore(struct vnode *vp, daddr_t blkno)
  978 {
  979         struct buf *bp;
  980         struct buf b;
  981         int s;
  982 
  983         s = splbio();
  984 
  985         /* Search buf lookup tree */
  986         b.b_lblkno = blkno;
  987         bp = RBT_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b);
  988         if (bp != NULL && ISSET(bp->b_flags, B_INVAL))
  989                 bp = NULL;
  990 
  991         splx(s);
  992         return (bp);
  993 }
  994 
  995 /*
  996  * Get a block of requested size that is associated with
  997  * a given vnode and block offset. If it is found in the
  998  * block cache, mark it as having been found, make it busy
  999  * and return it. Otherwise, return an empty block of the
 1000  * correct size. It is up to the caller to ensure that the
 1001  * cached blocks be of the correct size.
 1002  */
 1003 struct buf *
 1004 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag,
 1005     uint64_t slptimeo)
 1006 {
 1007         struct buf *bp;
 1008         struct buf b;
 1009         int s, error;
 1010 
 1011         /*
 1012          * XXX
 1013          * The following is an inlined version of 'incore()', but with
 1014          * the 'invalid' test moved to after the 'busy' test.  It's
 1015          * necessary because there are some cases in which the NFS
 1016          * code sets B_INVAL prior to writing data to the server, but
 1017          * in which the buffers actually contain valid data.  In this
 1018          * case, we can't allow the system to allocate a new buffer for
 1019          * the block until the write is finished.
 1020          */
 1021 start:
 1022         s = splbio();
 1023         b.b_lblkno = blkno;
 1024         bp = RBT_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b);
 1025         if (bp != NULL) {
 1026                 if (ISSET(bp->b_flags, B_BUSY)) {
 1027                         SET(bp->b_flags, B_WANTED);
 1028                         error = tsleep_nsec(bp, slpflag | (PRIBIO + 1),
 1029                             "getblk", slptimeo);
 1030                         splx(s);
 1031                         if (error)
 1032                                 return (NULL);
 1033                         goto start;
 1034                 }
 1035 
 1036                 if (!ISSET(bp->b_flags, B_INVAL)) {
 1037                         bcstats.cachehits++;
 1038                         SET(bp->b_flags, B_CACHE);
 1039                         bufcache_take(bp);
 1040                         buf_acquire(bp);
 1041                         splx(s);
 1042                         return (bp);
 1043                 }
 1044         }
 1045         splx(s);
 1046 
 1047         if ((bp = buf_get(vp, blkno, size)) == NULL)
 1048                 goto start;
 1049 
 1050         return (bp);
 1051 }
 1052 
 1053 /*
 1054  * Get an empty, disassociated buffer of given size.
 1055  */
 1056 struct buf *
 1057 geteblk(size_t size)
 1058 {
 1059         struct buf *bp;
 1060 
 1061         while ((bp = buf_get(NULL, 0, size)) == NULL)
 1062                 continue;
 1063 
 1064         return (bp);
 1065 }
 1066 
 1067 /*
 1068  * Allocate a buffer.
 1069  * If vp is given, put it into the buffer cache for that vnode.
 1070  * If size != 0, allocate memory and call buf_map().
 1071  * If there is already a buffer for the given vnode/blkno, return NULL.
 1072  */
 1073 struct buf *
 1074 buf_get(struct vnode *vp, daddr_t blkno, size_t size)
 1075 {
 1076         struct buf *bp;
 1077         int poolwait = size == 0 ? PR_NOWAIT : PR_WAITOK;
 1078         int npages;
 1079         int s;
 1080 
 1081         s = splbio();
 1082         if (size) {
 1083                 /*
 1084                  * Wake up the cleaner if we have lots of dirty pages,
 1085                  * or if we are getting low on buffer cache kva.
 1086                  */
 1087                 if (UNCLEAN_PAGES >= hidirtypages ||
 1088                         bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS)
 1089                         wakeup(&bd_req);
 1090 
 1091                 npages = atop(round_page(size));
 1092 
 1093                 /*
 1094                  * if our cache has been previously shrunk,
 1095                  * allow it to grow again with use up to
 1096                  * bufhighpages (cachepercent)
 1097                  */
 1098                 if (bufpages < bufhighpages)
 1099                         bufadjust(bufhighpages);
 1100 
 1101                 /*
 1102                  * If we would go over the page target with our
 1103                  * new allocation, free enough buffers first
 1104                  * to stay at the target with our new allocation.
 1105                  */
 1106                 if (bcstats.dmapages + npages > targetpages) {
 1107                         (void) bufcache_recover_dmapages(0, npages);
 1108                         bufcache_adjust();
 1109                 }
 1110 
 1111                 /*
 1112                  * If we get here, we tried to free the world down
 1113                  * above, and couldn't get down - Wake the cleaner
 1114                  * and wait for it to push some buffers out.
 1115                  */
 1116                 if ((bcstats.dmapages + npages > targetpages ||
 1117                     bcstats.kvaslots_avail <= RESERVE_SLOTS) &&
 1118                     curproc != syncerproc && curproc != cleanerproc) {
 1119                         wakeup(&bd_req);
 1120                         needbuffer++;
 1121                         tsleep_nsec(&needbuffer, PRIBIO, "needbuffer", INFSLP);
 1122                         splx(s);
 1123                         return (NULL);
 1124                 }
 1125                 if (bcstats.dmapages + npages > bufpages) {
 1126                         /* cleaner or syncer */
 1127                         nobuffers = 1;
 1128                         tsleep_nsec(&nobuffers, PRIBIO, "nobuffers", INFSLP);
 1129                         splx(s);
 1130                         return (NULL);
 1131                 }
 1132         }
 1133 
 1134         bp = pool_get(&bufpool, poolwait|PR_ZERO);
 1135 
 1136         if (bp == NULL) {
 1137                 splx(s);
 1138                 return (NULL);
 1139         }
 1140 
 1141         bp->b_freelist.tqe_next = NOLIST;
 1142         bp->b_dev = NODEV;
 1143         LIST_INIT(&bp->b_dep);
 1144         bp->b_bcount = size;
 1145 
 1146         buf_acquire_nomap(bp);
 1147 
 1148         if (vp != NULL) {
 1149                 /*
 1150                  * We insert the buffer into the hash with B_BUSY set
 1151                  * while we allocate pages for it. This way any getblk
 1152                  * that happens while we allocate pages will wait for
 1153                  * this buffer instead of starting its own buf_get.
 1154                  *
 1155                  * But first, we check if someone beat us to it.
 1156                  */
 1157                 if (incore(vp, blkno)) {
 1158                         pool_put(&bufpool, bp);
 1159                         splx(s);
 1160                         return (NULL);
 1161                 }
 1162 
 1163                 bp->b_blkno = bp->b_lblkno = blkno;
 1164                 bgetvp(vp, bp);
 1165                 if (RBT_INSERT(buf_rb_bufs, &vp->v_bufs_tree, bp))
 1166                         panic("buf_get: dup lblk vp %p bp %p", vp, bp);
 1167         } else {
 1168                 bp->b_vnbufs.le_next = NOLIST;
 1169                 SET(bp->b_flags, B_INVAL);
 1170                 bp->b_vp = NULL;
 1171         }
 1172 
 1173         LIST_INSERT_HEAD(&bufhead, bp, b_list);
 1174         bcstats.numbufs++;
 1175 
 1176         if (size) {
 1177                 buf_alloc_pages(bp, round_page(size));
 1178                 KASSERT(ISSET(bp->b_flags, B_DMA));
 1179                 buf_map(bp);
 1180         }
 1181 
 1182         SET(bp->b_flags, B_BC);
 1183         splx(s);
 1184 
 1185         return (bp);
 1186 }
 1187 
 1188 /*
 1189  * Buffer cleaning daemon.
 1190  */
 1191 void
 1192 buf_daemon(void *arg)
 1193 {
 1194         struct buf *bp = NULL;
 1195         int s, pushed = 0;
 1196 
 1197         s = splbio();
 1198         for (;;) {
 1199                 if (bp == NULL || (pushed >= 16 &&
 1200                     UNCLEAN_PAGES < hidirtypages &&
 1201                     bcstats.kvaslots_avail > 2 * RESERVE_SLOTS)){
 1202                         pushed = 0;
 1203                         /*
 1204                          * Wake up anyone who was waiting for buffers
 1205                          * to be released.
 1206                          */
 1207                         if (needbuffer) {
 1208                                 needbuffer = 0;
 1209                                 wakeup(&needbuffer);
 1210                         }
 1211                         tsleep_nsec(&bd_req, PRIBIO - 7, "cleaner", INFSLP);
 1212                 }
 1213 
 1214                 while ((bp = bufcache_getdirtybuf())) {
 1215                         TRACEPOINT(vfs, cleaner, bp->b_flags, pushed,
 1216                             lodirtypages, hidirtypages);
 1217 
 1218                         if (UNCLEAN_PAGES < lodirtypages &&
 1219                             bcstats.kvaslots_avail > 2 * RESERVE_SLOTS &&
 1220                             pushed >= 16)
 1221                                 break;
 1222 
 1223                         bufcache_take(bp);
 1224                         buf_acquire(bp);
 1225                         splx(s);
 1226 
 1227                         if (ISSET(bp->b_flags, B_INVAL)) {
 1228                                 brelse(bp);
 1229                                 s = splbio();
 1230                                 continue;
 1231                         }
 1232 #ifdef DIAGNOSTIC
 1233                         if (!ISSET(bp->b_flags, B_DELWRI))
 1234                                 panic("Clean buffer on dirty queue");
 1235 #endif
 1236                         if (LIST_FIRST(&bp->b_dep) != NULL &&
 1237                             !ISSET(bp->b_flags, B_DEFERRED) &&
 1238                             buf_countdeps(bp, 0, 0)) {
 1239                                 SET(bp->b_flags, B_DEFERRED);
 1240                                 s = splbio();
 1241                                 bufcache_release(bp);
 1242                                 buf_release(bp);
 1243                                 continue;
 1244                         }
 1245 
 1246                         bawrite(bp);
 1247                         pushed++;
 1248 
 1249                         sched_pause(yield);
 1250 
 1251                         s = splbio();
 1252                 }
 1253         }
 1254 }
 1255 
 1256 /*
 1257  * Wait for operations on the buffer to complete.
 1258  * When they do, extract and return the I/O's error value.
 1259  */
 1260 int
 1261 biowait(struct buf *bp)
 1262 {
 1263         int s;
 1264 
 1265         KASSERT(!(bp->b_flags & B_ASYNC));
 1266 
 1267         s = splbio();
 1268         while (!ISSET(bp->b_flags, B_DONE))
 1269                 tsleep_nsec(bp, PRIBIO + 1, "biowait", INFSLP);
 1270         splx(s);
 1271 
 1272         /* check for interruption of I/O (e.g. via NFS), then errors. */
 1273         if (ISSET(bp->b_flags, B_EINTR)) {
 1274                 CLR(bp->b_flags, B_EINTR);
 1275                 return (EINTR);
 1276         }
 1277 
 1278         if (ISSET(bp->b_flags, B_ERROR))
 1279                 return (bp->b_error ? bp->b_error : EIO);
 1280         else
 1281                 return (0);
 1282 }
 1283 
 1284 /*
 1285  * Mark I/O complete on a buffer.
 1286  *
 1287  * If a callback has been requested, e.g. the pageout
 1288  * daemon, do so. Otherwise, awaken waiting processes.
 1289  *
 1290  * [ Leffler, et al., says on p.247:
 1291  *      "This routine wakes up the blocked process, frees the buffer
 1292  *      for an asynchronous write, or, for a request by the pagedaemon
 1293  *      process, invokes a procedure specified in the buffer structure" ]
 1294  *
 1295  * In real life, the pagedaemon (or other system processes) wants
 1296  * to do async stuff to, and doesn't want the buffer brelse()'d.
 1297  * (for swap pager, that puts swap buffers on the free lists (!!!),
 1298  * for the vn device, that puts malloc'd buffers on the free lists!)
 1299  *
 1300  * Must be called at splbio().
 1301  */
 1302 void
 1303 biodone(struct buf *bp)
 1304 {
 1305         splassert(IPL_BIO);
 1306 
 1307         if (ISSET(bp->b_flags, B_DONE))
 1308                 panic("biodone already");
 1309         SET(bp->b_flags, B_DONE);               /* note that it's done */
 1310 
 1311         if (bp->b_bq)
 1312                 bufq_done(bp->b_bq, bp);
 1313 
 1314         if (LIST_FIRST(&bp->b_dep) != NULL)
 1315                 buf_complete(bp);
 1316 
 1317         if (!ISSET(bp->b_flags, B_READ)) {
 1318                 CLR(bp->b_flags, B_WRITEINPROG);
 1319                 vwakeup(bp->b_vp);
 1320         }
 1321         if (bcstats.numbufs &&
 1322             (!(ISSET(bp->b_flags, B_RAW) || ISSET(bp->b_flags, B_PHYS)))) {
 1323                 if (!ISSET(bp->b_flags, B_READ)) {
 1324                         bcstats.pendingwrites--;
 1325                 } else
 1326                         bcstats.pendingreads--;
 1327         }
 1328         if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
 1329                 CLR(bp->b_flags, B_CALL);       /* but note callout done */
 1330                 (*bp->b_iodone)(bp);
 1331         } else {
 1332                 if (ISSET(bp->b_flags, B_ASYNC)) {/* if async, release it */
 1333                         brelse(bp);
 1334                 } else {                        /* or just wakeup the buffer */
 1335                         CLR(bp->b_flags, B_WANTED);
 1336                         wakeup(bp);
 1337                 }
 1338         }
 1339 }
 1340 
 1341 #ifdef DDB
 1342 void    bcstats_print(int (*)(const char *, ...)
 1343     __attribute__((__format__(__kprintf__,1,2))));
 1344 /*
 1345  * bcstats_print: ddb hook to print interesting buffer cache counters
 1346  */
 1347 void
 1348 bcstats_print(
 1349     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
 1350 {
 1351         (*pr)("Current Buffer Cache status:\n");
 1352         (*pr)("numbufs %lld busymapped %lld, delwri %lld\n",
 1353             bcstats.numbufs, bcstats.busymapped, bcstats.delwribufs);
 1354         (*pr)("kvaslots %lld avail kva slots %lld\n",
 1355             bcstats.kvaslots, bcstats.kvaslots_avail);
 1356         (*pr)("bufpages %lld, dmapages %lld, dirtypages %lld\n",
 1357             bcstats.numbufpages, bcstats.dmapages, bcstats.numdirtypages);
 1358         (*pr)("pendingreads %lld, pendingwrites %lld\n",
 1359             bcstats.pendingreads, bcstats.pendingwrites);
 1360         (*pr)("highflips %lld, highflops %lld, dmaflips %lld\n",
 1361             bcstats.highflips, bcstats.highflops, bcstats.dmaflips);
 1362 }
 1363 #endif
 1364 
 1365 void
 1366 buf_adjcnt(struct buf *bp, long ncount)
 1367 {
 1368         KASSERT(ncount <= bp->b_bufsize);
 1369         bp->b_bcount = ncount;
 1370 }
 1371 
 1372 /* bufcache freelist code below */
 1373 /*
 1374  * Copyright (c) 2014 Ted Unangst <tedu@openbsd.org>
 1375  *
 1376  * Permission to use, copy, modify, and distribute this software for any
 1377  * purpose with or without fee is hereby granted, provided that the above
 1378  * copyright notice and this permission notice appear in all copies.
 1379  *
 1380  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 1381  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 1382  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 1383  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 1384  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 1385  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 1386  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 1387  */
 1388 
 1389 /*
 1390  * The code below implements a variant of the 2Q buffer cache algorithm by
 1391  * Johnson and Shasha.
 1392  *
 1393  * General Outline
 1394  * We divide the buffer cache into three working sets: current, previous,
 1395  * and long term. Each list is itself LRU and buffers get promoted and moved
 1396  * around between them. A buffer starts its life in the current working set.
 1397  * As time passes and newer buffers push it out, it will turn into the previous
 1398  * working set and is subject to recycling. But if it's accessed again from
 1399  * the previous working set, that's an indication that it's actually in the
 1400  * long term working set, so we promote it there. The separation of current
 1401  * and previous working sets prevents us from promoting a buffer that's only
 1402  * temporarily hot to the long term cache.
 1403  *
 1404  * The objective is to provide scan resistance by making the long term
 1405  * working set ineligible for immediate recycling, even as the current
 1406  * working set is rapidly turned over.
 1407  *
 1408  * Implementation
 1409  * The code below identifies the current, previous, and long term sets as
 1410  * hotqueue, coldqueue, and warmqueue. The hot and warm queues are capped at
 1411  * 1/3 of the total clean pages, after which point they start pushing their
 1412  * oldest buffers into coldqueue.
 1413  * A buf always starts out with neither WARM or COLD flags set (implying HOT).
 1414  * When released, it will be returned to the tail of the hotqueue list.
 1415  * When the hotqueue gets too large, the oldest hot buf will be moved to the
 1416  * coldqueue, with the B_COLD flag set. When a cold buf is released, we set
 1417  * the B_WARM flag and put it onto the warmqueue. Warm bufs are also
 1418  * directly returned to the end of the warmqueue. As with the hotqueue, when
 1419  * the warmqueue grows too large, B_WARM bufs are moved onto the coldqueue.
 1420  *
 1421  * Note that this design does still support large working sets, greater
 1422  * than the cap of hotqueue or warmqueue would imply. The coldqueue is still
 1423  * cached and has no maximum length. The hot and warm queues form a Y feeding
 1424  * into the coldqueue. Moving bufs between queues is constant time, so this
 1425  * design decays to one long warm->cold queue.
 1426  *
 1427  * In the 2Q paper, hotqueue and coldqueue are A1in and A1out. The warmqueue
 1428  * is Am. We always cache pages, as opposed to pointers to pages for A1.
 1429  *
 1430  * This implementation adds support for multiple 2q caches.
 1431  *
 1432  * If we have more than one 2q cache, as bufs fall off the cold queue
 1433  * for recycling, bufs that have been warm before (which retain the
 1434  * B_WARM flag in addition to B_COLD) can be put into the hot queue of
 1435  * a second level 2Q cache. buffers which are only B_COLD are
 1436  * recycled. Bufs falling off the last cache's cold queue are always
 1437  * recycled.
 1438  *
 1439  */
 1440 
 1441 /*
 1442  * this function is called when a hot or warm queue may have exceeded its
 1443  * size limit. it will move a buf to the coldqueue.
 1444  */
 1445 int chillbufs(struct
 1446     bufcache *cache, struct bufqueue *queue, int64_t *queuepages);
 1447 
 1448 void
 1449 bufcache_init(void)
 1450 {
 1451         int i;
 1452 
 1453         for (i = 0; i < NUM_CACHES; i++) {
 1454                 TAILQ_INIT(&cleancache[i].hotqueue);
 1455                 TAILQ_INIT(&cleancache[i].coldqueue);
 1456                 TAILQ_INIT(&cleancache[i].warmqueue);
 1457         }
 1458         TAILQ_INIT(&dirtyqueue);
 1459 }
 1460 
 1461 /*
 1462  * if the buffer caches have shrunk, we may need to rebalance our queues.
 1463  */
 1464 void
 1465 bufcache_adjust(void)
 1466 {
 1467         int i;
 1468 
 1469         for (i = 0; i < NUM_CACHES; i++) {
 1470                 while (chillbufs(&cleancache[i], &cleancache[i].warmqueue,
 1471                     &cleancache[i].warmbufpages) ||
 1472                     chillbufs(&cleancache[i], &cleancache[i].hotqueue,
 1473                     &cleancache[i].hotbufpages))
 1474                         continue;
 1475         }
 1476 }
 1477 
 1478 /*
 1479  * Get a clean buffer from the cache. if "discard" is set do not promote
 1480  * previously warm buffers as normal, because we are tossing everything
 1481  * away such as in a hibernation
 1482  */
 1483 struct buf *
 1484 bufcache_getcleanbuf(int cachenum, int discard)
 1485 {
 1486         struct buf *bp = NULL;
 1487         struct bufcache *cache = &cleancache[cachenum];
 1488         struct bufqueue * queue;
 1489 
 1490         splassert(IPL_BIO);
 1491 
 1492         /* try cold queue */
 1493         while ((bp = TAILQ_FIRST(&cache->coldqueue)) ||
 1494             (bp = TAILQ_FIRST(&cache->warmqueue)) ||
 1495             (bp = TAILQ_FIRST(&cache->hotqueue))) {
 1496                 int64_t pages = atop(bp->b_bufsize);
 1497                 struct bufcache *newcache;
 1498 
 1499                 if (discard || cachenum >= NUM_CACHES - 1) {
 1500                         /* Victim selected, give it up */
 1501                         return bp;
 1502                 }
 1503                 KASSERT(bp->cache == cachenum);
 1504 
 1505                 /*
 1506                  * If this buffer was warm before, move it to
 1507                  * the hot queue in the next cache
 1508                  */
 1509 
 1510                 if (fliphigh) {
 1511                         /*
 1512                          * If we are in the DMA cache, try to flip the
 1513                          * buffer up high to move it on to the other
 1514                          * caches. if we can't move the buffer to high
 1515                          * memory without sleeping, we give it up and
 1516                          * return it rather than fight for more memory
 1517                          * against non buffer cache competitors.
 1518                          */
 1519                         SET(bp->b_flags, B_BUSY);
 1520                         if (bp->cache == 0 && buf_flip_high(bp) == -1) {
 1521                                 CLR(bp->b_flags, B_BUSY);
 1522                                 return bp;
 1523                         }
 1524                         CLR(bp->b_flags, B_BUSY);
 1525                 }
 1526 
 1527                 /* Move the buffer to the hot queue in the next cache */
 1528                 if (ISSET(bp->b_flags, B_COLD)) {
 1529                         queue = &cache->coldqueue;
 1530                 } else if (ISSET(bp->b_flags, B_WARM)) {
 1531                         queue = &cache->warmqueue;
 1532                         cache->warmbufpages -= pages;
 1533                 } else {
 1534                         queue = &cache->hotqueue;
 1535                         cache->hotbufpages -= pages;
 1536                 }
 1537                 TAILQ_REMOVE(queue, bp, b_freelist);
 1538                 cache->cachepages -= pages;
 1539                 CLR(bp->b_flags, B_WARM);
 1540                 CLR(bp->b_flags, B_COLD);
 1541                 bp->cache++;
 1542                 newcache= &cleancache[bp->cache];
 1543                 newcache->cachepages += pages;
 1544                 newcache->hotbufpages += pages;
 1545                 chillbufs(newcache, &newcache->hotqueue,
 1546                     &newcache->hotbufpages);
 1547                 TAILQ_INSERT_TAIL(&newcache->hotqueue, bp, b_freelist);
 1548         }
 1549         return bp;
 1550 }
 1551 
 1552 
 1553 void
 1554 discard_buffer(struct buf *bp)
 1555 {
 1556         splassert(IPL_BIO);
 1557 
 1558         bufcache_take(bp);
 1559         if (bp->b_vp) {
 1560                 RBT_REMOVE(buf_rb_bufs,
 1561                     &bp->b_vp->v_bufs_tree, bp);
 1562                 brelvp(bp);
 1563         }
 1564         buf_put(bp);
 1565 }
 1566 
 1567 int64_t
 1568 bufcache_recover_dmapages(int discard, int64_t howmany)
 1569 {
 1570         struct buf *bp = NULL;
 1571         struct bufcache *cache = &cleancache[DMA_CACHE];
 1572         struct bufqueue * queue;
 1573         int64_t recovered = 0;
 1574 
 1575         splassert(IPL_BIO);
 1576 
 1577         while ((recovered < howmany) &&
 1578             ((bp = TAILQ_FIRST(&cache->coldqueue)) ||
 1579             (bp = TAILQ_FIRST(&cache->warmqueue)) ||
 1580             (bp = TAILQ_FIRST(&cache->hotqueue)))) {
 1581                 int64_t pages = atop(bp->b_bufsize);
 1582                 struct bufcache *newcache;
 1583 
 1584                 if (discard || DMA_CACHE >= NUM_CACHES - 1) {
 1585                         discard_buffer(bp);
 1586                         continue;
 1587                 }
 1588                 KASSERT(bp->cache == DMA_CACHE);
 1589 
 1590                 /*
 1591                  * If this buffer was warm before, move it to
 1592                  * the hot queue in the next cache
 1593                  */
 1594 
 1595                 /*
 1596                  * One way or another, the pages for this
 1597                  * buffer are leaving DMA memory
 1598                  */
 1599                 recovered += pages;
 1600 
 1601                 if (!fliphigh) {
 1602                         discard_buffer(bp);
 1603                         continue;
 1604                 }
 1605 
 1606                 /*
 1607                  * If we are in the DMA cache, try to flip the
 1608                  * buffer up high to move it on to the other
 1609                  * caches. if we can't move the buffer to high
 1610                  * memory without sleeping, we give it up
 1611                  * now rather than fight for more memory
 1612                  * against non buffer cache competitors.
 1613                  */
 1614                 SET(bp->b_flags, B_BUSY);
 1615                 if (bp->cache == 0 && buf_flip_high(bp) == -1) {
 1616                         CLR(bp->b_flags, B_BUSY);
 1617                         discard_buffer(bp);
 1618                         continue;
 1619                 }
 1620                 CLR(bp->b_flags, B_BUSY);
 1621 
 1622                 /*
 1623                  * Move the buffer to the hot queue in the next cache
 1624                  */
 1625                 if (ISSET(bp->b_flags, B_COLD)) {
 1626                         queue = &cache->coldqueue;
 1627                 } else if (ISSET(bp->b_flags, B_WARM)) {
 1628                         queue = &cache->warmqueue;
 1629                         cache->warmbufpages -= pages;
 1630                 } else {
 1631                         queue = &cache->hotqueue;
 1632                         cache->hotbufpages -= pages;
 1633                 }
 1634                 TAILQ_REMOVE(queue, bp, b_freelist);
 1635                 cache->cachepages -= pages;
 1636                 CLR(bp->b_flags, B_WARM);
 1637                 CLR(bp->b_flags, B_COLD);
 1638                 bp->cache++;
 1639                 newcache= &cleancache[bp->cache];
 1640                 newcache->cachepages += pages;
 1641                 newcache->hotbufpages += pages;
 1642                 chillbufs(newcache, &newcache->hotqueue,
 1643                     &newcache->hotbufpages);
 1644                 TAILQ_INSERT_TAIL(&newcache->hotqueue, bp, b_freelist);
 1645         }
 1646         return recovered;
 1647 }
 1648 
 1649 struct buf *
 1650 bufcache_getcleanbuf_range(int start, int end, int discard)
 1651 {
 1652         int i, j = start, q = end;
 1653         struct buf *bp = NULL;
 1654 
 1655         /*
 1656          * XXX in theory we could promote warm buffers into a previous queue
 1657          * so in the pathological case of where we go through all the caches
 1658          * without getting a buffer we have to start at the beginning again.
 1659          */
 1660         while (j <= q)  {
 1661                 for (i = q; i >= j; i--)
 1662                         if ((bp = bufcache_getcleanbuf(i, discard)))
 1663                                 return (bp);
 1664                 j++;
 1665         }
 1666         return bp;
 1667 }
 1668 
 1669 struct buf *
 1670 bufcache_gethighcleanbuf(void)
 1671 {
 1672         if (!fliphigh)
 1673                 return NULL;
 1674         return bufcache_getcleanbuf_range(DMA_CACHE + 1, NUM_CACHES - 1, 0);
 1675 }
 1676 
 1677 
 1678 struct buf *
 1679 bufcache_getdmacleanbuf(void)
 1680 {
 1681         if (fliphigh)
 1682                 return bufcache_getcleanbuf_range(DMA_CACHE, DMA_CACHE, 0);
 1683         return bufcache_getcleanbuf_range(DMA_CACHE, NUM_CACHES - 1, 0);
 1684 }
 1685 
 1686 
 1687 struct buf *
 1688 bufcache_getdirtybuf(void)
 1689 {
 1690         return TAILQ_FIRST(&dirtyqueue);
 1691 }
 1692 
 1693 void
 1694 bufcache_take(struct buf *bp)
 1695 {
 1696         struct bufqueue *queue;
 1697         int64_t pages;
 1698 
 1699         splassert(IPL_BIO);
 1700         KASSERT(ISSET(bp->b_flags, B_BC));
 1701         KASSERT(bp->cache >= DMA_CACHE);
 1702         KASSERT((bp->cache < NUM_CACHES));
 1703 
 1704         pages = atop(bp->b_bufsize);
 1705 
 1706         TRACEPOINT(vfs, bufcache_take, bp->b_flags, bp->cache, pages);
 1707 
 1708         struct bufcache *cache = &cleancache[bp->cache];
 1709         if (!ISSET(bp->b_flags, B_DELWRI)) {
 1710                 if (ISSET(bp->b_flags, B_COLD)) {
 1711                         queue = &cache->coldqueue;
 1712                 } else if (ISSET(bp->b_flags, B_WARM)) {
 1713                         queue = &cache->warmqueue;
 1714                         cache->warmbufpages -= pages;
 1715                 } else {
 1716                         queue = &cache->hotqueue;
 1717                         cache->hotbufpages -= pages;
 1718                 }
 1719                 bcstats.numcleanpages -= pages;
 1720                 cache->cachepages -= pages;
 1721         } else {
 1722                 queue = &dirtyqueue;
 1723                 bcstats.numdirtypages -= pages;
 1724                 bcstats.delwribufs--;
 1725         }
 1726         TAILQ_REMOVE(queue, bp, b_freelist);
 1727 }
 1728 
 1729 /* move buffers from a hot or warm queue to a cold queue in a cache */
 1730 int
 1731 chillbufs(struct bufcache *cache, struct bufqueue *queue, int64_t *queuepages)
 1732 {
 1733         struct buf *bp;
 1734         int64_t limit, pages;
 1735 
 1736         /*
 1737          * We limit the hot queue to be small, with a max of 4096 pages.
 1738          * We limit the warm queue to half the cache size.
 1739          *
 1740          * We impose a minimum size of 96 to prevent too much "wobbling".
 1741          */
 1742         if (queue == &cache->hotqueue)
 1743                 limit = min(cache->cachepages / 20, 4096);
 1744         else if (queue == &cache->warmqueue)
 1745                 limit = (cache->cachepages / 2);
 1746         else
 1747                 panic("chillbufs: invalid queue");
 1748 
 1749         if (*queuepages > 96 && *queuepages > limit) {
 1750                 bp = TAILQ_FIRST(queue);
 1751                 if (!bp)
 1752                         panic("inconsistent bufpage counts");
 1753                 pages = atop(bp->b_bufsize);
 1754                 *queuepages -= pages;
 1755                 TAILQ_REMOVE(queue, bp, b_freelist);
 1756                 /* we do not clear B_WARM */
 1757                 SET(bp->b_flags, B_COLD);
 1758                 TAILQ_INSERT_TAIL(&cache->coldqueue, bp, b_freelist);
 1759                 return 1;
 1760         }
 1761         return 0;
 1762 }
 1763 
 1764 void
 1765 bufcache_release(struct buf *bp)
 1766 {
 1767         struct bufqueue *queue;
 1768         int64_t pages;
 1769         struct bufcache *cache = &cleancache[bp->cache];
 1770 
 1771         KASSERT(ISSET(bp->b_flags, B_BC));
 1772         pages = atop(bp->b_bufsize);
 1773 
 1774         TRACEPOINT(vfs, bufcache_rel, bp->b_flags, bp->cache, pages);
 1775 
 1776         if (fliphigh) {
 1777                 if (ISSET(bp->b_flags, B_DMA) && bp->cache > 0)
 1778                         panic("B_DMA buffer release from cache %d",
 1779                             bp->cache);
 1780                 else if ((!ISSET(bp->b_flags, B_DMA)) && bp->cache == 0)
 1781                         panic("Non B_DMA buffer release from cache %d",
 1782                             bp->cache);
 1783         }
 1784 
 1785         if (!ISSET(bp->b_flags, B_DELWRI)) {
 1786                 int64_t *queuepages;
 1787                 if (ISSET(bp->b_flags, B_WARM | B_COLD)) {
 1788                         SET(bp->b_flags, B_WARM);
 1789                         CLR(bp->b_flags, B_COLD);
 1790                         queue = &cache->warmqueue;
 1791                         queuepages = &cache->warmbufpages;
 1792                 } else {
 1793                         queue = &cache->hotqueue;
 1794                         queuepages = &cache->hotbufpages;
 1795                 }
 1796                 *queuepages += pages;
 1797                 bcstats.numcleanpages += pages;
 1798                 cache->cachepages += pages;
 1799                 chillbufs(cache, queue, queuepages);
 1800         } else {
 1801                 queue = &dirtyqueue;
 1802                 bcstats.numdirtypages += pages;
 1803                 bcstats.delwribufs++;
 1804         }
 1805         TAILQ_INSERT_TAIL(queue, bp, b_freelist);
 1806 }
 1807 
 1808 #ifdef HIBERNATE
 1809 /*
 1810  * Nuke the buffer cache from orbit when hibernating. We do not want to save
 1811  * any clean cache pages to swap and read them back. the original disk files
 1812  * are just as good.
 1813  */
 1814 void
 1815 hibernate_suspend_bufcache(void)
 1816 {
 1817         struct buf *bp;
 1818         int s;
 1819 
 1820         s = splbio();
 1821         /* Chuck away all the cache pages.. discard bufs, do not promote */
 1822         while ((bp = bufcache_getcleanbuf_range(DMA_CACHE, NUM_CACHES - 1, 1))) {
 1823                 bufcache_take(bp);
 1824                 if (bp->b_vp) {
 1825                         RBT_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp);
 1826                         brelvp(bp);
 1827                 }
 1828                 buf_put(bp);
 1829         }
 1830         splx(s);
 1831 }
 1832 
 1833 void
 1834 hibernate_resume_bufcache(void)
 1835 {
 1836         /* XXX Nothing needed here for now */
 1837 }
 1838 #endif /* HIBERNATE */
Cache object: 159040b3cf463b5bfae4605002ecd0b1
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_bio.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_bio.c