vfs_bio.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $NetBSD: vfs_bio.c,v 1.303 2022/03/30 14:54:29 riastradh Exp $  */
    2 
    3 /*-
    4  * Copyright (c) 2007, 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Andrew Doran, and by Wasabi Systems, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   29  * POSSIBILITY OF SUCH DAMAGE.
   30  */
   31 
   32 /*-
   33  * Copyright (c) 1982, 1986, 1989, 1993
   34  *      The Regents of the University of California.  All rights reserved.
   35  * (c) UNIX System Laboratories, Inc.
   36  * All or some portions of this file are derived from material licensed
   37  * to the University of California by American Telephone and Telegraph
   38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   39  * the permission of UNIX System Laboratories, Inc.
   40  *
   41  * Redistribution and use in source and binary forms, with or without
   42  * modification, are permitted provided that the following conditions
   43  * are met:
   44  * 1. Redistributions of source code must retain the above copyright
   45  *    notice, this list of conditions and the following disclaimer.
   46  * 2. Redistributions in binary form must reproduce the above copyright
   47  *    notice, this list of conditions and the following disclaimer in the
   48  *    documentation and/or other materials provided with the distribution.
   49  * 3. Neither the name of the University nor the names of its contributors
   50  *    may be used to endorse or promote products derived from this software
   51  *    without specific prior written permission.
   52  *
   53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   63  * SUCH DAMAGE.
   64  *
   65  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
   66  */
   67 
   68 /*-
   69  * Copyright (c) 1994 Christopher G. Demetriou
   70  *
   71  * Redistribution and use in source and binary forms, with or without
   72  * modification, are permitted provided that the following conditions
   73  * are met:
   74  * 1. Redistributions of source code must retain the above copyright
   75  *    notice, this list of conditions and the following disclaimer.
   76  * 2. Redistributions in binary form must reproduce the above copyright
   77  *    notice, this list of conditions and the following disclaimer in the
   78  *    documentation and/or other materials provided with the distribution.
   79  * 3. All advertising materials mentioning features or use of this software
   80  *    must display the following acknowledgement:
   81  *      This product includes software developed by the University of
   82  *      California, Berkeley and its contributors.
   83  * 4. Neither the name of the University nor the names of its contributors
   84  *    may be used to endorse or promote products derived from this software
   85  *    without specific prior written permission.
   86  *
   87  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   88  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   89  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   90  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   91  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   92  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   93  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   94  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   95  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   96  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   97  * SUCH DAMAGE.
   98  *
   99  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  100  */
  101 
  102 /*
  103  * The buffer cache subsystem.
  104  *
  105  * Some references:
  106  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
  107  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  108  *              UNIX Operating System (Addison Welley, 1989)
  109  *
  110  * Locking
  111  *
  112  * There are three locks:
  113  * - bufcache_lock: protects global buffer cache state.
  114  * - BC_BUSY: a long term per-buffer lock.
  115  * - buf_t::b_objlock: lock on completion (biowait vs biodone).
  116  *
  117  * For buffers associated with vnodes (a most common case) b_objlock points
  118  * to the vnode_t::v_interlock.  Otherwise, it points to generic buffer_lock.
  119  *
  120  * Lock order:
  121  *      bufcache_lock ->
  122  *              buf_t::b_objlock
  123  */
  124 
  125 #include <sys/cdefs.h>
  126 __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.303 2022/03/30 14:54:29 riastradh Exp $");
  127 
  128 #ifdef _KERNEL_OPT
  129 #include "opt_bufcache.h"
  130 #include "opt_dtrace.h"
  131 #include "opt_biohist.h"
  132 #endif
  133 
  134 #include <sys/param.h>
  135 #include <sys/systm.h>
  136 #include <sys/kernel.h>
  137 #include <sys/proc.h>
  138 #include <sys/buf.h>
  139 #include <sys/vnode.h>
  140 #include <sys/mount.h>
  141 #include <sys/resourcevar.h>
  142 #include <sys/sysctl.h>
  143 #include <sys/conf.h>
  144 #include <sys/kauth.h>
  145 #include <sys/fstrans.h>
  146 #include <sys/intr.h>
  147 #include <sys/cpu.h>
  148 #include <sys/wapbl.h>
  149 #include <sys/bitops.h>
  150 #include <sys/cprng.h>
  151 #include <sys/sdt.h>
  152 
  153 #include <uvm/uvm.h>    /* extern struct uvm uvm */
  154 
  155 #include <miscfs/specfs/specdev.h>
  156 
  157 SDT_PROVIDER_DEFINE(io);
  158 
  159 SDT_PROBE_DEFINE4(io, kernel, , bbusy__start,
  160     "struct buf *"/*bp*/,
  161     "bool"/*intr*/, "int"/*timo*/, "kmutex_t *"/*interlock*/);
  162 SDT_PROBE_DEFINE5(io, kernel, , bbusy__done,
  163     "struct buf *"/*bp*/,
  164     "bool"/*intr*/,
  165     "int"/*timo*/,
  166     "kmutex_t *"/*interlock*/,
  167     "int"/*error*/);
  168 SDT_PROBE_DEFINE0(io, kernel, , getnewbuf__start);
  169 SDT_PROBE_DEFINE1(io, kernel, , getnewbuf__done,  "struct buf *"/*bp*/);
  170 SDT_PROBE_DEFINE3(io, kernel, , getblk__start,
  171     "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/);
  172 SDT_PROBE_DEFINE4(io, kernel, , getblk__done,
  173     "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/,
  174     "struct buf *"/*bp*/);
  175 SDT_PROBE_DEFINE2(io, kernel, , brelse, "struct buf *"/*bp*/, "int"/*set*/);
  176 SDT_PROBE_DEFINE1(io, kernel, , wait__start, "struct buf *"/*bp*/);
  177 SDT_PROBE_DEFINE1(io, kernel, , wait__done, "struct buf *"/*bp*/);
  178 
  179 #ifndef BUFPAGES
  180 # define BUFPAGES 0
  181 #endif
  182 
  183 #ifdef BUFCACHE
  184 # if (BUFCACHE < 5) || (BUFCACHE > 95)
  185 #  error BUFCACHE is not between 5 and 95
  186 # endif
  187 #else
  188 # define BUFCACHE 15
  189 #endif
  190 
  191 u_int   nbuf;                   /* desired number of buffer headers */
  192 u_int   bufpages = BUFPAGES;    /* optional hardwired count */
  193 u_int   bufcache = BUFCACHE;    /* max % of RAM to use for buffer cache */
  194 
  195 /*
  196  * Definitions for the buffer free lists.
  197  */
  198 #define BQUEUES         3               /* number of free buffer queues */
  199 
  200 #define BQ_LOCKED       0               /* super-blocks &c */
  201 #define BQ_LRU          1               /* lru, useful buffers */
  202 #define BQ_AGE          2               /* rubbish */
  203 
  204 struct bqueue {
  205         TAILQ_HEAD(, buf) bq_queue;
  206         uint64_t bq_bytes;
  207         buf_t *bq_marker;
  208 };
  209 static struct bqueue bufqueues[BQUEUES] __cacheline_aligned;
  210 
  211 /* Function prototypes */
  212 static void buf_setwm(void);
  213 static int buf_trim(void);
  214 static void *bufpool_page_alloc(struct pool *, int);
  215 static void bufpool_page_free(struct pool *, void *);
  216 static buf_t *bio_doread(struct vnode *, daddr_t, int, int);
  217 static buf_t *getnewbuf(int, int, int);
  218 static int buf_lotsfree(void);
  219 static int buf_canrelease(void);
  220 static u_long buf_mempoolidx(u_long);
  221 static u_long buf_roundsize(u_long);
  222 static void *buf_alloc(size_t);
  223 static void buf_mrelease(void *, size_t);
  224 static void binsheadfree(buf_t *, struct bqueue *);
  225 static void binstailfree(buf_t *, struct bqueue *);
  226 #ifdef DEBUG
  227 static int checkfreelist(buf_t *, struct bqueue *, int);
  228 #endif
  229 static void biointr(void *);
  230 static void biodone2(buf_t *);
  231 static void sysctl_kern_buf_setup(void);
  232 static void sysctl_vm_buf_setup(void);
  233 
  234 /* Initialization for biohist */
  235 
  236 #include <sys/biohist.h>
  237 
  238 BIOHIST_DEFINE(biohist);
  239 
  240 void
  241 biohist_init(void)
  242 {
  243 
  244         BIOHIST_INIT(biohist, BIOHIST_SIZE);
  245 }
  246 
  247 /*
  248  * Definitions for the buffer hash lists.
  249  */
  250 #define BUFHASH(dvp, lbn)       \
  251         (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
  252 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
  253 u_long  bufhash;
  254 
  255 static int     bufhash_stats(struct hashstat_sysctl *, bool);
  256 
  257 static kcondvar_t needbuffer_cv;
  258 
  259 /*
  260  * Buffer queue lock.
  261  */
  262 kmutex_t bufcache_lock __cacheline_aligned;
  263 kmutex_t buffer_lock __cacheline_aligned;
  264 
  265 /* Software ISR for completed transfers. */
  266 static void *biodone_sih;
  267 
  268 /* Buffer pool for I/O buffers. */
  269 static pool_cache_t buf_cache;
  270 static pool_cache_t bufio_cache;
  271 
  272 #define MEMPOOL_INDEX_OFFSET (ilog2(DEV_BSIZE)) /* smallest pool is 512 bytes */
  273 #define NMEMPOOLS (ilog2(MAXBSIZE) - MEMPOOL_INDEX_OFFSET + 1)
  274 __CTASSERT((1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) == MAXBSIZE);
  275 
  276 /* Buffer memory pools */
  277 static struct pool bmempools[NMEMPOOLS];
  278 
  279 static struct vm_map *buf_map;
  280 
  281 /*
  282  * Buffer memory pool allocator.
  283  */
  284 static void *
  285 bufpool_page_alloc(struct pool *pp, int flags)
  286 {
  287 
  288         return (void *)uvm_km_alloc(buf_map,
  289             MAXBSIZE, MAXBSIZE,
  290             ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT|UVM_KMF_TRYLOCK)
  291             | UVM_KMF_WIRED);
  292 }
  293 
  294 static void
  295 bufpool_page_free(struct pool *pp, void *v)
  296 {
  297 
  298         uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
  299 }
  300 
  301 static struct pool_allocator bufmempool_allocator = {
  302         .pa_alloc = bufpool_page_alloc,
  303         .pa_free = bufpool_page_free,
  304         .pa_pagesz = MAXBSIZE,
  305 };
  306 
  307 /* Buffer memory management variables */
  308 u_long bufmem_valimit;
  309 u_long bufmem_hiwater;
  310 u_long bufmem_lowater;
  311 u_long bufmem;
  312 
  313 /*
  314  * MD code can call this to set a hard limit on the amount
  315  * of virtual memory used by the buffer cache.
  316  */
  317 int
  318 buf_setvalimit(vsize_t sz)
  319 {
  320 
  321         /* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */
  322         if (sz < NMEMPOOLS * MAXBSIZE)
  323                 return EINVAL;
  324 
  325         bufmem_valimit = sz;
  326         return 0;
  327 }
  328 
  329 static void
  330 buf_setwm(void)
  331 {
  332 
  333         bufmem_hiwater = buf_memcalc();
  334         /* lowater is approx. 2% of memory (with bufcache = 15) */
  335 #define BUFMEM_WMSHIFT  3
  336 #define BUFMEM_HIWMMIN  (64 * 1024 << BUFMEM_WMSHIFT)
  337         if (bufmem_hiwater < BUFMEM_HIWMMIN)
  338                 /* Ensure a reasonable minimum value */
  339                 bufmem_hiwater = BUFMEM_HIWMMIN;
  340         bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT;
  341 }
  342 
  343 #ifdef DEBUG
  344 int debug_verify_freelist = 0;
  345 static int
  346 checkfreelist(buf_t *bp, struct bqueue *dp, int ison)
  347 {
  348         buf_t *b;
  349 
  350         if (!debug_verify_freelist)
  351                 return 1;
  352 
  353         TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) {
  354                 if (b == bp)
  355                         return ison ? 1 : 0;
  356         }
  357 
  358         return ison ? 0 : 1;
  359 }
  360 #endif
  361 
  362 /*
  363  * Insq/Remq for the buffer hash lists.
  364  * Call with buffer queue locked.
  365  */
  366 static void
  367 binsheadfree(buf_t *bp, struct bqueue *dp)
  368 {
  369 
  370         KASSERT(mutex_owned(&bufcache_lock));
  371         KASSERT(bp->b_freelistindex == -1);
  372         TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist);
  373         dp->bq_bytes += bp->b_bufsize;
  374         bp->b_freelistindex = dp - bufqueues;
  375 }
  376 
  377 static void
  378 binstailfree(buf_t *bp, struct bqueue *dp)
  379 {
  380 
  381         KASSERT(mutex_owned(&bufcache_lock));
  382         KASSERTMSG(bp->b_freelistindex == -1, "double free of buffer? "
  383             "bp=%p, b_freelistindex=%d\n", bp, bp->b_freelistindex);
  384         TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist);
  385         dp->bq_bytes += bp->b_bufsize;
  386         bp->b_freelistindex = dp - bufqueues;
  387 }
  388 
  389 void
  390 bremfree(buf_t *bp)
  391 {
  392         struct bqueue *dp;
  393         int bqidx = bp->b_freelistindex;
  394 
  395         KASSERT(mutex_owned(&bufcache_lock));
  396 
  397         KASSERT(bqidx != -1);
  398         dp = &bufqueues[bqidx];
  399         KDASSERT(checkfreelist(bp, dp, 1));
  400         KASSERT(dp->bq_bytes >= bp->b_bufsize);
  401         TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist);
  402         dp->bq_bytes -= bp->b_bufsize;
  403 
  404         /* For the sysctl helper. */
  405         if (bp == dp->bq_marker)
  406                 dp->bq_marker = NULL;
  407 
  408 #if defined(DIAGNOSTIC)
  409         bp->b_freelistindex = -1;
  410 #endif /* defined(DIAGNOSTIC) */
  411 }
  412 
  413 /*
  414  * note that for some ports this is used by pmap bootstrap code to
  415  * determine kva size.
  416  */
  417 u_long
  418 buf_memcalc(void)
  419 {
  420         u_long n;
  421         vsize_t mapsz = 0;
  422 
  423         /*
  424          * Determine the upper bound of memory to use for buffers.
  425          *
  426          *      - If bufpages is specified, use that as the number
  427          *        pages.
  428          *
  429          *      - Otherwise, use bufcache as the percentage of
  430          *        physical memory.
  431          */
  432         if (bufpages != 0) {
  433                 n = bufpages;
  434         } else {
  435                 if (bufcache < 5) {
  436                         printf("forcing bufcache %d -> 5", bufcache);
  437                         bufcache = 5;
  438                 }
  439                 if (bufcache > 95) {
  440                         printf("forcing bufcache %d -> 95", bufcache);
  441                         bufcache = 95;
  442                 }
  443                 if (buf_map != NULL)
  444                         mapsz = vm_map_max(buf_map) - vm_map_min(buf_map);
  445                 n = calc_cache_size(mapsz, bufcache,
  446                     (buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT)
  447                     / PAGE_SIZE;
  448         }
  449 
  450         n <<= PAGE_SHIFT;
  451         if (bufmem_valimit != 0 && n > bufmem_valimit)
  452                 n = bufmem_valimit;
  453 
  454         return (n);
  455 }
  456 
  457 /*
  458  * Initialize buffers and hash links for buffers.
  459  */
  460 void
  461 bufinit(void)
  462 {
  463         struct bqueue *dp;
  464         int use_std;
  465         u_int i;
  466 
  467         biodone_vfs = biodone;
  468 
  469         mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE);
  470         mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE);
  471         cv_init(&needbuffer_cv, "needbuf");
  472 
  473         if (bufmem_valimit != 0) {
  474                 vaddr_t minaddr = 0, maxaddr;
  475                 buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
  476                                           bufmem_valimit, 0, false, 0);
  477                 if (buf_map == NULL)
  478                         panic("bufinit: cannot allocate submap");
  479         } else
  480                 buf_map = kernel_map;
  481 
  482         /*
  483          * Initialize buffer cache memory parameters.
  484          */
  485         bufmem = 0;
  486         buf_setwm();
  487 
  488         /* On "small" machines use small pool page sizes where possible */
  489         use_std = (physmem < atop(16*1024*1024));
  490 
  491         /*
  492          * Also use them on systems that can map the pool pages using
  493          * a direct-mapped segment.
  494          */
  495 #ifdef PMAP_MAP_POOLPAGE
  496         use_std = 1;
  497 #endif
  498 
  499         buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
  500             "bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL);
  501         bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
  502             "biopl", NULL, IPL_BIO, NULL, NULL, NULL);
  503 
  504         for (i = 0; i < NMEMPOOLS; i++) {
  505                 struct pool_allocator *pa;
  506                 struct pool *pp = &bmempools[i];
  507                 u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET);
  508                 char *name = kmem_alloc(8, KM_SLEEP); /* XXX: never freed */
  509                 if (__predict_false(size >= 1048576))
  510                         (void)snprintf(name, 8, "buf%um", size / 1048576);
  511                 else if (__predict_true(size >= 1024))
  512                         (void)snprintf(name, 8, "buf%uk", size / 1024);
  513                 else
  514                         (void)snprintf(name, 8, "buf%ub", size);
  515                 pa = (size <= PAGE_SIZE && use_std)
  516                         ? &pool_allocator_nointr
  517                         : &bufmempool_allocator;
  518                 pool_init(pp, size, DEV_BSIZE, 0, 0, name, pa, IPL_NONE);
  519                 pool_setlowat(pp, 1);
  520                 pool_sethiwat(pp, 1);
  521         }
  522 
  523         /* Initialize the buffer queues */
  524         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
  525                 TAILQ_INIT(&dp->bq_queue);
  526                 dp->bq_bytes = 0;
  527         }
  528 
  529         /*
  530          * Estimate hash table size based on the amount of memory we
  531          * intend to use for the buffer cache. The average buffer
  532          * size is dependent on our clients (i.e. filesystems).
  533          *
  534          * For now, use an empirical 3K per buffer.
  535          */
  536         nbuf = (bufmem_hiwater / 1024) / 3;
  537         bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash);
  538 
  539         sysctl_kern_buf_setup();
  540         sysctl_vm_buf_setup();
  541         hashstat_register("bufhash", bufhash_stats);
  542 }
  543 
  544 void
  545 bufinit2(void)
  546 {
  547 
  548         biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr,
  549             NULL);
  550         if (biodone_sih == NULL)
  551                 panic("bufinit2: can't establish soft interrupt");
  552 }
  553 
  554 static int
  555 buf_lotsfree(void)
  556 {
  557         u_long guess;
  558 
  559         /* Always allocate if less than the low water mark. */
  560         if (bufmem < bufmem_lowater)
  561                 return 1;
  562 
  563         /* Never allocate if greater than the high water mark. */
  564         if (bufmem > bufmem_hiwater)
  565                 return 0;
  566 
  567         /* If there's anything on the AGE list, it should be eaten. */
  568         if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL)
  569                 return 0;
  570 
  571         /*
  572          * The probabily of getting a new allocation is inversely
  573          * proportional  to the current size of the cache above
  574          * the low water mark.  Divide the total first to avoid overflows
  575          * in the product.
  576          */
  577         guess = cprng_fast32() % 16;
  578 
  579         if ((bufmem_hiwater - bufmem_lowater) / 16 * guess >=
  580             (bufmem - bufmem_lowater))
  581                 return 1;
  582 
  583         /* Otherwise don't allocate. */
  584         return 0;
  585 }
  586 
  587 /*
  588  * Return estimate of bytes we think need to be
  589  * released to help resolve low memory conditions.
  590  *
  591  * => called with bufcache_lock held.
  592  */
  593 static int
  594 buf_canrelease(void)
  595 {
  596         int pagedemand, ninvalid = 0;
  597 
  598         KASSERT(mutex_owned(&bufcache_lock));
  599 
  600         if (bufmem < bufmem_lowater)
  601                 return 0;
  602 
  603         if (bufmem > bufmem_hiwater)
  604                 return bufmem - bufmem_hiwater;
  605 
  606         ninvalid += bufqueues[BQ_AGE].bq_bytes;
  607 
  608         pagedemand = uvmexp.freetarg - uvm_availmem(false);
  609         if (pagedemand < 0)
  610                 return ninvalid;
  611         return MAX(ninvalid, MIN(2 * MAXBSIZE,
  612             MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE)));
  613 }
  614 
  615 /*
  616  * Buffer memory allocation helper functions
  617  */
  618 static u_long
  619 buf_mempoolidx(u_long size)
  620 {
  621         u_int n = 0;
  622 
  623         size -= 1;
  624         size >>= MEMPOOL_INDEX_OFFSET;
  625         while (size) {
  626                 size >>= 1;
  627                 n += 1;
  628         }
  629         if (n >= NMEMPOOLS)
  630                 panic("buf mem pool index %d", n);
  631         return n;
  632 }
  633 
  634 static u_long
  635 buf_roundsize(u_long size)
  636 {
  637         /* Round up to nearest power of 2 */
  638         return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET));
  639 }
  640 
  641 static void *
  642 buf_alloc(size_t size)
  643 {
  644         u_int n = buf_mempoolidx(size);
  645         void *addr;
  646 
  647         while (1) {
  648                 addr = pool_get(&bmempools[n], PR_NOWAIT);
  649                 if (addr != NULL)
  650                         break;
  651 
  652                 /* No memory, see if we can free some. If so, try again */
  653                 mutex_enter(&bufcache_lock);
  654                 if (buf_drain(1) > 0) {
  655                         mutex_exit(&bufcache_lock);
  656                         continue;
  657                 }
  658 
  659                 if (curlwp == uvm.pagedaemon_lwp) {
  660                         mutex_exit(&bufcache_lock);
  661                         return NULL;
  662                 }
  663 
  664                 /* Wait for buffers to arrive on the LRU queue */
  665                 cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4);
  666                 mutex_exit(&bufcache_lock);
  667         }
  668 
  669         return addr;
  670 }
  671 
  672 static void
  673 buf_mrelease(void *addr, size_t size)
  674 {
  675 
  676         pool_put(&bmempools[buf_mempoolidx(size)], addr);
  677 }
  678 
  679 /*
  680  * bread()/breadn() helper.
  681  */
  682 static buf_t *
  683 bio_doread(struct vnode *vp, daddr_t blkno, int size, int async)
  684 {
  685         buf_t *bp;
  686         struct mount *mp;
  687 
  688         bp = getblk(vp, blkno, size, 0, 0);
  689 
  690         /*
  691          * getblk() may return NULL if we are the pagedaemon.
  692          */
  693         if (bp == NULL) {
  694                 KASSERT(curlwp == uvm.pagedaemon_lwp);
  695                 return NULL;
  696         }
  697 
  698         /*
  699          * If buffer does not have data valid, start a read.
  700          * Note that if buffer is BC_INVAL, getblk() won't return it.
  701          * Therefore, it's valid if its I/O has completed or been delayed.
  702          */
  703         if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) {
  704                 /* Start I/O for the buffer. */
  705                 SET(bp->b_flags, B_READ | async);
  706                 if (async)
  707                         BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
  708                 else
  709                         BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
  710                 VOP_STRATEGY(vp, bp);
  711 
  712                 /* Pay for the read. */
  713                 curlwp->l_ru.ru_inblock++;
  714         } else if (async)
  715                 brelse(bp, 0);
  716 
  717         if (vp->v_type == VBLK)
  718                 mp = spec_node_getmountedfs(vp);
  719         else
  720                 mp = vp->v_mount;
  721 
  722         /*
  723          * Collect statistics on synchronous and asynchronous reads.
  724          * Reads from block devices are charged to their associated
  725          * filesystem (if any).
  726          */
  727         if (mp != NULL) {
  728                 if (async == 0)
  729                         mp->mnt_stat.f_syncreads++;
  730                 else
  731                         mp->mnt_stat.f_asyncreads++;
  732         }
  733 
  734         return (bp);
  735 }
  736 
  737 /*
  738  * Read a disk block.
  739  * This algorithm described in Bach (p.54).
  740  */
  741 int
  742 bread(struct vnode *vp, daddr_t blkno, int size, int flags, buf_t **bpp)
  743 {
  744         buf_t *bp;
  745         int error;
  746 
  747         BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
  748 
  749         /* Get buffer for block. */
  750         bp = *bpp = bio_doread(vp, blkno, size, 0);
  751         if (bp == NULL)
  752                 return ENOMEM;
  753 
  754         /* Wait for the read to complete, and return result. */
  755         error = biowait(bp);
  756         if (error == 0 && (flags & B_MODIFY) != 0)
  757                 error = fscow_run(bp, true);
  758         if (error) {
  759                 brelse(bp, 0);
  760                 *bpp = NULL;
  761         }
  762 
  763         return error;
  764 }
  765 
  766 /*
  767  * Read-ahead multiple disk blocks. The first is sync, the rest async.
  768  * Trivial modification to the breada algorithm presented in Bach (p.55).
  769  */
  770 int
  771 breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks,
  772     int *rasizes, int nrablks, int flags, buf_t **bpp)
  773 {
  774         buf_t *bp;
  775         int error, i;
  776 
  777         BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
  778 
  779         bp = *bpp = bio_doread(vp, blkno, size, 0);
  780         if (bp == NULL)
  781                 return ENOMEM;
  782 
  783         /*
  784          * For each of the read-ahead blocks, start a read, if necessary.
  785          */
  786         mutex_enter(&bufcache_lock);
  787         for (i = 0; i < nrablks; i++) {
  788                 /* If it's in the cache, just go on to next one. */
  789                 if (incore(vp, rablks[i]))
  790                         continue;
  791 
  792                 /* Get a buffer for the read-ahead block */
  793                 mutex_exit(&bufcache_lock);
  794                 (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
  795                 mutex_enter(&bufcache_lock);
  796         }
  797         mutex_exit(&bufcache_lock);
  798 
  799         /* Otherwise, we had to start a read for it; wait until it's valid. */
  800         error = biowait(bp);
  801         if (error == 0 && (flags & B_MODIFY) != 0)
  802                 error = fscow_run(bp, true);
  803         if (error) {
  804                 brelse(bp, 0);
  805                 *bpp = NULL;
  806         }
  807 
  808         return error;
  809 }
  810 
  811 /*
  812  * Block write.  Described in Bach (p.56)
  813  */
  814 int
  815 bwrite(buf_t *bp)
  816 {
  817         int rv, sync, wasdelayed;
  818         struct vnode *vp;
  819         struct mount *mp;
  820 
  821         BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx",
  822             (uintptr_t)bp, 0, 0, 0);
  823 
  824         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
  825         KASSERT(!cv_has_waiters(&bp->b_done));
  826 
  827         vp = bp->b_vp;
  828 
  829         /*
  830          * dholland 20160728 AFAICT vp==NULL must be impossible as it
  831          * will crash upon reaching VOP_STRATEGY below... see further
  832          * analysis on tech-kern.
  833          */
  834         KASSERTMSG(vp != NULL, "bwrite given buffer with null vnode");
  835 
  836         if (vp != NULL) {
  837                 KASSERT(bp->b_objlock == vp->v_interlock);
  838                 if (vp->v_type == VBLK)
  839                         mp = spec_node_getmountedfs(vp);
  840                 else
  841                         mp = vp->v_mount;
  842         } else {
  843                 mp = NULL;
  844         }
  845 
  846         if (mp && mp->mnt_wapbl) {
  847                 if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
  848                         bdwrite(bp);
  849                         return 0;
  850                 }
  851         }
  852 
  853         /*
  854          * Remember buffer type, to switch on it later.  If the write was
  855          * synchronous, but the file system was mounted with MNT_ASYNC,
  856          * convert it to a delayed write.
  857          * XXX note that this relies on delayed tape writes being converted
  858          * to async, not sync writes (which is safe, but ugly).
  859          */
  860         sync = !ISSET(bp->b_flags, B_ASYNC);
  861         if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) {
  862                 bdwrite(bp);
  863                 return (0);
  864         }
  865 
  866         /*
  867          * Collect statistics on synchronous and asynchronous writes.
  868          * Writes to block devices are charged to their associated
  869          * filesystem (if any).
  870          */
  871         if (mp != NULL) {
  872                 if (sync)
  873                         mp->mnt_stat.f_syncwrites++;
  874                 else
  875                         mp->mnt_stat.f_asyncwrites++;
  876         }
  877 
  878         /*
  879          * Pay for the I/O operation and make sure the buf is on the correct
  880          * vnode queue.
  881          */
  882         bp->b_error = 0;
  883         wasdelayed = ISSET(bp->b_oflags, BO_DELWRI);
  884         CLR(bp->b_flags, B_READ);
  885         if (wasdelayed) {
  886                 mutex_enter(&bufcache_lock);
  887                 mutex_enter(bp->b_objlock);
  888                 CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
  889                 reassignbuf(bp, bp->b_vp);
  890                 /* Wake anyone trying to busy the buffer via vnode's lists. */
  891                 cv_broadcast(&bp->b_busy);
  892                 mutex_exit(&bufcache_lock);
  893         } else {
  894                 curlwp->l_ru.ru_oublock++;
  895                 mutex_enter(bp->b_objlock);
  896                 CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
  897         }
  898         if (vp != NULL)
  899                 vp->v_numoutput++;
  900         mutex_exit(bp->b_objlock);
  901 
  902         /* Initiate disk write. */
  903         if (sync)
  904                 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
  905         else
  906                 BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
  907 
  908         VOP_STRATEGY(vp, bp);
  909 
  910         if (sync) {
  911                 /* If I/O was synchronous, wait for it to complete. */
  912                 rv = biowait(bp);
  913 
  914                 /* Release the buffer. */
  915                 brelse(bp, 0);
  916 
  917                 return (rv);
  918         } else {
  919                 return (0);
  920         }
  921 }
  922 
  923 int
  924 vn_bwrite(void *v)
  925 {
  926         struct vop_bwrite_args *ap = v;
  927 
  928         return (bwrite(ap->a_bp));
  929 }
  930 
  931 /*
  932  * Delayed write.
  933  *
  934  * The buffer is marked dirty, but is not queued for I/O.
  935  * This routine should be used when the buffer is expected
  936  * to be modified again soon, typically a small write that
  937  * partially fills a buffer.
  938  *
  939  * NB: magnetic tapes cannot be delayed; they must be
  940  * written in the order that the writes are requested.
  941  *
  942  * Described in Leffler, et al. (pp. 208-213).
  943  */
  944 void
  945 bdwrite(buf_t *bp)
  946 {
  947 
  948         BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx",
  949             (uintptr_t)bp, 0, 0, 0);
  950 
  951         KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS ||
  952             bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE));
  953         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
  954         KASSERT(!cv_has_waiters(&bp->b_done));
  955 
  956         /* If this is a tape block, write the block now. */
  957         if (bdev_type(bp->b_dev) == D_TAPE) {
  958                 bawrite(bp);
  959                 return;
  960         }
  961 
  962         if (wapbl_vphaswapbl(bp->b_vp)) {
  963                 struct mount *mp = wapbl_vptomp(bp->b_vp);
  964 
  965                 if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
  966                         WAPBL_ADD_BUF(mp, bp);
  967                 }
  968         }
  969 
  970         /*
  971          * If the block hasn't been seen before:
  972          *      (1) Mark it as having been seen,
  973          *      (2) Charge for the write,
  974          *      (3) Make sure it's on its vnode's correct block list.
  975          */
  976         KASSERT(bp->b_vp == NULL || bp->b_objlock == bp->b_vp->v_interlock);
  977 
  978         if (!ISSET(bp->b_oflags, BO_DELWRI)) {
  979                 mutex_enter(&bufcache_lock);
  980                 mutex_enter(bp->b_objlock);
  981                 SET(bp->b_oflags, BO_DELWRI);
  982                 curlwp->l_ru.ru_oublock++;
  983                 reassignbuf(bp, bp->b_vp);
  984                 /* Wake anyone trying to busy the buffer via vnode's lists. */
  985                 cv_broadcast(&bp->b_busy);
  986                 mutex_exit(&bufcache_lock);
  987         } else {
  988                 mutex_enter(bp->b_objlock);
  989         }
  990         /* Otherwise, the "write" is done, so mark and release the buffer. */
  991         CLR(bp->b_oflags, BO_DONE);
  992         mutex_exit(bp->b_objlock);
  993 
  994         brelse(bp, 0);
  995 }
  996 
  997 /*
  998  * Asynchronous block write; just an asynchronous bwrite().
  999  */
 1000 void
 1001 bawrite(buf_t *bp)
 1002 {
 1003 
 1004         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
 1005         KASSERT(bp->b_vp != NULL);
 1006 
 1007         SET(bp->b_flags, B_ASYNC);
 1008         VOP_BWRITE(bp->b_vp, bp);
 1009 }
 1010 
 1011 /*
 1012  * Release a buffer on to the free lists.
 1013  * Described in Bach (p. 46).
 1014  */
 1015 void
 1016 brelsel(buf_t *bp, int set)
 1017 {
 1018         struct bqueue *bufq;
 1019         struct vnode *vp;
 1020 
 1021         SDT_PROBE2(io, kernel, , brelse,  bp, set);
 1022 
 1023         KASSERT(bp != NULL);
 1024         KASSERT(mutex_owned(&bufcache_lock));
 1025         KASSERT(!cv_has_waiters(&bp->b_done));
 1026 
 1027         SET(bp->b_cflags, set);
 1028 
 1029         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
 1030         KASSERT(bp->b_iodone == NULL);
 1031 
 1032         /* Wake up any processes waiting for any buffer to become free. */
 1033         cv_signal(&needbuffer_cv);
 1034 
 1035         /* Wake up any proceeses waiting for _this_ buffer to become free */
 1036         if (ISSET(bp->b_cflags, BC_WANTED))
 1037                 CLR(bp->b_cflags, BC_WANTED|BC_AGE);
 1038 
 1039         /* If it's clean clear the copy-on-write flag. */
 1040         if (ISSET(bp->b_flags, B_COWDONE)) {
 1041                 mutex_enter(bp->b_objlock);
 1042                 if (!ISSET(bp->b_oflags, BO_DELWRI))
 1043                         CLR(bp->b_flags, B_COWDONE);
 1044                 mutex_exit(bp->b_objlock);
 1045         }
 1046 
 1047         /*
 1048          * Determine which queue the buffer should be on, then put it there.
 1049          */
 1050 
 1051         /* If it's locked, don't report an error; try again later. */
 1052         if (ISSET(bp->b_flags, B_LOCKED))
 1053                 bp->b_error = 0;
 1054 
 1055         /* If it's not cacheable, or an error, mark it invalid. */
 1056         if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0)
 1057                 SET(bp->b_cflags, BC_INVAL);
 1058 
 1059         if (ISSET(bp->b_cflags, BC_VFLUSH)) {
 1060                 /*
 1061                  * This is a delayed write buffer that was just flushed to
 1062                  * disk.  It is still on the LRU queue.  If it's become
 1063                  * invalid, then we need to move it to a different queue;
 1064                  * otherwise leave it in its current position.
 1065                  */
 1066                 CLR(bp->b_cflags, BC_VFLUSH);
 1067                 if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) &&
 1068                     !ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) {
 1069                         KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1));
 1070                         goto already_queued;
 1071                 } else {
 1072                         bremfree(bp);
 1073                 }
 1074         }
 1075 
 1076         KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0));
 1077         KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0));
 1078         KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0));
 1079 
 1080         if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) {
 1081                 /*
 1082                  * If it's invalid or empty, dissociate it from its vnode
 1083                  * and put on the head of the appropriate queue.
 1084                  */
 1085                 if (ISSET(bp->b_flags, B_LOCKED)) {
 1086                         if (wapbl_vphaswapbl(vp = bp->b_vp)) {
 1087                                 struct mount *mp = wapbl_vptomp(vp);
 1088 
 1089                                 KASSERT(bp->b_iodone
 1090                                     != mp->mnt_wapbl_op->wo_wapbl_biodone);
 1091                                 WAPBL_REMOVE_BUF(mp, bp);
 1092                         }
 1093                 }
 1094 
 1095                 mutex_enter(bp->b_objlock);
 1096                 CLR(bp->b_oflags, BO_DONE|BO_DELWRI);
 1097                 if ((vp = bp->b_vp) != NULL) {
 1098                         KASSERT(bp->b_objlock == vp->v_interlock);
 1099                         reassignbuf(bp, bp->b_vp);
 1100                         brelvp(bp);
 1101                         mutex_exit(vp->v_interlock);
 1102                 } else {
 1103                         KASSERT(bp->b_objlock == &buffer_lock);
 1104                         mutex_exit(bp->b_objlock);
 1105                 }
 1106                 /* We want to dispose of the buffer, so wake everybody. */
 1107                 cv_broadcast(&bp->b_busy);
 1108                 if (bp->b_bufsize <= 0)
 1109                         /* no data */
 1110                         goto already_queued;
 1111                 else
 1112                         /* invalid data */
 1113                         bufq = &bufqueues[BQ_AGE];
 1114                 binsheadfree(bp, bufq);
 1115         } else  {
 1116                 /*
 1117                  * It has valid data.  Put it on the end of the appropriate
 1118                  * queue, so that it'll stick around for as long as possible.
 1119                  * If buf is AGE, but has dependencies, must put it on last
 1120                  * bufqueue to be scanned, ie LRU. This protects against the
 1121                  * livelock where BQ_AGE only has buffers with dependencies,
 1122                  * and we thus never get to the dependent buffers in BQ_LRU.
 1123                  */
 1124                 if (ISSET(bp->b_flags, B_LOCKED)) {
 1125                         /* locked in core */
 1126                         bufq = &bufqueues[BQ_LOCKED];
 1127                 } else if (!ISSET(bp->b_cflags, BC_AGE)) {
 1128                         /* valid data */
 1129                         bufq = &bufqueues[BQ_LRU];
 1130                 } else {
 1131                         /* stale but valid data */
 1132                         bufq = &bufqueues[BQ_AGE];
 1133                 }
 1134                 binstailfree(bp, bufq);
 1135         }
 1136 already_queued:
 1137         /* Unlock the buffer. */
 1138         CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE);
 1139         CLR(bp->b_flags, B_ASYNC);
 1140 
 1141         /*
 1142          * Wake only the highest priority waiter on the lock, in order to
 1143          * prevent a thundering herd: many LWPs simultaneously awakening and
 1144          * competing for the buffer's lock.  Testing in 2019 revealed this
 1145          * to reduce contention on bufcache_lock tenfold during a kernel
 1146          * compile.  Here and elsewhere, when the buffer is changing
 1147          * identity, being disposed of, or moving from one list to another,
 1148          * we wake all lock requestors.
 1149          */
 1150         if (bp->b_bufsize <= 0) {
 1151                 cv_broadcast(&bp->b_busy);
 1152                 buf_destroy(bp);
 1153 #ifdef DEBUG
 1154                 memset((char *)bp, 0, sizeof(*bp));
 1155 #endif
 1156                 pool_cache_put(buf_cache, bp);
 1157         } else
 1158                 cv_signal(&bp->b_busy);
 1159 }
 1160 
 1161 void
 1162 brelse(buf_t *bp, int set)
 1163 {
 1164 
 1165         mutex_enter(&bufcache_lock);
 1166         brelsel(bp, set);
 1167         mutex_exit(&bufcache_lock);
 1168 }
 1169 
 1170 /*
 1171  * Determine if a block is in the cache.
 1172  * Just look on what would be its hash chain.  If it's there, return
 1173  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
 1174  * we normally don't return the buffer, unless the caller explicitly
 1175  * wants us to.
 1176  */
 1177 buf_t *
 1178 incore(struct vnode *vp, daddr_t blkno)
 1179 {
 1180         buf_t *bp;
 1181 
 1182         KASSERT(mutex_owned(&bufcache_lock));
 1183 
 1184         /* Search hash chain */
 1185         LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
 1186                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
 1187                     !ISSET(bp->b_cflags, BC_INVAL)) {
 1188                         KASSERT(bp->b_objlock == vp->v_interlock);
 1189                         return (bp);
 1190                 }
 1191         }
 1192 
 1193         return (NULL);
 1194 }
 1195 
 1196 /*
 1197  * Get a block of requested size that is associated with
 1198  * a given vnode and block offset. If it is found in the
 1199  * block cache, mark it as having been found, make it busy
 1200  * and return it. Otherwise, return an empty block of the
 1201  * correct size. It is up to the caller to insure that the
 1202  * cached blocks be of the correct size.
 1203  */
 1204 buf_t *
 1205 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 1206 {
 1207         int err, preserve;
 1208         buf_t *bp;
 1209 
 1210         mutex_enter(&bufcache_lock);
 1211         SDT_PROBE3(io, kernel, , getblk__start,  vp, blkno, size);
 1212  loop:
 1213         bp = incore(vp, blkno);
 1214         if (bp != NULL) {
 1215                 err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL);
 1216                 if (err != 0) {
 1217                         if (err == EPASSTHROUGH)
 1218                                 goto loop;
 1219                         mutex_exit(&bufcache_lock);
 1220                         SDT_PROBE4(io, kernel, , getblk__done,
 1221                             vp, blkno, size, NULL);
 1222                         return (NULL);
 1223                 }
 1224                 KASSERT(!cv_has_waiters(&bp->b_done));
 1225 #ifdef DIAGNOSTIC
 1226                 if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) &&
 1227                     bp->b_bcount < size && vp->v_type != VBLK)
 1228                         panic("getblk: block size invariant failed");
 1229 #endif
 1230                 bremfree(bp);
 1231                 preserve = 1;
 1232         } else {
 1233                 if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL)
 1234                         goto loop;
 1235 
 1236                 if (incore(vp, blkno) != NULL) {
 1237                         /* The block has come into memory in the meantime. */
 1238                         brelsel(bp, 0);
 1239                         goto loop;
 1240                 }
 1241 
 1242                 LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash);
 1243                 bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
 1244                 mutex_enter(vp->v_interlock);
 1245                 bgetvp(vp, bp);
 1246                 mutex_exit(vp->v_interlock);
 1247                 preserve = 0;
 1248         }
 1249         mutex_exit(&bufcache_lock);
 1250 
 1251         /*
 1252          * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes)
 1253          * if we re-size buffers here.
 1254          */
 1255         if (ISSET(bp->b_flags, B_LOCKED)) {
 1256                 KASSERT(bp->b_bufsize >= size);
 1257         } else {
 1258                 if (allocbuf(bp, size, preserve)) {
 1259                         mutex_enter(&bufcache_lock);
 1260                         LIST_REMOVE(bp, b_hash);
 1261                         brelsel(bp, BC_INVAL);
 1262                         mutex_exit(&bufcache_lock);
 1263                         SDT_PROBE4(io, kernel, , getblk__done,
 1264                             vp, blkno, size, NULL);
 1265                         return NULL;
 1266                 }
 1267         }
 1268         BIO_SETPRIO(bp, BPRIO_DEFAULT);
 1269         SDT_PROBE4(io, kernel, , getblk__done,  vp, blkno, size, bp);
 1270         return (bp);
 1271 }
 1272 
 1273 /*
 1274  * Get an empty, disassociated buffer of given size.
 1275  */
 1276 buf_t *
 1277 geteblk(int size)
 1278 {
 1279         buf_t *bp;
 1280         int error __diagused;
 1281 
 1282         mutex_enter(&bufcache_lock);
 1283         while ((bp = getnewbuf(0, 0, 0)) == NULL)
 1284                 ;
 1285 
 1286         SET(bp->b_cflags, BC_INVAL);
 1287         LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 1288         mutex_exit(&bufcache_lock);
 1289         BIO_SETPRIO(bp, BPRIO_DEFAULT);
 1290         error = allocbuf(bp, size, 0);
 1291         KASSERT(error == 0);
 1292         return (bp);
 1293 }
 1294 
 1295 /*
 1296  * Expand or contract the actual memory allocated to a buffer.
 1297  *
 1298  * If the buffer shrinks, data is lost, so it's up to the
 1299  * caller to have written it out *first*; this routine will not
 1300  * start a write.  If the buffer grows, it's the callers
 1301  * responsibility to fill out the buffer's additional contents.
 1302  */
 1303 int
 1304 allocbuf(buf_t *bp, int size, int preserve)
 1305 {
 1306         void *addr;
 1307         vsize_t oldsize, desired_size;
 1308         int oldcount;
 1309         int delta;
 1310 
 1311         desired_size = buf_roundsize(size);
 1312         if (desired_size > MAXBSIZE)
 1313                 printf("allocbuf: buffer larger than MAXBSIZE requested");
 1314 
 1315         oldcount = bp->b_bcount;
 1316 
 1317         bp->b_bcount = size;
 1318 
 1319         oldsize = bp->b_bufsize;
 1320         if (oldsize == desired_size) {
 1321                 /*
 1322                  * Do not short cut the WAPBL resize, as the buffer length
 1323                  * could still have changed and this would corrupt the
 1324                  * tracking of the transaction length.
 1325                  */
 1326                 goto out;
 1327         }
 1328 
 1329         /*
 1330          * If we want a buffer of a different size, re-allocate the
 1331          * buffer's memory; copy old content only if needed.
 1332          */
 1333         addr = buf_alloc(desired_size);
 1334         if (addr == NULL)
 1335                 return ENOMEM;
 1336         if (preserve)
 1337                 memcpy(addr, bp->b_data, MIN(oldsize,desired_size));
 1338         if (bp->b_data != NULL)
 1339                 buf_mrelease(bp->b_data, oldsize);
 1340         bp->b_data = addr;
 1341         bp->b_bufsize = desired_size;
 1342 
 1343         /*
 1344          * Update overall buffer memory counter (protected by bufcache_lock)
 1345          */
 1346         delta = (long)desired_size - (long)oldsize;
 1347 
 1348         mutex_enter(&bufcache_lock);
 1349         if ((bufmem += delta) > bufmem_hiwater) {
 1350                 /*
 1351                  * Need to trim overall memory usage.
 1352                  */
 1353                 while (buf_canrelease()) {
 1354                         if (preempt_needed()) {
 1355                                 mutex_exit(&bufcache_lock);
 1356                                 preempt();
 1357                                 mutex_enter(&bufcache_lock);
 1358                         }
 1359                         if (buf_trim() == 0)
 1360                                 break;
 1361                 }
 1362         }
 1363         mutex_exit(&bufcache_lock);
 1364 
 1365  out:
 1366         if (wapbl_vphaswapbl(bp->b_vp))
 1367                 WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount);
 1368 
 1369         return 0;
 1370 }
 1371 
 1372 /*
 1373  * Find a buffer which is available for use.
 1374  * Select something from a free list.
 1375  * Preference is to AGE list, then LRU list.
 1376  *
 1377  * Called with the buffer queues locked.
 1378  * Return buffer locked.
 1379  */
 1380 static buf_t *
 1381 getnewbuf(int slpflag, int slptimeo, int from_bufq)
 1382 {
 1383         buf_t *bp;
 1384         struct vnode *vp;
 1385         struct mount *transmp = NULL;
 1386 
 1387         SDT_PROBE0(io, kernel, , getnewbuf__start);
 1388 
 1389  start:
 1390         KASSERT(mutex_owned(&bufcache_lock));
 1391 
 1392         /*
 1393          * Get a new buffer from the pool.
 1394          */
 1395         if (!from_bufq && buf_lotsfree()) {
 1396                 mutex_exit(&bufcache_lock);
 1397                 bp = pool_cache_get(buf_cache, PR_NOWAIT);
 1398                 if (bp != NULL) {
 1399                         memset((char *)bp, 0, sizeof(*bp));
 1400                         buf_init(bp);
 1401                         SET(bp->b_cflags, BC_BUSY);     /* mark buffer busy */
 1402                         mutex_enter(&bufcache_lock);
 1403 #if defined(DIAGNOSTIC)
 1404                         bp->b_freelistindex = -1;
 1405 #endif /* defined(DIAGNOSTIC) */
 1406                         SDT_PROBE1(io, kernel, , getnewbuf__done,  bp);
 1407                         return (bp);
 1408                 }
 1409                 mutex_enter(&bufcache_lock);
 1410         }
 1411 
 1412         KASSERT(mutex_owned(&bufcache_lock));
 1413         if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL) {
 1414                 KASSERT(!ISSET(bp->b_oflags, BO_DELWRI));
 1415         } else {
 1416                 TAILQ_FOREACH(bp, &bufqueues[BQ_LRU].bq_queue, b_freelist) {
 1417                         if (ISSET(bp->b_cflags, BC_VFLUSH) ||
 1418                             !ISSET(bp->b_oflags, BO_DELWRI))
 1419                                 break;
 1420                         if (fstrans_start_nowait(bp->b_vp->v_mount) == 0) {
 1421                                 KASSERT(transmp == NULL);
 1422                                 transmp = bp->b_vp->v_mount;
 1423                                 break;
 1424                         }
 1425                 }
 1426         }
 1427         if (bp != NULL) {
 1428                 KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || ISSET(bp->b_cflags, BC_VFLUSH));
 1429                 bremfree(bp);
 1430 
 1431                 /* Buffer is no longer on free lists. */
 1432                 SET(bp->b_cflags, BC_BUSY);
 1433 
 1434                 /* Wake anyone trying to lock the old identity. */
 1435                 cv_broadcast(&bp->b_busy);
 1436         } else {
 1437                 /*
 1438                  * XXX: !from_bufq should be removed.
 1439                  */
 1440                 if (!from_bufq || curlwp != uvm.pagedaemon_lwp) {
 1441                         /* wait for a free buffer of any kind */
 1442                         if ((slpflag & PCATCH) != 0)
 1443                                 (void)cv_timedwait_sig(&needbuffer_cv,
 1444                                     &bufcache_lock, slptimeo);
 1445                         else
 1446                                 (void)cv_timedwait(&needbuffer_cv,
 1447                                     &bufcache_lock, slptimeo);
 1448                 }
 1449                 SDT_PROBE1(io, kernel, , getnewbuf__done,  NULL);
 1450                 return (NULL);
 1451         }
 1452 
 1453 #ifdef DIAGNOSTIC
 1454         if (bp->b_bufsize <= 0)
 1455                 panic("buffer %p: on queue but empty", bp);
 1456 #endif
 1457 
 1458         if (ISSET(bp->b_cflags, BC_VFLUSH)) {
 1459                 /*
 1460                  * This is a delayed write buffer being flushed to disk.  Make
 1461                  * sure it gets aged out of the queue when it's finished, and
 1462                  * leave it off the LRU queue.
 1463                  */
 1464                 CLR(bp->b_cflags, BC_VFLUSH);
 1465                 SET(bp->b_cflags, BC_AGE);
 1466                 goto start;
 1467         }
 1468 
 1469         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
 1470         KASSERT(!cv_has_waiters(&bp->b_done));
 1471 
 1472         /*
 1473          * If buffer was a delayed write, start it and return NULL
 1474          * (since we might sleep while starting the write).
 1475          */
 1476         if (ISSET(bp->b_oflags, BO_DELWRI)) {
 1477                 /*
 1478                  * This buffer has gone through the LRU, so make sure it gets
 1479                  * reused ASAP.
 1480                  */
 1481                 SET(bp->b_cflags, BC_AGE);
 1482                 mutex_exit(&bufcache_lock);
 1483                 bawrite(bp);
 1484                 KASSERT(transmp != NULL);
 1485                 fstrans_done(transmp);
 1486                 mutex_enter(&bufcache_lock);
 1487                 SDT_PROBE1(io, kernel, , getnewbuf__done,  NULL);
 1488                 return (NULL);
 1489         }
 1490 
 1491         KASSERT(transmp == NULL);
 1492 
 1493         vp = bp->b_vp;
 1494 
 1495         /* clear out various other fields */
 1496         bp->b_cflags = BC_BUSY;
 1497         bp->b_oflags = 0;
 1498         bp->b_flags = 0;
 1499         bp->b_dev = NODEV;
 1500         bp->b_blkno = 0;
 1501         bp->b_lblkno = 0;
 1502         bp->b_rawblkno = 0;
 1503         bp->b_iodone = 0;
 1504         bp->b_error = 0;
 1505         bp->b_resid = 0;
 1506         bp->b_bcount = 0;
 1507 
 1508         LIST_REMOVE(bp, b_hash);
 1509 
 1510         /* Disassociate us from our vnode, if we had one... */
 1511         if (vp != NULL) {
 1512                 mutex_enter(vp->v_interlock);
 1513                 brelvp(bp);
 1514                 mutex_exit(vp->v_interlock);
 1515         }
 1516 
 1517         SDT_PROBE1(io, kernel, , getnewbuf__done,  bp);
 1518         return (bp);
 1519 }
 1520 
 1521 /*
 1522  * Invalidate the specified buffer if it exists.
 1523  */
 1524 void
 1525 binvalbuf(struct vnode *vp, daddr_t blkno)
 1526 {
 1527         buf_t *bp;
 1528         int err;
 1529 
 1530         mutex_enter(&bufcache_lock);
 1531 
 1532  loop:
 1533         bp = incore(vp, blkno);
 1534         if (bp != NULL) {
 1535                 err = bbusy(bp, 0, 0, NULL);
 1536                 if (err == EPASSTHROUGH)
 1537                         goto loop;
 1538                 bremfree(bp);
 1539                 if (ISSET(bp->b_oflags, BO_DELWRI)) {
 1540                         SET(bp->b_cflags, BC_NOCACHE);
 1541                         mutex_exit(&bufcache_lock);
 1542                         bwrite(bp);
 1543                 } else {
 1544                         brelsel(bp, BC_INVAL);
 1545                         mutex_exit(&bufcache_lock);
 1546                 }
 1547         } else
 1548                 mutex_exit(&bufcache_lock);
 1549 }
 1550 
 1551 /*
 1552  * Attempt to free an aged buffer off the queues.
 1553  * Called with queue lock held.
 1554  * Returns the amount of buffer memory freed.
 1555  */
 1556 static int
 1557 buf_trim(void)
 1558 {
 1559         buf_t *bp;
 1560         long size;
 1561 
 1562         KASSERT(mutex_owned(&bufcache_lock));
 1563 
 1564         /* Instruct getnewbuf() to get buffers off the queues */
 1565         if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL)
 1566                 return 0;
 1567 
 1568         KASSERT((bp->b_cflags & BC_WANTED) == 0);
 1569         size = bp->b_bufsize;
 1570         bufmem -= size;
 1571         if (size > 0) {
 1572                 buf_mrelease(bp->b_data, size);
 1573                 bp->b_bcount = bp->b_bufsize = 0;
 1574         }
 1575         /* brelse() will return the buffer to the global buffer pool */
 1576         brelsel(bp, 0);
 1577         return size;
 1578 }
 1579 
 1580 int
 1581 buf_drain(int n)
 1582 {
 1583         int size = 0, sz;
 1584 
 1585         KASSERT(mutex_owned(&bufcache_lock));
 1586 
 1587         while (size < n && bufmem > bufmem_lowater) {
 1588                 sz = buf_trim();
 1589                 if (sz <= 0)
 1590                         break;
 1591                 size += sz;
 1592         }
 1593 
 1594         return size;
 1595 }
 1596 
 1597 /*
 1598  * Wait for operations on the buffer to complete.
 1599  * When they do, extract and return the I/O's error value.
 1600  */
 1601 int
 1602 biowait(buf_t *bp)
 1603 {
 1604 
 1605         BIOHIST_FUNC(__func__);
 1606 
 1607         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
 1608 
 1609         SDT_PROBE1(io, kernel, , wait__start, bp);
 1610 
 1611         mutex_enter(bp->b_objlock);
 1612 
 1613         BIOHIST_CALLARGS(biohist, "bp=%#jx, oflags=0x%jx, ret_addr=%#jx",
 1614             (uintptr_t)bp, bp->b_oflags, 
 1615             (uintptr_t)__builtin_return_address(0), 0);
 1616 
 1617         while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI)) {
 1618                 BIOHIST_LOG(biohist, "waiting bp=%#jx", (uintptr_t)bp, 0, 0, 0);
 1619                 cv_wait(&bp->b_done, bp->b_objlock);
 1620         }
 1621         mutex_exit(bp->b_objlock);
 1622 
 1623         SDT_PROBE1(io, kernel, , wait__done, bp);
 1624 
 1625         BIOHIST_LOG(biohist, "return %jd", bp->b_error, 0, 0, 0);
 1626 
 1627         return bp->b_error;
 1628 }
 1629 
 1630 /*
 1631  * Mark I/O complete on a buffer.
 1632  *
 1633  * If a callback has been requested, e.g. the pageout
 1634  * daemon, do so. Otherwise, awaken waiting processes.
 1635  *
 1636  * [ Leffler, et al., says on p.247:
 1637  *      "This routine wakes up the blocked process, frees the buffer
 1638  *      for an asynchronous write, or, for a request by the pagedaemon
 1639  *      process, invokes a procedure specified in the buffer structure" ]
 1640  *
 1641  * In real life, the pagedaemon (or other system processes) wants
 1642  * to do async stuff too, and doesn't want the buffer brelse()'d.
 1643  * (for swap pager, that puts swap buffers on the free lists (!!!),
 1644  * for the vn device, that puts allocated buffers on the free lists!)
 1645  */
 1646 void
 1647 biodone(buf_t *bp)
 1648 {
 1649         int s;
 1650 
 1651         BIOHIST_FUNC(__func__);
 1652 
 1653         KASSERT(!ISSET(bp->b_oflags, BO_DONE));
 1654 
 1655         if (cpu_intr_p()) {
 1656                 /* From interrupt mode: defer to a soft interrupt. */
 1657                 s = splvm();
 1658                 TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq);
 1659 
 1660                 BIOHIST_CALLARGS(biohist, "bp=%#jx, softint scheduled",
 1661                     (uintptr_t)bp, 0, 0, 0);
 1662                 softint_schedule(biodone_sih);
 1663                 splx(s);
 1664         } else {
 1665                 /* Process now - the buffer may be freed soon. */
 1666                 biodone2(bp);
 1667         }
 1668 }
 1669 
 1670 SDT_PROBE_DEFINE1(io, kernel, , done, "struct buf *"/*bp*/);
 1671 
 1672 static void
 1673 biodone2(buf_t *bp)
 1674 {
 1675         void (*callout)(buf_t *);
 1676 
 1677         SDT_PROBE1(io, kernel, ,done, bp);
 1678 
 1679         BIOHIST_FUNC(__func__);
 1680         BIOHIST_CALLARGS(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0);
 1681 
 1682         mutex_enter(bp->b_objlock);
 1683         /* Note that the transfer is done. */
 1684         if (ISSET(bp->b_oflags, BO_DONE))
 1685                 panic("biodone2 already");
 1686         CLR(bp->b_flags, B_COWDONE);
 1687         SET(bp->b_oflags, BO_DONE);
 1688         BIO_SETPRIO(bp, BPRIO_DEFAULT);
 1689 
 1690         /* Wake up waiting writers. */
 1691         if (!ISSET(bp->b_flags, B_READ))
 1692                 vwakeup(bp);
 1693 
 1694         if ((callout = bp->b_iodone) != NULL) {
 1695                 BIOHIST_LOG(biohist, "callout %#jx", (uintptr_t)callout,
 1696                     0, 0, 0);
 1697 
 1698                 /* Note callout done, then call out. */
 1699                 KASSERT(!cv_has_waiters(&bp->b_done));
 1700                 bp->b_iodone = NULL;
 1701                 mutex_exit(bp->b_objlock);
 1702                 (*callout)(bp);
 1703         } else if (ISSET(bp->b_flags, B_ASYNC)) {
 1704                 /* If async, release. */
 1705                 BIOHIST_LOG(biohist, "async", 0, 0, 0, 0);
 1706                 KASSERT(!cv_has_waiters(&bp->b_done));
 1707                 mutex_exit(bp->b_objlock);
 1708                 brelse(bp, 0);
 1709         } else {
 1710                 /* Otherwise just wake up waiters in biowait(). */
 1711                 BIOHIST_LOG(biohist, "wake-up", 0, 0, 0, 0);
 1712                 cv_broadcast(&bp->b_done);
 1713                 mutex_exit(bp->b_objlock);
 1714         }
 1715 }
 1716 
 1717 static void
 1718 biointr(void *cookie)
 1719 {
 1720         struct cpu_info *ci;
 1721         buf_t *bp;
 1722         int s;
 1723 
 1724         BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
 1725 
 1726         ci = curcpu();
 1727 
 1728         s = splvm();
 1729         while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) {
 1730                 KASSERT(curcpu() == ci);
 1731 
 1732                 bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone);
 1733                 TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq);
 1734                 splx(s);
 1735 
 1736                 BIOHIST_LOG(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0);
 1737                 biodone2(bp);
 1738 
 1739                 s = splvm();
 1740         }
 1741         splx(s);
 1742 }
 1743 
 1744 static void
 1745 sysctl_fillbuf(const buf_t *i, struct buf_sysctl *o)
 1746 {
 1747         const bool allowaddr = get_expose_address(curproc);
 1748 
 1749         memset(o, 0, sizeof(*o));
 1750 
 1751         o->b_flags = i->b_flags | i->b_cflags | i->b_oflags;
 1752         o->b_error = i->b_error;
 1753         o->b_prio = i->b_prio;
 1754         o->b_dev = i->b_dev;
 1755         o->b_bufsize = i->b_bufsize;
 1756         o->b_bcount = i->b_bcount;
 1757         o->b_resid = i->b_resid;
 1758         COND_SET_VALUE(o->b_addr, PTRTOUINT64(i->b_data), allowaddr);
 1759         o->b_blkno = i->b_blkno;
 1760         o->b_rawblkno = i->b_rawblkno;
 1761         COND_SET_VALUE(o->b_iodone, PTRTOUINT64(i->b_iodone), allowaddr);
 1762         COND_SET_VALUE(o->b_proc, PTRTOUINT64(i->b_proc), allowaddr);
 1763         COND_SET_VALUE(o->b_vp, PTRTOUINT64(i->b_vp), allowaddr);
 1764         COND_SET_VALUE(o->b_saveaddr, PTRTOUINT64(i->b_saveaddr), allowaddr);
 1765         o->b_lblkno = i->b_lblkno;
 1766 }
 1767 
 1768 static int
 1769 sysctl_dobuf(SYSCTLFN_ARGS)
 1770 {
 1771         buf_t *bp;
 1772         struct buf_sysctl bs;
 1773         struct bqueue *bq;
 1774         char *dp;
 1775         u_int i, op, arg;
 1776         size_t len, needed, elem_size, out_size;
 1777         int error, elem_count, retries;
 1778 
 1779         if (namelen == 1 && name[0] == CTL_QUERY)
 1780                 return (sysctl_query(SYSCTLFN_CALL(rnode)));
 1781 
 1782         if (namelen != 4)
 1783                 return (EINVAL);
 1784 
 1785         retries = 100;
 1786  retry:
 1787         dp = oldp;
 1788         len = (oldp != NULL) ? *oldlenp : 0;
 1789         op = name[0];
 1790         arg = name[1];
 1791         elem_size = name[2];
 1792         elem_count = name[3];
 1793         out_size = MIN(sizeof(bs), elem_size);
 1794 
 1795         /*
 1796          * at the moment, these are just "placeholders" to make the
 1797          * API for retrieving kern.buf data more extensible in the
 1798          * future.
 1799          *
 1800          * XXX kern.buf currently has "netbsd32" issues.  hopefully
 1801          * these will be resolved at a later point.
 1802          */
 1803         if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL ||
 1804             elem_size < 1 || elem_count < 0)
 1805                 return (EINVAL);
 1806 
 1807         if (oldp == NULL) {
 1808                 /* count only, don't run through the buffer queues */
 1809                 needed = pool_cache_nget(buf_cache) - pool_cache_nput(buf_cache);
 1810                 *oldlenp = (needed + KERN_BUFSLOP) * elem_size;
 1811 
 1812                 return 0;
 1813         }
 1814 
 1815         error = 0;
 1816         needed = 0;
 1817         sysctl_unlock();
 1818         mutex_enter(&bufcache_lock);
 1819         for (i = 0; i < BQUEUES; i++) {
 1820                 bq = &bufqueues[i];
 1821                 TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) {
 1822                         bq->bq_marker = bp;
 1823                         if (len >= elem_size && elem_count > 0) {
 1824                                 sysctl_fillbuf(bp, &bs);
 1825                                 mutex_exit(&bufcache_lock);
 1826                                 error = copyout(&bs, dp, out_size);
 1827                                 mutex_enter(&bufcache_lock);
 1828                                 if (error)
 1829                                         break;
 1830                                 if (bq->bq_marker != bp) {
 1831                                         /*
 1832                                          * This sysctl node is only for
 1833                                          * statistics.  Retry; if the
 1834                                          * queue keeps changing, then
 1835                                          * bail out.
 1836                                          */
 1837                                         if (retries-- == 0) {
 1838                                                 error = EAGAIN;
 1839                                                 break;
 1840                                         }
 1841                                         mutex_exit(&bufcache_lock);
 1842                                         sysctl_relock();
 1843                                         goto retry;
 1844                                 }
 1845                                 dp += elem_size;
 1846                                 len -= elem_size;
 1847                         }
 1848                         needed += elem_size;
 1849                         if (elem_count > 0 && elem_count != INT_MAX)
 1850                                 elem_count--;
 1851                 }
 1852                 if (error != 0)
 1853                         break;
 1854         }
 1855         mutex_exit(&bufcache_lock);
 1856         sysctl_relock();
 1857 
 1858         *oldlenp = needed;
 1859 
 1860         return (error);
 1861 }
 1862 
 1863 static int
 1864 sysctl_bufvm_update(SYSCTLFN_ARGS)
 1865 {
 1866         int error, rv;
 1867         struct sysctlnode node;
 1868         unsigned int temp_bufcache;
 1869         unsigned long temp_water;
 1870 
 1871         /* Take a copy of the supplied node and its data */
 1872         node = *rnode;
 1873         if (node.sysctl_data == &bufcache) {
 1874             node.sysctl_data = &temp_bufcache;
 1875             temp_bufcache = *(unsigned int *)rnode->sysctl_data;
 1876         } else {
 1877             node.sysctl_data = &temp_water;
 1878             temp_water = *(unsigned long *)rnode->sysctl_data;
 1879         }
 1880 
 1881         /* Update the copy */
 1882         error = sysctl_lookup(SYSCTLFN_CALL(&node));
 1883         if (error || newp == NULL)
 1884                 return (error);
 1885 
 1886         if (rnode->sysctl_data == &bufcache) {
 1887                 if (temp_bufcache > 100)
 1888                         return (EINVAL);
 1889                 bufcache = temp_bufcache;
 1890                 buf_setwm();
 1891         } else if (rnode->sysctl_data == &bufmem_lowater) {
 1892                 if (bufmem_hiwater - temp_water < 16)
 1893                         return (EINVAL);
 1894                 bufmem_lowater = temp_water;
 1895         } else if (rnode->sysctl_data == &bufmem_hiwater) {
 1896                 if (temp_water - bufmem_lowater < 16)
 1897                         return (EINVAL);
 1898                 bufmem_hiwater = temp_water;
 1899         } else
 1900                 return (EINVAL);
 1901 
 1902         /* Drain until below new high water mark */
 1903         sysctl_unlock();
 1904         mutex_enter(&bufcache_lock);
 1905         while (bufmem > bufmem_hiwater) {
 1906                 rv = buf_drain((bufmem - bufmem_hiwater) / (2 * 1024));
 1907                 if (rv <= 0)
 1908                         break;
 1909         }
 1910         mutex_exit(&bufcache_lock);
 1911         sysctl_relock();
 1912 
 1913         return 0;
 1914 }
 1915 
 1916 static struct sysctllog *vfsbio_sysctllog;
 1917 
 1918 static void
 1919 sysctl_kern_buf_setup(void)
 1920 {
 1921 
 1922         sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
 1923                        CTLFLAG_PERMANENT,
 1924                        CTLTYPE_NODE, "buf",
 1925                        SYSCTL_DESCR("Kernel buffer cache information"),
 1926                        sysctl_dobuf, 0, NULL, 0,
 1927                        CTL_KERN, KERN_BUF, CTL_EOL);
 1928 }
 1929 
 1930 static void
 1931 sysctl_vm_buf_setup(void)
 1932 {
 1933 
 1934         sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
 1935                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 1936                        CTLTYPE_INT, "bufcache",
 1937                        SYSCTL_DESCR("Percentage of physical memory to use for "
 1938                                     "buffer cache"),
 1939                        sysctl_bufvm_update, 0, &bufcache, 0,
 1940                        CTL_VM, CTL_CREATE, CTL_EOL);
 1941         sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
 1942                        CTLFLAG_PERMANENT|CTLFLAG_READONLY,
 1943                        CTLTYPE_LONG, "bufmem",
 1944                        SYSCTL_DESCR("Amount of kernel memory used by buffer "
 1945                                     "cache"),
 1946                        NULL, 0, &bufmem, 0,
 1947                        CTL_VM, CTL_CREATE, CTL_EOL);
 1948         sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
 1949                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 1950                        CTLTYPE_LONG, "bufmem_lowater",
 1951                        SYSCTL_DESCR("Minimum amount of kernel memory to "
 1952                                     "reserve for buffer cache"),
 1953                        sysctl_bufvm_update, 0, &bufmem_lowater, 0,
 1954                        CTL_VM, CTL_CREATE, CTL_EOL);
 1955         sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
 1956                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 1957                        CTLTYPE_LONG, "bufmem_hiwater",
 1958                        SYSCTL_DESCR("Maximum amount of kernel memory to use "
 1959                                     "for buffer cache"),
 1960                        sysctl_bufvm_update, 0, &bufmem_hiwater, 0,
 1961                        CTL_VM, CTL_CREATE, CTL_EOL);
 1962 }
 1963 
 1964 static int
 1965 bufhash_stats(struct hashstat_sysctl *hs, bool fill)
 1966 {
 1967         buf_t *bp;
 1968         uint64_t chain;
 1969 
 1970         strlcpy(hs->hash_name, "bufhash", sizeof(hs->hash_name));
 1971         strlcpy(hs->hash_desc, "buffer hash", sizeof(hs->hash_desc));
 1972         if (!fill)
 1973                 return 0;
 1974 
 1975         hs->hash_size = bufhash + 1;
 1976 
 1977         for (size_t i = 0; i < hs->hash_size; i++) {
 1978                 chain = 0;
 1979 
 1980                 mutex_enter(&bufcache_lock);
 1981                 LIST_FOREACH(bp, &bufhashtbl[i], b_hash) {
 1982                         chain++;
 1983                 }
 1984                 mutex_exit(&bufcache_lock);
 1985 
 1986                 if (chain > 0) {
 1987                         hs->hash_used++;
 1988                         hs->hash_items += chain;
 1989                         if (chain > hs->hash_maxchain)
 1990                                 hs->hash_maxchain = chain;
 1991                 }
 1992                 preempt_point();
 1993         }
 1994 
 1995         return 0;
 1996 }
 1997 
 1998 #ifdef DEBUG
 1999 /*
 2000  * Print out statistics on the current allocation of the buffer pool.
 2001  * Can be enabled to print out on every ``sync'' by setting "syncprt"
 2002  * in vfs_syscalls.c using sysctl.
 2003  */
 2004 void
 2005 vfs_bufstats(void)
 2006 {
 2007         int i, j, count;
 2008         buf_t *bp;
 2009         struct bqueue *dp;
 2010         int counts[MAXBSIZE / MIN_PAGE_SIZE + 1];
 2011         static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" };
 2012 
 2013         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
 2014                 count = 0;
 2015                 memset(counts, 0, sizeof(counts));
 2016                 TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) {
 2017                         counts[bp->b_bufsize / PAGE_SIZE]++;
 2018                         count++;
 2019                 }
 2020                 printf("%s: total-%d", bname[i], count);
 2021                 for (j = 0; j <= MAXBSIZE / PAGE_SIZE; j++)
 2022                         if (counts[j] != 0)
 2023                                 printf(", %d-%d", j * PAGE_SIZE, counts[j]);
 2024                 printf("\n");
 2025         }
 2026 }
 2027 #endif /* DEBUG */
 2028 
 2029 /* ------------------------------ */
 2030 
 2031 buf_t *
 2032 getiobuf(struct vnode *vp, bool waitok)
 2033 {
 2034         buf_t *bp;
 2035 
 2036         bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
 2037         if (bp == NULL)
 2038                 return bp;
 2039 
 2040         buf_init(bp);
 2041 
 2042         if ((bp->b_vp = vp) != NULL) {
 2043                 bp->b_objlock = vp->v_interlock;
 2044         } else {
 2045                 KASSERT(bp->b_objlock == &buffer_lock);
 2046         }
 2047 
 2048         return bp;
 2049 }
 2050 
 2051 void
 2052 putiobuf(buf_t *bp)
 2053 {
 2054 
 2055         buf_destroy(bp);
 2056         pool_cache_put(bufio_cache, bp);
 2057 }
 2058 
 2059 /*
 2060  * nestiobuf_iodone: b_iodone callback for nested buffers.
 2061  */
 2062 
 2063 void
 2064 nestiobuf_iodone(buf_t *bp)
 2065 {
 2066         buf_t *mbp = bp->b_private;
 2067         int error;
 2068         int donebytes;
 2069 
 2070         KASSERT(bp->b_bcount <= bp->b_bufsize);
 2071         KASSERT(mbp != bp);
 2072 
 2073         error = bp->b_error;
 2074         if (bp->b_error == 0 &&
 2075             (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) {
 2076                 /*
 2077                  * Not all got transferred, raise an error. We have no way to
 2078                  * propagate these conditions to mbp.
 2079                  */
 2080                 error = EIO;
 2081         }
 2082 
 2083         donebytes = bp->b_bufsize;
 2084 
 2085         putiobuf(bp);
 2086         nestiobuf_done(mbp, donebytes, error);
 2087 }
 2088 
 2089 /*
 2090  * nestiobuf_setup: setup a "nested" buffer.
 2091  *
 2092  * => 'mbp' is a "master" buffer which is being divided into sub pieces.
 2093  * => 'bp' should be a buffer allocated by getiobuf.
 2094  * => 'offset' is a byte offset in the master buffer.
 2095  * => 'size' is a size in bytes of this nested buffer.
 2096  */
 2097 
 2098 void
 2099 nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size)
 2100 {
 2101         const int b_pass = mbp->b_flags & (B_READ|B_PHYS|B_RAW|B_MEDIA_FLAGS);
 2102         struct vnode *vp = mbp->b_vp;
 2103 
 2104         KASSERT(mbp->b_bcount >= offset + size);
 2105         bp->b_vp = vp;
 2106         bp->b_dev = mbp->b_dev;
 2107         bp->b_objlock = mbp->b_objlock;
 2108         bp->b_cflags = BC_BUSY;
 2109         bp->b_flags = B_ASYNC | b_pass;
 2110         bp->b_iodone = nestiobuf_iodone;
 2111         bp->b_data = (char *)mbp->b_data + offset;
 2112         bp->b_resid = bp->b_bcount = size;
 2113         bp->b_bufsize = bp->b_bcount;
 2114         bp->b_private = mbp;
 2115         BIO_COPYPRIO(bp, mbp);
 2116         if (BUF_ISWRITE(bp) && vp != NULL) {
 2117                 mutex_enter(vp->v_interlock);
 2118                 vp->v_numoutput++;
 2119                 mutex_exit(vp->v_interlock);
 2120         }
 2121 }
 2122 
 2123 /*
 2124  * nestiobuf_done: propagate completion to the master buffer.
 2125  *
 2126  * => 'donebytes' specifies how many bytes in the 'mbp' is completed.
 2127  * => 'error' is an errno(2) that 'donebytes' has been completed with.
 2128  */
 2129 
 2130 void
 2131 nestiobuf_done(buf_t *mbp, int donebytes, int error)
 2132 {
 2133 
 2134         if (donebytes == 0) {
 2135                 return;
 2136         }
 2137         mutex_enter(mbp->b_objlock);
 2138         KASSERT(mbp->b_resid >= donebytes);
 2139         mbp->b_resid -= donebytes;
 2140         if (error)
 2141                 mbp->b_error = error;
 2142         if (mbp->b_resid == 0) {
 2143                 if (mbp->b_error)
 2144                         mbp->b_resid = mbp->b_bcount;
 2145                 mutex_exit(mbp->b_objlock);
 2146                 biodone(mbp);
 2147         } else
 2148                 mutex_exit(mbp->b_objlock);
 2149 }
 2150 
 2151 void
 2152 buf_init(buf_t *bp)
 2153 {
 2154 
 2155         cv_init(&bp->b_busy, "biolock");
 2156         cv_init(&bp->b_done, "biowait");
 2157         bp->b_dev = NODEV;
 2158         bp->b_error = 0;
 2159         bp->b_flags = 0;
 2160         bp->b_cflags = 0;
 2161         bp->b_oflags = 0;
 2162         bp->b_objlock = &buffer_lock;
 2163         bp->b_iodone = NULL;
 2164         bp->b_dev = NODEV;
 2165         bp->b_vnbufs.le_next = NOLIST;
 2166         BIO_SETPRIO(bp, BPRIO_DEFAULT);
 2167 }
 2168 
 2169 void
 2170 buf_destroy(buf_t *bp)
 2171 {
 2172 
 2173         cv_destroy(&bp->b_done);
 2174         cv_destroy(&bp->b_busy);
 2175 }
 2176 
 2177 int
 2178 bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock)
 2179 {
 2180         int error;
 2181 
 2182         KASSERT(mutex_owned(&bufcache_lock));
 2183 
 2184         SDT_PROBE4(io, kernel, , bbusy__start,  bp, intr, timo, interlock);
 2185 
 2186         if ((bp->b_cflags & BC_BUSY) != 0) {
 2187                 if (curlwp == uvm.pagedaemon_lwp) {
 2188                         error = EDEADLK;
 2189                         goto out;
 2190                 }
 2191                 bp->b_cflags |= BC_WANTED;
 2192                 if (interlock != NULL)
 2193                         mutex_exit(interlock);
 2194                 if (intr) {
 2195                         error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock,
 2196                             timo);
 2197                 } else {
 2198                         error = cv_timedwait(&bp->b_busy, &bufcache_lock,
 2199                             timo);
 2200                 }
 2201                 /*
 2202                  * At this point the buffer may be gone: don't touch it
 2203                  * again.  The caller needs to find it again and retry.
 2204                  */
 2205                 if (interlock != NULL)
 2206                         mutex_enter(interlock);
 2207                 if (error == 0)
 2208                         error = EPASSTHROUGH;
 2209         } else {
 2210                 bp->b_cflags |= BC_BUSY;
 2211                 error = 0;
 2212         }
 2213 
 2214 out:    SDT_PROBE5(io, kernel, , bbusy__done,
 2215             bp, intr, timo, interlock, error);
 2216         return error;
 2217 }
 2218 
 2219 /*
 2220  * Nothing outside this file should really need to know about nbuf,
 2221  * but a few things still want to read it, so give them a way to do that.
 2222  */
 2223 u_int
 2224 buf_nbuf(void)
 2225 {
 2226 
 2227         return nbuf;
 2228 }
Cache object: 291a698b196d0ae384eb8af5fc57e2bd
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_bio.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_bio.c