vfs_bio.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $NetBSD: vfs_bio.c,v 1.210 2008/09/11 09:14:46 hannken Exp $    */
    2 
    3 /*-
    4  * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Andrew Doran.
    9  * This code is derived from software contributed to The NetBSD Foundation
   10  * by Wasabi Systems, Inc.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  *
   21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   31  * POSSIBILITY OF SUCH DAMAGE.
   32  */
   33 
   34 /*-
   35  * Copyright (c) 1982, 1986, 1989, 1993
   36  *      The Regents of the University of California.  All rights reserved.
   37  * (c) UNIX System Laboratories, Inc.
   38  * All or some portions of this file are derived from material licensed
   39  * to the University of California by American Telephone and Telegraph
   40  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   41  * the permission of UNIX System Laboratories, Inc.
   42  *
   43  * Redistribution and use in source and binary forms, with or without
   44  * modification, are permitted provided that the following conditions
   45  * are met:
   46  * 1. Redistributions of source code must retain the above copyright
   47  *    notice, this list of conditions and the following disclaimer.
   48  * 2. Redistributions in binary form must reproduce the above copyright
   49  *    notice, this list of conditions and the following disclaimer in the
   50  *    documentation and/or other materials provided with the distribution.
   51  * 3. Neither the name of the University nor the names of its contributors
   52  *    may be used to endorse or promote products derived from this software
   53  *    without specific prior written permission.
   54  *
   55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   65  * SUCH DAMAGE.
   66  *
   67  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
   68  */
   69 
   70 /*-
   71  * Copyright (c) 1994 Christopher G. Demetriou
   72  *
   73  * Redistribution and use in source and binary forms, with or without
   74  * modification, are permitted provided that the following conditions
   75  * are met:
   76  * 1. Redistributions of source code must retain the above copyright
   77  *    notice, this list of conditions and the following disclaimer.
   78  * 2. Redistributions in binary form must reproduce the above copyright
   79  *    notice, this list of conditions and the following disclaimer in the
   80  *    documentation and/or other materials provided with the distribution.
   81  * 3. All advertising materials mentioning features or use of this software
   82  *    must display the following acknowledgement:
   83  *      This product includes software developed by the University of
   84  *      California, Berkeley and its contributors.
   85  * 4. Neither the name of the University nor the names of its contributors
   86  *    may be used to endorse or promote products derived from this software
   87  *    without specific prior written permission.
   88  *
   89  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   90  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   91  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   92  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   93  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   94  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   95  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   96  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   97  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   98  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   99  * SUCH DAMAGE.
  100  *
  101  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  102  */
  103 
  104 /*
  105  * Some references:
  106  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
  107  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  108  *              UNIX Operating System (Addison Welley, 1989)
  109  */
  110 
  111 #include <sys/cdefs.h>
  112 __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.210 2008/09/11 09:14:46 hannken Exp $");
  113 
  114 #include "fs_ffs.h"
  115 #include "opt_bufcache.h"
  116 
  117 #include <sys/param.h>
  118 #include <sys/systm.h>
  119 #include <sys/kernel.h>
  120 #include <sys/proc.h>
  121 #include <sys/buf.h>
  122 #include <sys/vnode.h>
  123 #include <sys/mount.h>
  124 #include <sys/resourcevar.h>
  125 #include <sys/sysctl.h>
  126 #include <sys/conf.h>
  127 #include <sys/kauth.h>
  128 #include <sys/fstrans.h>
  129 #include <sys/intr.h>
  130 #include <sys/cpu.h>
  131 #include <sys/wapbl.h>
  132 
  133 #include <uvm/uvm.h>
  134 
  135 #include <miscfs/specfs/specdev.h>
  136 
  137 #ifndef BUFPAGES
  138 # define BUFPAGES 0
  139 #endif
  140 
  141 #ifdef BUFCACHE
  142 # if (BUFCACHE < 5) || (BUFCACHE > 95)
  143 #  error BUFCACHE is not between 5 and 95
  144 # endif
  145 #else
  146 # define BUFCACHE 15
  147 #endif
  148 
  149 u_int   nbuf;                   /* XXX - for softdep_lockedbufs */
  150 u_int   bufpages = BUFPAGES;    /* optional hardwired count */
  151 u_int   bufcache = BUFCACHE;    /* max % of RAM to use for buffer cache */
  152 
  153 /* Function prototypes */
  154 struct bqueue;
  155 
  156 static void buf_setwm(void);
  157 static int buf_trim(void);
  158 static void *bufpool_page_alloc(struct pool *, int);
  159 static void bufpool_page_free(struct pool *, void *);
  160 static buf_t *bio_doread(struct vnode *, daddr_t, int,
  161     kauth_cred_t, int);
  162 static buf_t *getnewbuf(int, int, int);
  163 static int buf_lotsfree(void);
  164 static int buf_canrelease(void);
  165 static u_long buf_mempoolidx(u_long);
  166 static u_long buf_roundsize(u_long);
  167 static void *buf_malloc(size_t);
  168 static void buf_mrelease(void *, size_t);
  169 static void binsheadfree(buf_t *, struct bqueue *);
  170 static void binstailfree(buf_t *, struct bqueue *);
  171 int count_lock_queue(void); /* XXX */
  172 #ifdef DEBUG
  173 static int checkfreelist(buf_t *, struct bqueue *, int);
  174 #endif
  175 static void biointr(void *);
  176 static void biodone2(buf_t *);
  177 static void bref(buf_t *);
  178 static void brele(buf_t *);
  179 
  180 /*
  181  * Definitions for the buffer hash lists.
  182  */
  183 #define BUFHASH(dvp, lbn)       \
  184         (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
  185 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
  186 u_long  bufhash;
  187 struct bqueue bufqueues[BQUEUES];
  188 const struct bio_ops *bioopsp;  /* I/O operation notification */
  189 
  190 static kcondvar_t needbuffer_cv;
  191 
  192 /*
  193  * Buffer queue lock.
  194  */
  195 kmutex_t bufcache_lock;
  196 kmutex_t buffer_lock;
  197 
  198 /* Software ISR for completed transfers. */
  199 static void *biodone_sih;
  200 
  201 /* Buffer pool for I/O buffers. */
  202 static pool_cache_t buf_cache;
  203 static pool_cache_t bufio_cache;
  204 
  205 /* XXX - somewhat gross.. */
  206 #if MAXBSIZE == 0x2000
  207 #define NMEMPOOLS 5
  208 #elif MAXBSIZE == 0x4000
  209 #define NMEMPOOLS 6
  210 #elif MAXBSIZE == 0x8000
  211 #define NMEMPOOLS 7
  212 #else
  213 #define NMEMPOOLS 8
  214 #endif
  215 
  216 #define MEMPOOL_INDEX_OFFSET 9  /* smallest pool is 512 bytes */
  217 #if (1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) != MAXBSIZE
  218 #error update vfs_bio buffer memory parameters
  219 #endif
  220 
  221 /* Buffer memory pools */
  222 static struct pool bmempools[NMEMPOOLS];
  223 
  224 static struct vm_map *buf_map;
  225 
  226 /*
  227  * Buffer memory pool allocator.
  228  */
  229 static void *
  230 bufpool_page_alloc(struct pool *pp, int flags)
  231 {
  232 
  233         return (void *)uvm_km_alloc(buf_map,
  234             MAXBSIZE, MAXBSIZE,
  235             ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
  236             | UVM_KMF_WIRED);
  237 }
  238 
  239 static void
  240 bufpool_page_free(struct pool *pp, void *v)
  241 {
  242 
  243         uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
  244 }
  245 
  246 static struct pool_allocator bufmempool_allocator = {
  247         .pa_alloc = bufpool_page_alloc,
  248         .pa_free = bufpool_page_free,
  249         .pa_pagesz = MAXBSIZE,
  250 };
  251 
  252 /* Buffer memory management variables */
  253 u_long bufmem_valimit;
  254 u_long bufmem_hiwater;
  255 u_long bufmem_lowater;
  256 u_long bufmem;
  257 
  258 /*
  259  * MD code can call this to set a hard limit on the amount
  260  * of virtual memory used by the buffer cache.
  261  */
  262 int
  263 buf_setvalimit(vsize_t sz)
  264 {
  265 
  266         /* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */
  267         if (sz < NMEMPOOLS * MAXBSIZE)
  268                 return EINVAL;
  269 
  270         bufmem_valimit = sz;
  271         return 0;
  272 }
  273 
  274 static void
  275 buf_setwm(void)
  276 {
  277 
  278         bufmem_hiwater = buf_memcalc();
  279         /* lowater is approx. 2% of memory (with bufcache = 15) */
  280 #define BUFMEM_WMSHIFT  3
  281 #define BUFMEM_HIWMMIN  (64 * 1024 << BUFMEM_WMSHIFT)
  282         if (bufmem_hiwater < BUFMEM_HIWMMIN)
  283                 /* Ensure a reasonable minimum value */
  284                 bufmem_hiwater = BUFMEM_HIWMMIN;
  285         bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT;
  286 }
  287 
  288 #ifdef DEBUG
  289 int debug_verify_freelist = 0;
  290 static int
  291 checkfreelist(buf_t *bp, struct bqueue *dp, int ison)
  292 {
  293         buf_t *b;
  294 
  295         if (!debug_verify_freelist)
  296                 return 1;
  297 
  298         TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) {
  299                 if (b == bp)
  300                         return ison ? 1 : 0;
  301         }
  302 
  303         return ison ? 0 : 1;
  304 }
  305 #endif
  306 
  307 /*
  308  * Insq/Remq for the buffer hash lists.
  309  * Call with buffer queue locked.
  310  */
  311 static void
  312 binsheadfree(buf_t *bp, struct bqueue *dp)
  313 {
  314 
  315         KASSERT(mutex_owned(&bufcache_lock));
  316         KASSERT(bp->b_freelistindex == -1);
  317         TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist);
  318         dp->bq_bytes += bp->b_bufsize;
  319         bp->b_freelistindex = dp - bufqueues;
  320 }
  321 
  322 static void
  323 binstailfree(buf_t *bp, struct bqueue *dp)
  324 {
  325 
  326         KASSERT(mutex_owned(&bufcache_lock));
  327         KASSERT(bp->b_freelistindex == -1);
  328         TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist);
  329         dp->bq_bytes += bp->b_bufsize;
  330         bp->b_freelistindex = dp - bufqueues;
  331 }
  332 
  333 void
  334 bremfree(buf_t *bp)
  335 {
  336         struct bqueue *dp;
  337         int bqidx = bp->b_freelistindex;
  338 
  339         KASSERT(mutex_owned(&bufcache_lock));
  340 
  341         KASSERT(bqidx != -1);
  342         dp = &bufqueues[bqidx];
  343         KDASSERT(checkfreelist(bp, dp, 1));
  344         KASSERT(dp->bq_bytes >= bp->b_bufsize);
  345         TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist);
  346         dp->bq_bytes -= bp->b_bufsize;
  347 
  348         /* For the sysctl helper. */
  349         if (bp == dp->bq_marker)
  350                 dp->bq_marker = NULL;
  351 
  352 #if defined(DIAGNOSTIC)
  353         bp->b_freelistindex = -1;
  354 #endif /* defined(DIAGNOSTIC) */
  355 }
  356 
  357 /*
  358  * Add a reference to an buffer structure that came from buf_cache.
  359  */
  360 static inline void
  361 bref(buf_t *bp)
  362 {
  363 
  364         KASSERT(mutex_owned(&bufcache_lock));
  365         KASSERT(bp->b_refcnt > 0);
  366 
  367         bp->b_refcnt++;
  368 }
  369 
  370 /*
  371  * Free an unused buffer structure that came from buf_cache.
  372  */
  373 static inline void
  374 brele(buf_t *bp)
  375 {
  376 
  377         KASSERT(mutex_owned(&bufcache_lock));
  378         KASSERT(bp->b_refcnt > 0);
  379 
  380         if (bp->b_refcnt-- == 1) {
  381                 buf_destroy(bp);
  382 #ifdef DEBUG
  383                 memset((char *)bp, 0, sizeof(*bp));
  384 #endif
  385                 pool_cache_put(buf_cache, bp);
  386         }
  387 }
  388 
  389 /*
  390  * note that for some ports this is used by pmap bootstrap code to
  391  * determine kva size.
  392  */
  393 u_long
  394 buf_memcalc(void)
  395 {
  396         u_long n;
  397 
  398         /*
  399          * Determine the upper bound of memory to use for buffers.
  400          *
  401          *      - If bufpages is specified, use that as the number
  402          *        pages.
  403          *
  404          *      - Otherwise, use bufcache as the percentage of
  405          *        physical memory.
  406          */
  407         if (bufpages != 0) {
  408                 n = bufpages;
  409         } else {
  410                 if (bufcache < 5) {
  411                         printf("forcing bufcache %d -> 5", bufcache);
  412                         bufcache = 5;
  413                 }
  414                 if (bufcache > 95) {
  415                         printf("forcing bufcache %d -> 95", bufcache);
  416                         bufcache = 95;
  417                 }
  418                 n = calc_cache_size(buf_map, bufcache,
  419                     (buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT)
  420                     / PAGE_SIZE;
  421         }
  422 
  423         n <<= PAGE_SHIFT;
  424         if (bufmem_valimit != 0 && n > bufmem_valimit)
  425                 n = bufmem_valimit;
  426 
  427         return (n);
  428 }
  429 
  430 /*
  431  * Initialize buffers and hash links for buffers.
  432  */
  433 void
  434 bufinit(void)
  435 {
  436         struct bqueue *dp;
  437         int use_std;
  438         u_int i;
  439 
  440         mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE);
  441         mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE);
  442         cv_init(&needbuffer_cv, "needbuf");
  443 
  444         if (bufmem_valimit != 0) {
  445                 vaddr_t minaddr = 0, maxaddr;
  446                 buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
  447                                           bufmem_valimit, 0, false, 0);
  448                 if (buf_map == NULL)
  449                         panic("bufinit: cannot allocate submap");
  450         } else
  451                 buf_map = kernel_map;
  452 
  453         /*
  454          * Initialize buffer cache memory parameters.
  455          */
  456         bufmem = 0;
  457         buf_setwm();
  458 
  459         /* On "small" machines use small pool page sizes where possible */
  460         use_std = (physmem < atop(16*1024*1024));
  461 
  462         /*
  463          * Also use them on systems that can map the pool pages using
  464          * a direct-mapped segment.
  465          */
  466 #ifdef PMAP_MAP_POOLPAGE
  467         use_std = 1;
  468 #endif
  469 
  470         buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
  471             "bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL);
  472         bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
  473             "biopl", NULL, IPL_BIO, NULL, NULL, NULL);
  474 
  475         bufmempool_allocator.pa_backingmap = buf_map;
  476         for (i = 0; i < NMEMPOOLS; i++) {
  477                 struct pool_allocator *pa;
  478                 struct pool *pp = &bmempools[i];
  479                 u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET);
  480                 char *name = kmem_alloc(8, KM_SLEEP);
  481                 if (__predict_true(size >= 1024))
  482                         (void)snprintf(name, 8, "buf%dk", size / 1024);
  483                 else
  484                         (void)snprintf(name, 8, "buf%db", size);
  485                 pa = (size <= PAGE_SIZE && use_std)
  486                         ? &pool_allocator_nointr
  487                         : &bufmempool_allocator;
  488                 pool_init(pp, size, 0, 0, 0, name, pa, IPL_NONE);
  489                 pool_setlowat(pp, 1);
  490                 pool_sethiwat(pp, 1);
  491         }
  492 
  493         /* Initialize the buffer queues */
  494         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
  495                 TAILQ_INIT(&dp->bq_queue);
  496                 dp->bq_bytes = 0;
  497         }
  498 
  499         /*
  500          * Estimate hash table size based on the amount of memory we
  501          * intend to use for the buffer cache. The average buffer
  502          * size is dependent on our clients (i.e. filesystems).
  503          *
  504          * For now, use an empirical 3K per buffer.
  505          */
  506         nbuf = (bufmem_hiwater / 1024) / 3;
  507         bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash);
  508 }
  509 
  510 void
  511 bufinit2(void)
  512 {
  513 
  514         biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr,
  515             NULL);
  516         if (biodone_sih == NULL)
  517                 panic("bufinit2: can't establish soft interrupt");
  518 }
  519 
  520 static int
  521 buf_lotsfree(void)
  522 {
  523         int try, thresh;
  524 
  525         /* Always allocate if less than the low water mark. */
  526         if (bufmem < bufmem_lowater)
  527                 return 1;
  528 
  529         /* Never allocate if greater than the high water mark. */
  530         if (bufmem > bufmem_hiwater)
  531                 return 0;
  532 
  533         /* If there's anything on the AGE list, it should be eaten. */
  534         if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL)
  535                 return 0;
  536 
  537         /*
  538          * The probabily of getting a new allocation is inversely
  539          * proportional to the current size of the cache, using
  540          * a granularity of 16 steps.
  541          */
  542         try = random() & 0x0000000fL;
  543 
  544         /* Don't use "16 * bufmem" here to avoid a 32-bit overflow. */
  545         thresh = (bufmem - bufmem_lowater) /
  546             ((bufmem_hiwater - bufmem_lowater) / 16);
  547 
  548         if (try >= thresh)
  549                 return 1;
  550 
  551         /* Otherwise don't allocate. */
  552         return 0;
  553 }
  554 
  555 /*
  556  * Return estimate of bytes we think need to be
  557  * released to help resolve low memory conditions.
  558  *
  559  * => called with bufcache_lock held.
  560  */
  561 static int
  562 buf_canrelease(void)
  563 {
  564         int pagedemand, ninvalid = 0;
  565 
  566         KASSERT(mutex_owned(&bufcache_lock));
  567 
  568         if (bufmem < bufmem_lowater)
  569                 return 0;
  570 
  571         if (bufmem > bufmem_hiwater)
  572                 return bufmem - bufmem_hiwater;
  573 
  574         ninvalid += bufqueues[BQ_AGE].bq_bytes;
  575 
  576         pagedemand = uvmexp.freetarg - uvmexp.free;
  577         if (pagedemand < 0)
  578                 return ninvalid;
  579         return MAX(ninvalid, MIN(2 * MAXBSIZE,
  580             MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE)));
  581 }
  582 
  583 /*
  584  * Buffer memory allocation helper functions
  585  */
  586 static u_long
  587 buf_mempoolidx(u_long size)
  588 {
  589         u_int n = 0;
  590 
  591         size -= 1;
  592         size >>= MEMPOOL_INDEX_OFFSET;
  593         while (size) {
  594                 size >>= 1;
  595                 n += 1;
  596         }
  597         if (n >= NMEMPOOLS)
  598                 panic("buf mem pool index %d", n);
  599         return n;
  600 }
  601 
  602 static u_long
  603 buf_roundsize(u_long size)
  604 {
  605         /* Round up to nearest power of 2 */
  606         return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET));
  607 }
  608 
  609 static void *
  610 buf_malloc(size_t size)
  611 {
  612         u_int n = buf_mempoolidx(size);
  613         void *addr;
  614 
  615         while (1) {
  616                 addr = pool_get(&bmempools[n], PR_NOWAIT);
  617                 if (addr != NULL)
  618                         break;
  619 
  620                 /* No memory, see if we can free some. If so, try again */
  621                 mutex_enter(&bufcache_lock);
  622                 if (buf_drain(1) > 0) {
  623                         mutex_exit(&bufcache_lock);
  624                         continue;
  625                 }
  626 
  627                 if (curlwp == uvm.pagedaemon_lwp) {
  628                         mutex_exit(&bufcache_lock);
  629                         return NULL;
  630                 }
  631 
  632                 /* Wait for buffers to arrive on the LRU queue */
  633                 cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4);
  634                 mutex_exit(&bufcache_lock);
  635         }
  636 
  637         return addr;
  638 }
  639 
  640 static void
  641 buf_mrelease(void *addr, size_t size)
  642 {
  643 
  644         pool_put(&bmempools[buf_mempoolidx(size)], addr);
  645 }
  646 
  647 /*
  648  * bread()/breadn() helper.
  649  */
  650 static buf_t *
  651 bio_doread(struct vnode *vp, daddr_t blkno, int size, kauth_cred_t cred,
  652     int async)
  653 {
  654         buf_t *bp;
  655         struct mount *mp;
  656 
  657         bp = getblk(vp, blkno, size, 0, 0);
  658 
  659 #ifdef DIAGNOSTIC
  660         if (bp == NULL) {
  661                 panic("bio_doread: no such buf");
  662         }
  663 #endif
  664 
  665         /*
  666          * If buffer does not have data valid, start a read.
  667          * Note that if buffer is BC_INVAL, getblk() won't return it.
  668          * Therefore, it's valid if its I/O has completed or been delayed.
  669          */
  670         if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) {
  671                 /* Start I/O for the buffer. */
  672                 SET(bp->b_flags, B_READ | async);
  673                 if (async)
  674                         BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
  675                 else
  676                         BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
  677                 VOP_STRATEGY(vp, bp);
  678 
  679                 /* Pay for the read. */
  680                 curlwp->l_ru.ru_inblock++;
  681         } else if (async)
  682                 brelse(bp, 0);
  683 
  684         if (vp->v_type == VBLK)
  685                 mp = vp->v_specmountpoint;
  686         else
  687                 mp = vp->v_mount;
  688 
  689         /*
  690          * Collect statistics on synchronous and asynchronous reads.
  691          * Reads from block devices are charged to their associated
  692          * filesystem (if any).
  693          */
  694         if (mp != NULL) {
  695                 if (async == 0)
  696                         mp->mnt_stat.f_syncreads++;
  697                 else
  698                         mp->mnt_stat.f_asyncreads++;
  699         }
  700 
  701         return (bp);
  702 }
  703 
  704 /*
  705  * Read a disk block.
  706  * This algorithm described in Bach (p.54).
  707  */
  708 int
  709 bread(struct vnode *vp, daddr_t blkno, int size, kauth_cred_t cred,
  710     int flags, buf_t **bpp)
  711 {
  712         buf_t *bp;
  713         int error;
  714 
  715         /* Get buffer for block. */
  716         bp = *bpp = bio_doread(vp, blkno, size, cred, 0);
  717 
  718         /* Wait for the read to complete, and return result. */
  719         error = biowait(bp);
  720         if (error == 0 && (flags & B_MODIFY) != 0)      /* XXXX before the next code block or after? */
  721                 error = fscow_run(bp, true);
  722 
  723         if (!error) {
  724                 struct mount *mp = wapbl_vptomp(vp);
  725 
  726                 if (mp && mp->mnt_wapbl_replay &&
  727                     WAPBL_REPLAY_ISOPEN(mp)) {
  728                         error = WAPBL_REPLAY_READ(mp, bp->b_data, bp->b_blkno,
  729                             bp->b_bcount);
  730                         if (error) {
  731                                 mutex_enter(&bufcache_lock);
  732                                 SET(bp->b_cflags, BC_INVAL);
  733                                 mutex_exit(&bufcache_lock);
  734                         }
  735                 }
  736         }
  737         return error;
  738 }
  739 
  740 /*
  741  * Read-ahead multiple disk blocks. The first is sync, the rest async.
  742  * Trivial modification to the breada algorithm presented in Bach (p.55).
  743  */
  744 int
  745 breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks,
  746     int *rasizes, int nrablks, kauth_cred_t cred, int flags, buf_t **bpp)
  747 {
  748         buf_t *bp;
  749         int error, i;
  750 
  751         bp = *bpp = bio_doread(vp, blkno, size, cred, 0);
  752 
  753         /*
  754          * For each of the read-ahead blocks, start a read, if necessary.
  755          */
  756         mutex_enter(&bufcache_lock);
  757         for (i = 0; i < nrablks; i++) {
  758                 /* If it's in the cache, just go on to next one. */
  759                 if (incore(vp, rablks[i]))
  760                         continue;
  761 
  762                 /* Get a buffer for the read-ahead block */
  763                 mutex_exit(&bufcache_lock);
  764                 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC);
  765                 mutex_enter(&bufcache_lock);
  766         }
  767         mutex_exit(&bufcache_lock);
  768 
  769         /* Otherwise, we had to start a read for it; wait until it's valid. */
  770         error = biowait(bp);
  771         if (error == 0 && (flags & B_MODIFY) != 0)
  772                 error = fscow_run(bp, true);
  773         return error;
  774 }
  775 
  776 /*
  777  * Read with single-block read-ahead.  Defined in Bach (p.55), but
  778  * implemented as a call to breadn().
  779  * XXX for compatibility with old file systems.
  780  */
  781 int
  782 breada(struct vnode *vp, daddr_t blkno, int size, daddr_t rablkno,
  783     int rabsize, kauth_cred_t cred, int flags, buf_t **bpp)
  784 {
  785 
  786         return (breadn(vp, blkno, size, &rablkno, &rabsize, 1,
  787             cred, flags, bpp));
  788 }
  789 
  790 /*
  791  * Block write.  Described in Bach (p.56)
  792  */
  793 int
  794 bwrite(buf_t *bp)
  795 {
  796         int rv, sync, wasdelayed;
  797         struct vnode *vp;
  798         struct mount *mp;
  799 
  800         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
  801         KASSERT(!cv_has_waiters(&bp->b_done));
  802 
  803         vp = bp->b_vp;
  804         if (vp != NULL) {
  805                 KASSERT(bp->b_objlock == &vp->v_interlock);
  806                 if (vp->v_type == VBLK)
  807                         mp = vp->v_specmountpoint;
  808                 else
  809                         mp = vp->v_mount;
  810         } else {
  811                 mp = NULL;
  812         }
  813 
  814         if (mp && mp->mnt_wapbl) {
  815                 if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
  816                         bdwrite(bp);
  817                         return 0;
  818                 }
  819         }
  820 
  821         /*
  822          * Remember buffer type, to switch on it later.  If the write was
  823          * synchronous, but the file system was mounted with MNT_ASYNC,
  824          * convert it to a delayed write.
  825          * XXX note that this relies on delayed tape writes being converted
  826          * to async, not sync writes (which is safe, but ugly).
  827          */
  828         sync = !ISSET(bp->b_flags, B_ASYNC);
  829         if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) {
  830                 bdwrite(bp);
  831                 return (0);
  832         }
  833 
  834         /*
  835          * Collect statistics on synchronous and asynchronous writes.
  836          * Writes to block devices are charged to their associated
  837          * filesystem (if any).
  838          */
  839         if (mp != NULL) {
  840                 if (sync)
  841                         mp->mnt_stat.f_syncwrites++;
  842                 else
  843                         mp->mnt_stat.f_asyncwrites++;
  844         }
  845 
  846         /*
  847          * Pay for the I/O operation and make sure the buf is on the correct
  848          * vnode queue.
  849          */
  850         bp->b_error = 0;
  851         wasdelayed = ISSET(bp->b_oflags, BO_DELWRI);
  852         CLR(bp->b_flags, B_READ);
  853         if (wasdelayed) {
  854                 mutex_enter(&bufcache_lock);
  855                 mutex_enter(bp->b_objlock);
  856                 CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
  857                 reassignbuf(bp, bp->b_vp);
  858                 mutex_exit(&bufcache_lock);
  859         } else {
  860                 curlwp->l_ru.ru_oublock++;
  861                 mutex_enter(bp->b_objlock);
  862                 CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
  863         }
  864         if (vp != NULL)
  865                 vp->v_numoutput++;
  866         mutex_exit(bp->b_objlock);
  867 
  868         /* Initiate disk write. */
  869         if (sync)
  870                 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
  871         else
  872                 BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
  873 
  874         VOP_STRATEGY(vp, bp);
  875 
  876         if (sync) {
  877                 /* If I/O was synchronous, wait for it to complete. */
  878                 rv = biowait(bp);
  879 
  880                 /* Release the buffer. */
  881                 brelse(bp, 0);
  882 
  883                 return (rv);
  884         } else {
  885                 return (0);
  886         }
  887 }
  888 
  889 int
  890 vn_bwrite(void *v)
  891 {
  892         struct vop_bwrite_args *ap = v;
  893 
  894         return (bwrite(ap->a_bp));
  895 }
  896 
  897 /*
  898  * Delayed write.
  899  *
  900  * The buffer is marked dirty, but is not queued for I/O.
  901  * This routine should be used when the buffer is expected
  902  * to be modified again soon, typically a small write that
  903  * partially fills a buffer.
  904  *
  905  * NB: magnetic tapes cannot be delayed; they must be
  906  * written in the order that the writes are requested.
  907  *
  908  * Described in Leffler, et al. (pp. 208-213).
  909  */
  910 void
  911 bdwrite(buf_t *bp)
  912 {
  913 
  914         KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS ||
  915             bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE));
  916         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
  917         KASSERT(!cv_has_waiters(&bp->b_done));
  918 
  919         /* If this is a tape block, write the block now. */
  920         if (bdev_type(bp->b_dev) == D_TAPE) {
  921                 bawrite(bp);
  922                 return;
  923         }
  924 
  925         if (wapbl_vphaswapbl(bp->b_vp)) {
  926                 struct mount *mp = wapbl_vptomp(bp->b_vp);
  927 
  928                 if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
  929                         WAPBL_ADD_BUF(mp, bp);
  930                 }
  931         }
  932 
  933         /*
  934          * If the block hasn't been seen before:
  935          *      (1) Mark it as having been seen,
  936          *      (2) Charge for the write,
  937          *      (3) Make sure it's on its vnode's correct block list.
  938          */
  939         KASSERT(bp->b_vp == NULL || bp->b_objlock == &bp->b_vp->v_interlock);
  940 
  941         if (!ISSET(bp->b_oflags, BO_DELWRI)) {
  942                 mutex_enter(&bufcache_lock);
  943                 mutex_enter(bp->b_objlock);
  944                 SET(bp->b_oflags, BO_DELWRI);
  945                 curlwp->l_ru.ru_oublock++;
  946                 reassignbuf(bp, bp->b_vp);
  947                 mutex_exit(&bufcache_lock);
  948         } else {
  949                 mutex_enter(bp->b_objlock);
  950         }
  951         /* Otherwise, the "write" is done, so mark and release the buffer. */
  952         CLR(bp->b_oflags, BO_DONE);
  953         mutex_exit(bp->b_objlock);
  954 
  955         brelse(bp, 0);
  956 }
  957 
  958 /*
  959  * Asynchronous block write; just an asynchronous bwrite().
  960  */
  961 void
  962 bawrite(buf_t *bp)
  963 {
  964 
  965         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
  966 
  967         SET(bp->b_flags, B_ASYNC);
  968         VOP_BWRITE(bp);
  969 }
  970 
  971 /*
  972  * Same as first half of bdwrite, mark buffer dirty, but do not release it.
  973  * Call with the buffer interlock held.
  974  *
  975  * Note: called only from biodone() through ffs softdep's io_complete()
  976  * Note2: smbfs also learned about bdirty().
  977  */
  978 void
  979 bdirty(buf_t *bp)
  980 {
  981 
  982         KASSERT(mutex_owned(&bufcache_lock));
  983         KASSERT(bp->b_objlock == &bp->b_vp->v_interlock);
  984         KASSERT(mutex_owned(bp->b_objlock));
  985         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
  986 
  987         CLR(bp->b_cflags, BC_AGE);
  988 
  989         if (!ISSET(bp->b_oflags, BO_DELWRI)) {
  990                 SET(bp->b_oflags, BO_DELWRI);
  991                 curlwp->l_ru.ru_oublock++;
  992                 reassignbuf(bp, bp->b_vp);
  993         }
  994 }
  995 
  996 
  997 /*
  998  * Release a buffer on to the free lists.
  999  * Described in Bach (p. 46).
 1000  */
 1001 void
 1002 brelsel(buf_t *bp, int set)
 1003 {
 1004         struct bqueue *bufq;
 1005         struct vnode *vp;
 1006 
 1007         KASSERT(mutex_owned(&bufcache_lock));
 1008         KASSERT(!cv_has_waiters(&bp->b_done));
 1009         KASSERT(bp->b_refcnt > 0);
 1010         
 1011         SET(bp->b_cflags, set);
 1012 
 1013         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
 1014         KASSERT(bp->b_iodone == NULL);
 1015 
 1016         /* Wake up any processes waiting for any buffer to become free. */
 1017         cv_signal(&needbuffer_cv);
 1018 
 1019         /* Wake up any proceeses waiting for _this_ buffer to become */
 1020         if (ISSET(bp->b_cflags, BC_WANTED))
 1021                 CLR(bp->b_cflags, BC_WANTED|BC_AGE);
 1022 
 1023         /*
 1024          * Determine which queue the buffer should be on, then put it there.
 1025          */
 1026 
 1027         /* If it's locked, don't report an error; try again later. */
 1028         if (ISSET(bp->b_flags, B_LOCKED))
 1029                 bp->b_error = 0;
 1030 
 1031         /* If it's not cacheable, or an error, mark it invalid. */
 1032         if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0)
 1033                 SET(bp->b_cflags, BC_INVAL);
 1034 
 1035         if (ISSET(bp->b_cflags, BC_VFLUSH)) {
 1036                 /*
 1037                  * This is a delayed write buffer that was just flushed to
 1038                  * disk.  It is still on the LRU queue.  If it's become
 1039                  * invalid, then we need to move it to a different queue;
 1040                  * otherwise leave it in its current position.
 1041                  */
 1042                 CLR(bp->b_cflags, BC_VFLUSH);
 1043                 if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) &&
 1044                     !ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) {
 1045                         KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1));
 1046                         goto already_queued;
 1047                 } else {
 1048                         bremfree(bp);
 1049                 }
 1050         }
 1051 
 1052         KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0));
 1053         KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0));
 1054         KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0));
 1055 
 1056         if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) {
 1057                 /*
 1058                  * If it's invalid or empty, dissociate it from its vnode
 1059                  * and put on the head of the appropriate queue.
 1060                  */
 1061                 if (bioopsp != NULL)
 1062                         (*bioopsp->io_deallocate)(bp);
 1063 
 1064                 if (ISSET(bp->b_flags, B_LOCKED)) {
 1065                         if (wapbl_vphaswapbl(vp = bp->b_vp)) {
 1066                                 struct mount *mp = wapbl_vptomp(vp);
 1067 
 1068                                 KASSERT(bp->b_iodone
 1069                                     != mp->mnt_wapbl_op->wo_wapbl_biodone);
 1070                                 WAPBL_REMOVE_BUF(mp, bp);
 1071                         }
 1072                 }
 1073 
 1074                 mutex_enter(bp->b_objlock);
 1075                 CLR(bp->b_oflags, BO_DONE|BO_DELWRI);
 1076                 if ((vp = bp->b_vp) != NULL) {
 1077                         KASSERT(bp->b_objlock == &vp->v_interlock);
 1078                         reassignbuf(bp, bp->b_vp);
 1079                         brelvp(bp);
 1080                         mutex_exit(&vp->v_interlock);
 1081                 } else {
 1082                         KASSERT(bp->b_objlock == &buffer_lock);
 1083                         mutex_exit(bp->b_objlock);
 1084                 }
 1085 
 1086                 if (bp->b_bufsize <= 0)
 1087                         /* no data */
 1088                         goto already_queued;
 1089                 else
 1090                         /* invalid data */
 1091                         bufq = &bufqueues[BQ_AGE];
 1092                 binsheadfree(bp, bufq);
 1093         } else  {
 1094                 /*
 1095                  * It has valid data.  Put it on the end of the appropriate
 1096                  * queue, so that it'll stick around for as long as possible.
 1097                  * If buf is AGE, but has dependencies, must put it on last
 1098                  * bufqueue to be scanned, ie LRU. This protects against the
 1099                  * livelock where BQ_AGE only has buffers with dependencies,
 1100                  * and we thus never get to the dependent buffers in BQ_LRU.
 1101                  */
 1102                 if (ISSET(bp->b_flags, B_LOCKED)) {
 1103                         /* locked in core */
 1104                         bufq = &bufqueues[BQ_LOCKED];
 1105                 } else if (!ISSET(bp->b_cflags, BC_AGE)) {
 1106                         /* valid data */
 1107                         bufq = &bufqueues[BQ_LRU];
 1108                 } else {
 1109                         /* stale but valid data */
 1110                         int has_deps;
 1111 
 1112                         if (bioopsp != NULL)
 1113                                 has_deps = (*bioopsp->io_countdeps)(bp, 0);
 1114                         else
 1115                                 has_deps = 0;
 1116                         bufq = has_deps ? &bufqueues[BQ_LRU] :
 1117                             &bufqueues[BQ_AGE];
 1118                 }
 1119                 binstailfree(bp, bufq);
 1120         }
 1121 already_queued:
 1122         /* Unlock the buffer. */
 1123         CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE);
 1124         CLR(bp->b_flags, B_ASYNC);
 1125         cv_broadcast(&bp->b_busy);
 1126 
 1127         if (bp->b_bufsize <= 0)
 1128                 brele(bp);
 1129 }
 1130 
 1131 void
 1132 brelse(buf_t *bp, int set)
 1133 {
 1134 
 1135         mutex_enter(&bufcache_lock);
 1136         brelsel(bp, set);
 1137         mutex_exit(&bufcache_lock);
 1138 }
 1139 
 1140 /*
 1141  * Determine if a block is in the cache.
 1142  * Just look on what would be its hash chain.  If it's there, return
 1143  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
 1144  * we normally don't return the buffer, unless the caller explicitly
 1145  * wants us to.
 1146  */
 1147 buf_t *
 1148 incore(struct vnode *vp, daddr_t blkno)
 1149 {
 1150         buf_t *bp;
 1151 
 1152         KASSERT(mutex_owned(&bufcache_lock));
 1153 
 1154         /* Search hash chain */
 1155         LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
 1156                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
 1157                     !ISSET(bp->b_cflags, BC_INVAL)) {
 1158                         KASSERT(bp->b_objlock == &vp->v_interlock);
 1159                         return (bp);
 1160                 }
 1161         }
 1162 
 1163         return (NULL);
 1164 }
 1165 
 1166 /*
 1167  * Get a block of requested size that is associated with
 1168  * a given vnode and block offset. If it is found in the
 1169  * block cache, mark it as having been found, make it busy
 1170  * and return it. Otherwise, return an empty block of the
 1171  * correct size. It is up to the caller to insure that the
 1172  * cached blocks be of the correct size.
 1173  */
 1174 buf_t *
 1175 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 1176 {
 1177         int err, preserve;
 1178         buf_t *bp;
 1179 
 1180         mutex_enter(&bufcache_lock);
 1181  loop:
 1182         bp = incore(vp, blkno);
 1183         if (bp != NULL) {
 1184                 err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL);
 1185                 if (err != 0) {
 1186                         if (err == EPASSTHROUGH)
 1187                                 goto loop;
 1188                         mutex_exit(&bufcache_lock);
 1189                         return (NULL);
 1190                 }
 1191                 KASSERT(!cv_has_waiters(&bp->b_done));
 1192 #ifdef DIAGNOSTIC
 1193                 if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) &&
 1194                     bp->b_bcount < size && vp->v_type != VBLK)
 1195                         panic("getblk: block size invariant failed");
 1196 #endif
 1197                 bremfree(bp);
 1198                 preserve = 1;
 1199         } else {
 1200                 if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL)
 1201                         goto loop;
 1202 
 1203                 if (incore(vp, blkno) != NULL) {
 1204                         /* The block has come into memory in the meantime. */
 1205                         brelsel(bp, 0);
 1206                         goto loop;
 1207                 }
 1208 
 1209                 LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash);
 1210                 bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
 1211                 mutex_enter(&vp->v_interlock);
 1212                 bgetvp(vp, bp);
 1213                 mutex_exit(&vp->v_interlock);
 1214                 preserve = 0;
 1215         }
 1216         mutex_exit(&bufcache_lock);
 1217 
 1218         /*
 1219          * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes)
 1220          * if we re-size buffers here.
 1221          */
 1222         if (ISSET(bp->b_flags, B_LOCKED)) {
 1223                 KASSERT(bp->b_bufsize >= size);
 1224         } else {
 1225                 if (allocbuf(bp, size, preserve)) {
 1226                         mutex_enter(&bufcache_lock);
 1227                         LIST_REMOVE(bp, b_hash);
 1228                         mutex_exit(&bufcache_lock);
 1229                         brelse(bp, BC_INVAL);
 1230                         return NULL;
 1231                 }
 1232         }
 1233         BIO_SETPRIO(bp, BPRIO_DEFAULT);
 1234         return (bp);
 1235 }
 1236 
 1237 /*
 1238  * Get an empty, disassociated buffer of given size.
 1239  */
 1240 buf_t *
 1241 geteblk(int size)
 1242 {
 1243         buf_t *bp;
 1244         int error;
 1245 
 1246         mutex_enter(&bufcache_lock);
 1247         while ((bp = getnewbuf(0, 0, 0)) == NULL)
 1248                 ;
 1249 
 1250         SET(bp->b_cflags, BC_INVAL);
 1251         LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 1252         mutex_exit(&bufcache_lock);
 1253         BIO_SETPRIO(bp, BPRIO_DEFAULT);
 1254         error = allocbuf(bp, size, 0);
 1255         KASSERT(error == 0);
 1256         return (bp);
 1257 }
 1258 
 1259 /*
 1260  * Expand or contract the actual memory allocated to a buffer.
 1261  *
 1262  * If the buffer shrinks, data is lost, so it's up to the
 1263  * caller to have written it out *first*; this routine will not
 1264  * start a write.  If the buffer grows, it's the callers
 1265  * responsibility to fill out the buffer's additional contents.
 1266  */
 1267 int
 1268 allocbuf(buf_t *bp, int size, int preserve)
 1269 {
 1270         void *addr;
 1271         vsize_t oldsize, desired_size;
 1272         int oldcount;
 1273         int delta;
 1274 
 1275         desired_size = buf_roundsize(size);
 1276         if (desired_size > MAXBSIZE)
 1277                 printf("allocbuf: buffer larger than MAXBSIZE requested");
 1278 
 1279         oldcount = bp->b_bcount;
 1280 
 1281         bp->b_bcount = size;
 1282 
 1283         oldsize = bp->b_bufsize;
 1284         if (oldsize == desired_size)
 1285                 goto out;
 1286 
 1287         /*
 1288          * If we want a buffer of a different size, re-allocate the
 1289          * buffer's memory; copy old content only if needed.
 1290          */
 1291         addr = buf_malloc(desired_size);
 1292         if (addr == NULL)
 1293                 return ENOMEM;
 1294         if (preserve)
 1295                 memcpy(addr, bp->b_data, MIN(oldsize,desired_size));
 1296         if (bp->b_data != NULL)
 1297                 buf_mrelease(bp->b_data, oldsize);
 1298         bp->b_data = addr;
 1299         bp->b_bufsize = desired_size;
 1300 
 1301         /*
 1302          * Update overall buffer memory counter (protected by bufcache_lock)
 1303          */
 1304         delta = (long)desired_size - (long)oldsize;
 1305 
 1306         mutex_enter(&bufcache_lock);
 1307         if ((bufmem += delta) > bufmem_hiwater) {
 1308                 /*
 1309                  * Need to trim overall memory usage.
 1310                  */
 1311                 while (buf_canrelease()) {
 1312                         if (curcpu()->ci_schedstate.spc_flags &
 1313                             SPCF_SHOULDYIELD) {
 1314                                 mutex_exit(&bufcache_lock);
 1315                                 preempt();
 1316                                 mutex_enter(&bufcache_lock);
 1317                         }
 1318                         if (buf_trim() == 0)
 1319                                 break;
 1320                 }
 1321         }
 1322         mutex_exit(&bufcache_lock);
 1323 
 1324  out:
 1325         if (wapbl_vphaswapbl(bp->b_vp))
 1326                 WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount);
 1327 
 1328         return 0;
 1329 }
 1330 
 1331 /*
 1332  * Find a buffer which is available for use.
 1333  * Select something from a free list.
 1334  * Preference is to AGE list, then LRU list.
 1335  *
 1336  * Called with the buffer queues locked.
 1337  * Return buffer locked.
 1338  */
 1339 buf_t *
 1340 getnewbuf(int slpflag, int slptimeo, int from_bufq)
 1341 {
 1342         buf_t *bp;
 1343         struct vnode *vp;
 1344 
 1345  start:
 1346         KASSERT(mutex_owned(&bufcache_lock));
 1347 
 1348         /*
 1349          * Get a new buffer from the pool.
 1350          */
 1351         if (!from_bufq && buf_lotsfree()) {
 1352                 mutex_exit(&bufcache_lock);
 1353                 bp = pool_cache_get(buf_cache, PR_NOWAIT);
 1354                 if (bp != NULL) {
 1355                         memset((char *)bp, 0, sizeof(*bp));
 1356                         buf_init(bp);
 1357                         SET(bp->b_cflags, BC_BUSY);     /* mark buffer busy */
 1358                         mutex_enter(&bufcache_lock);
 1359 #if defined(DIAGNOSTIC)
 1360                         bp->b_freelistindex = -1;
 1361 #endif /* defined(DIAGNOSTIC) */
 1362                         return (bp);
 1363                 }
 1364                 mutex_enter(&bufcache_lock);
 1365         }
 1366 
 1367         KASSERT(mutex_owned(&bufcache_lock));
 1368         if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL ||
 1369             (bp = TAILQ_FIRST(&bufqueues[BQ_LRU].bq_queue)) != NULL) {
 1370                 KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || ISSET(bp->b_cflags, BC_VFLUSH));
 1371                 bremfree(bp);
 1372 
 1373                 /* Buffer is no longer on free lists. */
 1374                 SET(bp->b_cflags, BC_BUSY);
 1375         } else {
 1376                 /*
 1377                  * XXX: !from_bufq should be removed.
 1378                  */
 1379                 if (!from_bufq || curlwp != uvm.pagedaemon_lwp) {
 1380                         /* wait for a free buffer of any kind */
 1381                         if ((slpflag & PCATCH) != 0)
 1382                                 (void)cv_timedwait_sig(&needbuffer_cv,
 1383                                     &bufcache_lock, slptimeo);
 1384                         else
 1385                                 (void)cv_timedwait(&needbuffer_cv,
 1386                                     &bufcache_lock, slptimeo);
 1387                 }
 1388                 return (NULL);
 1389         }
 1390 
 1391 #ifdef DIAGNOSTIC
 1392         if (bp->b_bufsize <= 0)
 1393                 panic("buffer %p: on queue but empty", bp);
 1394 #endif
 1395 
 1396         if (ISSET(bp->b_cflags, BC_VFLUSH)) {
 1397                 /*
 1398                  * This is a delayed write buffer being flushed to disk.  Make
 1399                  * sure it gets aged out of the queue when it's finished, and
 1400                  * leave it off the LRU queue.
 1401                  */
 1402                 CLR(bp->b_cflags, BC_VFLUSH);
 1403                 SET(bp->b_cflags, BC_AGE);
 1404                 goto start;
 1405         }
 1406 
 1407         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
 1408         KASSERT(bp->b_refcnt > 0);
 1409         KASSERT(!cv_has_waiters(&bp->b_done));
 1410 
 1411         /*
 1412          * If buffer was a delayed write, start it and return NULL
 1413          * (since we might sleep while starting the write).
 1414          */
 1415         if (ISSET(bp->b_oflags, BO_DELWRI)) {
 1416                 /*
 1417                  * This buffer has gone through the LRU, so make sure it gets
 1418                  * reused ASAP.
 1419                  */
 1420                 SET(bp->b_cflags, BC_AGE);
 1421                 mutex_exit(&bufcache_lock);
 1422                 bawrite(bp);
 1423                 mutex_enter(&bufcache_lock);
 1424                 return (NULL);
 1425         }
 1426 
 1427         vp = bp->b_vp;
 1428         if (bioopsp != NULL)
 1429                 (*bioopsp->io_deallocate)(bp);
 1430 
 1431         /* clear out various other fields */
 1432         bp->b_cflags = BC_BUSY;
 1433         bp->b_oflags = 0;
 1434         bp->b_flags = 0;
 1435         bp->b_dev = NODEV;
 1436         bp->b_blkno = 0;
 1437         bp->b_lblkno = 0;
 1438         bp->b_rawblkno = 0;
 1439         bp->b_iodone = 0;
 1440         bp->b_error = 0;
 1441         bp->b_resid = 0;
 1442         bp->b_bcount = 0;
 1443 
 1444         LIST_REMOVE(bp, b_hash);
 1445 
 1446         /* Disassociate us from our vnode, if we had one... */
 1447         if (vp != NULL) {
 1448                 mutex_enter(&vp->v_interlock);
 1449                 brelvp(bp);
 1450                 mutex_exit(&vp->v_interlock);
 1451         }
 1452 
 1453         return (bp);
 1454 }
 1455 
 1456 /*
 1457  * Attempt to free an aged buffer off the queues.
 1458  * Called with queue lock held.
 1459  * Returns the amount of buffer memory freed.
 1460  */
 1461 static int
 1462 buf_trim(void)
 1463 {
 1464         buf_t *bp;
 1465         long size = 0;
 1466 
 1467         KASSERT(mutex_owned(&bufcache_lock));
 1468 
 1469         /* Instruct getnewbuf() to get buffers off the queues */
 1470         if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL)
 1471                 return 0;
 1472 
 1473         KASSERT((bp->b_cflags & BC_WANTED) == 0);
 1474         size = bp->b_bufsize;
 1475         bufmem -= size;
 1476         if (size > 0) {
 1477                 buf_mrelease(bp->b_data, size);
 1478                 bp->b_bcount = bp->b_bufsize = 0;
 1479         }
 1480         /* brelse() will return the buffer to the global buffer pool */
 1481         brelsel(bp, 0);
 1482         return size;
 1483 }
 1484 
 1485 int
 1486 buf_drain(int n)
 1487 {
 1488         int size = 0, sz;
 1489 
 1490         KASSERT(mutex_owned(&bufcache_lock));
 1491 
 1492         while (size < n && bufmem > bufmem_lowater) {
 1493                 sz = buf_trim();
 1494                 if (sz <= 0)
 1495                         break;
 1496                 size += sz;
 1497         }
 1498 
 1499         return size;
 1500 }
 1501 
 1502 /*
 1503  * Wait for operations on the buffer to complete.
 1504  * When they do, extract and return the I/O's error value.
 1505  */
 1506 int
 1507 biowait(buf_t *bp)
 1508 {
 1509 
 1510         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
 1511         KASSERT(bp->b_refcnt > 0);
 1512 
 1513         mutex_enter(bp->b_objlock);
 1514         while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI))
 1515                 cv_wait(&bp->b_done, bp->b_objlock);
 1516         mutex_exit(bp->b_objlock);
 1517 
 1518         return bp->b_error;
 1519 }
 1520 
 1521 /*
 1522  * Mark I/O complete on a buffer.
 1523  *
 1524  * If a callback has been requested, e.g. the pageout
 1525  * daemon, do so. Otherwise, awaken waiting processes.
 1526  *
 1527  * [ Leffler, et al., says on p.247:
 1528  *      "This routine wakes up the blocked process, frees the buffer
 1529  *      for an asynchronous write, or, for a request by the pagedaemon
 1530  *      process, invokes a procedure specified in the buffer structure" ]
 1531  *
 1532  * In real life, the pagedaemon (or other system processes) wants
 1533  * to do async stuff to, and doesn't want the buffer brelse()'d.
 1534  * (for swap pager, that puts swap buffers on the free lists (!!!),
 1535  * for the vn device, that puts malloc'd buffers on the free lists!)
 1536  */
 1537 void
 1538 biodone(buf_t *bp)
 1539 {
 1540         int s;
 1541 
 1542         KASSERT(!ISSET(bp->b_oflags, BO_DONE));
 1543 
 1544         if (cpu_intr_p()) {
 1545                 /* From interrupt mode: defer to a soft interrupt. */
 1546                 s = splvm();
 1547                 TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq);
 1548                 softint_schedule(biodone_sih);
 1549                 splx(s);
 1550         } else {
 1551                 /* Process now - the buffer may be freed soon. */
 1552                 biodone2(bp);
 1553         }
 1554 }
 1555 
 1556 static void
 1557 biodone2(buf_t *bp)
 1558 {
 1559         void (*callout)(buf_t *);
 1560 
 1561         if (bioopsp != NULL)
 1562                 (*bioopsp->io_complete)(bp);
 1563 
 1564         mutex_enter(bp->b_objlock);
 1565         /* Note that the transfer is done. */
 1566         if (ISSET(bp->b_oflags, BO_DONE))
 1567                 panic("biodone2 already");
 1568         CLR(bp->b_flags, B_COWDONE);
 1569         SET(bp->b_oflags, BO_DONE);
 1570         BIO_SETPRIO(bp, BPRIO_DEFAULT);
 1571 
 1572         /* Wake up waiting writers. */
 1573         if (!ISSET(bp->b_flags, B_READ))
 1574                 vwakeup(bp);
 1575 
 1576         if ((callout = bp->b_iodone) != NULL) {
 1577                 /* Note callout done, then call out. */
 1578                 KASSERT(!cv_has_waiters(&bp->b_done));
 1579                 KERNEL_LOCK(1, NULL);           /* XXXSMP */
 1580                 bp->b_iodone = NULL;
 1581                 mutex_exit(bp->b_objlock);
 1582                 (*callout)(bp);
 1583                 KERNEL_UNLOCK_ONE(NULL);        /* XXXSMP */
 1584         } else if (ISSET(bp->b_flags, B_ASYNC)) {
 1585                 /* If async, release. */
 1586                 KASSERT(!cv_has_waiters(&bp->b_done));
 1587                 mutex_exit(bp->b_objlock);
 1588                 brelse(bp, 0);
 1589         } else {
 1590                 /* Otherwise just wake up waiters in biowait(). */
 1591                 cv_broadcast(&bp->b_done);
 1592                 mutex_exit(bp->b_objlock);
 1593         }
 1594 }
 1595 
 1596 static void
 1597 biointr(void *cookie)
 1598 {
 1599         struct cpu_info *ci;
 1600         buf_t *bp;
 1601         int s;
 1602 
 1603         ci = curcpu();
 1604 
 1605         while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) {
 1606                 KASSERT(curcpu() == ci);
 1607 
 1608                 s = splvm();
 1609                 bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone);
 1610                 TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq);
 1611                 splx(s);
 1612 
 1613                 biodone2(bp);
 1614         }
 1615 }
 1616 
 1617 /*
 1618  * Return a count of buffers on the "locked" queue.
 1619  */
 1620 int
 1621 count_lock_queue(void)
 1622 {
 1623         buf_t *bp;
 1624         int n = 0;
 1625 
 1626         mutex_enter(&bufcache_lock);
 1627         TAILQ_FOREACH(bp, &bufqueues[BQ_LOCKED].bq_queue, b_freelist)
 1628                 n++;
 1629         mutex_exit(&bufcache_lock);
 1630         return (n);
 1631 }
 1632 
 1633 /*
 1634  * Wait for all buffers to complete I/O
 1635  * Return the number of "stuck" buffers.
 1636  */
 1637 int
 1638 buf_syncwait(void)
 1639 {
 1640         buf_t *bp;
 1641         int iter, nbusy, nbusy_prev = 0, dcount, ihash;
 1642 
 1643         dcount = 10000;
 1644         for (iter = 0; iter < 20;) {
 1645                 mutex_enter(&bufcache_lock);
 1646                 nbusy = 0;
 1647                 for (ihash = 0; ihash < bufhash+1; ihash++) {
 1648                     LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) {
 1649                         if ((bp->b_cflags & (BC_BUSY|BC_INVAL)) == BC_BUSY)
 1650                                 nbusy += ((bp->b_flags & B_READ) == 0);
 1651                         /*
 1652                          * With soft updates, some buffers that are
 1653                          * written will be remarked as dirty until other
 1654                          * buffers are written.
 1655                          */
 1656                         if (bp->b_vp && bp->b_vp->v_mount
 1657                             && (bp->b_vp->v_mount->mnt_flag & MNT_SOFTDEP)
 1658                             && (bp->b_oflags & BO_DELWRI)) {
 1659                                 bremfree(bp);
 1660                                 bp->b_cflags |= BC_BUSY;
 1661                                 nbusy++;
 1662                                 mutex_exit(&bufcache_lock);
 1663                                 bawrite(bp);
 1664                                 if (dcount-- <= 0) {
 1665                                         printf("softdep ");
 1666                                         goto fail;
 1667                                 }
 1668                                 mutex_enter(&bufcache_lock);
 1669                         }
 1670                     }
 1671                 }
 1672                 mutex_exit(&bufcache_lock);
 1673 
 1674                 if (nbusy == 0)
 1675                         break;
 1676                 if (nbusy_prev == 0)
 1677                         nbusy_prev = nbusy;
 1678                 printf("%d ", nbusy);
 1679                 kpause("bflush", false, (iter == 0) ? 1 : hz / 25 * iter, NULL);
 1680                 if (nbusy >= nbusy_prev) /* we didn't flush anything */
 1681                         iter++;
 1682                 else
 1683                         nbusy_prev = nbusy;
 1684         }
 1685 
 1686         if (nbusy) {
 1687 fail:;
 1688 #if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
 1689                 printf("giving up\nPrinting vnodes for busy buffers\n");
 1690                 for (ihash = 0; ihash < bufhash+1; ihash++) {
 1691                     LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) {
 1692                         if ((bp->b_cflags & (BC_BUSY|BC_INVAL)) == BC_BUSY &&
 1693                             (bp->b_flags & B_READ) == 0)
 1694                                 vprint(NULL, bp->b_vp);
 1695                     }
 1696                 }
 1697 #endif
 1698         }
 1699 
 1700         return nbusy;
 1701 }
 1702 
 1703 static void
 1704 sysctl_fillbuf(buf_t *i, struct buf_sysctl *o)
 1705 {
 1706 
 1707         o->b_flags = i->b_flags | i->b_cflags | i->b_oflags;
 1708         o->b_error = i->b_error;
 1709         o->b_prio = i->b_prio;
 1710         o->b_dev = i->b_dev;
 1711         o->b_bufsize = i->b_bufsize;
 1712         o->b_bcount = i->b_bcount;
 1713         o->b_resid = i->b_resid;
 1714         o->b_addr = PTRTOUINT64(i->b_data);
 1715         o->b_blkno = i->b_blkno;
 1716         o->b_rawblkno = i->b_rawblkno;
 1717         o->b_iodone = PTRTOUINT64(i->b_iodone);
 1718         o->b_proc = PTRTOUINT64(i->b_proc);
 1719         o->b_vp = PTRTOUINT64(i->b_vp);
 1720         o->b_saveaddr = PTRTOUINT64(i->b_saveaddr);
 1721         o->b_lblkno = i->b_lblkno;
 1722 }
 1723 
 1724 #define KERN_BUFSLOP 20
 1725 static int
 1726 sysctl_dobuf(SYSCTLFN_ARGS)
 1727 {
 1728         buf_t *bp;
 1729         struct buf_sysctl bs;
 1730         struct bqueue *bq;
 1731         char *dp;
 1732         u_int i, op, arg;
 1733         size_t len, needed, elem_size, out_size;
 1734         int error, elem_count, retries;
 1735 
 1736         if (namelen == 1 && name[0] == CTL_QUERY)
 1737                 return (sysctl_query(SYSCTLFN_CALL(rnode)));
 1738 
 1739         if (namelen != 4)
 1740                 return (EINVAL);
 1741 
 1742         retries = 100;
 1743  retry:
 1744         dp = oldp;
 1745         len = (oldp != NULL) ? *oldlenp : 0;
 1746         op = name[0];
 1747         arg = name[1];
 1748         elem_size = name[2];
 1749         elem_count = name[3];
 1750         out_size = MIN(sizeof(bs), elem_size);
 1751 
 1752         /*
 1753          * at the moment, these are just "placeholders" to make the
 1754          * API for retrieving kern.buf data more extensible in the
 1755          * future.
 1756          *
 1757          * XXX kern.buf currently has "netbsd32" issues.  hopefully
 1758          * these will be resolved at a later point.
 1759          */
 1760         if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL ||
 1761             elem_size < 1 || elem_count < 0)
 1762                 return (EINVAL);
 1763 
 1764         error = 0;
 1765         needed = 0;
 1766         sysctl_unlock();
 1767         mutex_enter(&bufcache_lock);
 1768         for (i = 0; i < BQUEUES; i++) {
 1769                 bq = &bufqueues[i];
 1770                 TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) {
 1771                         bq->bq_marker = bp;
 1772                         if (len >= elem_size && elem_count > 0) {
 1773                                 sysctl_fillbuf(bp, &bs);
 1774                                 mutex_exit(&bufcache_lock);
 1775                                 error = copyout(&bs, dp, out_size);
 1776                                 mutex_enter(&bufcache_lock);
 1777                                 if (error)
 1778                                         break;
 1779                                 if (bq->bq_marker != bp) {
 1780                                         /*
 1781                                          * This sysctl node is only for
 1782                                          * statistics.  Retry; if the
 1783                                          * queue keeps changing, then
 1784                                          * bail out.
 1785                                          */
 1786                                         if (retries-- == 0) {
 1787                                                 error = EAGAIN;
 1788                                                 break;
 1789                                         }
 1790                                         mutex_exit(&bufcache_lock);
 1791                                         goto retry;
 1792                                 }
 1793                                 dp += elem_size;
 1794                                 len -= elem_size;
 1795                         }
 1796                         if (elem_count > 0) {
 1797                                 needed += elem_size;
 1798                                 if (elem_count != INT_MAX)
 1799                                         elem_count--;
 1800                         }
 1801                 }
 1802                 if (error != 0)
 1803                         break;
 1804         }
 1805         mutex_exit(&bufcache_lock);
 1806         sysctl_relock();
 1807 
 1808         *oldlenp = needed;
 1809         if (oldp == NULL)
 1810                 *oldlenp += KERN_BUFSLOP * sizeof(buf_t);
 1811 
 1812         return (error);
 1813 }
 1814 
 1815 static int
 1816 sysctl_bufvm_update(SYSCTLFN_ARGS)
 1817 {
 1818         int t, error, rv;
 1819         struct sysctlnode node;
 1820 
 1821         node = *rnode;
 1822         node.sysctl_data = &t;
 1823         t = *(int *)rnode->sysctl_data;
 1824         error = sysctl_lookup(SYSCTLFN_CALL(&node));
 1825         if (error || newp == NULL)
 1826                 return (error);
 1827 
 1828         if (t < 0)
 1829                 return EINVAL;
 1830         if (rnode->sysctl_data == &bufcache) {
 1831                 if (t > 100)
 1832                         return (EINVAL);
 1833                 bufcache = t;
 1834                 buf_setwm();
 1835         } else if (rnode->sysctl_data == &bufmem_lowater) {
 1836                 if (bufmem_hiwater - t < 16)
 1837                         return (EINVAL);
 1838                 bufmem_lowater = t;
 1839         } else if (rnode->sysctl_data == &bufmem_hiwater) {
 1840                 if (t - bufmem_lowater < 16)
 1841                         return (EINVAL);
 1842                 bufmem_hiwater = t;
 1843         } else
 1844                 return (EINVAL);
 1845 
 1846         /* Drain until below new high water mark */
 1847         sysctl_unlock();
 1848         mutex_enter(&bufcache_lock);
 1849         while ((t = bufmem - bufmem_hiwater) >= 0) {
 1850                 rv = buf_drain(t / (2 * 1024));
 1851                 if (rv <= 0)
 1852                         break;
 1853         }
 1854         mutex_exit(&bufcache_lock);
 1855         sysctl_relock();
 1856 
 1857         return 0;
 1858 }
 1859 
 1860 SYSCTL_SETUP(sysctl_kern_buf_setup, "sysctl kern.buf subtree setup")
 1861 {
 1862 
 1863         sysctl_createv(clog, 0, NULL, NULL,
 1864                        CTLFLAG_PERMANENT,
 1865                        CTLTYPE_NODE, "kern", NULL,
 1866                        NULL, 0, NULL, 0,
 1867                        CTL_KERN, CTL_EOL);
 1868         sysctl_createv(clog, 0, NULL, NULL,
 1869                        CTLFLAG_PERMANENT,
 1870                        CTLTYPE_NODE, "buf",
 1871                        SYSCTL_DESCR("Kernel buffer cache information"),
 1872                        sysctl_dobuf, 0, NULL, 0,
 1873                        CTL_KERN, KERN_BUF, CTL_EOL);
 1874 }
 1875 
 1876 SYSCTL_SETUP(sysctl_vm_buf_setup, "sysctl vm.buf* subtree setup")
 1877 {
 1878 
 1879         sysctl_createv(clog, 0, NULL, NULL,
 1880                        CTLFLAG_PERMANENT,
 1881                        CTLTYPE_NODE, "vm", NULL,
 1882                        NULL, 0, NULL, 0,
 1883                        CTL_VM, CTL_EOL);
 1884 
 1885         sysctl_createv(clog, 0, NULL, NULL,
 1886                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 1887                        CTLTYPE_INT, "bufcache",
 1888                        SYSCTL_DESCR("Percentage of physical memory to use for "
 1889                                     "buffer cache"),
 1890                        sysctl_bufvm_update, 0, &bufcache, 0,
 1891                        CTL_VM, CTL_CREATE, CTL_EOL);
 1892         sysctl_createv(clog, 0, NULL, NULL,
 1893                        CTLFLAG_PERMANENT|CTLFLAG_READONLY,
 1894                        CTLTYPE_INT, "bufmem",
 1895                        SYSCTL_DESCR("Amount of kernel memory used by buffer "
 1896                                     "cache"),
 1897                        NULL, 0, &bufmem, 0,
 1898                        CTL_VM, CTL_CREATE, CTL_EOL);
 1899         sysctl_createv(clog, 0, NULL, NULL,
 1900                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 1901                        CTLTYPE_INT, "bufmem_lowater",
 1902                        SYSCTL_DESCR("Minimum amount of kernel memory to "
 1903                                     "reserve for buffer cache"),
 1904                        sysctl_bufvm_update, 0, &bufmem_lowater, 0,
 1905                        CTL_VM, CTL_CREATE, CTL_EOL);
 1906         sysctl_createv(clog, 0, NULL, NULL,
 1907                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 1908                        CTLTYPE_INT, "bufmem_hiwater",
 1909                        SYSCTL_DESCR("Maximum amount of kernel memory to use "
 1910                                     "for buffer cache"),
 1911                        sysctl_bufvm_update, 0, &bufmem_hiwater, 0,
 1912                        CTL_VM, CTL_CREATE, CTL_EOL);
 1913 }
 1914 
 1915 #ifdef DEBUG
 1916 /*
 1917  * Print out statistics on the current allocation of the buffer pool.
 1918  * Can be enabled to print out on every ``sync'' by setting "syncprt"
 1919  * in vfs_syscalls.c using sysctl.
 1920  */
 1921 void
 1922 vfs_bufstats(void)
 1923 {
 1924         int i, j, count;
 1925         buf_t *bp;
 1926         struct bqueue *dp;
 1927         int counts[(MAXBSIZE / PAGE_SIZE) + 1];
 1928         static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" };
 1929 
 1930         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
 1931                 count = 0;
 1932                 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
 1933                         counts[j] = 0;
 1934                 TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) {
 1935                         counts[bp->b_bufsize/PAGE_SIZE]++;
 1936                         count++;
 1937                 }
 1938                 printf("%s: total-%d", bname[i], count);
 1939                 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
 1940                         if (counts[j] != 0)
 1941                                 printf(", %d-%d", j * PAGE_SIZE, counts[j]);
 1942                 printf("\n");
 1943         }
 1944 }
 1945 #endif /* DEBUG */
 1946 
 1947 /* ------------------------------ */
 1948 
 1949 buf_t *
 1950 getiobuf(struct vnode *vp, bool waitok)
 1951 {
 1952         buf_t *bp;
 1953 
 1954         bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
 1955         if (bp == NULL)
 1956                 return bp;
 1957 
 1958         buf_init(bp);
 1959 
 1960         if ((bp->b_vp = vp) == NULL)
 1961                 bp->b_objlock = &buffer_lock;
 1962         else
 1963                 bp->b_objlock = &vp->v_interlock;
 1964         
 1965         return bp;
 1966 }
 1967 
 1968 void
 1969 putiobuf(buf_t *bp)
 1970 {
 1971 
 1972         buf_destroy(bp);
 1973         pool_cache_put(bufio_cache, bp);
 1974 }
 1975 
 1976 /*
 1977  * nestiobuf_iodone: b_iodone callback for nested buffers.
 1978  */
 1979 
 1980 void
 1981 nestiobuf_iodone(buf_t *bp)
 1982 {
 1983         buf_t *mbp = bp->b_private;
 1984         int error;
 1985         int donebytes;
 1986 
 1987         KASSERT(bp->b_bcount <= bp->b_bufsize);
 1988         KASSERT(mbp != bp);
 1989 
 1990         error = bp->b_error;
 1991         if (bp->b_error == 0 &&
 1992             (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) {
 1993                 /*
 1994                  * Not all got transfered, raise an error. We have no way to
 1995                  * propagate these conditions to mbp.
 1996                  */
 1997                 error = EIO;
 1998         }
 1999 
 2000         donebytes = bp->b_bufsize;
 2001 
 2002         putiobuf(bp);
 2003         nestiobuf_done(mbp, donebytes, error);
 2004 }
 2005 
 2006 /*
 2007  * nestiobuf_setup: setup a "nested" buffer.
 2008  *
 2009  * => 'mbp' is a "master" buffer which is being divided into sub pieces.
 2010  * => 'bp' should be a buffer allocated by getiobuf.
 2011  * => 'offset' is a byte offset in the master buffer.
 2012  * => 'size' is a size in bytes of this nested buffer.
 2013  */
 2014 
 2015 void
 2016 nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size)
 2017 {
 2018         const int b_read = mbp->b_flags & B_READ;
 2019         struct vnode *vp = mbp->b_vp;
 2020 
 2021         KASSERT(mbp->b_bcount >= offset + size);
 2022         bp->b_vp = vp;
 2023         bp->b_dev = mbp->b_dev;
 2024         bp->b_objlock = mbp->b_objlock;
 2025         bp->b_cflags = BC_BUSY;
 2026         bp->b_flags = B_ASYNC | b_read;
 2027         bp->b_iodone = nestiobuf_iodone;
 2028         bp->b_data = (char *)mbp->b_data + offset;
 2029         bp->b_resid = bp->b_bcount = size;
 2030         bp->b_bufsize = bp->b_bcount;
 2031         bp->b_private = mbp;
 2032         BIO_COPYPRIO(bp, mbp);
 2033         if (!b_read && vp != NULL) {
 2034                 mutex_enter(&vp->v_interlock);
 2035                 vp->v_numoutput++;
 2036                 mutex_exit(&vp->v_interlock);
 2037         }
 2038 }
 2039 
 2040 /*
 2041  * nestiobuf_done: propagate completion to the master buffer.
 2042  *
 2043  * => 'donebytes' specifies how many bytes in the 'mbp' is completed.
 2044  * => 'error' is an errno(2) that 'donebytes' has been completed with.
 2045  */
 2046 
 2047 void
 2048 nestiobuf_done(buf_t *mbp, int donebytes, int error)
 2049 {
 2050 
 2051         if (donebytes == 0) {
 2052                 return;
 2053         }
 2054         mutex_enter(mbp->b_objlock);
 2055         KASSERT(mbp->b_resid >= donebytes);
 2056         mbp->b_resid -= donebytes;
 2057         if (error)
 2058                 mbp->b_error = error;
 2059         if (mbp->b_resid == 0) {
 2060                 mutex_exit(mbp->b_objlock);
 2061                 biodone(mbp);
 2062         } else
 2063                 mutex_exit(mbp->b_objlock);
 2064 }
 2065 
 2066 void
 2067 buf_init(buf_t *bp)
 2068 {
 2069 
 2070         LIST_INIT(&bp->b_dep);
 2071         cv_init(&bp->b_busy, "biolock");
 2072         cv_init(&bp->b_done, "biowait");
 2073         bp->b_dev = NODEV;
 2074         bp->b_error = 0;
 2075         bp->b_flags = 0;
 2076         bp->b_cflags = 0;
 2077         bp->b_oflags = 0;
 2078         bp->b_objlock = &buffer_lock;
 2079         bp->b_iodone = NULL;
 2080         bp->b_refcnt = 1;
 2081         bp->b_dev = NODEV;
 2082         bp->b_vnbufs.le_next = NOLIST;
 2083         BIO_SETPRIO(bp, BPRIO_DEFAULT);
 2084 }
 2085 
 2086 void
 2087 buf_destroy(buf_t *bp)
 2088 {
 2089 
 2090         cv_destroy(&bp->b_done);
 2091         cv_destroy(&bp->b_busy);
 2092 }
 2093 
 2094 int
 2095 bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock)
 2096 {
 2097         int error;
 2098 
 2099         KASSERT(mutex_owned(&bufcache_lock));
 2100 
 2101         if ((bp->b_cflags & BC_BUSY) != 0) {
 2102                 if (curlwp == uvm.pagedaemon_lwp)
 2103                         return EDEADLK;
 2104                 bp->b_cflags |= BC_WANTED;
 2105                 bref(bp);
 2106                 if (interlock != NULL)
 2107                         mutex_exit(interlock);
 2108                 if (intr) {
 2109                         error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock,
 2110                             timo);
 2111                 } else {
 2112                         error = cv_timedwait(&bp->b_busy, &bufcache_lock,
 2113                             timo);
 2114                 }
 2115                 brele(bp);
 2116                 if (interlock != NULL)
 2117                         mutex_enter(interlock);
 2118                 if (error != 0)
 2119                         return error;
 2120                 return EPASSTHROUGH;
 2121         }
 2122         bp->b_cflags |= BC_BUSY;
 2123 
 2124         return 0;
 2125 }
Cache object: 2846341b90b3c9d943301474c5e83c6f
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_bio.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_bio.c