The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_bio.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 1994,1997 John S. Dyson
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice immediately at the beginning of the file, without modification,
   10  *    this list of conditions, and the following disclaimer.
   11  * 2. Absolutely no warranty of function or purpose is made by the author
   12  *              John S. Dyson.
   13  *
   14  * $FreeBSD: src/sys/kern/vfs_bio.c,v 1.242.2.20 2003/05/28 18:38:10 alc Exp $
   15  */
   16 
   17 /*
   18  * this file contains a new buffer I/O scheme implementing a coherent
   19  * VM object and buffer cache scheme.  Pains have been taken to make
   20  * sure that the performance degradation associated with schemes such
   21  * as this is not realized.
   22  *
   23  * Author:  John S. Dyson
   24  * Significant help during the development and debugging phases
   25  * had been provided by David Greenman, also of the FreeBSD core team.
   26  *
   27  * see man buf(9) for more info.
   28  */
   29 
   30 #include <sys/param.h>
   31 #include <sys/systm.h>
   32 #include <sys/buf.h>
   33 #include <sys/conf.h>
   34 #include <sys/devicestat.h>
   35 #include <sys/eventhandler.h>
   36 #include <sys/lock.h>
   37 #include <sys/malloc.h>
   38 #include <sys/mount.h>
   39 #include <sys/kernel.h>
   40 #include <sys/kthread.h>
   41 #include <sys/proc.h>
   42 #include <sys/reboot.h>
   43 #include <sys/resourcevar.h>
   44 #include <sys/sysctl.h>
   45 #include <sys/vmmeter.h>
   46 #include <sys/vnode.h>
   47 #include <sys/dsched.h>
   48 #include <vm/vm.h>
   49 #include <vm/vm_param.h>
   50 #include <vm/vm_kern.h>
   51 #include <vm/vm_pageout.h>
   52 #include <vm/vm_page.h>
   53 #include <vm/vm_object.h>
   54 #include <vm/vm_extern.h>
   55 #include <vm/vm_map.h>
   56 #include <vm/vm_pager.h>
   57 #include <vm/swap_pager.h>
   58 
   59 #include <sys/buf2.h>
   60 #include <sys/thread2.h>
   61 #include <sys/spinlock2.h>
   62 #include <sys/mplock2.h>
   63 #include <vm/vm_page2.h>
   64 
   65 #include "opt_ddb.h"
   66 #ifdef DDB
   67 #include <ddb/ddb.h>
   68 #endif
   69 
   70 /*
   71  * Buffer queues.
   72  */
   73 enum bufq_type {
   74         BQUEUE_NONE,            /* not on any queue */
   75         BQUEUE_LOCKED,          /* locked buffers */
   76         BQUEUE_CLEAN,           /* non-B_DELWRI buffers */
   77         BQUEUE_DIRTY,           /* B_DELWRI buffers */
   78         BQUEUE_DIRTY_HW,        /* B_DELWRI buffers - heavy weight */
   79         BQUEUE_EMPTYKVA,        /* empty buffer headers with KVA assignment */
   80         BQUEUE_EMPTY,           /* empty buffer headers */
   81 
   82         BUFFER_QUEUES           /* number of buffer queues */
   83 };
   84 
   85 typedef enum bufq_type bufq_type_t;
   86 
   87 #define BD_WAKE_SIZE    16384
   88 #define BD_WAKE_MASK    (BD_WAKE_SIZE - 1)
   89 
   90 TAILQ_HEAD(bqueues, buf);
   91 
   92 struct bufpcpu {
   93         struct spinlock spin;
   94         struct bqueues bufqueues[BUFFER_QUEUES];
   95 } __cachealign;
   96 
   97 struct bufpcpu bufpcpu[MAXCPU];
   98 
   99 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
  100 
  101 struct buf *buf;                /* buffer header pool */
  102 
  103 static void vfs_clean_pages(struct buf *bp);
  104 static void vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m);
  105 #if 0
  106 static void vfs_dirty_one_page(struct buf *bp, int pageno, vm_page_t m);
  107 #endif
  108 static void vfs_vmio_release(struct buf *bp);
  109 static int flushbufqueues(struct buf *marker, bufq_type_t q);
  110 static vm_page_t bio_page_alloc(struct buf *bp, vm_object_t obj,
  111                                 vm_pindex_t pg, int deficit);
  112 
  113 static void bd_signal(long totalspace);
  114 static void buf_daemon(void);
  115 static void buf_daemon_hw(void);
  116 
  117 /*
  118  * bogus page -- for I/O to/from partially complete buffers
  119  * this is a temporary solution to the problem, but it is not
  120  * really that bad.  it would be better to split the buffer
  121  * for input in the case of buffers partially already in memory,
  122  * but the code is intricate enough already.
  123  */
  124 vm_page_t bogus_page;
  125 
  126 /*
  127  * These are all static, but make the ones we export globals so we do
  128  * not need to use compiler magic.
  129  */
  130 long bufspace;                  /* locked by buffer_map */
  131 long maxbufspace;
  132 static long bufmallocspace;     /* atomic ops */
  133 long maxbufmallocspace, lobufspace, hibufspace;
  134 static long bufreusecnt, bufdefragcnt, buffreekvacnt;
  135 static long lorunningspace;
  136 static long hirunningspace;
  137 static long dirtykvaspace;              /* atomic */
  138 static long dirtybufspace;              /* atomic */
  139 static long dirtybufcount;              /* atomic */
  140 static long dirtybufspacehw;            /* atomic */
  141 static long dirtybufcounthw;            /* atomic */
  142 static long runningbufspace;            /* atomic */
  143 static long runningbufcount;            /* atomic */
  144 long lodirtybufspace;
  145 long hidirtybufspace;
  146 static int getnewbufcalls;
  147 static int getnewbufrestarts;
  148 static int recoverbufcalls;
  149 static int needsbuffer;                 /* atomic */
  150 static int runningbufreq;               /* atomic */
  151 static int bd_request;                  /* atomic */
  152 static int bd_request_hw;               /* atomic */
  153 static u_int bd_wake_ary[BD_WAKE_SIZE];
  154 static u_int bd_wake_index;
  155 static u_int vm_cycle_point = 40; /* 23-36 will migrate more act->inact */
  156 static int debug_commit;
  157 static int debug_bufbio;
  158 
  159 static struct thread *bufdaemon_td;
  160 static struct thread *bufdaemonhw_td;
  161 static u_int lowmempgallocs;
  162 static u_int lowmempgfails;
  163 
  164 /*
  165  * Sysctls for operational control of the buffer cache.
  166  */
  167 SYSCTL_LONG(_vfs, OID_AUTO, lodirtybufspace, CTLFLAG_RW, &lodirtybufspace, 0,
  168         "Number of dirty buffers to flush before bufdaemon becomes inactive");
  169 SYSCTL_LONG(_vfs, OID_AUTO, hidirtybufspace, CTLFLAG_RW, &hidirtybufspace, 0,
  170         "High watermark used to trigger explicit flushing of dirty buffers");
  171 SYSCTL_LONG(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
  172         "Minimum amount of buffer space required for active I/O");
  173 SYSCTL_LONG(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
  174         "Maximum amount of buffer space to usable for active I/O");
  175 SYSCTL_UINT(_vfs, OID_AUTO, lowmempgallocs, CTLFLAG_RW, &lowmempgallocs, 0,
  176         "Page allocations done during periods of very low free memory");
  177 SYSCTL_UINT(_vfs, OID_AUTO, lowmempgfails, CTLFLAG_RW, &lowmempgfails, 0,
  178         "Page allocations which failed during periods of very low free memory");
  179 SYSCTL_UINT(_vfs, OID_AUTO, vm_cycle_point, CTLFLAG_RW, &vm_cycle_point, 0,
  180         "Recycle pages to active or inactive queue transition pt 0-64");
  181 /*
  182  * Sysctls determining current state of the buffer cache.
  183  */
  184 SYSCTL_LONG(_vfs, OID_AUTO, nbuf, CTLFLAG_RD, &nbuf, 0,
  185         "Total number of buffers in buffer cache");
  186 SYSCTL_LONG(_vfs, OID_AUTO, dirtykvaspace, CTLFLAG_RD, &dirtykvaspace, 0,
  187         "KVA reserved by dirty buffers (all)");
  188 SYSCTL_LONG(_vfs, OID_AUTO, dirtybufspace, CTLFLAG_RD, &dirtybufspace, 0,
  189         "Pending bytes of dirty buffers (all)");
  190 SYSCTL_LONG(_vfs, OID_AUTO, dirtybufspacehw, CTLFLAG_RD, &dirtybufspacehw, 0,
  191         "Pending bytes of dirty buffers (heavy weight)");
  192 SYSCTL_LONG(_vfs, OID_AUTO, dirtybufcount, CTLFLAG_RD, &dirtybufcount, 0,
  193         "Pending number of dirty buffers");
  194 SYSCTL_LONG(_vfs, OID_AUTO, dirtybufcounthw, CTLFLAG_RD, &dirtybufcounthw, 0,
  195         "Pending number of dirty buffers (heavy weight)");
  196 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
  197         "I/O bytes currently in progress due to asynchronous writes");
  198 SYSCTL_LONG(_vfs, OID_AUTO, runningbufcount, CTLFLAG_RD, &runningbufcount, 0,
  199         "I/O buffers currently in progress due to asynchronous writes");
  200 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
  201         "Hard limit on maximum amount of memory usable for buffer space");
  202 SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
  203         "Soft limit on maximum amount of memory usable for buffer space");
  204 SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
  205         "Minimum amount of memory to reserve for system buffer space");
  206 SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
  207         "Amount of memory available for buffers");
  208 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RD, &maxbufmallocspace,
  209         0, "Maximum amount of memory reserved for buffers using malloc");
  210 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
  211         "Amount of memory left for buffers using malloc-scheme");
  212 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, 0,
  213         "New buffer header acquisition requests");
  214 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, &getnewbufrestarts,
  215         0, "New buffer header acquisition restarts");
  216 SYSCTL_INT(_vfs, OID_AUTO, recoverbufcalls, CTLFLAG_RD, &recoverbufcalls, 0,
  217         "Recover VM space in an emergency");
  218 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RD, &bufdefragcnt, 0,
  219         "Buffer acquisition restarts due to fragmented buffer map");
  220 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RD, &buffreekvacnt, 0,
  221         "Amount of time KVA space was deallocated in an arbitrary buffer");
  222 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RD, &bufreusecnt, 0,
  223         "Amount of time buffer re-use operations were successful");
  224 SYSCTL_INT(_vfs, OID_AUTO, debug_commit, CTLFLAG_RW, &debug_commit, 0, "");
  225 SYSCTL_INT(_vfs, OID_AUTO, debug_bufbio, CTLFLAG_RW, &debug_bufbio, 0, "");
  226 SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf),
  227         "sizeof(struct buf)");
  228 
  229 char *buf_wmesg = BUF_WMESG;
  230 
  231 #define VFS_BIO_NEED_ANY        0x01    /* any freeable buffer */
  232 #define VFS_BIO_NEED_UNUSED02   0x02
  233 #define VFS_BIO_NEED_UNUSED04   0x04
  234 #define VFS_BIO_NEED_BUFSPACE   0x08    /* wait for buf space, lo hysteresis */
  235 
  236 /*
  237  * bufspacewakeup:
  238  *
  239  *      Called when buffer space is potentially available for recovery.
  240  *      getnewbuf() will block on this flag when it is unable to free 
  241  *      sufficient buffer space.  Buffer space becomes recoverable when 
  242  *      bp's get placed back in the queues.
  243  */
  244 static __inline void
  245 bufspacewakeup(void)
  246 {
  247         /*
  248          * If someone is waiting for BUF space, wake them up.  Even
  249          * though we haven't freed the kva space yet, the waiting
  250          * process will be able to now.
  251          */
  252         for (;;) {
  253                 int flags = needsbuffer;
  254                 cpu_ccfence();
  255                 if ((flags & VFS_BIO_NEED_BUFSPACE) == 0)
  256                         break;
  257                 if (atomic_cmpset_int(&needsbuffer, flags,
  258                                       flags & ~VFS_BIO_NEED_BUFSPACE)) {
  259                         wakeup(&needsbuffer);
  260                         break;
  261                 }
  262                 /* retry */
  263         }
  264 }
  265 
  266 /*
  267  * runningbufwakeup:
  268  *
  269  *      Accounting for I/O in progress.
  270  *
  271  */
  272 static __inline void
  273 runningbufwakeup(struct buf *bp)
  274 {
  275         long totalspace;
  276         long limit;
  277         long flags;
  278 
  279         if ((totalspace = bp->b_runningbufspace) != 0) {
  280                 atomic_add_long(&runningbufspace, -totalspace);
  281                 atomic_add_long(&runningbufcount, -1);
  282                 bp->b_runningbufspace = 0;
  283 
  284                 /*
  285                  * see waitrunningbufspace() for limit test.
  286                  */
  287                 limit = hirunningspace * 3 / 6;
  288                 for (;;) {
  289                         flags = runningbufreq;
  290                         cpu_ccfence();
  291                         if (flags == 0)
  292                                 break;
  293                         if (atomic_cmpset_int(&runningbufreq, flags, 0)) {
  294                                 wakeup(&runningbufreq);
  295                                 break;
  296                         }
  297                         /* retry */
  298                 }
  299                 bd_signal(totalspace);
  300         }
  301 }
  302 
  303 /*
  304  * bufcountwakeup:
  305  *
  306  *      Called when a buffer has been added to one of the free queues to
  307  *      account for the buffer and to wakeup anyone waiting for free buffers.
  308  *      This typically occurs when large amounts of metadata are being handled
  309  *      by the buffer cache ( else buffer space runs out first, usually ).
  310  */
  311 static __inline void
  312 bufcountwakeup(void) 
  313 {
  314         long flags;
  315 
  316         for (;;) {
  317                 flags = needsbuffer;
  318                 if (flags == 0)
  319                         break;
  320                 if (atomic_cmpset_int(&needsbuffer, flags,
  321                                       (flags & ~VFS_BIO_NEED_ANY))) {
  322                         wakeup(&needsbuffer);
  323                         break;
  324                 }
  325                 /* retry */
  326         }
  327 }
  328 
  329 /*
  330  * waitrunningbufspace()
  331  *
  332  * If runningbufspace exceeds 4/6 hirunningspace we block until
  333  * runningbufspace drops to 3/6 hirunningspace.  We also block if another
  334  * thread blocked here in order to be fair, even if runningbufspace
  335  * is now lower than the limit.
  336  *
  337  * The caller may be using this function to block in a tight loop, we
  338  * must block while runningbufspace is greater than at least
  339  * hirunningspace * 3 / 6.
  340  */
  341 void
  342 waitrunningbufspace(void)
  343 {
  344         long limit = hirunningspace * 4 / 6;
  345         long flags;
  346 
  347         while (runningbufspace > limit || runningbufreq) {
  348                 tsleep_interlock(&runningbufreq, 0);
  349                 flags = atomic_fetchadd_int(&runningbufreq, 1);
  350                 if (runningbufspace > limit || flags)
  351                         tsleep(&runningbufreq, PINTERLOCKED, "wdrn1", hz);
  352         }
  353 }
  354 
  355 /*
  356  * buf_dirty_count_severe:
  357  *
  358  *      Return true if we have too many dirty buffers.
  359  */
  360 int
  361 buf_dirty_count_severe(void)
  362 {
  363         return (runningbufspace + dirtykvaspace >= hidirtybufspace ||
  364                 dirtybufcount >= nbuf / 2);
  365 }
  366 
  367 /*
  368  * Return true if the amount of running I/O is severe and BIOQ should
  369  * start bursting.
  370  */
  371 int
  372 buf_runningbufspace_severe(void)
  373 {
  374         return (runningbufspace >= hirunningspace * 4 / 6);
  375 }
  376 
  377 /*
  378  * vfs_buf_test_cache:
  379  *
  380  * Called when a buffer is extended.  This function clears the B_CACHE
  381  * bit if the newly extended portion of the buffer does not contain
  382  * valid data.
  383  *
  384  * NOTE! Dirty VM pages are not processed into dirty (B_DELWRI) buffer
  385  * cache buffers.  The VM pages remain dirty, as someone had mmap()'d
  386  * them while a clean buffer was present.
  387  */
  388 static __inline__
  389 void
  390 vfs_buf_test_cache(struct buf *bp,
  391                   vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
  392                   vm_page_t m)
  393 {
  394         if (bp->b_flags & B_CACHE) {
  395                 int base = (foff + off) & PAGE_MASK;
  396                 if (vm_page_is_valid(m, base, size) == 0)
  397                         bp->b_flags &= ~B_CACHE;
  398         }
  399 }
  400 
  401 /*
  402  * bd_speedup()
  403  *
  404  * Spank the buf_daemon[_hw] if the total dirty buffer space exceeds the
  405  * low water mark.
  406  */
  407 static __inline__
  408 void
  409 bd_speedup(void)
  410 {
  411         if (dirtykvaspace < lodirtybufspace && dirtybufcount < nbuf / 2)
  412                 return;
  413 
  414         if (bd_request == 0 &&
  415             (dirtykvaspace > lodirtybufspace / 2 ||
  416              dirtybufcount - dirtybufcounthw >= nbuf / 2)) {
  417                 if (atomic_fetchadd_int(&bd_request, 1) == 0)
  418                         wakeup(&bd_request);
  419         }
  420         if (bd_request_hw == 0 &&
  421             (dirtykvaspace > lodirtybufspace / 2 ||
  422              dirtybufcounthw >= nbuf / 2)) {
  423                 if (atomic_fetchadd_int(&bd_request_hw, 1) == 0)
  424                         wakeup(&bd_request_hw);
  425         }
  426 }
  427 
  428 /*
  429  * bd_heatup()
  430  *
  431  *      Get the buf_daemon heated up when the number of running and dirty
  432  *      buffers exceeds the mid-point.
  433  *
  434  *      Return the total number of dirty bytes past the second mid point
  435  *      as a measure of how much excess dirty data there is in the system.
  436  */
  437 long
  438 bd_heatup(void)
  439 {
  440         long mid1;
  441         long mid2;
  442         long totalspace;
  443 
  444         mid1 = lodirtybufspace + (hidirtybufspace - lodirtybufspace) / 2;
  445 
  446         totalspace = runningbufspace + dirtykvaspace;
  447         if (totalspace >= mid1 || dirtybufcount >= nbuf / 2) {
  448                 bd_speedup();
  449                 mid2 = mid1 + (hidirtybufspace - mid1) / 2;
  450                 if (totalspace >= mid2)
  451                         return(totalspace - mid2);
  452         }
  453         return(0);
  454 }
  455 
  456 /*
  457  * bd_wait()
  458  *
  459  *      Wait for the buffer cache to flush (totalspace) bytes worth of
  460  *      buffers, then return.
  461  *
  462  *      Regardless this function blocks while the number of dirty buffers
  463  *      exceeds hidirtybufspace.
  464  */
  465 void
  466 bd_wait(long totalspace)
  467 {
  468         u_int i;
  469         u_int j;
  470         u_int mi;
  471         int count;
  472 
  473         if (curthread == bufdaemonhw_td || curthread == bufdaemon_td)
  474                 return;
  475 
  476         while (totalspace > 0) {
  477                 bd_heatup();
  478 
  479                 /*
  480                  * Order is important.  Suppliers adjust bd_wake_index after
  481                  * updating runningbufspace/dirtykvaspace.  We want to fetch
  482                  * bd_wake_index before accessing.  Any error should thus
  483                  * be in our favor.
  484                  */
  485                 i = atomic_fetchadd_int(&bd_wake_index, 0);
  486                 if (totalspace > runningbufspace + dirtykvaspace)
  487                         totalspace = runningbufspace + dirtykvaspace;
  488                 count = totalspace / BKVASIZE;
  489                 if (count >= BD_WAKE_SIZE / 2)
  490                         count = BD_WAKE_SIZE / 2;
  491                 i = i + count;
  492                 mi = i & BD_WAKE_MASK;
  493 
  494                 /*
  495                  * This is not a strict interlock, so we play a bit loose
  496                  * with locking access to dirtybufspace*.  We have to re-check
  497                  * bd_wake_index to ensure that it hasn't passed us.
  498                  */
  499                 tsleep_interlock(&bd_wake_ary[mi], 0);
  500                 atomic_add_int(&bd_wake_ary[mi], 1);
  501                 j = atomic_fetchadd_int(&bd_wake_index, 0);
  502                 if ((int)(i - j) >= 0)
  503                         tsleep(&bd_wake_ary[mi], PINTERLOCKED, "flstik", hz);
  504 
  505                 totalspace = runningbufspace + dirtykvaspace - hidirtybufspace;
  506         }
  507 }
  508 
  509 /*
  510  * bd_signal()
  511  * 
  512  *      This function is called whenever runningbufspace or dirtykvaspace
  513  *      is reduced.  Track threads waiting for run+dirty buffer I/O
  514  *      complete.
  515  */
  516 static void
  517 bd_signal(long totalspace)
  518 {
  519         u_int i;
  520 
  521         if (totalspace > 0) {
  522                 if (totalspace > BKVASIZE * BD_WAKE_SIZE)
  523                         totalspace = BKVASIZE * BD_WAKE_SIZE;
  524                 while (totalspace > 0) {
  525                         i = atomic_fetchadd_int(&bd_wake_index, 1);
  526                         i &= BD_WAKE_MASK;
  527                         if (atomic_readandclear_int(&bd_wake_ary[i]))
  528                                 wakeup(&bd_wake_ary[i]);
  529                         totalspace -= BKVASIZE;
  530                 }
  531         }
  532 }
  533 
  534 /*
  535  * BIO tracking support routines.
  536  *
  537  * Release a ref on a bio_track.  Wakeup requests are atomically released
  538  * along with the last reference so bk_active will never wind up set to
  539  * only 0x80000000.
  540  */
  541 static
  542 void
  543 bio_track_rel(struct bio_track *track)
  544 {
  545         int     active;
  546         int     desired;
  547 
  548         /*
  549          * Shortcut
  550          */
  551         active = track->bk_active;
  552         if (active == 1 && atomic_cmpset_int(&track->bk_active, 1, 0))
  553                 return;
  554 
  555         /*
  556          * Full-on.  Note that the wait flag is only atomically released on
  557          * the 1->0 count transition.
  558          *
  559          * We check for a negative count transition using bit 30 since bit 31
  560          * has a different meaning.
  561          */
  562         for (;;) {
  563                 desired = (active & 0x7FFFFFFF) - 1;
  564                 if (desired)
  565                         desired |= active & 0x80000000;
  566                 if (atomic_cmpset_int(&track->bk_active, active, desired)) {
  567                         if (desired & 0x40000000)
  568                                 panic("bio_track_rel: bad count: %p", track);
  569                         if (active & 0x80000000)
  570                                 wakeup(track);
  571                         break;
  572                 }
  573                 active = track->bk_active;
  574         }
  575 }
  576 
  577 /*
  578  * Wait for the tracking count to reach 0.
  579  *
  580  * Use atomic ops such that the wait flag is only set atomically when
  581  * bk_active is non-zero.
  582  */
  583 int
  584 bio_track_wait(struct bio_track *track, int slp_flags, int slp_timo)
  585 {
  586         int     active;
  587         int     desired;
  588         int     error;
  589 
  590         /*
  591          * Shortcut
  592          */
  593         if (track->bk_active == 0)
  594                 return(0);
  595 
  596         /*
  597          * Full-on.  Note that the wait flag may only be atomically set if
  598          * the active count is non-zero.
  599          *
  600          * NOTE: We cannot optimize active == desired since a wakeup could
  601          *       clear active prior to our tsleep_interlock().
  602          */
  603         error = 0;
  604         while ((active = track->bk_active) != 0) {
  605                 cpu_ccfence();
  606                 desired = active | 0x80000000;
  607                 tsleep_interlock(track, slp_flags);
  608                 if (atomic_cmpset_int(&track->bk_active, active, desired)) {
  609                         error = tsleep(track, slp_flags | PINTERLOCKED,
  610                                        "trwait", slp_timo);
  611                         if (error)
  612                                 break;
  613                 }
  614         }
  615         return (error);
  616 }
  617 
  618 /*
  619  * bufinit:
  620  *
  621  *      Load time initialisation of the buffer cache, called from machine
  622  *      dependant initialization code. 
  623  */
  624 static
  625 void
  626 bufinit(void *dummy __unused)
  627 {
  628         struct bufpcpu *pcpu;
  629         struct buf *bp;
  630         vm_offset_t bogus_offset;
  631         int i;
  632         int j;
  633         long n;
  634 
  635         /* next, make a null set of free lists */
  636         for (i = 0; i < ncpus; ++i) {
  637                 pcpu = &bufpcpu[i];
  638                 spin_init(&pcpu->spin);
  639                 for (j = 0; j < BUFFER_QUEUES; j++)
  640                         TAILQ_INIT(&pcpu->bufqueues[j]);
  641         }
  642 
  643         /* finally, initialize each buffer header and stick on empty q */
  644         i = 0;
  645         pcpu = &bufpcpu[i];
  646 
  647         for (n = 0; n < nbuf; n++) {
  648                 bp = &buf[n];
  649                 bzero(bp, sizeof *bp);
  650                 bp->b_flags = B_INVAL;  /* we're just an empty header */
  651                 bp->b_cmd = BUF_CMD_DONE;
  652                 bp->b_qindex = BQUEUE_EMPTY;
  653                 bp->b_qcpu = i;
  654                 initbufbio(bp);
  655                 xio_init(&bp->b_xio);
  656                 buf_dep_init(bp);
  657                 TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
  658                                   bp, b_freelist);
  659 
  660                 i = (i + 1) % ncpus;
  661                 pcpu = &bufpcpu[i];
  662         }
  663 
  664         /*
  665          * maxbufspace is the absolute maximum amount of buffer space we are 
  666          * allowed to reserve in KVM and in real terms.  The absolute maximum
  667          * is nominally used by buf_daemon.  hibufspace is the nominal maximum
  668          * used by most other processes.  The differential is required to 
  669          * ensure that buf_daemon is able to run when other processes might 
  670          * be blocked waiting for buffer space.
  671          *
  672          * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
  673          * this may result in KVM fragmentation which is not handled optimally
  674          * by the system.
  675          */
  676         maxbufspace = nbuf * BKVASIZE;
  677         hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
  678         lobufspace = hibufspace - MAXBSIZE;
  679 
  680         lorunningspace = 512 * 1024;
  681         /* hirunningspace -- see below */
  682 
  683         /*
  684          * Limit the amount of malloc memory since it is wired permanently
  685          * into the kernel space.  Even though this is accounted for in
  686          * the buffer allocation, we don't want the malloced region to grow
  687          * uncontrolled.  The malloc scheme improves memory utilization
  688          * significantly on average (small) directories.
  689          */
  690         maxbufmallocspace = hibufspace / 20;
  691 
  692         /*
  693          * Reduce the chance of a deadlock occuring by limiting the number
  694          * of delayed-write dirty buffers we allow to stack up.
  695          *
  696          * We don't want too much actually queued to the device at once
  697          * (XXX this needs to be per-mount!), because the buffers will
  698          * wind up locked for a very long period of time while the I/O
  699          * drains.
  700          */
  701         hidirtybufspace = hibufspace / 2;       /* dirty + running */
  702         hirunningspace = hibufspace / 16;       /* locked & queued to device */
  703         if (hirunningspace < 1024 * 1024)
  704                 hirunningspace = 1024 * 1024;
  705 
  706         dirtykvaspace = 0;
  707         dirtybufspace = 0;
  708         dirtybufspacehw = 0;
  709 
  710         lodirtybufspace = hidirtybufspace / 2;
  711 
  712         /*
  713          * Maximum number of async ops initiated per buf_daemon loop.  This is
  714          * somewhat of a hack at the moment, we really need to limit ourselves
  715          * based on the number of bytes of I/O in-transit that were initiated
  716          * from buf_daemon.
  717          */
  718 
  719         bogus_offset = kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
  720         vm_object_hold(&kernel_object);
  721         bogus_page = vm_page_alloc(&kernel_object,
  722                                    (bogus_offset >> PAGE_SHIFT),
  723                                    VM_ALLOC_NORMAL);
  724         vm_object_drop(&kernel_object);
  725         vmstats.v_wire_count++;
  726 
  727 }
  728 
  729 SYSINIT(do_bufinit, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, bufinit, NULL);
  730 
  731 /*
  732  * Initialize the embedded bio structures, typically used by
  733  * deprecated code which tries to allocate its own struct bufs.
  734  */
  735 void
  736 initbufbio(struct buf *bp)
  737 {
  738         bp->b_bio1.bio_buf = bp;
  739         bp->b_bio1.bio_prev = NULL;
  740         bp->b_bio1.bio_offset = NOOFFSET;
  741         bp->b_bio1.bio_next = &bp->b_bio2;
  742         bp->b_bio1.bio_done = NULL;
  743         bp->b_bio1.bio_flags = 0;
  744 
  745         bp->b_bio2.bio_buf = bp;
  746         bp->b_bio2.bio_prev = &bp->b_bio1;
  747         bp->b_bio2.bio_offset = NOOFFSET;
  748         bp->b_bio2.bio_next = NULL;
  749         bp->b_bio2.bio_done = NULL;
  750         bp->b_bio2.bio_flags = 0;
  751 
  752         BUF_LOCKINIT(bp);
  753 }
  754 
  755 /*
  756  * Reinitialize the embedded bio structures as well as any additional
  757  * translation cache layers.
  758  */
  759 void
  760 reinitbufbio(struct buf *bp)
  761 {
  762         struct bio *bio;
  763 
  764         for (bio = &bp->b_bio1; bio; bio = bio->bio_next) {
  765                 bio->bio_done = NULL;
  766                 bio->bio_offset = NOOFFSET;
  767         }
  768 }
  769 
  770 /*
  771  * Undo the effects of an initbufbio().
  772  */
  773 void
  774 uninitbufbio(struct buf *bp)
  775 {
  776         dsched_exit_buf(bp);
  777         BUF_LOCKFREE(bp);
  778 }
  779 
  780 /*
  781  * Push another BIO layer onto an existing BIO and return it.  The new
  782  * BIO layer may already exist, holding cached translation data.
  783  */
  784 struct bio *
  785 push_bio(struct bio *bio)
  786 {
  787         struct bio *nbio;
  788 
  789         if ((nbio = bio->bio_next) == NULL) {
  790                 int index = bio - &bio->bio_buf->b_bio_array[0];
  791                 if (index >= NBUF_BIO - 1) {
  792                         panic("push_bio: too many layers bp %p",
  793                                 bio->bio_buf);
  794                 }
  795                 nbio = &bio->bio_buf->b_bio_array[index + 1];
  796                 bio->bio_next = nbio;
  797                 nbio->bio_prev = bio;
  798                 nbio->bio_buf = bio->bio_buf;
  799                 nbio->bio_offset = NOOFFSET;
  800                 nbio->bio_done = NULL;
  801                 nbio->bio_next = NULL;
  802         }
  803         KKASSERT(nbio->bio_done == NULL);
  804         return(nbio);
  805 }
  806 
  807 /*
  808  * Pop a BIO translation layer, returning the previous layer.  The
  809  * must have been previously pushed.
  810  */
  811 struct bio *
  812 pop_bio(struct bio *bio)
  813 {
  814         return(bio->bio_prev);
  815 }
  816 
  817 void
  818 clearbiocache(struct bio *bio)
  819 {
  820         while (bio) {
  821                 bio->bio_offset = NOOFFSET;
  822                 bio = bio->bio_next;
  823         }
  824 }
  825 
  826 /*
  827  * bfreekva:
  828  *
  829  *      Free the KVA allocation for buffer 'bp'.
  830  *
  831  *      Must be called from a critical section as this is the only locking for
  832  *      buffer_map.
  833  *
  834  *      Since this call frees up buffer space, we call bufspacewakeup().
  835  */
  836 static void
  837 bfreekva(struct buf *bp)
  838 {
  839         int count;
  840 
  841         if (bp->b_kvasize) {
  842                 ++buffreekvacnt;
  843                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
  844                 vm_map_lock(&buffer_map);
  845                 bufspace -= bp->b_kvasize;
  846                 vm_map_delete(&buffer_map,
  847                     (vm_offset_t) bp->b_kvabase,
  848                     (vm_offset_t) bp->b_kvabase + bp->b_kvasize,
  849                     &count
  850                 );
  851                 vm_map_unlock(&buffer_map);
  852                 vm_map_entry_release(count);
  853                 bp->b_kvasize = 0;
  854                 bp->b_kvabase = NULL;
  855                 bufspacewakeup();
  856         }
  857 }
  858 
  859 /*
  860  * Remove the buffer from the appropriate free list.
  861  * (caller must be locked)
  862  */
  863 static __inline void
  864 _bremfree(struct buf *bp)
  865 {
  866         struct bufpcpu *pcpu = &bufpcpu[bp->b_qcpu];
  867 
  868         if (bp->b_qindex != BQUEUE_NONE) {
  869                 KASSERT(BUF_REFCNTNB(bp) == 1, 
  870                         ("bremfree: bp %p not locked",bp));
  871                 TAILQ_REMOVE(&pcpu->bufqueues[bp->b_qindex], bp, b_freelist);
  872                 bp->b_qindex = BQUEUE_NONE;
  873         } else {
  874                 if (BUF_REFCNTNB(bp) <= 1)
  875                         panic("bremfree: removing a buffer not on a queue");
  876         }
  877 }
  878 
  879 /*
  880  * bremfree() - must be called with a locked buffer
  881  */
  882 void
  883 bremfree(struct buf *bp)
  884 {
  885         struct bufpcpu *pcpu = &bufpcpu[bp->b_qcpu];
  886 
  887         spin_lock(&pcpu->spin);
  888         _bremfree(bp);
  889         spin_unlock(&pcpu->spin);
  890 }
  891 
  892 /*
  893  * bremfree_locked - must be called with pcpu->spin locked
  894  */
  895 static void
  896 bremfree_locked(struct buf *bp)
  897 {
  898         _bremfree(bp);
  899 }
  900 
  901 /*
  902  * This version of bread issues any required I/O asyncnronously and
  903  * makes a callback on completion.
  904  *
  905  * The callback must check whether BIO_DONE is set in the bio and issue
  906  * the bpdone(bp, 0) if it isn't.  The callback is responsible for clearing
  907  * BIO_DONE and disposing of the I/O (bqrelse()ing it).
  908  */
  909 void
  910 breadcb(struct vnode *vp, off_t loffset, int size,
  911         void (*func)(struct bio *), void *arg)
  912 {
  913         struct buf *bp;
  914 
  915         bp = getblk(vp, loffset, size, 0, 0);
  916 
  917         /* if not found in cache, do some I/O */
  918         if ((bp->b_flags & B_CACHE) == 0) {
  919                 bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL);
  920                 bp->b_cmd = BUF_CMD_READ;
  921                 bp->b_bio1.bio_done = func;
  922                 bp->b_bio1.bio_caller_info1.ptr = arg;
  923                 vfs_busy_pages(vp, bp);
  924                 BUF_KERNPROC(bp);
  925                 vn_strategy(vp, &bp->b_bio1);
  926         } else if (func) {
  927                 /*
  928                  * Since we are issuing the callback synchronously it cannot
  929                  * race the BIO_DONE, so no need for atomic ops here.
  930                  */
  931                 /*bp->b_bio1.bio_done = func;*/
  932                 bp->b_bio1.bio_caller_info1.ptr = arg;
  933                 bp->b_bio1.bio_flags |= BIO_DONE;
  934                 func(&bp->b_bio1);
  935         } else {
  936                 bqrelse(bp);
  937         }
  938 }
  939 
  940 /*
  941  * breadnx() - Terminal function for bread() and breadn().
  942  *
  943  * This function will start asynchronous I/O on read-ahead blocks as well
  944  * as satisfy the primary request.
  945  *
  946  * We must clear B_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE is
  947  * set, the buffer is valid and we do not have to do anything.
  948  */
  949 int
  950 breadnx(struct vnode *vp, off_t loffset, int size, off_t *raoffset,
  951         int *rabsize, int cnt, struct buf **bpp)
  952 {
  953         struct buf *bp, *rabp;
  954         int i;
  955         int rv = 0, readwait = 0;
  956 
  957         if (*bpp)
  958                 bp = *bpp;
  959         else
  960                 *bpp = bp = getblk(vp, loffset, size, 0, 0);
  961 
  962         /* if not found in cache, do some I/O */
  963         if ((bp->b_flags & B_CACHE) == 0) {
  964                 bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL);
  965                 bp->b_cmd = BUF_CMD_READ;
  966                 bp->b_bio1.bio_done = biodone_sync;
  967                 bp->b_bio1.bio_flags |= BIO_SYNC;
  968                 vfs_busy_pages(vp, bp);
  969                 vn_strategy(vp, &bp->b_bio1);
  970                 ++readwait;
  971         }
  972 
  973         for (i = 0; i < cnt; i++, raoffset++, rabsize++) {
  974                 if (inmem(vp, *raoffset))
  975                         continue;
  976                 rabp = getblk(vp, *raoffset, *rabsize, 0, 0);
  977 
  978                 if ((rabp->b_flags & B_CACHE) == 0) {
  979                         rabp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL);
  980                         rabp->b_cmd = BUF_CMD_READ;
  981                         vfs_busy_pages(vp, rabp);
  982                         BUF_KERNPROC(rabp);
  983                         vn_strategy(vp, &rabp->b_bio1);
  984                 } else {
  985                         brelse(rabp);
  986                 }
  987         }
  988         if (readwait)
  989                 rv = biowait(&bp->b_bio1, "biord");
  990         return (rv);
  991 }
  992 
  993 /*
  994  * bwrite:
  995  *
  996  *      Synchronous write, waits for completion.
  997  *
  998  *      Write, release buffer on completion.  (Done by iodone
  999  *      if async).  Do not bother writing anything if the buffer
 1000  *      is invalid.
 1001  *
 1002  *      Note that we set B_CACHE here, indicating that buffer is
 1003  *      fully valid and thus cacheable.  This is true even of NFS
 1004  *      now so we set it generally.  This could be set either here 
 1005  *      or in biodone() since the I/O is synchronous.  We put it
 1006  *      here.
 1007  */
 1008 int
 1009 bwrite(struct buf *bp)
 1010 {
 1011         int error;
 1012 
 1013         if (bp->b_flags & B_INVAL) {
 1014                 brelse(bp);
 1015                 return (0);
 1016         }
 1017         if (BUF_REFCNTNB(bp) == 0)
 1018                 panic("bwrite: buffer is not busy???");
 1019 
 1020         /* Mark the buffer clean */
 1021         bundirty(bp);
 1022 
 1023         bp->b_flags &= ~(B_ERROR | B_EINTR);
 1024         bp->b_flags |= B_CACHE;
 1025         bp->b_cmd = BUF_CMD_WRITE;
 1026         bp->b_bio1.bio_done = biodone_sync;
 1027         bp->b_bio1.bio_flags |= BIO_SYNC;
 1028         vfs_busy_pages(bp->b_vp, bp);
 1029 
 1030         /*
 1031          * Normal bwrites pipeline writes.  NOTE: b_bufsize is only
 1032          * valid for vnode-backed buffers.
 1033          */
 1034         bsetrunningbufspace(bp, bp->b_bufsize);
 1035         vn_strategy(bp->b_vp, &bp->b_bio1);
 1036         error = biowait(&bp->b_bio1, "biows");
 1037         brelse(bp);
 1038 
 1039         return (error);
 1040 }
 1041 
 1042 /*
 1043  * bawrite:
 1044  *
 1045  *      Asynchronous write.  Start output on a buffer, but do not wait for
 1046  *      it to complete.  The buffer is released when the output completes.
 1047  *
 1048  *      bwrite() ( or the VOP routine anyway ) is responsible for handling
 1049  *      B_INVAL buffers.  Not us.
 1050  */
 1051 void
 1052 bawrite(struct buf *bp)
 1053 {
 1054         if (bp->b_flags & B_INVAL) {
 1055                 brelse(bp);
 1056                 return;
 1057         }
 1058         if (BUF_REFCNTNB(bp) == 0)
 1059                 panic("bwrite: buffer is not busy???");
 1060 
 1061         /* Mark the buffer clean */
 1062         bundirty(bp);
 1063 
 1064         bp->b_flags &= ~(B_ERROR | B_EINTR);
 1065         bp->b_flags |= B_CACHE;
 1066         bp->b_cmd = BUF_CMD_WRITE;
 1067         KKASSERT(bp->b_bio1.bio_done == NULL);
 1068         vfs_busy_pages(bp->b_vp, bp);
 1069 
 1070         /*
 1071          * Normal bwrites pipeline writes.  NOTE: b_bufsize is only
 1072          * valid for vnode-backed buffers.
 1073          */
 1074         bsetrunningbufspace(bp, bp->b_bufsize);
 1075         BUF_KERNPROC(bp);
 1076         vn_strategy(bp->b_vp, &bp->b_bio1);
 1077 }
 1078 
 1079 /*
 1080  * bowrite:
 1081  *
 1082  *      Ordered write.  Start output on a buffer, and flag it so that the
 1083  *      device will write it in the order it was queued.  The buffer is
 1084  *      released when the output completes.  bwrite() ( or the VOP routine
 1085  *      anyway ) is responsible for handling B_INVAL buffers.
 1086  */
 1087 int
 1088 bowrite(struct buf *bp)
 1089 {
 1090         bp->b_flags |= B_ORDERED;
 1091         bawrite(bp);
 1092         return (0);
 1093 }
 1094 
 1095 /*
 1096  * bdwrite:
 1097  *
 1098  *      Delayed write. (Buffer is marked dirty).  Do not bother writing
 1099  *      anything if the buffer is marked invalid.
 1100  *
 1101  *      Note that since the buffer must be completely valid, we can safely
 1102  *      set B_CACHE.  In fact, we have to set B_CACHE here rather then in
 1103  *      biodone() in order to prevent getblk from writing the buffer
 1104  *      out synchronously.
 1105  */
 1106 void
 1107 bdwrite(struct buf *bp)
 1108 {
 1109         if (BUF_REFCNTNB(bp) == 0)
 1110                 panic("bdwrite: buffer is not busy");
 1111 
 1112         if (bp->b_flags & B_INVAL) {
 1113                 brelse(bp);
 1114                 return;
 1115         }
 1116         bdirty(bp);
 1117 
 1118         if (dsched_is_clear_buf_priv(bp))
 1119                 dsched_new_buf(bp);
 1120 
 1121         /*
 1122          * Set B_CACHE, indicating that the buffer is fully valid.  This is
 1123          * true even of NFS now.
 1124          */
 1125         bp->b_flags |= B_CACHE;
 1126 
 1127         /*
 1128          * This bmap keeps the system from needing to do the bmap later,
 1129          * perhaps when the system is attempting to do a sync.  Since it
 1130          * is likely that the indirect block -- or whatever other datastructure
 1131          * that the filesystem needs is still in memory now, it is a good
 1132          * thing to do this.  Note also, that if the pageout daemon is
 1133          * requesting a sync -- there might not be enough memory to do
 1134          * the bmap then...  So, this is important to do.
 1135          */
 1136         if (bp->b_bio2.bio_offset == NOOFFSET) {
 1137                 VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset,
 1138                          NULL, NULL, BUF_CMD_WRITE);
 1139         }
 1140 
 1141         /*
 1142          * Because the underlying pages may still be mapped and
 1143          * writable trying to set the dirty buffer (b_dirtyoff/end)
 1144          * range here will be inaccurate.
 1145          *
 1146          * However, we must still clean the pages to satisfy the
 1147          * vnode_pager and pageout daemon, so theythink the pages
 1148          * have been "cleaned".  What has really occured is that
 1149          * they've been earmarked for later writing by the buffer
 1150          * cache.
 1151          *
 1152          * So we get the b_dirtyoff/end update but will not actually
 1153          * depend on it (NFS that is) until the pages are busied for
 1154          * writing later on.
 1155          */
 1156         vfs_clean_pages(bp);
 1157         bqrelse(bp);
 1158 
 1159         /*
 1160          * note: we cannot initiate I/O from a bdwrite even if we wanted to,
 1161          * due to the softdep code.
 1162          */
 1163 }
 1164 
 1165 /*
 1166  * Fake write - return pages to VM system as dirty, leave the buffer clean.
 1167  * This is used by tmpfs.
 1168  *
 1169  * It is important for any VFS using this routine to NOT use it for
 1170  * IO_SYNC or IO_ASYNC operations which occur when the system really
 1171  * wants to flush VM pages to backing store.
 1172  */
 1173 void
 1174 buwrite(struct buf *bp)
 1175 {
 1176         vm_page_t m;
 1177         int i;
 1178 
 1179         /*
 1180          * Only works for VMIO buffers.  If the buffer is already
 1181          * marked for delayed-write we can't avoid the bdwrite().
 1182          */
 1183         if ((bp->b_flags & B_VMIO) == 0 || (bp->b_flags & B_DELWRI)) {
 1184                 bdwrite(bp);
 1185                 return;
 1186         }
 1187 
 1188         /*
 1189          * Mark as needing a commit.
 1190          */
 1191         for (i = 0; i < bp->b_xio.xio_npages; i++) {
 1192                 m = bp->b_xio.xio_pages[i];
 1193                 vm_page_need_commit(m);
 1194         }
 1195         bqrelse(bp);
 1196 }
 1197 
 1198 /*
 1199  * bdirty:
 1200  *
 1201  *      Turn buffer into delayed write request by marking it B_DELWRI.
 1202  *      B_RELBUF and B_NOCACHE must be cleared.
 1203  *
 1204  *      We reassign the buffer to itself to properly update it in the
 1205  *      dirty/clean lists. 
 1206  *
 1207  *      Must be called from a critical section.
 1208  *      The buffer must be on BQUEUE_NONE.
 1209  */
 1210 void
 1211 bdirty(struct buf *bp)
 1212 {
 1213         KASSERT(bp->b_qindex == BQUEUE_NONE,
 1214                 ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
 1215         if (bp->b_flags & B_NOCACHE) {
 1216                 kprintf("bdirty: clearing B_NOCACHE on buf %p\n", bp);
 1217                 bp->b_flags &= ~B_NOCACHE;
 1218         }
 1219         if (bp->b_flags & B_INVAL) {
 1220                 kprintf("bdirty: warning, dirtying invalid buffer %p\n", bp);
 1221         }
 1222         bp->b_flags &= ~B_RELBUF;
 1223 
 1224         if ((bp->b_flags & B_DELWRI) == 0) {
 1225                 lwkt_gettoken(&bp->b_vp->v_token);
 1226                 bp->b_flags |= B_DELWRI;
 1227                 reassignbuf(bp);
 1228                 lwkt_reltoken(&bp->b_vp->v_token);
 1229 
 1230                 atomic_add_long(&dirtybufcount, 1);
 1231                 atomic_add_long(&dirtykvaspace, bp->b_kvasize);
 1232                 atomic_add_long(&dirtybufspace, bp->b_bufsize);
 1233                 if (bp->b_flags & B_HEAVY) {
 1234                         atomic_add_long(&dirtybufcounthw, 1);
 1235                         atomic_add_long(&dirtybufspacehw, bp->b_bufsize);
 1236                 }
 1237                 bd_heatup();
 1238         }
 1239 }
 1240 
 1241 /*
 1242  * Set B_HEAVY, indicating that this is a heavy-weight buffer that
 1243  * needs to be flushed with a different buf_daemon thread to avoid
 1244  * deadlocks.  B_HEAVY also imposes restrictions in getnewbuf().
 1245  */
 1246 void
 1247 bheavy(struct buf *bp)
 1248 {
 1249         if ((bp->b_flags & B_HEAVY) == 0) {
 1250                 bp->b_flags |= B_HEAVY;
 1251                 if (bp->b_flags & B_DELWRI) {
 1252                         atomic_add_long(&dirtybufcounthw, 1);
 1253                         atomic_add_long(&dirtybufspacehw, bp->b_bufsize);
 1254                 }
 1255         }
 1256 }
 1257 
 1258 /*
 1259  * bundirty:
 1260  *
 1261  *      Clear B_DELWRI for buffer.
 1262  *
 1263  *      Must be called from a critical section.
 1264  *
 1265  *      The buffer is typically on BQUEUE_NONE but there is one case in 
 1266  *      brelse() that calls this function after placing the buffer on
 1267  *      a different queue.
 1268  */
 1269 void
 1270 bundirty(struct buf *bp)
 1271 {
 1272         if (bp->b_flags & B_DELWRI) {
 1273                 lwkt_gettoken(&bp->b_vp->v_token);
 1274                 bp->b_flags &= ~B_DELWRI;
 1275                 reassignbuf(bp);
 1276                 lwkt_reltoken(&bp->b_vp->v_token);
 1277 
 1278                 atomic_add_long(&dirtybufcount, -1);
 1279                 atomic_add_long(&dirtykvaspace, -bp->b_kvasize);
 1280                 atomic_add_long(&dirtybufspace, -bp->b_bufsize);
 1281                 if (bp->b_flags & B_HEAVY) {
 1282                         atomic_add_long(&dirtybufcounthw, -1);
 1283                         atomic_add_long(&dirtybufspacehw, -bp->b_bufsize);
 1284                 }
 1285                 bd_signal(bp->b_bufsize);
 1286         }
 1287         /*
 1288          * Since it is now being written, we can clear its deferred write flag.
 1289          */
 1290         bp->b_flags &= ~B_DEFERRED;
 1291 }
 1292 
 1293 /*
 1294  * Set the b_runningbufspace field, used to track how much I/O is
 1295  * in progress at any given moment.
 1296  */
 1297 void
 1298 bsetrunningbufspace(struct buf *bp, int bytes)
 1299 {
 1300         bp->b_runningbufspace = bytes;
 1301         if (bytes) {
 1302                 atomic_add_long(&runningbufspace, bytes);
 1303                 atomic_add_long(&runningbufcount, 1);
 1304         }
 1305 }
 1306 
 1307 /*
 1308  * brelse:
 1309  *
 1310  *      Release a busy buffer and, if requested, free its resources.  The
 1311  *      buffer will be stashed in the appropriate bufqueue[] allowing it
 1312  *      to be accessed later as a cache entity or reused for other purposes.
 1313  */
 1314 void
 1315 brelse(struct buf *bp)
 1316 {
 1317         struct bufpcpu *pcpu;
 1318 #ifdef INVARIANTS
 1319         int saved_flags = bp->b_flags;
 1320 #endif
 1321 
 1322         KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 1323                 ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 1324 
 1325         /*
 1326          * If B_NOCACHE is set we are being asked to destroy the buffer and
 1327          * its backing store.  Clear B_DELWRI.
 1328          *
 1329          * B_NOCACHE is set in two cases: (1) when the caller really wants
 1330          * to destroy the buffer and backing store and (2) when the caller
 1331          * wants to destroy the buffer and backing store after a write 
 1332          * completes.
 1333          */
 1334         if ((bp->b_flags & (B_NOCACHE|B_DELWRI)) == (B_NOCACHE|B_DELWRI)) {
 1335                 bundirty(bp);
 1336         }
 1337 
 1338         if ((bp->b_flags & (B_INVAL | B_DELWRI)) == B_DELWRI) {
 1339                 /*
 1340                  * A re-dirtied buffer is only subject to destruction
 1341                  * by B_INVAL.  B_ERROR and B_NOCACHE are ignored.
 1342                  */
 1343                 /* leave buffer intact */
 1344         } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
 1345                    (bp->b_bufsize <= 0)) {
 1346                 /*
 1347                  * Either a failed read or we were asked to free or not
 1348                  * cache the buffer.  This path is reached with B_DELWRI
 1349                  * set only if B_INVAL is already set.  B_NOCACHE governs
 1350                  * backing store destruction.
 1351                  *
 1352                  * NOTE: HAMMER will set B_LOCKED in buf_deallocate if the
 1353                  * buffer cannot be immediately freed.
 1354                  */
 1355                 bp->b_flags |= B_INVAL;
 1356                 if (LIST_FIRST(&bp->b_dep) != NULL)
 1357                         buf_deallocate(bp);
 1358                 if (bp->b_flags & B_DELWRI) {
 1359                         atomic_add_long(&dirtybufcount, -1);
 1360                         atomic_add_long(&dirtykvaspace, -bp->b_kvasize);
 1361                         atomic_add_long(&dirtybufspace, -bp->b_bufsize);
 1362                         if (bp->b_flags & B_HEAVY) {
 1363                                 atomic_add_long(&dirtybufcounthw, -1);
 1364                                 atomic_add_long(&dirtybufspacehw,
 1365                                                 -bp->b_bufsize);
 1366                         }
 1367                         bd_signal(bp->b_bufsize);
 1368                 }
 1369                 bp->b_flags &= ~(B_DELWRI | B_CACHE);
 1370         }
 1371 
 1372         /*
 1373          * We must clear B_RELBUF if B_DELWRI or B_LOCKED is set,
 1374          * or if b_refs is non-zero.
 1375          *
 1376          * If vfs_vmio_release() is called with either bit set, the
 1377          * underlying pages may wind up getting freed causing a previous
 1378          * write (bdwrite()) to get 'lost' because pages associated with
 1379          * a B_DELWRI bp are marked clean.  Pages associated with a
 1380          * B_LOCKED buffer may be mapped by the filesystem.
 1381          *
 1382          * If we want to release the buffer ourselves (rather then the
 1383          * originator asking us to release it), give the originator a
 1384          * chance to countermand the release by setting B_LOCKED.
 1385          * 
 1386          * We still allow the B_INVAL case to call vfs_vmio_release(), even
 1387          * if B_DELWRI is set.
 1388          *
 1389          * If B_DELWRI is not set we may have to set B_RELBUF if we are low
 1390          * on pages to return pages to the VM page queues.
 1391          */
 1392         if ((bp->b_flags & (B_DELWRI | B_LOCKED)) || bp->b_refs) {
 1393                 bp->b_flags &= ~B_RELBUF;
 1394         } else if (vm_page_count_min(0)) {
 1395                 if (LIST_FIRST(&bp->b_dep) != NULL)
 1396                         buf_deallocate(bp);             /* can set B_LOCKED */
 1397                 if (bp->b_flags & (B_DELWRI | B_LOCKED))
 1398                         bp->b_flags &= ~B_RELBUF;
 1399                 else
 1400                         bp->b_flags |= B_RELBUF;
 1401         }
 1402 
 1403         /*
 1404          * Make sure b_cmd is clear.  It may have already been cleared by
 1405          * biodone().
 1406          *
 1407          * At this point destroying the buffer is governed by the B_INVAL 
 1408          * or B_RELBUF flags.
 1409          */
 1410         bp->b_cmd = BUF_CMD_DONE;
 1411         dsched_exit_buf(bp);
 1412 
 1413         /*
 1414          * VMIO buffer rundown.  Make sure the VM page array is restored
 1415          * after an I/O may have replaces some of the pages with bogus pages
 1416          * in order to not destroy dirty pages in a fill-in read.
 1417          *
 1418          * Note that due to the code above, if a buffer is marked B_DELWRI
 1419          * then the B_RELBUF and B_NOCACHE bits will always be clear.
 1420          * B_INVAL may still be set, however.
 1421          *
 1422          * For clean buffers, B_INVAL or B_RELBUF will destroy the buffer
 1423          * but not the backing store.   B_NOCACHE will destroy the backing
 1424          * store.
 1425          *
 1426          * Note that dirty NFS buffers contain byte-granular write ranges
 1427          * and should not be destroyed w/ B_INVAL even if the backing store
 1428          * is left intact.
 1429          */
 1430         if (bp->b_flags & B_VMIO) {
 1431                 /*
 1432                  * Rundown for VMIO buffers which are not dirty NFS buffers.
 1433                  */
 1434                 int i, j, resid;
 1435                 vm_page_t m;
 1436                 off_t foff;
 1437                 vm_pindex_t poff;
 1438                 vm_object_t obj;
 1439                 struct vnode *vp;
 1440 
 1441                 vp = bp->b_vp;
 1442 
 1443                 /*
 1444                  * Get the base offset and length of the buffer.  Note that 
 1445                  * in the VMIO case if the buffer block size is not
 1446                  * page-aligned then b_data pointer may not be page-aligned.
 1447                  * But our b_xio.xio_pages array *IS* page aligned.
 1448                  *
 1449                  * block sizes less then DEV_BSIZE (usually 512) are not 
 1450                  * supported due to the page granularity bits (m->valid,
 1451                  * m->dirty, etc...). 
 1452                  *
 1453                  * See man buf(9) for more information
 1454                  */
 1455 
 1456                 resid = bp->b_bufsize;
 1457                 foff = bp->b_loffset;
 1458 
 1459                 for (i = 0; i < bp->b_xio.xio_npages; i++) {
 1460                         m = bp->b_xio.xio_pages[i];
 1461                         vm_page_flag_clear(m, PG_ZERO);
 1462                         /*
 1463                          * If we hit a bogus page, fixup *all* of them
 1464                          * now.  Note that we left these pages wired
 1465                          * when we removed them so they had better exist,
 1466                          * and they cannot be ripped out from under us so
 1467                          * no critical section protection is necessary.
 1468                          */
 1469                         if (m == bogus_page) {
 1470                                 obj = vp->v_object;
 1471                                 poff = OFF_TO_IDX(bp->b_loffset);
 1472 
 1473                                 vm_object_hold(obj);
 1474                                 for (j = i; j < bp->b_xio.xio_npages; j++) {
 1475                                         vm_page_t mtmp;
 1476 
 1477                                         mtmp = bp->b_xio.xio_pages[j];
 1478                                         if (mtmp == bogus_page) {
 1479                                                 mtmp = vm_page_lookup(obj, poff + j);
 1480                                                 if (!mtmp) {
 1481                                                         panic("brelse: page missing");
 1482                                                 }
 1483                                                 bp->b_xio.xio_pages[j] = mtmp;
 1484                                         }
 1485                                 }
 1486                                 bp->b_flags &= ~B_HASBOGUS;
 1487                                 vm_object_drop(obj);
 1488 
 1489                                 if ((bp->b_flags & B_INVAL) == 0) {
 1490                                         pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 1491                                                 bp->b_xio.xio_pages, bp->b_xio.xio_npages);
 1492                                 }
 1493                                 m = bp->b_xio.xio_pages[i];
 1494                         }
 1495 
 1496                         /*
 1497                          * Invalidate the backing store if B_NOCACHE is set
 1498                          * (e.g. used with vinvalbuf()).  If this is NFS
 1499                          * we impose a requirement that the block size be
 1500                          * a multiple of PAGE_SIZE and create a temporary
 1501                          * hack to basically invalidate the whole page.  The
 1502                          * problem is that NFS uses really odd buffer sizes
 1503                          * especially when tracking piecemeal writes and
 1504                          * it also vinvalbuf()'s a lot, which would result
 1505                          * in only partial page validation and invalidation
 1506                          * here.  If the file page is mmap()'d, however,
 1507                          * all the valid bits get set so after we invalidate
 1508                          * here we would end up with weird m->valid values
 1509                          * like 0xfc.  nfs_getpages() can't handle this so
 1510                          * we clear all the valid bits for the NFS case
 1511                          * instead of just some of them.
 1512                          *
 1513                          * The real bug is the VM system having to set m->valid
 1514                          * to VM_PAGE_BITS_ALL for faulted-in pages, which
 1515                          * itself is an artifact of the whole 512-byte
 1516                          * granular mess that exists to support odd block 
 1517                          * sizes and UFS meta-data block sizes (e.g. 6144).
 1518                          * A complete rewrite is required.
 1519                          *
 1520                          * XXX
 1521                          */
 1522                         if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
 1523                                 int poffset = foff & PAGE_MASK;
 1524                                 int presid;
 1525 
 1526                                 presid = PAGE_SIZE - poffset;
 1527                                 if (bp->b_vp->v_tag == VT_NFS &&
 1528                                     bp->b_vp->v_type == VREG) {
 1529                                         ; /* entire page */
 1530                                 } else if (presid > resid) {
 1531                                         presid = resid;
 1532                                 }
 1533                                 KASSERT(presid >= 0, ("brelse: extra page"));
 1534                                 vm_page_set_invalid(m, poffset, presid);
 1535 
 1536                                 /*
 1537                                  * Also make sure any swap cache is removed
 1538                                  * as it is now stale (HAMMER in particular
 1539                                  * uses B_NOCACHE to deal with buffer
 1540                                  * aliasing).
 1541                                  */
 1542                                 swap_pager_unswapped(m);
 1543                         }
 1544                         resid -= PAGE_SIZE - (foff & PAGE_MASK);
 1545                         foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 1546                 }
 1547                 if (bp->b_flags & (B_INVAL | B_RELBUF))
 1548                         vfs_vmio_release(bp);
 1549         } else {
 1550                 /*
 1551                  * Rundown for non-VMIO buffers.
 1552                  */
 1553                 if (bp->b_flags & (B_INVAL | B_RELBUF)) {
 1554                         if (bp->b_bufsize)
 1555                                 allocbuf(bp, 0);
 1556                         KKASSERT (LIST_FIRST(&bp->b_dep) == NULL);
 1557                         if (bp->b_vp)
 1558                                 brelvp(bp);
 1559                 }
 1560         }
 1561                         
 1562         if (bp->b_qindex != BQUEUE_NONE)
 1563                 panic("brelse: free buffer onto another queue???");
 1564         if (BUF_REFCNTNB(bp) > 1) {
 1565                 /* Temporary panic to verify exclusive locking */
 1566                 /* This panic goes away when we allow shared refs */
 1567                 panic("brelse: multiple refs");
 1568                 /* NOT REACHED */
 1569                 return;
 1570         }
 1571 
 1572         /*
 1573          * Figure out the correct queue to place the cleaned up buffer on.
 1574          * Buffers placed in the EMPTY or EMPTYKVA had better already be
 1575          * disassociated from their vnode.
 1576          *
 1577          * Return the buffer to its original pcpu area
 1578          */
 1579         pcpu = &bufpcpu[bp->b_qcpu];
 1580         spin_lock(&pcpu->spin);
 1581 
 1582         if (bp->b_flags & B_LOCKED) {
 1583                 /*
 1584                  * Buffers that are locked are placed in the locked queue
 1585                  * immediately, regardless of their state.
 1586                  */
 1587                 bp->b_qindex = BQUEUE_LOCKED;
 1588                 TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
 1589                                   bp, b_freelist);
 1590         } else if (bp->b_bufsize == 0) {
 1591                 /*
 1592                  * Buffers with no memory.  Due to conditionals near the top
 1593                  * of brelse() such buffers should probably already be
 1594                  * marked B_INVAL and disassociated from their vnode.
 1595                  */
 1596                 bp->b_flags |= B_INVAL;
 1597                 KASSERT(bp->b_vp == NULL,
 1598                         ("bp1 %p flags %08x/%08x vnode %p "
 1599                          "unexpectededly still associated!",
 1600                         bp, saved_flags, bp->b_flags, bp->b_vp));
 1601                 KKASSERT((bp->b_flags & B_HASHED) == 0);
 1602                 if (bp->b_kvasize) {
 1603                         bp->b_qindex = BQUEUE_EMPTYKVA;
 1604                 } else {
 1605                         bp->b_qindex = BQUEUE_EMPTY;
 1606                 }
 1607                 TAILQ_INSERT_HEAD(&pcpu->bufqueues[bp->b_qindex],
 1608                                   bp, b_freelist);
 1609         } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) {
 1610                 /*
 1611                  * Buffers with junk contents.   Again these buffers had better
 1612                  * already be disassociated from their vnode.
 1613                  */
 1614                 KASSERT(bp->b_vp == NULL,
 1615                         ("bp2 %p flags %08x/%08x vnode %p unexpectededly "
 1616                          "still associated!",
 1617                         bp, saved_flags, bp->b_flags, bp->b_vp));
 1618                 KKASSERT((bp->b_flags & B_HASHED) == 0);
 1619                 bp->b_flags |= B_INVAL;
 1620                 bp->b_qindex = BQUEUE_CLEAN;
 1621                 TAILQ_INSERT_HEAD(&pcpu->bufqueues[bp->b_qindex],
 1622                                   bp, b_freelist);
 1623         } else {
 1624                 /*
 1625                  * Remaining buffers.  These buffers are still associated with
 1626                  * their vnode.
 1627                  */
 1628                 switch(bp->b_flags & (B_DELWRI|B_HEAVY)) {
 1629                 case B_DELWRI:
 1630                         bp->b_qindex = BQUEUE_DIRTY;
 1631                         TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
 1632                                           bp, b_freelist);
 1633                         break;
 1634                 case B_DELWRI | B_HEAVY:
 1635                         bp->b_qindex = BQUEUE_DIRTY_HW;
 1636                         TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
 1637                                           bp, b_freelist);
 1638                         break;
 1639                 default:
 1640                         /*
 1641                          * NOTE: Buffers are always placed at the end of the
 1642                          * queue.  If B_AGE is not set the buffer will cycle
 1643                          * through the queue twice.
 1644                          */
 1645                         bp->b_qindex = BQUEUE_CLEAN;
 1646                         TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
 1647                                           bp, b_freelist);
 1648                         break;
 1649                 }
 1650         }
 1651         spin_unlock(&pcpu->spin);
 1652 
 1653         /*
 1654          * If B_INVAL, clear B_DELWRI.  We've already placed the buffer
 1655          * on the correct queue but we have not yet unlocked it.
 1656          */
 1657         if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI))
 1658                 bundirty(bp);
 1659 
 1660         /*
 1661          * The bp is on an appropriate queue unless locked.  If it is not
 1662          * locked or dirty we can wakeup threads waiting for buffer space.
 1663          *
 1664          * We've already handled the B_INVAL case ( B_DELWRI will be clear
 1665          * if B_INVAL is set ).
 1666          */
 1667         if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0)
 1668                 bufcountwakeup();
 1669 
 1670         /*
 1671          * Something we can maybe free or reuse
 1672          */
 1673         if (bp->b_bufsize || bp->b_kvasize)
 1674                 bufspacewakeup();
 1675 
 1676         /*
 1677          * Clean up temporary flags and unlock the buffer.
 1678          */
 1679         bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF | B_DIRECT);
 1680         BUF_UNLOCK(bp);
 1681 }
 1682 
 1683 /*
 1684  * bqrelse:
 1685  *
 1686  *      Release a buffer back to the appropriate queue but do not try to free
 1687  *      it.  The buffer is expected to be used again soon.
 1688  *
 1689  *      bqrelse() is used by bdwrite() to requeue a delayed write, and used by
 1690  *      biodone() to requeue an async I/O on completion.  It is also used when
 1691  *      known good buffers need to be requeued but we think we may need the data
 1692  *      again soon.
 1693  *
 1694  *      XXX we should be able to leave the B_RELBUF hint set on completion.
 1695  */
 1696 void
 1697 bqrelse(struct buf *bp)
 1698 {
 1699         struct bufpcpu *pcpu;
 1700 
 1701         KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 1702                 ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 1703 
 1704         if (bp->b_qindex != BQUEUE_NONE)
 1705                 panic("bqrelse: free buffer onto another queue???");
 1706         if (BUF_REFCNTNB(bp) > 1) {
 1707                 /* do not release to free list */
 1708                 panic("bqrelse: multiple refs");
 1709                 return;
 1710         }
 1711 
 1712         buf_act_advance(bp);
 1713 
 1714         pcpu = &bufpcpu[bp->b_qcpu];
 1715         spin_lock(&pcpu->spin);
 1716 
 1717         if (bp->b_flags & B_LOCKED) {
 1718                 /*
 1719                  * Locked buffers are released to the locked queue.  However,
 1720                  * if the buffer is dirty it will first go into the dirty
 1721                  * queue and later on after the I/O completes successfully it
 1722                  * will be released to the locked queue.
 1723                  */
 1724                 bp->b_qindex = BQUEUE_LOCKED;
 1725                 TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
 1726                                   bp, b_freelist);
 1727         } else if (bp->b_flags & B_DELWRI) {
 1728                 bp->b_qindex = (bp->b_flags & B_HEAVY) ?
 1729                                BQUEUE_DIRTY_HW : BQUEUE_DIRTY;
 1730                 TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
 1731                                   bp, b_freelist);
 1732         } else if (vm_page_count_min(0)) {
 1733                 /*
 1734                  * We are too low on memory, we have to try to free the
 1735                  * buffer (most importantly: the wired pages making up its
 1736                  * backing store) *now*.
 1737                  */
 1738                 spin_unlock(&pcpu->spin);
 1739                 brelse(bp);
 1740                 return;
 1741         } else {
 1742                 bp->b_qindex = BQUEUE_CLEAN;
 1743                 TAILQ_INSERT_TAIL(&pcpu->bufqueues[bp->b_qindex],
 1744                                   bp, b_freelist);
 1745         }
 1746         spin_unlock(&pcpu->spin);
 1747 
 1748         /*
 1749          * We have now placed the buffer on the proper queue, but have yet
 1750          * to unlock it.
 1751          */
 1752         if ((bp->b_flags & B_LOCKED) == 0 &&
 1753             ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0)) {
 1754                 bufcountwakeup();
 1755         }
 1756 
 1757         /*
 1758          * Something we can maybe free or reuse.
 1759          */
 1760         if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
 1761                 bufspacewakeup();
 1762 
 1763         /*
 1764          * Final cleanup and unlock.  Clear bits that are only used while a
 1765          * buffer is actively locked.
 1766          */
 1767         bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF);
 1768         dsched_exit_buf(bp);
 1769         BUF_UNLOCK(bp);
 1770 }
 1771 
 1772 /*
 1773  * Hold a buffer, preventing it from being reused.  This will prevent
 1774  * normal B_RELBUF operations on the buffer but will not prevent B_INVAL
 1775  * operations.  If a B_INVAL operation occurs the buffer will remain held
 1776  * but the underlying pages may get ripped out.
 1777  *
 1778  * These functions are typically used in VOP_READ/VOP_WRITE functions
 1779  * to hold a buffer during a copyin or copyout, preventing deadlocks
 1780  * or recursive lock panics when read()/write() is used over mmap()'d
 1781  * space.
 1782  *
 1783  * NOTE: bqhold() requires that the buffer be locked at the time of the
 1784  *       hold.  bqdrop() has no requirements other than the buffer having
 1785  *       previously been held.
 1786  */
 1787 void
 1788 bqhold(struct buf *bp)
 1789 {
 1790         atomic_add_int(&bp->b_refs, 1);
 1791 }
 1792 
 1793 void
 1794 bqdrop(struct buf *bp)
 1795 {
 1796         KKASSERT(bp->b_refs > 0);
 1797         atomic_add_int(&bp->b_refs, -1);
 1798 }
 1799 
 1800 /*
 1801  * Return backing pages held by the buffer 'bp' back to the VM system.
 1802  * This routine is called when the bp is invalidated, released, or
 1803  * reused.
 1804  *
 1805  * The KVA mapping (b_data) for the underlying pages is removed by
 1806  * this function.
 1807  *
 1808  * WARNING! This routine is integral to the low memory critical path
 1809  *          when a buffer is B_RELBUF'd.  If the system has a severe page
 1810  *          deficit we need to get the page(s) onto the PQ_FREE or PQ_CACHE
 1811  *          queues so they can be reused in the current pageout daemon
 1812  *          pass.
 1813  */
 1814 static void
 1815 vfs_vmio_release(struct buf *bp)
 1816 {
 1817         int i;
 1818         vm_page_t m;
 1819 
 1820         for (i = 0; i < bp->b_xio.xio_npages; i++) {
 1821                 m = bp->b_xio.xio_pages[i];
 1822                 bp->b_xio.xio_pages[i] = NULL;
 1823 
 1824                 /*
 1825                  * We need to own the page in order to safely unwire it.
 1826                  */
 1827                 vm_page_busy_wait(m, FALSE, "vmiopg");
 1828 
 1829                 /*
 1830                  * The VFS is telling us this is not a meta-data buffer
 1831                  * even if it is backed by a block device.
 1832                  */
 1833                 if (bp->b_flags & B_NOTMETA)
 1834                         vm_page_flag_set(m, PG_NOTMETA);
 1835 
 1836                 /*
 1837                  * This is a very important bit of code.  We try to track
 1838                  * VM page use whether the pages are wired into the buffer
 1839                  * cache or not.  While wired into the buffer cache the
 1840                  * bp tracks the act_count.
 1841                  *
 1842                  * We can choose to place unwired pages on the inactive
 1843                  * queue (0) or active queue (1).  If we place too many
 1844                  * on the active queue the queue will cycle the act_count
 1845                  * on pages we'd like to keep, just from single-use pages
 1846                  * (such as when doing a tar-up or file scan).
 1847                  */
 1848                 if (bp->b_act_count < vm_cycle_point)
 1849                         vm_page_unwire(m, 0);
 1850                 else
 1851                         vm_page_unwire(m, 1);
 1852 
 1853                 /*
 1854                  * If the wire_count has dropped to 0 we may need to take
 1855                  * further action before unbusying the page.
 1856                  *
 1857                  * WARNING: vm_page_try_*() also checks PG_NEED_COMMIT for us.
 1858                  */
 1859                 if (m->wire_count == 0) {
 1860                         vm_page_flag_clear(m, PG_ZERO);
 1861 
 1862                         if (bp->b_flags & B_DIRECT) {
 1863                                 /*
 1864                                  * Attempt to free the page if B_DIRECT is
 1865                                  * set, the caller does not desire the page
 1866                                  * to be cached.
 1867                                  */
 1868                                 vm_page_wakeup(m);
 1869                                 vm_page_try_to_free(m);
 1870                         } else if ((bp->b_flags & B_NOTMETA) ||
 1871                                    vm_page_count_min(0)) {
 1872                                 /*
 1873                                  * Attempt to move the page to PQ_CACHE
 1874                                  * if B_NOTMETA is set.  This flag is set
 1875                                  * by HAMMER to remove one of the two pages
 1876                                  * present when double buffering is enabled.
 1877                                  *
 1878                                  * Attempt to move the page to PQ_CACHE
 1879                                  * If we have a severe page deficit.  This
 1880                                  * will cause buffer cache operations related
 1881                                  * to pageouts to recycle the related pages
 1882                                  * in order to avoid a low memory deadlock.
 1883                                  */
 1884                                 m->act_count = bp->b_act_count;
 1885                                 vm_page_wakeup(m);
 1886                                 vm_page_try_to_cache(m);
 1887                         } else {
 1888                                 /*
 1889                                  * Nominal case, leave the page on the
 1890                                  * queue the original unwiring placed it on
 1891                                  * (active or inactive).
 1892                                  */
 1893                                 m->act_count = bp->b_act_count;
 1894                                 vm_page_wakeup(m);
 1895                         }
 1896                 } else {
 1897                         vm_page_wakeup(m);
 1898                 }
 1899         }
 1900 
 1901         pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
 1902                      bp->b_xio.xio_npages);
 1903         if (bp->b_bufsize) {
 1904                 bufspacewakeup();
 1905                 bp->b_bufsize = 0;
 1906         }
 1907         bp->b_xio.xio_npages = 0;
 1908         bp->b_flags &= ~B_VMIO;
 1909         KKASSERT (LIST_FIRST(&bp->b_dep) == NULL);
 1910         if (bp->b_vp)
 1911                 brelvp(bp);
 1912 }
 1913 
 1914 /*
 1915  * Find and initialize a new buffer header, freeing up existing buffers
 1916  * in the bufqueues as necessary.  The new buffer is returned locked.
 1917  *
 1918  * Important:  B_INVAL is not set.  If the caller wishes to throw the
 1919  * buffer away, the caller must set B_INVAL prior to calling brelse().
 1920  *
 1921  * We block if:
 1922  *      We have insufficient buffer headers
 1923  *      We have insufficient buffer space
 1924  *      buffer_map is too fragmented ( space reservation fails )
 1925  *      If we have to flush dirty buffers ( but we try to avoid this )
 1926  *
 1927  * To avoid VFS layer recursion we do not flush dirty buffers ourselves.
 1928  * Instead we ask the buf daemon to do it for us.  We attempt to
 1929  * avoid piecemeal wakeups of the pageout daemon.
 1930  */
 1931 struct buf *
 1932 getnewbuf(int blkflags, int slptimeo, int size, int maxsize)
 1933 {
 1934         struct bufpcpu *pcpu;
 1935         struct buf *bp;
 1936         struct buf *nbp;
 1937         int defrag = 0;
 1938         int nqindex;
 1939         int nqcpu;
 1940         int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0;
 1941         int maxloops = 200000;
 1942         int restart_reason = 0;
 1943         struct buf *restart_bp = NULL;
 1944         static int flushingbufs;
 1945 
 1946         /*
 1947          * We can't afford to block since we might be holding a vnode lock,
 1948          * which may prevent system daemons from running.  We deal with
 1949          * low-memory situations by proactively returning memory and running
 1950          * async I/O rather then sync I/O.
 1951          */
 1952         
 1953         ++getnewbufcalls;
 1954         --getnewbufrestarts;
 1955         nqcpu = mycpu->gd_cpuid;
 1956 restart:
 1957         ++getnewbufrestarts;
 1958 
 1959         if (debug_bufbio && --maxloops == 0)
 1960                 panic("getnewbuf, excessive loops on cpu %d restart %d (%p)",
 1961                         mycpu->gd_cpuid, restart_reason, restart_bp);
 1962 
 1963         /*
 1964          * Setup for scan.  If we do not have enough free buffers,
 1965          * we setup a degenerate case that immediately fails.  Note
 1966          * that if we are specially marked process, we are allowed to
 1967          * dip into our reserves.
 1968          *
 1969          * The scanning sequence is nominally:  EMPTY->EMPTYKVA->CLEAN
 1970          *
 1971          * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
 1972          * However, there are a number of cases (defragging, reusing, ...)
 1973          * where we cannot backup.
 1974          */
 1975         pcpu = &bufpcpu[nqcpu];
 1976         nqindex = BQUEUE_EMPTYKVA;
 1977         spin_lock(&pcpu->spin);
 1978 
 1979         nbp = TAILQ_FIRST(&pcpu->bufqueues[BQUEUE_EMPTYKVA]);
 1980 
 1981         if (nbp == NULL) {
 1982                 /*
 1983                  * If no EMPTYKVA buffers and we are either
 1984                  * defragging or reusing, locate a CLEAN buffer
 1985                  * to free or reuse.  If bufspace useage is low
 1986                  * skip this step so we can allocate a new buffer.
 1987                  */
 1988                 if (defrag || bufspace >= lobufspace) {
 1989                         nqindex = BQUEUE_CLEAN;
 1990                         nbp = TAILQ_FIRST(&pcpu->bufqueues[BQUEUE_CLEAN]);
 1991                 }
 1992 
 1993                 /*
 1994                  * If we could not find or were not allowed to reuse a
 1995                  * CLEAN buffer, check to see if it is ok to use an EMPTY
 1996                  * buffer.  We can only use an EMPTY buffer if allocating
 1997                  * its KVA would not otherwise run us out of buffer space.
 1998                  */
 1999                 if (nbp == NULL && defrag == 0 &&
 2000                     bufspace + maxsize < hibufspace) {
 2001                         nqindex = BQUEUE_EMPTY;
 2002                         nbp = TAILQ_FIRST(&pcpu->bufqueues[BQUEUE_EMPTY]);
 2003                 }
 2004         }
 2005 
 2006         /*
 2007          * Run scan, possibly freeing data and/or kva mappings on the fly
 2008          * depending.
 2009          *
 2010          * WARNING! spin is held!
 2011          */
 2012         while ((bp = nbp) != NULL) {
 2013                 int qindex = nqindex;
 2014 
 2015                 nbp = TAILQ_NEXT(bp, b_freelist);
 2016 
 2017                 /*
 2018                  * BQUEUE_CLEAN - B_AGE special case.  If not set the bp
 2019                  * cycles through the queue twice before being selected.
 2020                  */
 2021                 if (qindex == BQUEUE_CLEAN && 
 2022                     (bp->b_flags & B_AGE) == 0 && nbp) {
 2023                         bp->b_flags |= B_AGE;
 2024                         TAILQ_REMOVE(&pcpu->bufqueues[qindex],
 2025                                      bp, b_freelist);
 2026                         TAILQ_INSERT_TAIL(&pcpu->bufqueues[qindex],
 2027                                           bp, b_freelist);
 2028                         continue;
 2029                 }
 2030 
 2031                 /*
 2032                  * Calculate next bp ( we can only use it if we do not block
 2033                  * or do other fancy things ).
 2034                  */
 2035                 if (nbp == NULL) {
 2036                         switch(qindex) {
 2037                         case BQUEUE_EMPTY:
 2038                                 nqindex = BQUEUE_EMPTYKVA;
 2039                                 if ((nbp = TAILQ_FIRST(&pcpu->bufqueues[BQUEUE_EMPTYKVA])))
 2040                                         break;
 2041                                 /* fall through */
 2042                         case BQUEUE_EMPTYKVA:
 2043                                 nqindex = BQUEUE_CLEAN;
 2044                                 if ((nbp = TAILQ_FIRST(&pcpu->bufqueues[BQUEUE_CLEAN])))
 2045                                         break;
 2046                                 /* fall through */
 2047                         case BQUEUE_CLEAN:
 2048                                 /*
 2049                                  * nbp is NULL. 
 2050                                  */
 2051                                 break;
 2052                         }
 2053                 }
 2054 
 2055                 /*
 2056                  * Sanity Checks
 2057                  */
 2058                 KASSERT(bp->b_qindex == qindex,
 2059                         ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
 2060 
 2061                 /*
 2062                  * Note: we no longer distinguish between VMIO and non-VMIO
 2063                  * buffers.
 2064                  */
 2065                 KASSERT((bp->b_flags & B_DELWRI) == 0,
 2066                         ("delwri buffer %p found in queue %d", bp, qindex));
 2067 
 2068                 /*
 2069                  * Do not try to reuse a buffer with a non-zero b_refs.
 2070                  * This is an unsynchronized test.  A synchronized test
 2071                  * is also performed after we lock the buffer.
 2072                  */
 2073                 if (bp->b_refs)
 2074                         continue;
 2075 
 2076                 /*
 2077                  * If we are defragging then we need a buffer with 
 2078                  * b_kvasize != 0.  XXX this situation should no longer
 2079                  * occur, if defrag is non-zero the buffer's b_kvasize
 2080                  * should also be non-zero at this point.  XXX
 2081                  */
 2082                 if (defrag && bp->b_kvasize == 0) {
 2083                         kprintf("Warning: defrag empty buffer %p\n", bp);
 2084                         continue;
 2085                 }
 2086 
 2087                 /*
 2088                  * Start freeing the bp.  This is somewhat involved.  nbp
 2089                  * remains valid only for BQUEUE_EMPTY[KVA] bp's.  Buffers
 2090                  * on the clean list must be disassociated from their 
 2091                  * current vnode.  Buffers on the empty[kva] lists have
 2092                  * already been disassociated.
 2093                  *
 2094                  * b_refs is checked after locking along with queue changes.
 2095                  * We must check here to deal with zero->nonzero transitions
 2096                  * made by the owner of the buffer lock, which is used by
 2097                  * VFS's to hold the buffer while issuing an unlocked
 2098                  * uiomove()s.  We cannot invalidate the buffer's pages
 2099                  * for this case.  Once we successfully lock a buffer the
 2100                  * only 0->1 transitions of b_refs will occur via findblk().
 2101                  *
 2102                  * We must also check for queue changes after successful
 2103                  * locking as the current lock holder may dispose of the
 2104                  * buffer and change its queue.
 2105                  */
 2106                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 2107                         spin_unlock(&pcpu->spin);
 2108                         tsleep(&bd_request, 0, "gnbxxx", (hz + 99) / 100);
 2109                         restart_reason = 1;
 2110                         restart_bp = bp;
 2111                         goto restart;
 2112                 }
 2113                 if (bp->b_qindex != qindex || bp->b_refs) {
 2114                         spin_unlock(&pcpu->spin);
 2115                         BUF_UNLOCK(bp);
 2116                         restart_reason = 2;
 2117                         restart_bp = bp;
 2118                         goto restart;
 2119                 }
 2120                 bremfree_locked(bp);
 2121                 spin_unlock(&pcpu->spin);
 2122 
 2123                 /*
 2124                  * Dependancies must be handled before we disassociate the
 2125                  * vnode.
 2126                  *
 2127                  * NOTE: HAMMER will set B_LOCKED if the buffer cannot
 2128                  * be immediately disassociated.  HAMMER then becomes
 2129                  * responsible for releasing the buffer.
 2130                  *
 2131                  * NOTE: spin is UNLOCKED now.
 2132                  */
 2133                 if (LIST_FIRST(&bp->b_dep) != NULL) {
 2134                         buf_deallocate(bp);
 2135                         if (bp->b_flags & B_LOCKED) {
 2136                                 bqrelse(bp);
 2137                                 restart_reason = 3;
 2138                                 restart_bp = bp;
 2139                                 goto restart;
 2140                         }
 2141                         KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
 2142                 }
 2143 
 2144                 if (qindex == BQUEUE_CLEAN) {
 2145                         if (bp->b_flags & B_VMIO)
 2146                                 vfs_vmio_release(bp);
 2147                         if (bp->b_vp)
 2148                                 brelvp(bp);
 2149                 }
 2150 
 2151                 /*
 2152                  * NOTE:  nbp is now entirely invalid.  We can only restart
 2153                  * the scan from this point on.
 2154                  *
 2155                  * Get the rest of the buffer freed up.  b_kva* is still
 2156                  * valid after this operation.
 2157                  */
 2158                 KASSERT(bp->b_vp == NULL,
 2159                         ("bp3 %p flags %08x vnode %p qindex %d "
 2160                          "unexpectededly still associated!",
 2161                          bp, bp->b_flags, bp->b_vp, qindex));
 2162                 KKASSERT((bp->b_flags & B_HASHED) == 0);
 2163 
 2164                 /*
 2165                  * critical section protection is not required when
 2166                  * scrapping a buffer's contents because it is already 
 2167                  * wired.
 2168                  */
 2169                 if (bp->b_bufsize)
 2170                         allocbuf(bp, 0);
 2171 
 2172                 bp->b_flags = B_BNOCLIP;
 2173                 bp->b_cmd = BUF_CMD_DONE;
 2174                 bp->b_vp = NULL;
 2175                 bp->b_error = 0;
 2176                 bp->b_resid = 0;
 2177                 bp->b_bcount = 0;
 2178                 bp->b_xio.xio_npages = 0;
 2179                 bp->b_dirtyoff = bp->b_dirtyend = 0;
 2180                 bp->b_act_count = ACT_INIT;
 2181                 reinitbufbio(bp);
 2182                 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
 2183                 buf_dep_init(bp);
 2184                 if (blkflags & GETBLK_BHEAVY)
 2185                         bp->b_flags |= B_HEAVY;
 2186 
 2187                 /*
 2188                  * If we are defragging then free the buffer.
 2189                  */
 2190                 if (defrag) {
 2191                         bp->b_flags |= B_INVAL;
 2192                         bfreekva(bp);
 2193                         brelse(bp);
 2194                         defrag = 0;
 2195                         restart_reason = 4;
 2196                         restart_bp = bp;
 2197                         goto restart;
 2198                 }
 2199 
 2200                 /*
 2201                  * If we are overcomitted then recover the buffer and its
 2202                  * KVM space.  This occurs in rare situations when multiple
 2203                  * processes are blocked in getnewbuf() or allocbuf().
 2204                  *
 2205                  * On 64-bit systems BKVASIZE == MAXBSIZE and overcommit
 2206                  * should not be possible.
 2207                  */
 2208                 if (bufspace >= hibufspace)
 2209                         flushingbufs = 1;
 2210                 if (BKVASIZE != MAXBSIZE) {
 2211                         if (flushingbufs && bp->b_kvasize != 0) {
 2212                                 bp->b_flags |= B_INVAL;
 2213                                 bfreekva(bp);
 2214                                 brelse(bp);
 2215                                 restart_reason = 5;
 2216                                 restart_bp = bp;
 2217                                 goto restart;
 2218                         }
 2219                 }
 2220                 if (bufspace < lobufspace)
 2221                         flushingbufs = 0;
 2222 
 2223                 /*
 2224                  * b_refs can transition to a non-zero value while we hold
 2225                  * the buffer locked due to a findblk().  Our brelvp() above
 2226                  * interlocked any future possible transitions due to
 2227                  * findblk()s.
 2228                  *
 2229                  * If we find b_refs to be non-zero we can destroy the
 2230                  * buffer's contents but we cannot yet reuse the buffer.
 2231                  */
 2232                 if (bp->b_refs) {
 2233                         bp->b_flags |= B_INVAL;
 2234                         if (BKVASIZE != MAXBSIZE)
 2235                                 bfreekva(bp);
 2236                         brelse(bp);
 2237                         restart_reason = 6;
 2238                         restart_bp = bp;
 2239                         goto restart;
 2240                 }
 2241                 break;
 2242                 /* NOT REACHED, spin not held */
 2243         }
 2244 
 2245         /*
 2246          * If we exhausted our list, iterate other cpus.  If that fails,
 2247          * sleep as appropriate.  We may have to wakeup various daemons
 2248          * and write out some dirty buffers.
 2249          *
 2250          * Generally we are sleeping due to insufficient buffer space.
 2251          *
 2252          * NOTE: spin is held if bp is NULL, else it is not held.
 2253          */
 2254         if (bp == NULL) {
 2255                 int flags;
 2256                 char *waitmsg;
 2257 
 2258                 spin_unlock(&pcpu->spin);
 2259 
 2260                 nqcpu = (nqcpu + 1) % ncpus;
 2261                 if (nqcpu != mycpu->gd_cpuid) {
 2262                         restart_reason = 7;
 2263                         restart_bp = bp;
 2264                         goto restart;
 2265                 }
 2266 
 2267                 if (defrag) {
 2268                         flags = VFS_BIO_NEED_BUFSPACE;
 2269                         waitmsg = "nbufkv";
 2270                 } else if (bufspace >= hibufspace) {
 2271                         waitmsg = "nbufbs";
 2272                         flags = VFS_BIO_NEED_BUFSPACE;
 2273                 } else {
 2274                         waitmsg = "newbuf";
 2275                         flags = VFS_BIO_NEED_ANY;
 2276                 }
 2277 
 2278                 bd_speedup();   /* heeeelp */
 2279                 atomic_set_int(&needsbuffer, flags);
 2280                 while (needsbuffer & flags) {
 2281                         int value;
 2282 
 2283                         tsleep_interlock(&needsbuffer, 0);
 2284                         value = atomic_fetchadd_int(&needsbuffer, 0);
 2285                         if (value & flags) {
 2286                                 if (tsleep(&needsbuffer, PINTERLOCKED|slpflags,
 2287                                            waitmsg, slptimeo)) {
 2288                                         return (NULL);
 2289                                 }
 2290                         }
 2291                 }
 2292         } else {
 2293                 /*
 2294                  * We finally have a valid bp.  We aren't quite out of the
 2295                  * woods, we still have to reserve kva space.  In order
 2296                  * to keep fragmentation sane we only allocate kva in
 2297                  * BKVASIZE chunks.
 2298                  *
 2299                  * (spin is not held)
 2300                  */
 2301                 maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 2302 
 2303                 if (maxsize != bp->b_kvasize) {
 2304                         vm_offset_t addr = 0;
 2305                         int count;
 2306 
 2307                         bfreekva(bp);
 2308 
 2309                         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
 2310                         vm_map_lock(&buffer_map);
 2311 
 2312                         if (vm_map_findspace(&buffer_map,
 2313                                     vm_map_min(&buffer_map), maxsize,
 2314                                     maxsize, 0, &addr)) {
 2315                                 /*
 2316                                  * Uh oh.  Buffer map is too fragmented.  We
 2317                                  * must defragment the map.
 2318                                  */
 2319                                 vm_map_unlock(&buffer_map);
 2320                                 vm_map_entry_release(count);
 2321                                 ++bufdefragcnt;
 2322                                 defrag = 1;
 2323                                 bp->b_flags |= B_INVAL;
 2324                                 brelse(bp);
 2325                                 restart_reason = 8;
 2326                                 restart_bp = bp;
 2327                                 goto restart;
 2328                         }
 2329                         if (addr) {
 2330                                 vm_map_insert(&buffer_map, &count,
 2331                                         NULL, 0,
 2332                                         addr, addr + maxsize,
 2333                                         VM_MAPTYPE_NORMAL,
 2334                                         VM_PROT_ALL, VM_PROT_ALL,
 2335                                         MAP_NOFAULT);
 2336 
 2337                                 bp->b_kvabase = (caddr_t) addr;
 2338                                 bp->b_kvasize = maxsize;
 2339                                 bufspace += bp->b_kvasize;
 2340                                 ++bufreusecnt;
 2341                         }
 2342                         vm_map_unlock(&buffer_map);
 2343                         vm_map_entry_release(count);
 2344                 }
 2345                 bp->b_data = bp->b_kvabase;
 2346         }
 2347         return(bp);
 2348 }
 2349 
 2350 /*
 2351  * buf_daemon:
 2352  *
 2353  *      Buffer flushing daemon.  Buffers are normally flushed by the
 2354  *      update daemon but if it cannot keep up this process starts to
 2355  *      take the load in an attempt to prevent getnewbuf() from blocking.
 2356  *
 2357  *      Once a flush is initiated it does not stop until the number
 2358  *      of buffers falls below lodirtybuffers, but we will wake up anyone
 2359  *      waiting at the mid-point.
 2360  */
 2361 static struct kproc_desc buf_kp = {
 2362         "bufdaemon",
 2363         buf_daemon,
 2364         &bufdaemon_td
 2365 };
 2366 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST,
 2367         kproc_start, &buf_kp)
 2368 
 2369 static struct kproc_desc bufhw_kp = {
 2370         "bufdaemon_hw",
 2371         buf_daemon_hw,
 2372         &bufdaemonhw_td
 2373 };
 2374 SYSINIT(bufdaemon_hw, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST,
 2375         kproc_start, &bufhw_kp)
 2376 
 2377 static void
 2378 buf_daemon1(struct thread *td, int queue, int (*buf_limit_fn)(long), 
 2379             int *bd_req)
 2380 {
 2381         long limit;
 2382         struct buf *marker;
 2383 
 2384         marker = kmalloc(sizeof(*marker), M_BIOBUF, M_WAITOK | M_ZERO);
 2385         marker->b_flags |= B_MARKER;
 2386         marker->b_qindex = BQUEUE_NONE;
 2387         marker->b_qcpu = 0;
 2388 
 2389         /*
 2390          * This process needs to be suspended prior to shutdown sync.
 2391          */
 2392         EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
 2393                               td, SHUTDOWN_PRI_LAST);
 2394         curthread->td_flags |= TDF_SYSTHREAD;
 2395 
 2396         /*
 2397          * This process is allowed to take the buffer cache to the limit
 2398          */
 2399         for (;;) {
 2400                 kproc_suspend_loop();
 2401 
 2402                 /*
 2403                  * Do the flush as long as the number of dirty buffers
 2404                  * (including those running) exceeds lodirtybufspace.
 2405                  *
 2406                  * When flushing limit running I/O to hirunningspace
 2407                  * Do the flush.  Limit the amount of in-transit I/O we
 2408                  * allow to build up, otherwise we would completely saturate
 2409                  * the I/O system.  Wakeup any waiting processes before we
 2410                  * normally would so they can run in parallel with our drain.
 2411                  *
 2412                  * Our aggregate normal+HW lo water mark is lodirtybufspace,
 2413                  * but because we split the operation into two threads we
 2414                  * have to cut it in half for each thread.
 2415                  */
 2416                 waitrunningbufspace();
 2417                 limit = lodirtybufspace / 2;
 2418                 while (buf_limit_fn(limit)) {
 2419                         if (flushbufqueues(marker, queue) == 0)
 2420                                 break;
 2421                         if (runningbufspace < hirunningspace)
 2422                                 continue;
 2423                         waitrunningbufspace();
 2424                 }
 2425 
 2426                 /*
 2427                  * We reached our low water mark, reset the
 2428                  * request and sleep until we are needed again.
 2429                  * The sleep is just so the suspend code works.
 2430                  */
 2431                 tsleep_interlock(bd_req, 0);
 2432                 if (atomic_swap_int(bd_req, 0) == 0)
 2433                         tsleep(bd_req, PINTERLOCKED, "psleep", hz);
 2434         }
 2435         /* NOT REACHED */
 2436         /*kfree(marker, M_BIOBUF);*/
 2437 }
 2438 
 2439 static int
 2440 buf_daemon_limit(long limit)
 2441 {
 2442         return (runningbufspace + dirtykvaspace > limit ||
 2443                 dirtybufcount - dirtybufcounthw >= nbuf / 2);
 2444 }
 2445 
 2446 static int
 2447 buf_daemon_hw_limit(long limit)
 2448 {
 2449         return (runningbufspace + dirtykvaspace > limit ||
 2450                 dirtybufcounthw >= nbuf / 2);
 2451 }
 2452 
 2453 static void
 2454 buf_daemon(void)
 2455 {
 2456         buf_daemon1(bufdaemon_td, BQUEUE_DIRTY, buf_daemon_limit, 
 2457                     &bd_request);
 2458 }
 2459 
 2460 static void
 2461 buf_daemon_hw(void)
 2462 {
 2463         buf_daemon1(bufdaemonhw_td, BQUEUE_DIRTY_HW, buf_daemon_hw_limit,
 2464                     &bd_request_hw);
 2465 }
 2466 
 2467 /*
 2468  * flushbufqueues:
 2469  *
 2470  *      Try to flush a buffer in the dirty queue.  We must be careful to
 2471  *      free up B_INVAL buffers instead of write them, which NFS is 
 2472  *      particularly sensitive to.
 2473  *
 2474  *      B_RELBUF may only be set by VFSs.  We do set B_AGE to indicate
 2475  *      that we really want to try to get the buffer out and reuse it
 2476  *      due to the write load on the machine.
 2477  *
 2478  *      We must lock the buffer in order to check its validity before we
 2479  *      can mess with its contents.  spin isn't enough.
 2480  */
 2481 static int
 2482 flushbufqueues(struct buf *marker, bufq_type_t q)
 2483 {
 2484         struct bufpcpu *pcpu;
 2485         struct buf *bp;
 2486         int r = 0;
 2487         int lcpu = marker->b_qcpu;
 2488 
 2489         KKASSERT(marker->b_qindex == BQUEUE_NONE);
 2490         KKASSERT(marker->b_flags & B_MARKER);
 2491 
 2492 again:
 2493         /*
 2494          * Spinlock needed to perform operations on the queue and may be
 2495          * held through a non-blocking BUF_LOCK(), but cannot be held when
 2496          * BUF_UNLOCK()ing or through any other major operation.
 2497          */
 2498         pcpu = &bufpcpu[marker->b_qcpu];
 2499         spin_lock(&pcpu->spin);
 2500         marker->b_qindex = q;
 2501         TAILQ_INSERT_HEAD(&pcpu->bufqueues[q], marker, b_freelist);
 2502         bp = marker;
 2503 
 2504         while ((bp = TAILQ_NEXT(bp, b_freelist)) != NULL) {
 2505                 /*
 2506                  * NOTE: spinlock is always held at the top of the loop
 2507                  */
 2508                 if (bp->b_flags & B_MARKER)
 2509                         continue;
 2510                 if ((bp->b_flags & B_DELWRI) == 0) {
 2511                         kprintf("Unexpected clean buffer %p\n", bp);
 2512                         continue;
 2513                 }
 2514                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
 2515                         continue;
 2516                 KKASSERT(bp->b_qcpu == marker->b_qcpu && bp->b_qindex == q);
 2517 
 2518                 /*
 2519                  * Once the buffer is locked we will have no choice but to
 2520                  * unlock the spinlock around a later BUF_UNLOCK and re-set
 2521                  * bp = marker when looping.  Move the marker now to make
 2522                  * things easier.
 2523                  */
 2524                 TAILQ_REMOVE(&pcpu->bufqueues[q], marker, b_freelist);
 2525                 TAILQ_INSERT_AFTER(&pcpu->bufqueues[q], bp, marker, b_freelist);
 2526 
 2527                 /*
 2528                  * Must recheck B_DELWRI after successfully locking
 2529                  * the buffer.
 2530                  */
 2531                 if ((bp->b_flags & B_DELWRI) == 0) {
 2532                         spin_unlock(&pcpu->spin);
 2533                         BUF_UNLOCK(bp);
 2534                         spin_lock(&pcpu->spin);
 2535                         bp = marker;
 2536                         continue;
 2537                 }
 2538 
 2539                 /*
 2540                  * Remove the buffer from its queue.  We still own the
 2541                  * spinlock here.
 2542                  */
 2543                 _bremfree(bp);
 2544 
 2545                 /*
 2546                  * Disposing of an invalid buffer counts as a flush op
 2547                  */
 2548                 if (bp->b_flags & B_INVAL) {
 2549                         spin_unlock(&pcpu->spin);
 2550                         brelse(bp);
 2551                         spin_lock(&pcpu->spin);
 2552                         ++r;
 2553                         break;
 2554                 }
 2555 
 2556                 /*
 2557                  * Release the spinlock for the more complex ops we
 2558                  * are now going to do.
 2559                  */
 2560                 spin_unlock(&pcpu->spin);
 2561                 lwkt_yield();
 2562 
 2563                 /*
 2564                  * This is a bit messy
 2565                  */
 2566                 if (LIST_FIRST(&bp->b_dep) != NULL &&
 2567                     (bp->b_flags & B_DEFERRED) == 0 &&
 2568                     buf_countdeps(bp, 0)) {
 2569                         spin_lock(&pcpu->spin);
 2570                         TAILQ_INSERT_TAIL(&pcpu->bufqueues[q], bp, b_freelist);
 2571                         bp->b_qindex = q;
 2572                         bp->b_flags |= B_DEFERRED;
 2573                         spin_unlock(&pcpu->spin);
 2574                         BUF_UNLOCK(bp);
 2575                         spin_lock(&pcpu->spin);
 2576                         bp = marker;
 2577                         continue;
 2578                 }
 2579 
 2580                 /*
 2581                  * spinlock not held here.
 2582                  *
 2583                  * If the buffer has a dependancy, buf_checkwrite() must
 2584                  * also return 0 for us to be able to initate the write.
 2585                  *
 2586                  * If the buffer is flagged B_ERROR it may be requeued
 2587                  * over and over again, we try to avoid a live lock.
 2588                  */
 2589                 if (LIST_FIRST(&bp->b_dep) != NULL && buf_checkwrite(bp)) {
 2590                         brelse(bp);
 2591                 } else if (bp->b_flags & B_ERROR) {
 2592                         tsleep(bp, 0, "bioer", 1);
 2593                         bp->b_flags &= ~B_AGE;
 2594                         cluster_awrite(bp);
 2595                 } else {
 2596                         bp->b_flags |= B_AGE;
 2597                         cluster_awrite(bp);
 2598                 }
 2599                 spin_lock(&pcpu->spin);
 2600                 ++r;
 2601                 break;
 2602         }
 2603 
 2604         TAILQ_REMOVE(&pcpu->bufqueues[q], marker, b_freelist);
 2605         marker->b_qindex = BQUEUE_NONE;
 2606         spin_unlock(&pcpu->spin);
 2607 
 2608         /*
 2609          * Advance the marker to be fair.
 2610          */
 2611         marker->b_qcpu = (marker->b_qcpu + 1) % ncpus;
 2612         if (bp == NULL) {
 2613                 if (marker->b_qcpu != lcpu)
 2614                         goto again;
 2615         }
 2616 
 2617         return (r);
 2618 }
 2619 
 2620 /*
 2621  * inmem:
 2622  *
 2623  *      Returns true if no I/O is needed to access the associated VM object.
 2624  *      This is like findblk except it also hunts around in the VM system for
 2625  *      the data.
 2626  *
 2627  *      Note that we ignore vm_page_free() races from interrupts against our
 2628  *      lookup, since if the caller is not protected our return value will not
 2629  *      be any more valid then otherwise once we exit the critical section.
 2630  */
 2631 int
 2632 inmem(struct vnode *vp, off_t loffset)
 2633 {
 2634         vm_object_t obj;
 2635         vm_offset_t toff, tinc, size;
 2636         vm_page_t m;
 2637         int res = 1;
 2638 
 2639         if (findblk(vp, loffset, FINDBLK_TEST))
 2640                 return 1;
 2641         if (vp->v_mount == NULL)
 2642                 return 0;
 2643         if ((obj = vp->v_object) == NULL)
 2644                 return 0;
 2645 
 2646         size = PAGE_SIZE;
 2647         if (size > vp->v_mount->mnt_stat.f_iosize)
 2648                 size = vp->v_mount->mnt_stat.f_iosize;
 2649 
 2650         vm_object_hold(obj);
 2651         for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 2652                 m = vm_page_lookup(obj, OFF_TO_IDX(loffset + toff));
 2653                 if (m == NULL) {
 2654                         res = 0;
 2655                         break;
 2656                 }
 2657                 tinc = size;
 2658                 if (tinc > PAGE_SIZE - ((toff + loffset) & PAGE_MASK))
 2659                         tinc = PAGE_SIZE - ((toff + loffset) & PAGE_MASK);
 2660                 if (vm_page_is_valid(m,
 2661                     (vm_offset_t) ((toff + loffset) & PAGE_MASK), tinc) == 0) {
 2662                         res = 0;
 2663                         break;
 2664                 }
 2665         }
 2666         vm_object_drop(obj);
 2667         return (res);
 2668 }
 2669 
 2670 /*
 2671  * findblk:
 2672  *
 2673  *      Locate and return the specified buffer.  Unless flagged otherwise,
 2674  *      a locked buffer will be returned if it exists or NULL if it does not.
 2675  *
 2676  *      findblk()'d buffers are still on the bufqueues and if you intend
 2677  *      to use your (locked NON-TEST) buffer you need to bremfree(bp)
 2678  *      and possibly do other stuff to it.
 2679  *
 2680  *      FINDBLK_TEST    - Do not lock the buffer.  The caller is responsible
 2681  *                        for locking the buffer and ensuring that it remains
 2682  *                        the desired buffer after locking.
 2683  *
 2684  *      FINDBLK_NBLOCK  - Lock the buffer non-blocking.  If we are unable
 2685  *                        to acquire the lock we return NULL, even if the
 2686  *                        buffer exists.
 2687  *
 2688  *      FINDBLK_REF     - Returns the buffer ref'd, which prevents normal
 2689  *                        reuse by getnewbuf() but does not prevent
 2690  *                        disassociation (B_INVAL).  Used to avoid deadlocks
 2691  *                        against random (vp,loffset)s due to reassignment.
 2692  *
 2693  *      (0)             - Lock the buffer blocking.
 2694  */
 2695 struct buf *
 2696 findblk(struct vnode *vp, off_t loffset, int flags)
 2697 {
 2698         struct buf *bp;
 2699         int lkflags;
 2700 
 2701         lkflags = LK_EXCLUSIVE;
 2702         if (flags & FINDBLK_NBLOCK)
 2703                 lkflags |= LK_NOWAIT;
 2704 
 2705         for (;;) {
 2706                 /*
 2707                  * Lookup.  Ref the buf while holding v_token to prevent
 2708                  * reuse (but does not prevent diassociation).
 2709                  */
 2710                 lwkt_gettoken_shared(&vp->v_token);
 2711                 bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, loffset);
 2712                 if (bp == NULL) {
 2713                         lwkt_reltoken(&vp->v_token);
 2714                         return(NULL);
 2715                 }
 2716                 bqhold(bp);
 2717                 lwkt_reltoken(&vp->v_token);
 2718 
 2719                 /*
 2720                  * If testing only break and return bp, do not lock.
 2721                  */
 2722                 if (flags & FINDBLK_TEST)
 2723                         break;
 2724 
 2725                 /*
 2726                  * Lock the buffer, return an error if the lock fails.
 2727                  * (only FINDBLK_NBLOCK can cause the lock to fail).
 2728                  */
 2729                 if (BUF_LOCK(bp, lkflags)) {
 2730                         atomic_subtract_int(&bp->b_refs, 1);
 2731                         /* bp = NULL; not needed */
 2732                         return(NULL);
 2733                 }
 2734 
 2735                 /*
 2736                  * Revalidate the locked buf before allowing it to be
 2737                  * returned.
 2738                  */
 2739                 if (bp->b_vp == vp && bp->b_loffset == loffset)
 2740                         break;
 2741                 atomic_subtract_int(&bp->b_refs, 1);
 2742                 BUF_UNLOCK(bp);
 2743         }
 2744 
 2745         /*
 2746          * Success
 2747          */
 2748         if ((flags & FINDBLK_REF) == 0)
 2749                 atomic_subtract_int(&bp->b_refs, 1);
 2750         return(bp);
 2751 }
 2752 
 2753 /*
 2754  * getcacheblk:
 2755  *
 2756  *      Similar to getblk() except only returns the buffer if it is
 2757  *      B_CACHE and requires no other manipulation.  Otherwise NULL
 2758  *      is returned.
 2759  *
 2760  *      If B_RAM is set the buffer might be just fine, but we return
 2761  *      NULL anyway because we want the code to fall through to the
 2762  *      cluster read.  Otherwise read-ahead breaks.
 2763  *
 2764  *      If blksize is 0 the buffer cache buffer must already be fully
 2765  *      cached.
 2766  *
 2767  *      If blksize is non-zero getblk() will be used, allowing a buffer
 2768  *      to be reinstantiated from its VM backing store.  The buffer must
 2769  *      still be fully cached after reinstantiation to be returned.
 2770  */
 2771 struct buf *
 2772 getcacheblk(struct vnode *vp, off_t loffset, int blksize, int blkflags)
 2773 {
 2774         struct buf *bp;
 2775         int fndflags = (blkflags & GETBLK_NOWAIT) ? FINDBLK_NBLOCK : 0;
 2776 
 2777         if (blksize) {
 2778                 bp = getblk(vp, loffset, blksize, blkflags, 0);
 2779                 if (bp) {
 2780                         if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) ==
 2781                             B_CACHE) {
 2782                                 bp->b_flags &= ~B_AGE;
 2783                         } else {
 2784                                 brelse(bp);
 2785                                 bp = NULL;
 2786                         }
 2787                 }
 2788         } else {
 2789                 bp = findblk(vp, loffset, fndflags);
 2790                 if (bp) {
 2791                         if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) ==
 2792                             B_CACHE) {
 2793                                 bp->b_flags &= ~B_AGE;
 2794                                 bremfree(bp);
 2795                         } else {
 2796                                 BUF_UNLOCK(bp);
 2797                                 bp = NULL;
 2798                         }
 2799                 }
 2800         }
 2801         return (bp);
 2802 }
 2803 
 2804 /*
 2805  * getblk:
 2806  *
 2807  *      Get a block given a specified block and offset into a file/device.
 2808  *      B_INVAL may or may not be set on return.  The caller should clear
 2809  *      B_INVAL prior to initiating a READ.
 2810  *
 2811  *      IT IS IMPORTANT TO UNDERSTAND THAT IF YOU CALL GETBLK() AND B_CACHE
 2812  *      IS NOT SET, YOU MUST INITIALIZE THE RETURNED BUFFER, ISSUE A READ,
 2813  *      OR SET B_INVAL BEFORE RETIRING IT.  If you retire a getblk'd buffer
 2814  *      without doing any of those things the system will likely believe
 2815  *      the buffer to be valid (especially if it is not B_VMIO), and the
 2816  *      next getblk() will return the buffer with B_CACHE set.
 2817  *
 2818  *      For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
 2819  *      an existing buffer.
 2820  *
 2821  *      For a VMIO buffer, B_CACHE is modified according to the backing VM.
 2822  *      If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
 2823  *      and then cleared based on the backing VM.  If the previous buffer is
 2824  *      non-0-sized but invalid, B_CACHE will be cleared.
 2825  *
 2826  *      If getblk() must create a new buffer, the new buffer is returned with
 2827  *      both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
 2828  *      case it is returned with B_INVAL clear and B_CACHE set based on the
 2829  *      backing VM.
 2830  *
 2831  *      getblk() also forces a bwrite() for any B_DELWRI buffer whos
 2832  *      B_CACHE bit is clear.
 2833  *      
 2834  *      What this means, basically, is that the caller should use B_CACHE to
 2835  *      determine whether the buffer is fully valid or not and should clear
 2836  *      B_INVAL prior to issuing a read.  If the caller intends to validate
 2837  *      the buffer by loading its data area with something, the caller needs
 2838  *      to clear B_INVAL.  If the caller does this without issuing an I/O, 
 2839  *      the caller should set B_CACHE ( as an optimization ), else the caller
 2840  *      should issue the I/O and biodone() will set B_CACHE if the I/O was
 2841  *      a write attempt or if it was a successfull read.  If the caller 
 2842  *      intends to issue a READ, the caller must clear B_INVAL and B_ERROR
 2843  *      prior to issuing the READ.  biodone() will *not* clear B_INVAL.
 2844  *
 2845  *      getblk flags:
 2846  *
 2847  *      GETBLK_PCATCH - catch signal if blocked, can cause NULL return
 2848  *      GETBLK_BHEAVY - heavy-weight buffer cache buffer
 2849  */
 2850 struct buf *
 2851 getblk(struct vnode *vp, off_t loffset, int size, int blkflags, int slptimeo)
 2852 {
 2853         struct buf *bp;
 2854         int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0;
 2855         int error;
 2856         int lkflags;
 2857 
 2858         if (size > MAXBSIZE)
 2859                 panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE);
 2860         if (vp->v_object == NULL)
 2861                 panic("getblk: vnode %p has no object!", vp);
 2862 
 2863 loop:
 2864         if ((bp = findblk(vp, loffset, FINDBLK_REF | FINDBLK_TEST)) != NULL) {
 2865                 /*
 2866                  * The buffer was found in the cache, but we need to lock it.
 2867                  * We must acquire a ref on the bp to prevent reuse, but
 2868                  * this will not prevent disassociation (brelvp()) so we
 2869                  * must recheck (vp,loffset) after acquiring the lock.
 2870                  *
 2871                  * Without the ref the buffer could potentially be reused
 2872                  * before we acquire the lock and create a deadlock
 2873                  * situation between the thread trying to reuse the buffer
 2874                  * and us due to the fact that we would wind up blocking
 2875                  * on a random (vp,loffset).
 2876                  */
 2877                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 2878                         if (blkflags & GETBLK_NOWAIT) {
 2879                                 bqdrop(bp);
 2880                                 return(NULL);
 2881                         }
 2882                         lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL;
 2883                         if (blkflags & GETBLK_PCATCH)
 2884                                 lkflags |= LK_PCATCH;
 2885                         error = BUF_TIMELOCK(bp, lkflags, "getblk", slptimeo);
 2886                         if (error) {
 2887                                 bqdrop(bp);
 2888                                 if (error == ENOLCK)
 2889                                         goto loop;
 2890                                 return (NULL);
 2891                         }
 2892                         /* buffer may have changed on us */
 2893                 }
 2894                 bqdrop(bp);
 2895 
 2896                 /*
 2897                  * Once the buffer has been locked, make sure we didn't race
 2898                  * a buffer recyclement.  Buffers that are no longer hashed
 2899                  * will have b_vp == NULL, so this takes care of that check
 2900                  * as well.
 2901                  */
 2902                 if (bp->b_vp != vp || bp->b_loffset != loffset) {
 2903                         kprintf("Warning buffer %p (vp %p loffset %lld) "
 2904                                 "was recycled\n",
 2905                                 bp, vp, (long long)loffset);
 2906                         BUF_UNLOCK(bp);
 2907                         goto loop;
 2908                 }
 2909 
 2910                 /*
 2911                  * If SZMATCH any pre-existing buffer must be of the requested
 2912                  * size or NULL is returned.  The caller absolutely does not
 2913                  * want getblk() to bwrite() the buffer on a size mismatch.
 2914                  */
 2915                 if ((blkflags & GETBLK_SZMATCH) && size != bp->b_bcount) {
 2916                         BUF_UNLOCK(bp);
 2917                         return(NULL);
 2918                 }
 2919 
 2920                 /*
 2921                  * All vnode-based buffers must be backed by a VM object.
 2922                  */
 2923                 KKASSERT(bp->b_flags & B_VMIO);
 2924                 KKASSERT(bp->b_cmd == BUF_CMD_DONE);
 2925                 bp->b_flags &= ~B_AGE;
 2926 
 2927                 /*
 2928                  * Make sure that B_INVAL buffers do not have a cached
 2929                  * block number translation.
 2930                  */
 2931                 if ((bp->b_flags & B_INVAL) && (bp->b_bio2.bio_offset != NOOFFSET)) {
 2932                         kprintf("Warning invalid buffer %p (vp %p loffset %lld)"
 2933                                 " did not have cleared bio_offset cache\n",
 2934                                 bp, vp, (long long)loffset);
 2935                         clearbiocache(&bp->b_bio2);
 2936                 }
 2937 
 2938                 /*
 2939                  * The buffer is locked.  B_CACHE is cleared if the buffer is 
 2940                  * invalid.
 2941                  */
 2942                 if (bp->b_flags & B_INVAL)
 2943                         bp->b_flags &= ~B_CACHE;
 2944                 bremfree(bp);
 2945 
 2946                 /*
 2947                  * Any size inconsistancy with a dirty buffer or a buffer
 2948                  * with a softupdates dependancy must be resolved.  Resizing
 2949                  * the buffer in such circumstances can lead to problems.
 2950                  *
 2951                  * Dirty or dependant buffers are written synchronously.
 2952                  * Other types of buffers are simply released and
 2953                  * reconstituted as they may be backed by valid, dirty VM
 2954                  * pages (but not marked B_DELWRI).
 2955                  *
 2956                  * NFS NOTE: NFS buffers which straddle EOF are oddly-sized
 2957                  * and may be left over from a prior truncation (and thus
 2958                  * no longer represent the actual EOF point), so we
 2959                  * definitely do not want to B_NOCACHE the backing store.
 2960                  */
 2961                 if (size != bp->b_bcount) {
 2962                         if (bp->b_flags & B_DELWRI) {
 2963                                 bp->b_flags |= B_RELBUF;
 2964                                 bwrite(bp);
 2965                         } else if (LIST_FIRST(&bp->b_dep)) {
 2966                                 bp->b_flags |= B_RELBUF;
 2967                                 bwrite(bp);
 2968                         } else {
 2969                                 bp->b_flags |= B_RELBUF;
 2970                                 brelse(bp);
 2971                         }
 2972                         goto loop;
 2973                 }
 2974                 KKASSERT(size <= bp->b_kvasize);
 2975                 KASSERT(bp->b_loffset != NOOFFSET, 
 2976                         ("getblk: no buffer offset"));
 2977 
 2978                 /*
 2979                  * A buffer with B_DELWRI set and B_CACHE clear must
 2980                  * be committed before we can return the buffer in
 2981                  * order to prevent the caller from issuing a read
 2982                  * ( due to B_CACHE not being set ) and overwriting
 2983                  * it.
 2984                  *
 2985                  * Most callers, including NFS and FFS, need this to
 2986                  * operate properly either because they assume they
 2987                  * can issue a read if B_CACHE is not set, or because
 2988                  * ( for example ) an uncached B_DELWRI might loop due 
 2989                  * to softupdates re-dirtying the buffer.  In the latter
 2990                  * case, B_CACHE is set after the first write completes,
 2991                  * preventing further loops.
 2992                  *
 2993                  * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
 2994                  * above while extending the buffer, we cannot allow the
 2995                  * buffer to remain with B_CACHE set after the write
 2996                  * completes or it will represent a corrupt state.  To
 2997                  * deal with this we set B_NOCACHE to scrap the buffer
 2998                  * after the write.
 2999                  *
 3000                  * XXX Should this be B_RELBUF instead of B_NOCACHE?
 3001                  *     I'm not even sure this state is still possible
 3002                  *     now that getblk() writes out any dirty buffers
 3003                  *     on size changes.
 3004                  *
 3005                  * We might be able to do something fancy, like setting
 3006                  * B_CACHE in bwrite() except if B_DELWRI is already set,
 3007                  * so the below call doesn't set B_CACHE, but that gets real
 3008                  * confusing.  This is much easier.
 3009                  */
 3010 
 3011                 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 3012                         kprintf("getblk: Warning, bp %p loff=%jx DELWRI set "
 3013                                 "and CACHE clear, b_flags %08x\n",
 3014                                 bp, (uintmax_t)bp->b_loffset, bp->b_flags);
 3015                         bp->b_flags |= B_NOCACHE;
 3016                         bwrite(bp);
 3017                         goto loop;
 3018                 }
 3019         } else {
 3020                 /*
 3021                  * Buffer is not in-core, create new buffer.  The buffer
 3022                  * returned by getnewbuf() is locked.  Note that the returned
 3023                  * buffer is also considered valid (not marked B_INVAL).
 3024                  *
 3025                  * Calculating the offset for the I/O requires figuring out
 3026                  * the block size.  We use DEV_BSIZE for VBLK or VCHR and
 3027                  * the mount's f_iosize otherwise.  If the vnode does not
 3028                  * have an associated mount we assume that the passed size is 
 3029                  * the block size.  
 3030                  *
 3031                  * Note that vn_isdisk() cannot be used here since it may
 3032                  * return a failure for numerous reasons.   Note that the
 3033                  * buffer size may be larger then the block size (the caller
 3034                  * will use block numbers with the proper multiple).  Beware
 3035                  * of using any v_* fields which are part of unions.  In
 3036                  * particular, in DragonFly the mount point overloading 
 3037                  * mechanism uses the namecache only and the underlying
 3038                  * directory vnode is not a special case.
 3039                  */
 3040                 int bsize, maxsize;
 3041 
 3042                 if (vp->v_type == VBLK || vp->v_type == VCHR)
 3043                         bsize = DEV_BSIZE;
 3044                 else if (vp->v_mount)
 3045                         bsize = vp->v_mount->mnt_stat.f_iosize;
 3046                 else
 3047                         bsize = size;
 3048 
 3049                 maxsize = size + (loffset & PAGE_MASK);
 3050                 maxsize = imax(maxsize, bsize);
 3051 
 3052                 bp = getnewbuf(blkflags, slptimeo, size, maxsize);
 3053                 if (bp == NULL) {
 3054                         if (slpflags || slptimeo)
 3055                                 return NULL;
 3056                         goto loop;
 3057                 }
 3058 
 3059                 /*
 3060                  * Atomically insert the buffer into the hash, so that it can
 3061                  * be found by findblk().
 3062                  *
 3063                  * If bgetvp() returns non-zero a collision occured, and the
 3064                  * bp will not be associated with the vnode.
 3065                  *
 3066                  * Make sure the translation layer has been cleared.
 3067                  */
 3068                 bp->b_loffset = loffset;
 3069                 bp->b_bio2.bio_offset = NOOFFSET;
 3070                 /* bp->b_bio2.bio_next = NULL; */
 3071 
 3072                 if (bgetvp(vp, bp, size)) {
 3073                         bp->b_flags |= B_INVAL;
 3074                         brelse(bp);
 3075                         goto loop;
 3076                 }
 3077 
 3078                 /*
 3079                  * All vnode-based buffers must be backed by a VM object.
 3080                  */
 3081                 KKASSERT(vp->v_object != NULL);
 3082                 bp->b_flags |= B_VMIO;
 3083                 KKASSERT(bp->b_cmd == BUF_CMD_DONE);
 3084 
 3085                 allocbuf(bp, size);
 3086         }
 3087         KKASSERT(dsched_is_clear_buf_priv(bp));
 3088         return (bp);
 3089 }
 3090 
 3091 /*
 3092  * regetblk(bp)
 3093  *
 3094  * Reacquire a buffer that was previously released to the locked queue,
 3095  * or reacquire a buffer which is interlocked by having bioops->io_deallocate
 3096  * set B_LOCKED (which handles the acquisition race).
 3097  *
 3098  * To this end, either B_LOCKED must be set or the dependancy list must be
 3099  * non-empty.
 3100  */
 3101 void
 3102 regetblk(struct buf *bp)
 3103 {
 3104         KKASSERT((bp->b_flags & B_LOCKED) || LIST_FIRST(&bp->b_dep) != NULL);
 3105         BUF_LOCK(bp, LK_EXCLUSIVE | LK_RETRY);
 3106         bremfree(bp);
 3107 }
 3108 
 3109 /*
 3110  * geteblk:
 3111  *
 3112  *      Get an empty, disassociated buffer of given size.  The buffer is
 3113  *      initially set to B_INVAL.
 3114  *
 3115  *      critical section protection is not required for the allocbuf()
 3116  *      call because races are impossible here.
 3117  */
 3118 struct buf *
 3119 geteblk(int size)
 3120 {
 3121         struct buf *bp;
 3122         int maxsize;
 3123 
 3124         maxsize = (size + BKVAMASK) & ~BKVAMASK;
 3125 
 3126         while ((bp = getnewbuf(0, 0, size, maxsize)) == NULL)
 3127                 ;
 3128         allocbuf(bp, size);
 3129         bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
 3130         KKASSERT(dsched_is_clear_buf_priv(bp));
 3131         return (bp);
 3132 }
 3133 
 3134 
 3135 /*
 3136  * allocbuf:
 3137  *
 3138  *      This code constitutes the buffer memory from either anonymous system
 3139  *      memory (in the case of non-VMIO operations) or from an associated
 3140  *      VM object (in the case of VMIO operations).  This code is able to
 3141  *      resize a buffer up or down.
 3142  *
 3143  *      Note that this code is tricky, and has many complications to resolve
 3144  *      deadlock or inconsistant data situations.  Tread lightly!!! 
 3145  *      There are B_CACHE and B_DELWRI interactions that must be dealt with by 
 3146  *      the caller.  Calling this code willy nilly can result in the loss of
 3147  *      data.
 3148  *
 3149  *      allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
 3150  *      B_CACHE for the non-VMIO case.
 3151  *
 3152  *      This routine does not need to be called from a critical section but you
 3153  *      must own the buffer.
 3154  */
 3155 int
 3156 allocbuf(struct buf *bp, int size)
 3157 {
 3158         int newbsize, mbsize;
 3159         int i;
 3160 
 3161         if (BUF_REFCNT(bp) == 0)
 3162                 panic("allocbuf: buffer not busy");
 3163 
 3164         if (bp->b_kvasize < size)
 3165                 panic("allocbuf: buffer too small");
 3166 
 3167         if ((bp->b_flags & B_VMIO) == 0) {
 3168                 caddr_t origbuf;
 3169                 int origbufsize;
 3170                 /*
 3171                  * Just get anonymous memory from the kernel.  Don't
 3172                  * mess with B_CACHE.
 3173                  */
 3174                 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 3175                 if (bp->b_flags & B_MALLOC)
 3176                         newbsize = mbsize;
 3177                 else
 3178                         newbsize = round_page(size);
 3179 
 3180                 if (newbsize < bp->b_bufsize) {
 3181                         /*
 3182                          * Malloced buffers are not shrunk
 3183                          */
 3184                         if (bp->b_flags & B_MALLOC) {
 3185                                 if (newbsize) {
 3186                                         bp->b_bcount = size;
 3187                                 } else {
 3188                                         kfree(bp->b_data, M_BIOBUF);
 3189                                         if (bp->b_bufsize) {
 3190                                                 atomic_subtract_long(&bufmallocspace, bp->b_bufsize);
 3191                                                 bufspacewakeup();
 3192                                                 bp->b_bufsize = 0;
 3193                                         }
 3194                                         bp->b_data = bp->b_kvabase;
 3195                                         bp->b_bcount = 0;
 3196                                         bp->b_flags &= ~B_MALLOC;
 3197                                 }
 3198                                 return 1;
 3199                         }               
 3200                         vm_hold_free_pages(
 3201                             bp,
 3202                             (vm_offset_t) bp->b_data + newbsize,
 3203                             (vm_offset_t) bp->b_data + bp->b_bufsize);
 3204                 } else if (newbsize > bp->b_bufsize) {
 3205                         /*
 3206                          * We only use malloced memory on the first allocation.
 3207                          * and revert to page-allocated memory when the buffer
 3208                          * grows.
 3209                          */
 3210                         if ((bufmallocspace < maxbufmallocspace) &&
 3211                                 (bp->b_bufsize == 0) &&
 3212                                 (mbsize <= PAGE_SIZE/2)) {
 3213 
 3214                                 bp->b_data = kmalloc(mbsize, M_BIOBUF, M_WAITOK);
 3215                                 bp->b_bufsize = mbsize;
 3216                                 bp->b_bcount = size;
 3217                                 bp->b_flags |= B_MALLOC;
 3218                                 atomic_add_long(&bufmallocspace, mbsize);
 3219                                 return 1;
 3220                         }
 3221                         origbuf = NULL;
 3222                         origbufsize = 0;
 3223                         /*
 3224                          * If the buffer is growing on its other-than-first
 3225                          * allocation, then we revert to the page-allocation
 3226                          * scheme.
 3227                          */
 3228                         if (bp->b_flags & B_MALLOC) {
 3229                                 origbuf = bp->b_data;
 3230                                 origbufsize = bp->b_bufsize;
 3231                                 bp->b_data = bp->b_kvabase;
 3232                                 if (bp->b_bufsize) {
 3233                                         atomic_subtract_long(&bufmallocspace,
 3234                                                              bp->b_bufsize);
 3235                                         bufspacewakeup();
 3236                                         bp->b_bufsize = 0;
 3237                                 }
 3238                                 bp->b_flags &= ~B_MALLOC;
 3239                                 newbsize = round_page(newbsize);
 3240                         }
 3241                         vm_hold_load_pages(
 3242                             bp,
 3243                             (vm_offset_t) bp->b_data + bp->b_bufsize,
 3244                             (vm_offset_t) bp->b_data + newbsize);
 3245                         if (origbuf) {
 3246                                 bcopy(origbuf, bp->b_data, origbufsize);
 3247                                 kfree(origbuf, M_BIOBUF);
 3248                         }
 3249                 }
 3250         } else {
 3251                 vm_page_t m;
 3252                 int desiredpages;
 3253 
 3254                 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 3255                 desiredpages = ((int)(bp->b_loffset & PAGE_MASK) +
 3256                                 newbsize + PAGE_MASK) >> PAGE_SHIFT;
 3257                 KKASSERT(desiredpages <= XIO_INTERNAL_PAGES);
 3258 
 3259                 if (bp->b_flags & B_MALLOC)
 3260                         panic("allocbuf: VMIO buffer can't be malloced");
 3261                 /*
 3262                  * Set B_CACHE initially if buffer is 0 length or will become
 3263                  * 0-length.
 3264                  */
 3265                 if (size == 0 || bp->b_bufsize == 0)
 3266                         bp->b_flags |= B_CACHE;
 3267 
 3268                 if (newbsize < bp->b_bufsize) {
 3269                         /*
 3270                          * DEV_BSIZE aligned new buffer size is less then the
 3271                          * DEV_BSIZE aligned existing buffer size.  Figure out
 3272                          * if we have to remove any pages.
 3273                          */
 3274                         if (desiredpages < bp->b_xio.xio_npages) {
 3275                                 for (i = desiredpages; i < bp->b_xio.xio_npages; i++) {
 3276                                         /*
 3277                                          * the page is not freed here -- it
 3278                                          * is the responsibility of 
 3279                                          * vnode_pager_setsize
 3280                                          */
 3281                                         m = bp->b_xio.xio_pages[i];
 3282                                         KASSERT(m != bogus_page,
 3283                                             ("allocbuf: bogus page found"));
 3284                                         vm_page_busy_wait(m, TRUE, "biodep");
 3285                                         bp->b_xio.xio_pages[i] = NULL;
 3286                                         vm_page_unwire(m, 0);
 3287                                         vm_page_wakeup(m);
 3288                                 }
 3289                                 pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
 3290                                     (desiredpages << PAGE_SHIFT), (bp->b_xio.xio_npages - desiredpages));
 3291                                 bp->b_xio.xio_npages = desiredpages;
 3292                         }
 3293                 } else if (size > bp->b_bcount) {
 3294                         /*
 3295                          * We are growing the buffer, possibly in a 
 3296                          * byte-granular fashion.
 3297                          */
 3298                         struct vnode *vp;
 3299                         vm_object_t obj;
 3300                         vm_offset_t toff;
 3301                         vm_offset_t tinc;
 3302 
 3303                         /*
 3304                          * Step 1, bring in the VM pages from the object, 
 3305                          * allocating them if necessary.  We must clear
 3306                          * B_CACHE if these pages are not valid for the 
 3307                          * range covered by the buffer.
 3308                          *
 3309                          * critical section protection is required to protect
 3310                          * against interrupts unbusying and freeing pages
 3311                          * between our vm_page_lookup() and our
 3312                          * busycheck/wiring call.
 3313                          */
 3314                         vp = bp->b_vp;
 3315                         obj = vp->v_object;
 3316 
 3317                         vm_object_hold(obj);
 3318                         while (bp->b_xio.xio_npages < desiredpages) {
 3319                                 vm_page_t m;
 3320                                 vm_pindex_t pi;
 3321                                 int error;
 3322 
 3323                                 pi = OFF_TO_IDX(bp->b_loffset) +
 3324                                      bp->b_xio.xio_npages;
 3325 
 3326                                 /*
 3327                                  * Blocking on m->busy might lead to a
 3328                                  * deadlock:
 3329                                  *
 3330                                  *  vm_fault->getpages->cluster_read->allocbuf
 3331                                  */
 3332                                 m = vm_page_lookup_busy_try(obj, pi, FALSE,
 3333                                                             &error);
 3334                                 if (error) {
 3335                                         vm_page_sleep_busy(m, FALSE, "pgtblk");
 3336                                         continue;
 3337                                 }
 3338                                 if (m == NULL) {
 3339                                         /*
 3340                                          * note: must allocate system pages
 3341                                          * since blocking here could intefere
 3342                                          * with paging I/O, no matter which
 3343                                          * process we are.
 3344                                          */
 3345                                         m = bio_page_alloc(bp, obj, pi, desiredpages - bp->b_xio.xio_npages);
 3346                                         if (m) {
 3347                                                 vm_page_wire(m);
 3348                                                 vm_page_flag_clear(m, PG_ZERO);
 3349                                                 vm_page_wakeup(m);
 3350                                                 bp->b_flags &= ~B_CACHE;
 3351                                                 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
 3352                                                 ++bp->b_xio.xio_npages;
 3353                                         }
 3354                                         continue;
 3355                                 }
 3356 
 3357                                 /*
 3358                                  * We found a page and were able to busy it.
 3359                                  */
 3360                                 vm_page_flag_clear(m, PG_ZERO);
 3361                                 vm_page_wire(m);
 3362                                 vm_page_wakeup(m);
 3363                                 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
 3364                                 ++bp->b_xio.xio_npages;
 3365                                 if (bp->b_act_count < m->act_count)
 3366                                         bp->b_act_count = m->act_count;
 3367                         }
 3368                         vm_object_drop(obj);
 3369 
 3370                         /*
 3371                          * Step 2.  We've loaded the pages into the buffer,
 3372                          * we have to figure out if we can still have B_CACHE
 3373                          * set.  Note that B_CACHE is set according to the
 3374                          * byte-granular range ( bcount and size ), not the
 3375                          * aligned range ( newbsize ).
 3376                          *
 3377                          * The VM test is against m->valid, which is DEV_BSIZE
 3378                          * aligned.  Needless to say, the validity of the data
 3379                          * needs to also be DEV_BSIZE aligned.  Note that this
 3380                          * fails with NFS if the server or some other client
 3381                          * extends the file's EOF.  If our buffer is resized, 
 3382                          * B_CACHE may remain set! XXX
 3383                          */
 3384 
 3385                         toff = bp->b_bcount;
 3386                         tinc = PAGE_SIZE - ((bp->b_loffset + toff) & PAGE_MASK);
 3387 
 3388                         while ((bp->b_flags & B_CACHE) && toff < size) {
 3389                                 vm_pindex_t pi;
 3390 
 3391                                 if (tinc > (size - toff))
 3392                                         tinc = size - toff;
 3393 
 3394                                 pi = ((bp->b_loffset & PAGE_MASK) + toff) >> 
 3395                                     PAGE_SHIFT;
 3396 
 3397                                 vfs_buf_test_cache(
 3398                                     bp, 
 3399                                     bp->b_loffset,
 3400                                     toff, 
 3401                                     tinc, 
 3402                                     bp->b_xio.xio_pages[pi]
 3403                                 );
 3404                                 toff += tinc;
 3405                                 tinc = PAGE_SIZE;
 3406                         }
 3407 
 3408                         /*
 3409                          * Step 3, fixup the KVM pmap.  Remember that
 3410                          * bp->b_data is relative to bp->b_loffset, but 
 3411                          * bp->b_loffset may be offset into the first page.
 3412                          */
 3413 
 3414                         bp->b_data = (caddr_t)
 3415                             trunc_page((vm_offset_t)bp->b_data);
 3416                         pmap_qenter(
 3417                             (vm_offset_t)bp->b_data,
 3418                             bp->b_xio.xio_pages, 
 3419                             bp->b_xio.xio_npages
 3420                         );
 3421                         bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
 3422                             (vm_offset_t)(bp->b_loffset & PAGE_MASK));
 3423                 }
 3424         }
 3425 
 3426         /* adjust space use on already-dirty buffer */
 3427         if (bp->b_flags & B_DELWRI) {
 3428                 /* dirtykvaspace unchanged */
 3429                 atomic_add_long(&dirtybufspace, newbsize - bp->b_bufsize);
 3430                 if (bp->b_flags & B_HEAVY) {
 3431                         atomic_add_long(&dirtybufspacehw,
 3432                                         newbsize - bp->b_bufsize);
 3433                 }
 3434         }
 3435         if (newbsize < bp->b_bufsize)
 3436                 bufspacewakeup();
 3437         bp->b_bufsize = newbsize;       /* actual buffer allocation     */
 3438         bp->b_bcount = size;            /* requested buffer size        */
 3439         return 1;
 3440 }
 3441 
 3442 /*
 3443  * biowait:
 3444  *
 3445  *      Wait for buffer I/O completion, returning error status. B_EINTR
 3446  *      is converted into an EINTR error but not cleared (since a chain
 3447  *      of biowait() calls may occur).
 3448  *
 3449  *      On return bpdone() will have been called but the buffer will remain
 3450  *      locked and will not have been brelse()'d.
 3451  *
 3452  *      NOTE!  If a timeout is specified and ETIMEDOUT occurs the I/O is
 3453  *      likely still in progress on return.
 3454  *
 3455  *      NOTE!  This operation is on a BIO, not a BUF.
 3456  *
 3457  *      NOTE!  BIO_DONE is cleared by vn_strategy()
 3458  */
 3459 static __inline int
 3460 _biowait(struct bio *bio, const char *wmesg, int to)
 3461 {
 3462         struct buf *bp = bio->bio_buf;
 3463         u_int32_t flags;
 3464         u_int32_t nflags;
 3465         int error;
 3466 
 3467         KKASSERT(bio == &bp->b_bio1);
 3468         for (;;) {
 3469                 flags = bio->bio_flags;
 3470                 if (flags & BIO_DONE)
 3471                         break;
 3472                 nflags = flags | BIO_WANT;
 3473                 tsleep_interlock(bio, 0);
 3474                 if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) {
 3475                         if (wmesg)
 3476                                 error = tsleep(bio, PINTERLOCKED, wmesg, to);
 3477                         else if (bp->b_cmd == BUF_CMD_READ)
 3478                                 error = tsleep(bio, PINTERLOCKED, "biord", to);
 3479                         else
 3480                                 error = tsleep(bio, PINTERLOCKED, "biowr", to);
 3481                         if (error) {
 3482                                 kprintf("tsleep error biowait %d\n", error);
 3483                                 return (error);
 3484                         }
 3485                 }
 3486         }
 3487 
 3488         /*
 3489          * Finish up.
 3490          */
 3491         KKASSERT(bp->b_cmd == BUF_CMD_DONE);
 3492         bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
 3493         if (bp->b_flags & B_EINTR)
 3494                 return (EINTR);
 3495         if (bp->b_flags & B_ERROR)
 3496                 return (bp->b_error ? bp->b_error : EIO);
 3497         return (0);
 3498 }
 3499 
 3500 int
 3501 biowait(struct bio *bio, const char *wmesg)
 3502 {
 3503         return(_biowait(bio, wmesg, 0));
 3504 }
 3505 
 3506 int
 3507 biowait_timeout(struct bio *bio, const char *wmesg, int to)
 3508 {
 3509         return(_biowait(bio, wmesg, to));
 3510 }
 3511 
 3512 /*
 3513  * This associates a tracking count with an I/O.  vn_strategy() and
 3514  * dev_dstrategy() do this automatically but there are a few cases
 3515  * where a vnode or device layer is bypassed when a block translation
 3516  * is cached.  In such cases bio_start_transaction() may be called on
 3517  * the bypassed layers so the system gets an I/O in progress indication 
 3518  * for those higher layers.
 3519  */
 3520 void
 3521 bio_start_transaction(struct bio *bio, struct bio_track *track)
 3522 {
 3523         bio->bio_track = track;
 3524         if (dsched_is_clear_buf_priv(bio->bio_buf))
 3525                 dsched_new_buf(bio->bio_buf);
 3526         bio_track_ref(track);
 3527 }
 3528 
 3529 /*
 3530  * Initiate I/O on a vnode.
 3531  *
 3532  * SWAPCACHE OPERATION:
 3533  *
 3534  *      Real buffer cache buffers have a non-NULL bp->b_vp.  Unfortunately
 3535  *      devfs also uses b_vp for fake buffers so we also have to check
 3536  *      that B_PAGING is 0.  In this case the passed 'vp' is probably the
 3537  *      underlying block device.  The swap assignments are related to the
 3538  *      buffer cache buffer's b_vp, not the passed vp.
 3539  *
 3540  *      The passed vp == bp->b_vp only in the case where the strategy call
 3541  *      is made on the vp itself for its own buffers (a regular file or
 3542  *      block device vp).  The filesystem usually then re-calls vn_strategy()
 3543  *      after translating the request to an underlying device.
 3544  *
 3545  *      Cluster buffers set B_CLUSTER and the passed vp is the vp of the
 3546  *      underlying buffer cache buffers.
 3547  *
 3548  *      We can only deal with page-aligned buffers at the moment, because
 3549  *      we can't tell what the real dirty state for pages straddling a buffer
 3550  *      are.
 3551  *
 3552  *      In order to call swap_pager_strategy() we must provide the VM object
 3553  *      and base offset for the underlying buffer cache pages so it can find
 3554  *      the swap blocks.
 3555  */
 3556 void
 3557 vn_strategy(struct vnode *vp, struct bio *bio)
 3558 {
 3559         struct bio_track *track;
 3560         struct buf *bp = bio->bio_buf;
 3561 
 3562         KKASSERT(bp->b_cmd != BUF_CMD_DONE);
 3563 
 3564         /*
 3565          * Set when an I/O is issued on the bp.  Cleared by consumers
 3566          * (aka HAMMER), allowing the consumer to determine if I/O had
 3567          * actually occurred.
 3568          */
 3569         bp->b_flags |= B_IODEBUG;
 3570 
 3571         /*
 3572          * Handle the swap cache intercept.
 3573          */
 3574         if (vn_cache_strategy(vp, bio))
 3575                 return;
 3576 
 3577         /*
 3578          * Otherwise do the operation through the filesystem
 3579          */
 3580         if (bp->b_cmd == BUF_CMD_READ)
 3581                 track = &vp->v_track_read;
 3582         else
 3583                 track = &vp->v_track_write;
 3584         KKASSERT((bio->bio_flags & BIO_DONE) == 0);
 3585         bio->bio_track = track;
 3586         if (dsched_is_clear_buf_priv(bio->bio_buf))
 3587                 dsched_new_buf(bio->bio_buf);
 3588         bio_track_ref(track);
 3589         vop_strategy(*vp->v_ops, vp, bio);
 3590 }
 3591 
 3592 static void vn_cache_strategy_callback(struct bio *bio);
 3593 
 3594 int
 3595 vn_cache_strategy(struct vnode *vp, struct bio *bio)
 3596 {
 3597         struct buf *bp = bio->bio_buf;
 3598         struct bio *nbio;
 3599         vm_object_t object;
 3600         vm_page_t m;
 3601         int i;
 3602 
 3603         /*
 3604          * Is this buffer cache buffer suitable for reading from
 3605          * the swap cache?
 3606          */
 3607         if (vm_swapcache_read_enable == 0 ||
 3608             bp->b_cmd != BUF_CMD_READ ||
 3609             ((bp->b_flags & B_CLUSTER) == 0 &&
 3610              (bp->b_vp == NULL || (bp->b_flags & B_PAGING))) ||
 3611             ((int)bp->b_loffset & PAGE_MASK) != 0 ||
 3612             (bp->b_bcount & PAGE_MASK) != 0) {
 3613                 return(0);
 3614         }
 3615 
 3616         /*
 3617          * Figure out the original VM object (it will match the underlying
 3618          * VM pages).  Note that swap cached data uses page indices relative
 3619          * to that object, not relative to bio->bio_offset.
 3620          */
 3621         if (bp->b_flags & B_CLUSTER)
 3622                 object = vp->v_object;
 3623         else
 3624                 object = bp->b_vp->v_object;
 3625 
 3626         /*
 3627          * In order to be able to use the swap cache all underlying VM
 3628          * pages must be marked as such, and we can't have any bogus pages.
 3629          */
 3630         for (i = 0; i < bp->b_xio.xio_npages; ++i) {
 3631                 m = bp->b_xio.xio_pages[i];
 3632                 if ((m->flags & PG_SWAPPED) == 0)
 3633                         break;
 3634                 if (m == bogus_page)
 3635                         break;
 3636         }
 3637 
 3638         /*
 3639          * If we are good then issue the I/O using swap_pager_strategy().
 3640          *
 3641          * We can only do this if the buffer actually supports object-backed
 3642          * I/O.  If it doesn't npages will be 0.
 3643          */
 3644         if (i && i == bp->b_xio.xio_npages) {
 3645                 m = bp->b_xio.xio_pages[0];
 3646                 nbio = push_bio(bio);
 3647                 nbio->bio_done = vn_cache_strategy_callback;
 3648                 nbio->bio_offset = ptoa(m->pindex);
 3649                 KKASSERT(m->object == object);
 3650                 swap_pager_strategy(object, nbio);
 3651                 return(1);
 3652         }
 3653         return(0);
 3654 }
 3655 
 3656 /*
 3657  * This is a bit of a hack but since the vn_cache_strategy() function can
 3658  * override a VFS's strategy function we must make sure that the bio, which
 3659  * is probably bio2, doesn't leak an unexpected offset value back to the
 3660  * filesystem.  The filesystem (e.g. UFS) might otherwise assume that the
 3661  * bio went through its own file strategy function and the the bio2 offset
 3662  * is a cached disk offset when, in fact, it isn't.
 3663  */
 3664 static void
 3665 vn_cache_strategy_callback(struct bio *bio)
 3666 {
 3667         bio->bio_offset = NOOFFSET;
 3668         biodone(pop_bio(bio));
 3669 }
 3670 
 3671 /*
 3672  * bpdone:
 3673  *
 3674  *      Finish I/O on a buffer after all BIOs have been processed.
 3675  *      Called when the bio chain is exhausted or by biowait.  If called
 3676  *      by biowait, elseit is typically 0.
 3677  *
 3678  *      bpdone is also responsible for setting B_CACHE in a B_VMIO bp.
 3679  *      In a non-VMIO bp, B_CACHE will be set on the next getblk() 
 3680  *      assuming B_INVAL is clear.
 3681  *
 3682  *      For the VMIO case, we set B_CACHE if the op was a read and no
 3683  *      read error occured, or if the op was a write.  B_CACHE is never
 3684  *      set if the buffer is invalid or otherwise uncacheable.
 3685  *
 3686  *      bpdone does not mess with B_INVAL, allowing the I/O routine or the
 3687  *      initiator to leave B_INVAL set to brelse the buffer out of existance
 3688  *      in the biodone routine.
 3689  */
 3690 void
 3691 bpdone(struct buf *bp, int elseit)
 3692 {
 3693         buf_cmd_t cmd;
 3694 
 3695         KASSERT(BUF_REFCNTNB(bp) > 0, 
 3696                 ("biodone: bp %p not busy %d", bp, BUF_REFCNTNB(bp)));
 3697         KASSERT(bp->b_cmd != BUF_CMD_DONE, 
 3698                 ("biodone: bp %p already done!", bp));
 3699 
 3700         /*
 3701          * No more BIOs are left.  All completion functions have been dealt
 3702          * with, now we clean up the buffer.
 3703          */
 3704         cmd = bp->b_cmd;
 3705         bp->b_cmd = BUF_CMD_DONE;
 3706 
 3707         /*
 3708          * Only reads and writes are processed past this point.
 3709          */
 3710         if (cmd != BUF_CMD_READ && cmd != BUF_CMD_WRITE) {
 3711                 if (cmd == BUF_CMD_FREEBLKS)
 3712                         bp->b_flags |= B_NOCACHE;
 3713                 if (elseit)
 3714                         brelse(bp);
 3715                 return;
 3716         }
 3717 
 3718         /*
 3719          * Warning: softupdates may re-dirty the buffer, and HAMMER can do
 3720          * a lot worse.  XXX - move this above the clearing of b_cmd
 3721          */
 3722         if (LIST_FIRST(&bp->b_dep) != NULL)
 3723                 buf_complete(bp);
 3724 
 3725         /*
 3726          * A failed write must re-dirty the buffer unless B_INVAL
 3727          * was set.  Only applicable to normal buffers (with VPs).
 3728          * vinum buffers may not have a vp.
 3729          */
 3730         if (cmd == BUF_CMD_WRITE &&
 3731             (bp->b_flags & (B_ERROR | B_INVAL)) == B_ERROR) {
 3732                 bp->b_flags &= ~B_NOCACHE;
 3733                 if (bp->b_vp)
 3734                         bdirty(bp);
 3735         }
 3736 
 3737         if (bp->b_flags & B_VMIO) {
 3738                 int i;
 3739                 vm_ooffset_t foff;
 3740                 vm_page_t m;
 3741                 vm_object_t obj;
 3742                 int iosize;
 3743                 struct vnode *vp = bp->b_vp;
 3744 
 3745                 obj = vp->v_object;
 3746 
 3747 #if defined(VFS_BIO_DEBUG)
 3748                 if (vp->v_auxrefs == 0)
 3749                         panic("biodone: zero vnode hold count");
 3750                 if ((vp->v_flag & VOBJBUF) == 0)
 3751                         panic("biodone: vnode is not setup for merged cache");
 3752 #endif
 3753 
 3754                 foff = bp->b_loffset;
 3755                 KASSERT(foff != NOOFFSET, ("biodone: no buffer offset"));
 3756                 KASSERT(obj != NULL, ("biodone: missing VM object"));
 3757 
 3758 #if defined(VFS_BIO_DEBUG)
 3759                 if (obj->paging_in_progress < bp->b_xio.xio_npages) {
 3760                         kprintf("biodone: paging in progress(%d) < "
 3761                                 "bp->b_xio.xio_npages(%d)\n",
 3762                                 obj->paging_in_progress,
 3763                                 bp->b_xio.xio_npages);
 3764                 }
 3765 #endif
 3766 
 3767                 /*
 3768                  * Set B_CACHE if the op was a normal read and no error
 3769                  * occured.  B_CACHE is set for writes in the b*write()
 3770                  * routines.
 3771                  */
 3772                 iosize = bp->b_bcount - bp->b_resid;
 3773                 if (cmd == BUF_CMD_READ &&
 3774                     (bp->b_flags & (B_INVAL|B_NOCACHE|B_ERROR)) == 0) {
 3775                         bp->b_flags |= B_CACHE;
 3776                 }
 3777 
 3778                 vm_object_hold(obj);
 3779                 for (i = 0; i < bp->b_xio.xio_npages; i++) {
 3780                         int bogusflag = 0;
 3781                         int resid;
 3782 
 3783                         resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 3784                         if (resid > iosize)
 3785                                 resid = iosize;
 3786 
 3787                         /*
 3788                          * cleanup bogus pages, restoring the originals.  Since
 3789                          * the originals should still be wired, we don't have
 3790                          * to worry about interrupt/freeing races destroying
 3791                          * the VM object association.
 3792                          */
 3793                         m = bp->b_xio.xio_pages[i];
 3794                         if (m == bogus_page) {
 3795                                 bogusflag = 1;
 3796                                 m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 3797                                 if (m == NULL)
 3798                                         panic("biodone: page disappeared");
 3799                                 bp->b_xio.xio_pages[i] = m;
 3800                                 pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 3801                                         bp->b_xio.xio_pages, bp->b_xio.xio_npages);
 3802                         }
 3803 #if defined(VFS_BIO_DEBUG)
 3804                         if (OFF_TO_IDX(foff) != m->pindex) {
 3805                                 kprintf("biodone: foff(%lu)/m->pindex(%ld) "
 3806                                         "mismatch\n",
 3807                                         (unsigned long)foff, (long)m->pindex);
 3808                         }
 3809 #endif
 3810 
 3811                         /*
 3812                          * In the write case, the valid and clean bits are
 3813                          * already changed correctly (see bdwrite()), so we
 3814                          * only need to do this here in the read case.
 3815                          */
 3816                         vm_page_busy_wait(m, FALSE, "bpdpgw");
 3817                         if (cmd == BUF_CMD_READ && !bogusflag && resid > 0) {
 3818                                 vfs_clean_one_page(bp, i, m);
 3819                         }
 3820                         vm_page_flag_clear(m, PG_ZERO);
 3821 
 3822                         /*
 3823                          * when debugging new filesystems or buffer I/O
 3824                          * methods, this is the most common error that pops
 3825                          * up.  if you see this, you have not set the page
 3826                          * busy flag correctly!!!
 3827                          */
 3828                         if (m->busy == 0) {
 3829                                 kprintf("biodone: page busy < 0, "
 3830                                     "pindex: %d, foff: 0x(%x,%x), "
 3831                                     "resid: %d, index: %d\n",
 3832                                     (int) m->pindex, (int)(foff >> 32),
 3833                                                 (int) foff & 0xffffffff, resid, i);
 3834                                 if (!vn_isdisk(vp, NULL))
 3835                                         kprintf(" iosize: %ld, loffset: %lld, "
 3836                                                 "flags: 0x%08x, npages: %d\n",
 3837                                             bp->b_vp->v_mount->mnt_stat.f_iosize,
 3838                                             (long long)bp->b_loffset,
 3839                                             bp->b_flags, bp->b_xio.xio_npages);
 3840                                 else
 3841                                         kprintf(" VDEV, loffset: %lld, flags: 0x%08x, npages: %d\n",
 3842                                             (long long)bp->b_loffset,
 3843                                             bp->b_flags, bp->b_xio.xio_npages);
 3844                                 kprintf(" valid: 0x%x, dirty: 0x%x, "
 3845                                         "wired: %d\n",
 3846                                         m->valid, m->dirty,
 3847                                         m->wire_count);
 3848                                 panic("biodone: page busy < 0");
 3849                         }
 3850                         vm_page_io_finish(m);
 3851                         vm_page_wakeup(m);
 3852                         vm_object_pip_wakeup(obj);
 3853                         foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 3854                         iosize -= resid;
 3855                 }
 3856                 bp->b_flags &= ~B_HASBOGUS;
 3857                 vm_object_drop(obj);
 3858         }
 3859 
 3860         /*
 3861          * Finish up by releasing the buffer.  There are no more synchronous
 3862          * or asynchronous completions, those were handled by bio_done
 3863          * callbacks.
 3864          */
 3865         if (elseit) {
 3866                 if (bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR|B_RELBUF))
 3867                         brelse(bp);
 3868                 else
 3869                         bqrelse(bp);
 3870         }
 3871 }
 3872 
 3873 /*
 3874  * Normal biodone.
 3875  */
 3876 void
 3877 biodone(struct bio *bio)
 3878 {
 3879         struct buf *bp = bio->bio_buf;
 3880 
 3881         runningbufwakeup(bp);
 3882 
 3883         /*
 3884          * Run up the chain of BIO's.   Leave b_cmd intact for the duration.
 3885          */
 3886         while (bio) {
 3887                 biodone_t *done_func;
 3888                 struct bio_track *track;
 3889 
 3890                 /*
 3891                  * BIO tracking.  Most but not all BIOs are tracked.
 3892                  */
 3893                 if ((track = bio->bio_track) != NULL) {
 3894                         bio_track_rel(track);
 3895                         bio->bio_track = NULL;
 3896                 }
 3897 
 3898                 /*
 3899                  * A bio_done function terminates the loop.  The function
 3900                  * will be responsible for any further chaining and/or
 3901                  * buffer management.
 3902                  *
 3903                  * WARNING!  The done function can deallocate the buffer!
 3904                  */
 3905                 if ((done_func = bio->bio_done) != NULL) {
 3906                         bio->bio_done = NULL;
 3907                         done_func(bio);
 3908                         return;
 3909                 }
 3910                 bio = bio->bio_prev;
 3911         }
 3912 
 3913         /*
 3914          * If we've run out of bio's do normal [a]synchronous completion.
 3915          */
 3916         bpdone(bp, 1);
 3917 }
 3918 
 3919 /*
 3920  * Synchronous biodone - this terminates a synchronous BIO.
 3921  *
 3922  * bpdone() is called with elseit=FALSE, leaving the buffer completed
 3923  * but still locked.  The caller must brelse() the buffer after waiting
 3924  * for completion.
 3925  */
 3926 void
 3927 biodone_sync(struct bio *bio)
 3928 {
 3929         struct buf *bp = bio->bio_buf;
 3930         int flags;
 3931         int nflags;
 3932 
 3933         KKASSERT(bio == &bp->b_bio1);
 3934         bpdone(bp, 0);
 3935 
 3936         for (;;) {
 3937                 flags = bio->bio_flags;
 3938                 nflags = (flags | BIO_DONE) & ~BIO_WANT;
 3939 
 3940                 if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) {
 3941                         if (flags & BIO_WANT)
 3942                                 wakeup(bio);
 3943                         break;
 3944                 }
 3945         }
 3946 }
 3947 
 3948 /*
 3949  * vfs_unbusy_pages:
 3950  *
 3951  *      This routine is called in lieu of iodone in the case of
 3952  *      incomplete I/O.  This keeps the busy status for pages
 3953  *      consistant.
 3954  */
 3955 void
 3956 vfs_unbusy_pages(struct buf *bp)
 3957 {
 3958         int i;
 3959 
 3960         runningbufwakeup(bp);
 3961 
 3962         if (bp->b_flags & B_VMIO) {
 3963                 struct vnode *vp = bp->b_vp;
 3964                 vm_object_t obj;
 3965 
 3966                 obj = vp->v_object;
 3967                 vm_object_hold(obj);
 3968 
 3969                 for (i = 0; i < bp->b_xio.xio_npages; i++) {
 3970                         vm_page_t m = bp->b_xio.xio_pages[i];
 3971 
 3972                         /*
 3973                          * When restoring bogus changes the original pages
 3974                          * should still be wired, so we are in no danger of
 3975                          * losing the object association and do not need
 3976                          * critical section protection particularly.
 3977                          */
 3978                         if (m == bogus_page) {
 3979                                 m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_loffset) + i);
 3980                                 if (!m) {
 3981                                         panic("vfs_unbusy_pages: page missing");
 3982                                 }
 3983                                 bp->b_xio.xio_pages[i] = m;
 3984                                 pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 3985                                         bp->b_xio.xio_pages, bp->b_xio.xio_npages);
 3986                         }
 3987                         vm_page_busy_wait(m, FALSE, "bpdpgw");
 3988                         vm_page_flag_clear(m, PG_ZERO);
 3989                         vm_page_io_finish(m);
 3990                         vm_page_wakeup(m);
 3991                         vm_object_pip_wakeup(obj);
 3992                 }
 3993                 bp->b_flags &= ~B_HASBOGUS;
 3994                 vm_object_drop(obj);
 3995         }
 3996 }
 3997 
 3998 /*
 3999  * vfs_busy_pages:
 4000  *
 4001  *      This routine is called before a device strategy routine.
 4002  *      It is used to tell the VM system that paging I/O is in
 4003  *      progress, and treat the pages associated with the buffer
 4004  *      almost as being PG_BUSY.  Also the object 'paging_in_progress'
 4005  *      flag is handled to make sure that the object doesn't become
 4006  *      inconsistant.
 4007  *
 4008  *      Since I/O has not been initiated yet, certain buffer flags
 4009  *      such as B_ERROR or B_INVAL may be in an inconsistant state
 4010  *      and should be ignored.
 4011  */
 4012 void
 4013 vfs_busy_pages(struct vnode *vp, struct buf *bp)
 4014 {
 4015         int i, bogus;
 4016         struct lwp *lp = curthread->td_lwp;
 4017 
 4018         /*
 4019          * The buffer's I/O command must already be set.  If reading,
 4020          * B_CACHE must be 0 (double check against callers only doing
 4021          * I/O when B_CACHE is 0).
 4022          */
 4023         KKASSERT(bp->b_cmd != BUF_CMD_DONE);
 4024         KKASSERT(bp->b_cmd == BUF_CMD_WRITE || (bp->b_flags & B_CACHE) == 0);
 4025 
 4026         if (bp->b_flags & B_VMIO) {
 4027                 vm_object_t obj;
 4028 
 4029                 obj = vp->v_object;
 4030                 KASSERT(bp->b_loffset != NOOFFSET,
 4031                         ("vfs_busy_pages: no buffer offset"));
 4032 
 4033                 /*
 4034                  * Busy all the pages.  We have to busy them all at once
 4035                  * to avoid deadlocks.
 4036                  */
 4037 retry:
 4038                 for (i = 0; i < bp->b_xio.xio_npages; i++) {
 4039                         vm_page_t m = bp->b_xio.xio_pages[i];
 4040 
 4041                         if (vm_page_busy_try(m, FALSE)) {
 4042                                 vm_page_sleep_busy(m, FALSE, "vbpage");
 4043                                 while (--i >= 0)
 4044                                         vm_page_wakeup(bp->b_xio.xio_pages[i]);
 4045                                 goto retry;
 4046                         }
 4047                 }
 4048 
 4049                 /*
 4050                  * Setup for I/O, soft-busy the page right now because
 4051                  * the next loop may block.
 4052                  */
 4053                 for (i = 0; i < bp->b_xio.xio_npages; i++) {
 4054                         vm_page_t m = bp->b_xio.xio_pages[i];
 4055 
 4056                         vm_page_flag_clear(m, PG_ZERO);
 4057                         if ((bp->b_flags & B_CLUSTER) == 0) {
 4058                                 vm_object_pip_add(obj, 1);
 4059                                 vm_page_io_start(m);
 4060                         }
 4061                 }
 4062 
 4063                 /*
 4064                  * Adjust protections for I/O and do bogus-page mapping.
 4065                  * Assume that vm_page_protect() can block (it can block
 4066                  * if VM_PROT_NONE, don't take any chances regardless).
 4067                  *
 4068                  * In particular note that for writes we must incorporate
 4069                  * page dirtyness from the VM system into the buffer's
 4070                  * dirty range.
 4071                  *
 4072                  * For reads we theoretically must incorporate page dirtyness
 4073                  * from the VM system to determine if the page needs bogus
 4074                  * replacement, but we shortcut the test by simply checking
 4075                  * that all m->valid bits are set, indicating that the page
 4076                  * is fully valid and does not need to be re-read.  For any
 4077                  * VM system dirtyness the page will also be fully valid
 4078                  * since it was mapped at one point.
 4079                  */
 4080                 bogus = 0;
 4081                 for (i = 0; i < bp->b_xio.xio_npages; i++) {
 4082                         vm_page_t m = bp->b_xio.xio_pages[i];
 4083 
 4084                         vm_page_flag_clear(m, PG_ZERO); /* XXX */
 4085                         if (bp->b_cmd == BUF_CMD_WRITE) {
 4086                                 /*
 4087                                  * When readying a vnode-backed buffer for
 4088                                  * a write we must zero-fill any invalid
 4089                                  * portions of the backing VM pages, mark
 4090                                  * it valid and clear related dirty bits.
 4091                                  *
 4092                                  * vfs_clean_one_page() incorporates any
 4093                                  * VM dirtyness and updates the b_dirtyoff
 4094                                  * range (after we've made the page RO).
 4095                                  *
 4096                                  * It is also expected that the pmap modified
 4097                                  * bit has already been cleared by the
 4098                                  * vm_page_protect().  We may not be able
 4099                                  * to clear all dirty bits for a page if it
 4100                                  * was also memory mapped (NFS).
 4101                                  *
 4102                                  * Finally be sure to unassign any swap-cache
 4103                                  * backing store as it is now stale.
 4104                                  */
 4105                                 vm_page_protect(m, VM_PROT_READ);
 4106                                 vfs_clean_one_page(bp, i, m);
 4107                                 swap_pager_unswapped(m);
 4108                         } else if (m->valid == VM_PAGE_BITS_ALL) {
 4109                                 /*
 4110                                  * When readying a vnode-backed buffer for
 4111                                  * read we must replace any dirty pages with
 4112                                  * a bogus page so dirty data is not destroyed
 4113                                  * when filling gaps.
 4114                                  *
 4115                                  * To avoid testing whether the page is
 4116                                  * dirty we instead test that the page was
 4117                                  * at some point mapped (m->valid fully
 4118                                  * valid) with the understanding that
 4119                                  * this also covers the dirty case.
 4120                                  */
 4121                                 bp->b_xio.xio_pages[i] = bogus_page;
 4122                                 bp->b_flags |= B_HASBOGUS;
 4123                                 bogus++;
 4124                         } else if (m->valid & m->dirty) {
 4125                                 /*
 4126                                  * This case should not occur as partial
 4127                                  * dirtyment can only happen if the buffer
 4128                                  * is B_CACHE, and this code is not entered
 4129                                  * if the buffer is B_CACHE.
 4130                                  */
 4131                                 kprintf("Warning: vfs_busy_pages - page not "
 4132                                         "fully valid! loff=%jx bpf=%08x "
 4133                                         "idx=%d val=%02x dir=%02x\n",
 4134                                         (uintmax_t)bp->b_loffset, bp->b_flags,
 4135                                         i, m->valid, m->dirty);
 4136                                 vm_page_protect(m, VM_PROT_NONE);
 4137                         } else {
 4138                                 /*
 4139                                  * The page is not valid and can be made
 4140                                  * part of the read.
 4141                                  */
 4142                                 vm_page_protect(m, VM_PROT_NONE);
 4143                         }
 4144                         vm_page_wakeup(m);
 4145                 }
 4146                 if (bogus) {
 4147                         pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 4148                                 bp->b_xio.xio_pages, bp->b_xio.xio_npages);
 4149                 }
 4150         }
 4151 
 4152         /*
 4153          * This is the easiest place to put the process accounting for the I/O
 4154          * for now.
 4155          */
 4156         if (lp != NULL) {
 4157                 if (bp->b_cmd == BUF_CMD_READ)
 4158                         lp->lwp_ru.ru_inblock++;
 4159                 else
 4160                         lp->lwp_ru.ru_oublock++;
 4161         }
 4162 }
 4163 
 4164 /*
 4165  * Tell the VM system that the pages associated with this buffer
 4166  * are clean.  This is used for delayed writes where the data is
 4167  * going to go to disk eventually without additional VM intevention.
 4168  *
 4169  * NOTE: While we only really need to clean through to b_bcount, we
 4170  *       just go ahead and clean through to b_bufsize.
 4171  */
 4172 static void
 4173 vfs_clean_pages(struct buf *bp)
 4174 {
 4175         vm_page_t m;
 4176         int i;
 4177 
 4178         if ((bp->b_flags & B_VMIO) == 0)
 4179                 return;
 4180 
 4181         KASSERT(bp->b_loffset != NOOFFSET,
 4182                 ("vfs_clean_pages: no buffer offset"));
 4183 
 4184         for (i = 0; i < bp->b_xio.xio_npages; i++) {
 4185                 m = bp->b_xio.xio_pages[i];
 4186                 vfs_clean_one_page(bp, i, m);
 4187         }
 4188 }
 4189 
 4190 /*
 4191  * vfs_clean_one_page:
 4192  *
 4193  *      Set the valid bits and clear the dirty bits in a page within a
 4194  *      buffer.  The range is restricted to the buffer's size and the
 4195  *      buffer's logical offset might index into the first page.
 4196  *
 4197  *      The caller has busied or soft-busied the page and it is not mapped,
 4198  *      test and incorporate the dirty bits into b_dirtyoff/end before
 4199  *      clearing them.  Note that we need to clear the pmap modified bits
 4200  *      after determining the the page was dirty, vm_page_set_validclean()
 4201  *      does not do it for us.
 4202  *
 4203  *      This routine is typically called after a read completes (dirty should
 4204  *      be zero in that case as we are not called on bogus-replace pages),
 4205  *      or before a write is initiated.
 4206  */
 4207 static void
 4208 vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m)
 4209 {
 4210         int bcount;
 4211         int xoff;
 4212         int soff;
 4213         int eoff;
 4214 
 4215         /*
 4216          * Calculate offset range within the page but relative to buffer's
 4217          * loffset.  loffset might be offset into the first page.
 4218          */
 4219         xoff = (int)bp->b_loffset & PAGE_MASK;  /* loffset offset into pg 0 */
 4220         bcount = bp->b_bcount + xoff;           /* offset adjusted */
 4221 
 4222         if (pageno == 0) {
 4223                 soff = xoff;
 4224                 eoff = PAGE_SIZE;
 4225         } else {
 4226                 soff = (pageno << PAGE_SHIFT);
 4227                 eoff = soff + PAGE_SIZE;
 4228         }
 4229         if (eoff > bcount)
 4230                 eoff = bcount;
 4231         if (soff >= eoff)
 4232                 return;
 4233 
 4234         /*
 4235          * Test dirty bits and adjust b_dirtyoff/end.
 4236          *
 4237          * If dirty pages are incorporated into the bp any prior
 4238          * B_NEEDCOMMIT state (NFS) must be cleared because the
 4239          * caller has not taken into account the new dirty data.
 4240          *
 4241          * If the page was memory mapped the dirty bits might go beyond the
 4242          * end of the buffer, but we can't really make the assumption that
 4243          * a file EOF straddles the buffer (even though this is the case for
 4244          * NFS if B_NEEDCOMMIT is also set).  So for the purposes of clearing
 4245          * B_NEEDCOMMIT we only test the dirty bits covered by the buffer.
 4246          * This also saves some console spam.
 4247          *
 4248          * When clearing B_NEEDCOMMIT we must also clear B_CLUSTEROK,
 4249          * NFS can handle huge commits but not huge writes.
 4250          */
 4251         vm_page_test_dirty(m);
 4252         if (m->dirty) {
 4253                 if ((bp->b_flags & B_NEEDCOMMIT) &&
 4254                     (m->dirty & vm_page_bits(soff & PAGE_MASK, eoff - soff))) {
 4255                         if (debug_commit)
 4256                                 kprintf("Warning: vfs_clean_one_page: bp %p "
 4257                                     "loff=%jx,%d flgs=%08x clr B_NEEDCOMMIT"
 4258                                     " cmd %d vd %02x/%02x x/s/e %d %d %d "
 4259                                     "doff/end %d %d\n",
 4260                                     bp, (uintmax_t)bp->b_loffset, bp->b_bcount,
 4261                                     bp->b_flags, bp->b_cmd,
 4262                                     m->valid, m->dirty, xoff, soff, eoff,
 4263                                     bp->b_dirtyoff, bp->b_dirtyend);
 4264                         bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 4265                         if (debug_commit)
 4266                                 print_backtrace(-1);
 4267                 }
 4268                 /*
 4269                  * Only clear the pmap modified bits if ALL the dirty bits
 4270                  * are set, otherwise the system might mis-clear portions
 4271                  * of a page.
 4272                  */
 4273                 if (m->dirty == VM_PAGE_BITS_ALL &&
 4274                     (bp->b_flags & B_NEEDCOMMIT) == 0) {
 4275                         pmap_clear_modify(m);
 4276                 }
 4277                 if (bp->b_dirtyoff > soff - xoff)
 4278                         bp->b_dirtyoff = soff - xoff;
 4279                 if (bp->b_dirtyend < eoff - xoff)
 4280                         bp->b_dirtyend = eoff - xoff;
 4281         }
 4282 
 4283         /*
 4284          * Set related valid bits, clear related dirty bits.
 4285          * Does not mess with the pmap modified bit.
 4286          *
 4287          * WARNING!  We cannot just clear all of m->dirty here as the
 4288          *           buffer cache buffers may use a DEV_BSIZE'd aligned
 4289          *           block size, or have an odd size (e.g. NFS at file EOF).
 4290          *           The putpages code can clear m->dirty to 0.
 4291          *
 4292          *           If a VOP_WRITE generates a buffer cache buffer which
 4293          *           covers the same space as mapped writable pages the
 4294          *           buffer flush might not be able to clear all the dirty
 4295          *           bits and still require a putpages from the VM system
 4296          *           to finish it off.
 4297          *
 4298          * WARNING!  vm_page_set_validclean() currently assumes vm_token
 4299          *           is held.  The page might not be busied (bdwrite() case).
 4300          *           XXX remove this comment once we've validated that this
 4301          *           is no longer an issue.
 4302          */
 4303         vm_page_set_validclean(m, soff & PAGE_MASK, eoff - soff);
 4304 }
 4305 
 4306 #if 0
 4307 /*
 4308  * Similar to vfs_clean_one_page() but sets the bits to valid and dirty.
 4309  * The page data is assumed to be valid (there is no zeroing here).
 4310  */
 4311 static void
 4312 vfs_dirty_one_page(struct buf *bp, int pageno, vm_page_t m)
 4313 {
 4314         int bcount;
 4315         int xoff;
 4316         int soff;
 4317         int eoff;
 4318 
 4319         /*
 4320          * Calculate offset range within the page but relative to buffer's
 4321          * loffset.  loffset might be offset into the first page.
 4322          */
 4323         xoff = (int)bp->b_loffset & PAGE_MASK;  /* loffset offset into pg 0 */
 4324         bcount = bp->b_bcount + xoff;           /* offset adjusted */
 4325 
 4326         if (pageno == 0) {
 4327                 soff = xoff;
 4328                 eoff = PAGE_SIZE;
 4329         } else {
 4330                 soff = (pageno << PAGE_SHIFT);
 4331                 eoff = soff + PAGE_SIZE;
 4332         }
 4333         if (eoff > bcount)
 4334                 eoff = bcount;
 4335         if (soff >= eoff)
 4336                 return;
 4337         vm_page_set_validdirty(m, soff & PAGE_MASK, eoff - soff);
 4338 }
 4339 #endif
 4340 
 4341 /*
 4342  * vfs_bio_clrbuf:
 4343  *
 4344  *      Clear a buffer.  This routine essentially fakes an I/O, so we need
 4345  *      to clear B_ERROR and B_INVAL.
 4346  *
 4347  *      Note that while we only theoretically need to clear through b_bcount,
 4348  *      we go ahead and clear through b_bufsize.
 4349  */
 4350 
 4351 void
 4352 vfs_bio_clrbuf(struct buf *bp)
 4353 {
 4354         int i, mask = 0;
 4355         caddr_t sa, ea;
 4356         if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
 4357                 bp->b_flags &= ~(B_INVAL | B_EINTR | B_ERROR);
 4358                 if ((bp->b_xio.xio_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 4359                     (bp->b_loffset & PAGE_MASK) == 0) {
 4360                         mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 4361                         if ((bp->b_xio.xio_pages[0]->valid & mask) == mask) {
 4362                                 bp->b_resid = 0;
 4363                                 return;
 4364                         }
 4365                         if (((bp->b_xio.xio_pages[0]->flags & PG_ZERO) == 0) &&
 4366                             ((bp->b_xio.xio_pages[0]->valid & mask) == 0)) {
 4367                                 bzero(bp->b_data, bp->b_bufsize);
 4368                                 bp->b_xio.xio_pages[0]->valid |= mask;
 4369                                 bp->b_resid = 0;
 4370                                 return;
 4371                         }
 4372                 }
 4373                 sa = bp->b_data;
 4374                 for(i=0;i<bp->b_xio.xio_npages;i++,sa=ea) {
 4375                         int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
 4376                         ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
 4377                         ea = (caddr_t)(vm_offset_t)ulmin(
 4378                             (u_long)(vm_offset_t)ea,
 4379                             (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
 4380                         mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 4381                         if ((bp->b_xio.xio_pages[i]->valid & mask) == mask)
 4382                                 continue;
 4383                         if ((bp->b_xio.xio_pages[i]->valid & mask) == 0) {
 4384                                 if ((bp->b_xio.xio_pages[i]->flags & PG_ZERO) == 0) {
 4385                                         bzero(sa, ea - sa);
 4386                                 }
 4387                         } else {
 4388                                 for (; sa < ea; sa += DEV_BSIZE, j++) {
 4389                                         if (((bp->b_xio.xio_pages[i]->flags & PG_ZERO) == 0) &&
 4390                                                 (bp->b_xio.xio_pages[i]->valid & (1<<j)) == 0)
 4391                                                 bzero(sa, DEV_BSIZE);
 4392                                 }
 4393                         }
 4394                         bp->b_xio.xio_pages[i]->valid |= mask;
 4395                         vm_page_flag_clear(bp->b_xio.xio_pages[i], PG_ZERO);
 4396                 }
 4397                 bp->b_resid = 0;
 4398         } else {
 4399                 clrbuf(bp);
 4400         }
 4401 }
 4402 
 4403 /*
 4404  * vm_hold_load_pages:
 4405  *
 4406  *      Load pages into the buffer's address space.  The pages are
 4407  *      allocated from the kernel object in order to reduce interference
 4408  *      with the any VM paging I/O activity.  The range of loaded
 4409  *      pages will be wired.
 4410  *
 4411  *      If a page cannot be allocated, the 'pagedaemon' is woken up to
 4412  *      retrieve the full range (to - from) of pages.
 4413  */
 4414 void
 4415 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 4416 {
 4417         vm_offset_t pg;
 4418         vm_page_t p;
 4419         int index;
 4420 
 4421         to = round_page(to);
 4422         from = round_page(from);
 4423         index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 4424 
 4425         pg = from;
 4426         while (pg < to) {
 4427                 /*
 4428                  * Note: must allocate system pages since blocking here
 4429                  * could intefere with paging I/O, no matter which
 4430                  * process we are.
 4431                  */
 4432                 vm_object_hold(&kernel_object);
 4433                 p = bio_page_alloc(bp, &kernel_object, pg >> PAGE_SHIFT,
 4434                                    (vm_pindex_t)((to - pg) >> PAGE_SHIFT));
 4435                 vm_object_drop(&kernel_object);
 4436                 if (p) {
 4437                         vm_page_wire(p);
 4438                         p->valid = VM_PAGE_BITS_ALL;
 4439                         vm_page_flag_clear(p, PG_ZERO);
 4440                         pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
 4441                         bp->b_xio.xio_pages[index] = p;
 4442                         vm_page_wakeup(p);
 4443 
 4444                         pg += PAGE_SIZE;
 4445                         ++index;
 4446                 }
 4447         }
 4448         bp->b_xio.xio_npages = index;
 4449 }
 4450 
 4451 /*
 4452  * Allocate a page for a buffer cache buffer.
 4453  *
 4454  * If NULL is returned the caller is expected to retry (typically check if
 4455  * the page already exists on retry before trying to allocate one).
 4456  *
 4457  * NOTE! Low-memory handling is dealt with in b[q]relse(), not here.  This
 4458  *       function will use the system reserve with the hope that the page
 4459  *       allocations can be returned to PQ_CACHE/PQ_FREE when the caller
 4460  *       is done with the buffer.
 4461  *
 4462  * NOTE! However, TMPFS is a special case because flushing a dirty buffer
 4463  *       to TMPFS doesn't clean the page.  For TMPFS, only the pagedaemon
 4464  *       is capable of retiring pages (to swap).  For TMPFS we don't dig
 4465  *       into the system reserve because doing so could stall out pretty
 4466  *       much every process running on the system.
 4467  */
 4468 static
 4469 vm_page_t
 4470 bio_page_alloc(struct buf *bp, vm_object_t obj, vm_pindex_t pg, int deficit)
 4471 {
 4472         int vmflags = VM_ALLOC_NORMAL | VM_ALLOC_NULL_OK;
 4473         vm_page_t p;
 4474 
 4475         ASSERT_LWKT_TOKEN_HELD(vm_object_token(obj));
 4476 
 4477         /*
 4478          * Try a normal allocation first.
 4479          */
 4480         p = vm_page_alloc(obj, pg, vmflags);
 4481         if (p)
 4482                 return(p);
 4483         if (vm_page_lookup(obj, pg))
 4484                 return(NULL);
 4485         vm_pageout_deficit += deficit;
 4486 
 4487         /*
 4488          * Try again, digging into the system reserve.
 4489          *
 4490          * Trying to recover pages from the buffer cache here can deadlock
 4491          * against other threads trying to busy underlying pages so we
 4492          * depend on the code in brelse() and bqrelse() to free/cache the
 4493          * underlying buffer cache pages when memory is low.
 4494          */
 4495         if (curthread->td_flags & TDF_SYSTHREAD)
 4496                 vmflags |= VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT;
 4497         else if (bp->b_vp && bp->b_vp->v_tag == VT_TMPFS)
 4498                 vmflags |= 0;
 4499         else
 4500                 vmflags |= VM_ALLOC_SYSTEM;
 4501 
 4502         /*recoverbufpages();*/
 4503         p = vm_page_alloc(obj, pg, vmflags);
 4504         if (p)
 4505                 return(p);
 4506         if (vm_page_lookup(obj, pg))
 4507                 return(NULL);
 4508 
 4509         /*
 4510          * Wait for memory to free up and try again
 4511          */
 4512         if (vm_page_count_severe())
 4513                 ++lowmempgallocs;
 4514         vm_wait(hz / 20 + 1);
 4515 
 4516         p = vm_page_alloc(obj, pg, vmflags);
 4517         if (p)
 4518                 return(p);
 4519         if (vm_page_lookup(obj, pg))
 4520                 return(NULL);
 4521 
 4522         /*
 4523          * Ok, now we are really in trouble.
 4524          */
 4525         {
 4526                 static struct krate biokrate = { .freq = 1 };
 4527                 krateprintf(&biokrate,
 4528                             "Warning: bio_page_alloc: memory exhausted "
 4529                             "during bufcache page allocation from %s\n",
 4530                             curthread->td_comm);
 4531         }
 4532         if (curthread->td_flags & TDF_SYSTHREAD)
 4533                 vm_wait(hz / 20 + 1);
 4534         else
 4535                 vm_wait(hz / 2 + 1);
 4536         return (NULL);
 4537 }
 4538 
 4539 /*
 4540  * vm_hold_free_pages:
 4541  *
 4542  *      Return pages associated with the buffer back to the VM system.
 4543  *
 4544  *      The range of pages underlying the buffer's address space will
 4545  *      be unmapped and un-wired.
 4546  */
 4547 void
 4548 vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 4549 {
 4550         vm_offset_t pg;
 4551         vm_page_t p;
 4552         int index, newnpages;
 4553 
 4554         from = round_page(from);
 4555         to = round_page(to);
 4556         index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 4557         newnpages = index;
 4558 
 4559         for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 4560                 p = bp->b_xio.xio_pages[index];
 4561                 if (p && (index < bp->b_xio.xio_npages)) {
 4562                         if (p->busy) {
 4563                                 kprintf("vm_hold_free_pages: doffset: %lld, "
 4564                                         "loffset: %lld\n",
 4565                                         (long long)bp->b_bio2.bio_offset,
 4566                                         (long long)bp->b_loffset);
 4567                         }
 4568                         bp->b_xio.xio_pages[index] = NULL;
 4569                         pmap_kremove(pg);
 4570                         vm_page_busy_wait(p, FALSE, "vmhldpg");
 4571                         vm_page_unwire(p, 0);
 4572                         vm_page_free(p);
 4573                 }
 4574         }
 4575         bp->b_xio.xio_npages = newnpages;
 4576 }
 4577 
 4578 /*
 4579  * vmapbuf:
 4580  *
 4581  *      Map a user buffer into KVM via a pbuf.  On return the buffer's
 4582  *      b_data, b_bufsize, and b_bcount will be set, and its XIO page array
 4583  *      initialized.
 4584  */
 4585 int
 4586 vmapbuf(struct buf *bp, caddr_t udata, int bytes)
 4587 {
 4588         caddr_t addr;
 4589         vm_offset_t va;
 4590         vm_page_t m;
 4591         int vmprot;
 4592         int error;
 4593         int pidx;
 4594         int i;
 4595 
 4596         /* 
 4597          * bp had better have a command and it better be a pbuf.
 4598          */
 4599         KKASSERT(bp->b_cmd != BUF_CMD_DONE);
 4600         KKASSERT(bp->b_flags & B_PAGING);
 4601         KKASSERT(bp->b_kvabase);
 4602 
 4603         if (bytes < 0)
 4604                 return (-1);
 4605 
 4606         /*
 4607          * Map the user data into KVM.  Mappings have to be page-aligned.
 4608          */
 4609         addr = (caddr_t)trunc_page((vm_offset_t)udata);
 4610         pidx = 0;
 4611 
 4612         vmprot = VM_PROT_READ;
 4613         if (bp->b_cmd == BUF_CMD_READ)
 4614                 vmprot |= VM_PROT_WRITE;
 4615 
 4616         while (addr < udata + bytes) {
 4617                 /*
 4618                  * Do the vm_fault if needed; do the copy-on-write thing
 4619                  * when reading stuff off device into memory.
 4620                  *
 4621                  * vm_fault_page*() returns a held VM page.
 4622                  */
 4623                 va = (addr >= udata) ? (vm_offset_t)addr : (vm_offset_t)udata;
 4624                 va = trunc_page(va);
 4625 
 4626                 m = vm_fault_page_quick(va, vmprot, &error);
 4627                 if (m == NULL) {
 4628                         for (i = 0; i < pidx; ++i) {
 4629                             vm_page_unhold(bp->b_xio.xio_pages[i]);
 4630                             bp->b_xio.xio_pages[i] = NULL;
 4631                         }
 4632                         return(-1);
 4633                 }
 4634                 bp->b_xio.xio_pages[pidx] = m;
 4635                 addr += PAGE_SIZE;
 4636                 ++pidx;
 4637         }
 4638 
 4639         /*
 4640          * Map the page array and set the buffer fields to point to
 4641          * the mapped data buffer.
 4642          */
 4643         if (pidx > btoc(MAXPHYS))
 4644                 panic("vmapbuf: mapped more than MAXPHYS");
 4645         pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_xio.xio_pages, pidx);
 4646 
 4647         bp->b_xio.xio_npages = pidx;
 4648         bp->b_data = bp->b_kvabase + ((int)(intptr_t)udata & PAGE_MASK);
 4649         bp->b_bcount = bytes;
 4650         bp->b_bufsize = bytes;
 4651         return(0);
 4652 }
 4653 
 4654 /*
 4655  * vunmapbuf:
 4656  *
 4657  *      Free the io map PTEs associated with this IO operation.
 4658  *      We also invalidate the TLB entries and restore the original b_addr.
 4659  */
 4660 void
 4661 vunmapbuf(struct buf *bp)
 4662 {
 4663         int pidx;
 4664         int npages;
 4665 
 4666         KKASSERT(bp->b_flags & B_PAGING);
 4667 
 4668         npages = bp->b_xio.xio_npages;
 4669         pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
 4670         for (pidx = 0; pidx < npages; ++pidx) {
 4671                 vm_page_unhold(bp->b_xio.xio_pages[pidx]);
 4672                 bp->b_xio.xio_pages[pidx] = NULL;
 4673         }
 4674         bp->b_xio.xio_npages = 0;
 4675         bp->b_data = bp->b_kvabase;
 4676 }
 4677 
 4678 /*
 4679  * Scan all buffers in the system and issue the callback.
 4680  */
 4681 int
 4682 scan_all_buffers(int (*callback)(struct buf *, void *), void *info)
 4683 {
 4684         int count = 0;
 4685         int error;
 4686         long n;
 4687 
 4688         for (n = 0; n < nbuf; ++n) {
 4689                 if ((error = callback(&buf[n], info)) < 0) {
 4690                         count = error;
 4691                         break;
 4692                 }
 4693                 count += error;
 4694         }
 4695         return (count);
 4696 }
 4697 
 4698 /*
 4699  * nestiobuf_iodone: biodone callback for nested buffers and propagate
 4700  * completion to the master buffer.
 4701  */
 4702 static void
 4703 nestiobuf_iodone(struct bio *bio)
 4704 {
 4705         struct bio *mbio;
 4706         struct buf *mbp, *bp;
 4707         struct devstat *stats;
 4708         int error;
 4709         int donebytes;
 4710 
 4711         bp = bio->bio_buf;
 4712         mbio = bio->bio_caller_info1.ptr;
 4713         stats = bio->bio_caller_info2.ptr;
 4714         mbp = mbio->bio_buf;
 4715 
 4716         KKASSERT(bp->b_bcount <= bp->b_bufsize);
 4717         KKASSERT(mbp != bp);
 4718 
 4719         error = bp->b_error;
 4720         if (bp->b_error == 0 &&
 4721             (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) {
 4722                 /*
 4723                  * Not all got transfered, raise an error. We have no way to
 4724                  * propagate these conditions to mbp.
 4725                  */
 4726                 error = EIO;
 4727         }
 4728 
 4729         donebytes = bp->b_bufsize;
 4730 
 4731         relpbuf(bp, NULL);
 4732 
 4733         nestiobuf_done(mbio, donebytes, error, stats);
 4734 }
 4735 
 4736 void
 4737 nestiobuf_done(struct bio *mbio, int donebytes, int error, struct devstat *stats)
 4738 {
 4739         struct buf *mbp;
 4740 
 4741         mbp = mbio->bio_buf;    
 4742 
 4743         KKASSERT((int)(intptr_t)mbio->bio_driver_info > 0);
 4744 
 4745         /*
 4746          * If an error occured, propagate it to the master buffer.
 4747          *
 4748          * Several biodone()s may wind up running concurrently so
 4749          * use an atomic op to adjust b_flags.
 4750          */
 4751         if (error) {
 4752                 mbp->b_error = error;
 4753                 atomic_set_int(&mbp->b_flags, B_ERROR);
 4754         }
 4755 
 4756         /*
 4757          * Decrement the operations in progress counter and terminate the
 4758          * I/O if this was the last bit.
 4759          */
 4760         if (atomic_fetchadd_int((int *)&mbio->bio_driver_info, -1) == 1) {
 4761                 mbp->b_resid = 0;
 4762                 if (stats)
 4763                         devstat_end_transaction_buf(stats, mbp);
 4764                 biodone(mbio);
 4765         }
 4766 }
 4767 
 4768 /*
 4769  * Initialize a nestiobuf for use.  Set an initial count of 1 to prevent
 4770  * the mbio from being biodone()'d while we are still adding sub-bios to
 4771  * it.
 4772  */
 4773 void
 4774 nestiobuf_init(struct bio *bio)
 4775 {
 4776         bio->bio_driver_info = (void *)1;
 4777 }
 4778 
 4779 /*
 4780  * The BIOs added to the nestedio have already been started, remove the
 4781  * count that placeheld our mbio and biodone() it if the count would
 4782  * transition to 0.
 4783  */
 4784 void
 4785 nestiobuf_start(struct bio *mbio)
 4786 {
 4787         struct buf *mbp = mbio->bio_buf;
 4788 
 4789         /*
 4790          * Decrement the operations in progress counter and terminate the
 4791          * I/O if this was the last bit.
 4792          */
 4793         if (atomic_fetchadd_int((int *)&mbio->bio_driver_info, -1) == 1) {
 4794                 if (mbp->b_flags & B_ERROR)
 4795                         mbp->b_resid = mbp->b_bcount;
 4796                 else
 4797                         mbp->b_resid = 0;
 4798                 biodone(mbio);
 4799         }
 4800 }
 4801 
 4802 /*
 4803  * Set an intermediate error prior to calling nestiobuf_start()
 4804  */
 4805 void
 4806 nestiobuf_error(struct bio *mbio, int error)
 4807 {
 4808         struct buf *mbp = mbio->bio_buf;
 4809 
 4810         if (error) {
 4811                 mbp->b_error = error;
 4812                 atomic_set_int(&mbp->b_flags, B_ERROR);
 4813         }
 4814 }
 4815 
 4816 /*
 4817  * nestiobuf_add: setup a "nested" buffer.
 4818  *
 4819  * => 'mbp' is a "master" buffer which is being divided into sub pieces.
 4820  * => 'bp' should be a buffer allocated by getiobuf.
 4821  * => 'offset' is a byte offset in the master buffer.
 4822  * => 'size' is a size in bytes of this nested buffer.
 4823  */
 4824 void
 4825 nestiobuf_add(struct bio *mbio, struct buf *bp, int offset, size_t size, struct devstat *stats)
 4826 {
 4827         struct buf *mbp = mbio->bio_buf;
 4828         struct vnode *vp = mbp->b_vp;
 4829 
 4830         KKASSERT(mbp->b_bcount >= offset + size);
 4831 
 4832         atomic_add_int((int *)&mbio->bio_driver_info, 1);
 4833 
 4834         /* kernel needs to own the lock for it to be released in biodone */
 4835         BUF_KERNPROC(bp);
 4836         bp->b_vp = vp;
 4837         bp->b_cmd = mbp->b_cmd;
 4838         bp->b_bio1.bio_done = nestiobuf_iodone;
 4839         bp->b_data = (char *)mbp->b_data + offset;
 4840         bp->b_resid = bp->b_bcount = size;
 4841         bp->b_bufsize = bp->b_bcount;
 4842 
 4843         bp->b_bio1.bio_track = NULL;
 4844         bp->b_bio1.bio_caller_info1.ptr = mbio;
 4845         bp->b_bio1.bio_caller_info2.ptr = stats;
 4846 }
 4847 
 4848 #ifdef DDB
 4849 
 4850 DB_SHOW_COMMAND(buffer, db_show_buffer)
 4851 {
 4852         /* get args */
 4853         struct buf *bp = (struct buf *)addr;
 4854 
 4855         if (!have_addr) {
 4856                 db_printf("usage: show buffer <addr>\n");
 4857                 return;
 4858         }
 4859 
 4860         db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
 4861         db_printf("b_cmd = %d\n", bp->b_cmd);
 4862         db_printf("b_error = %d, b_bufsize = %d, b_bcount = %d, "
 4863                   "b_resid = %d\n, b_data = %p, "
 4864                   "bio_offset(disk) = %lld, bio_offset(phys) = %lld\n",
 4865                   bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 4866                   bp->b_data,
 4867                   (long long)bp->b_bio2.bio_offset,
 4868                   (long long)(bp->b_bio2.bio_next ?
 4869                                 bp->b_bio2.bio_next->bio_offset : (off_t)-1));
 4870         if (bp->b_xio.xio_npages) {
 4871                 int i;
 4872                 db_printf("b_xio.xio_npages = %d, pages(OBJ, IDX, PA): ",
 4873                         bp->b_xio.xio_npages);
 4874                 for (i = 0; i < bp->b_xio.xio_npages; i++) {
 4875                         vm_page_t m;
 4876                         m = bp->b_xio.xio_pages[i];
 4877                         db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
 4878                             (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
 4879                         if ((i + 1) < bp->b_xio.xio_npages)
 4880                                 db_printf(",");
 4881                 }
 4882                 db_printf("\n");
 4883         }
 4884 }
 4885 #endif /* DDB */

Cache object: 26ea41e6f01de8de41b5099cde1e501f


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.