vfs_bio.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Copyright (c) 1994,1997 John S. Dyson
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice immediately at the beginning of the file, without modification,
   10  *    this list of conditions, and the following disclaimer.
   11  * 2. Absolutely no warranty of function or purpose is made by the author
   12  *              John S. Dyson.
   13  *
   14  * $FreeBSD$
   15  */
   16 
   17 /*
   18  * this file contains a new buffer I/O scheme implementing a coherent
   19  * VM object and buffer cache scheme.  Pains have been taken to make
   20  * sure that the performance degradation associated with schemes such
   21  * as this is not realized.
   22  *
   23  * Author:  John S. Dyson
   24  * Significant help during the development and debugging phases
   25  * had been provided by David Greenman, also of the FreeBSD core team.
   26  *
   27  * see man buf(9) for more info.
   28  */
   29 
   30 #include <sys/param.h>
   31 #include <sys/systm.h>
   32 #include <sys/buf.h>
   33 #include <sys/conf.h>
   34 #include <sys/eventhandler.h>
   35 #include <sys/lock.h>
   36 #include <sys/malloc.h>
   37 #include <sys/mount.h>
   38 #include <sys/kernel.h>
   39 #include <sys/kthread.h>
   40 #include <sys/proc.h>
   41 #include <sys/reboot.h>
   42 #include <sys/resourcevar.h>
   43 #include <sys/sysctl.h>
   44 #include <sys/vmmeter.h>
   45 #include <sys/vnode.h>
   46 #include <vm/vm.h>
   47 #include <vm/vm_param.h>
   48 #include <vm/vm_kern.h>
   49 #include <vm/vm_pageout.h>
   50 #include <vm/vm_page.h>
   51 #include <vm/vm_object.h>
   52 #include <vm/vm_extern.h>
   53 #include <vm/vm_map.h>
   54 
   55 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
   56 
   57 struct  bio_ops bioops;         /* I/O operation notification */
   58 
   59 struct buf *buf;                /* buffer header pool */
   60 struct swqueue bswlist;
   61 
   62 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
   63                 vm_offset_t to);
   64 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
   65                 vm_offset_t to);
   66 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
   67                                int pageno, vm_page_t m);
   68 static void vfs_clean_pages(struct buf * bp);
   69 static void vfs_setdirty(struct buf *bp);
   70 static void vfs_vmio_release(struct buf *bp);
   71 static void vfs_backgroundwritedone(struct buf *bp);
   72 static int flushbufqueues(void);
   73 
   74 static int bd_request;
   75 
   76 static void buf_daemon __P((void));
   77 /*
   78  * bogus page -- for I/O to/from partially complete buffers
   79  * this is a temporary solution to the problem, but it is not
   80  * really that bad.  it would be better to split the buffer
   81  * for input in the case of buffers partially already in memory,
   82  * but the code is intricate enough already.
   83  */
   84 vm_page_t bogus_page;
   85 int vmiodirenable = TRUE;
   86 int runningbufspace;
   87 static vm_offset_t bogus_offset;
   88 
   89 static int bufspace, maxbufspace,
   90         bufmallocspace, maxbufmallocspace, lobufspace, hibufspace;
   91 static int bufreusecnt, bufdefragcnt, buffreekvacnt;
   92 static int needsbuffer;
   93 static int lorunningspace, hirunningspace, runningbufreq;
   94 static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
   95 static int numfreebuffers, lofreebuffers, hifreebuffers;
   96 static int getnewbufcalls;
   97 static int getnewbufrestarts;
   98 
   99 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
  100         &numdirtybuffers, 0, "");
  101 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
  102         &lodirtybuffers, 0, "");
  103 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
  104         &hidirtybuffers, 0, "");
  105 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
  106         &numfreebuffers, 0, "");
  107 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
  108         &lofreebuffers, 0, "");
  109 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
  110         &hifreebuffers, 0, "");
  111 SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
  112         &runningbufspace, 0, "");
  113 SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW,
  114         &lorunningspace, 0, "");
  115 SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW,
  116         &hirunningspace, 0, "");
  117 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD,
  118         &maxbufspace, 0, "");
  119 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
  120         &hibufspace, 0, "");
  121 SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD,
  122         &lobufspace, 0, "");
  123 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
  124         &bufspace, 0, "");
  125 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
  126         &maxbufmallocspace, 0, "");
  127 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
  128         &bufmallocspace, 0, "");
  129 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW,
  130         &getnewbufcalls, 0, "");
  131 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW,
  132         &getnewbufrestarts, 0, "");
  133 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW,
  134         &vmiodirenable, 0, "");
  135 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW,
  136         &bufdefragcnt, 0, "");
  137 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW,
  138         &buffreekvacnt, 0, "");
  139 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW,
  140         &bufreusecnt, 0, "");
  141 
  142 static int bufhashmask;
  143 static int bufhashshift;
  144 static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
  145 struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
  146 char *buf_wmesg = BUF_WMESG;
  147 
  148 extern int vm_swap_size;
  149 
  150 #define VFS_BIO_NEED_ANY        0x01    /* any freeable buffer */
  151 #define VFS_BIO_NEED_DIRTYFLUSH 0x02    /* waiting for dirty buffer flush */
  152 #define VFS_BIO_NEED_FREE       0x04    /* wait for free bufs, hi hysteresis */
  153 #define VFS_BIO_NEED_BUFSPACE   0x08    /* wait for buf space, lo hysteresis */
  154 
  155 /*
  156  * Buffer hash table code.  Note that the logical block scans linearly, which
  157  * gives us some L1 cache locality.
  158  */
  159 
  160 static __inline 
  161 struct bufhashhdr *
  162 bufhash(struct vnode *vnp, daddr_t bn)
  163 {
  164         u_int64_t hashkey64;
  165         int hashkey; 
  166 
  167         /*
  168          * A variation on the Fibonacci hash that Knuth credits to
  169          * R. W. Floyd, see Knuth's _Art of Computer Programming,
  170          * Volume 3 / Sorting and Searching_
  171          *
  172          * We reduce the argument to 32 bits before doing the hash to
  173          * avoid the need for a slow 64x64 multiply on 32 bit platforms.
  174          *
  175          * sizeof(struct vnode) is 168 on i386, so toss some of the lower
  176          * bits of the vnode address to reduce the key range, which
  177          * improves the distribution of keys across buckets.
  178          *
  179          * The file system cylinder group blocks are very heavily
  180          * used.  They are located at invervals of fbg, which is
  181          * on the order of 89 to 94 * 2^10, depending on other
  182          * filesystem parameters, for a 16k block size.  Smaller block
  183          * sizes will reduce fpg approximately proportionally.  This
  184          * will cause the cylinder group index to be hashed using the
  185          * lower bits of the hash multiplier, which will not distribute
  186          * the keys as uniformly in a classic Fibonacci hash where a
  187          * relatively small number of the upper bits of the result
  188          * are used.  Using 2^16 as a close-enough approximation to
  189          * fpg, split the hash multiplier in half, with the upper 16
  190          * bits being the inverse of the golden ratio, and the lower
  191          * 16 bits being a fraction between 1/3 and 3/7 (closer to
  192          * 3/7 in this case), that gives good experimental results.
  193          */
  194         hashkey64 = ((u_int64_t)(uintptr_t)vnp >> 3) + (u_int64_t)bn;
  195         hashkey = (((u_int32_t)(hashkey64 + (hashkey64 >> 32)) * 0x9E376DB1u) >>
  196             bufhashshift) & bufhashmask;
  197         return(&bufhashtbl[hashkey]);
  198 }
  199 
  200 /*
  201  *      numdirtywakeup:
  202  *
  203  *      If someone is blocked due to there being too many dirty buffers,
  204  *      and numdirtybuffers is now reasonable, wake them up.
  205  */
  206 
  207 static __inline void
  208 numdirtywakeup(int level)
  209 {
  210         if (numdirtybuffers <= level) {
  211                 if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
  212                         needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
  213                         wakeup(&needsbuffer);
  214                 }
  215         }
  216 }
  217 
  218 /*
  219  *      bufspacewakeup:
  220  *
  221  *      Called when buffer space is potentially available for recovery.
  222  *      getnewbuf() will block on this flag when it is unable to free 
  223  *      sufficient buffer space.  Buffer space becomes recoverable when 
  224  *      bp's get placed back in the queues.
  225  */
  226 
  227 static __inline void
  228 bufspacewakeup(void)
  229 {
  230         /*
  231          * If someone is waiting for BUF space, wake them up.  Even
  232          * though we haven't freed the kva space yet, the waiting
  233          * process will be able to now.
  234          */
  235         if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
  236                 needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
  237                 wakeup(&needsbuffer);
  238         }
  239 }
  240 
  241 /*
  242  * runningbufwakeup() - in-progress I/O accounting.
  243  *
  244  */
  245 static __inline void
  246 runningbufwakeup(struct buf *bp)
  247 {
  248         if (bp->b_runningbufspace) {
  249                 runningbufspace -= bp->b_runningbufspace;
  250                 bp->b_runningbufspace = 0;
  251                 if (runningbufreq && runningbufspace <= lorunningspace) {
  252                         runningbufreq = 0;
  253                         wakeup(&runningbufreq);
  254                 }
  255         }
  256 }
  257 
  258 /*
  259  *      bufcountwakeup:
  260  *
  261  *      Called when a buffer has been added to one of the free queues to
  262  *      account for the buffer and to wakeup anyone waiting for free buffers.
  263  *      This typically occurs when large amounts of metadata are being handled
  264  *      by the buffer cache ( else buffer space runs out first, usually ).
  265  */
  266 
  267 static __inline void
  268 bufcountwakeup(void) 
  269 {
  270         ++numfreebuffers;
  271         if (needsbuffer) {
  272                 needsbuffer &= ~VFS_BIO_NEED_ANY;
  273                 if (numfreebuffers >= hifreebuffers)
  274                         needsbuffer &= ~VFS_BIO_NEED_FREE;
  275                 wakeup(&needsbuffer);
  276         }
  277 }
  278 
  279 /*
  280  *      waitrunningbufspace()
  281  *
  282  *      runningbufspace is a measure of the amount of I/O currently
  283  *      running.  This routine is used in async-write situations to
  284  *      prevent creating huge backups of pending writes to a device.
  285  *      Only asynchronous writes are governed by this function.  
  286  *
  287  *      Reads will adjust runningbufspace, but will not block based on it.
  288  *      The read load has a side effect of reducing the allowed write load.
  289  *
  290  *      This does NOT turn an async write into a sync write.  It waits
  291  *      for earlier writes to complete and generally returns before the
  292  *      caller's write has reached the device.
  293  */
  294 static __inline void
  295 waitrunningbufspace(void)
  296 {
  297         while (runningbufspace > hirunningspace) {
  298                 int s;
  299 
  300                 s = splbio();   /* fix race against interrupt/biodone() */
  301                 ++runningbufreq;
  302                 tsleep(&runningbufreq, PVM, "wdrain", 0);
  303                 splx(s);
  304         }
  305 }
  306 
  307 /*
  308  *      vfs_buf_test_cache:
  309  *
  310  *      Called when a buffer is extended.  This function clears the B_CACHE
  311  *      bit if the newly extended portion of the buffer does not contain
  312  *      valid data.
  313  */
  314 static __inline__
  315 void
  316 vfs_buf_test_cache(struct buf *bp,
  317                   vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
  318                   vm_page_t m)
  319 {
  320         if (bp->b_flags & B_CACHE) {
  321                 int base = (foff + off) & PAGE_MASK;
  322                 if (vm_page_is_valid(m, base, size) == 0)
  323                         bp->b_flags &= ~B_CACHE;
  324         }
  325 }
  326 
  327 static __inline__
  328 void
  329 bd_wakeup(int dirtybuflevel)
  330 {
  331         if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
  332                 bd_request = 1;
  333                 wakeup(&bd_request);
  334         }
  335 }
  336 
  337 /*
  338  * bd_speedup - speedup the buffer cache flushing code
  339  */
  340 
  341 static __inline__
  342 void
  343 bd_speedup(void)
  344 {
  345         bd_wakeup(1);
  346 }
  347 
  348 /*
  349  * Initialize buffer headers and related structures. 
  350  */
  351 
  352 caddr_t
  353 bufhashinit(caddr_t vaddr)
  354 {
  355         /* first, make a null hash table */
  356         bufhashshift = 29;
  357         for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
  358                 bufhashshift--;
  359         bufhashtbl = (void *)vaddr;
  360         vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
  361         --bufhashmask;
  362         return(vaddr);
  363 }
  364 
  365 void
  366 bufinit(void)
  367 {
  368         struct buf *bp;
  369         int i;
  370 
  371         TAILQ_INIT(&bswlist);
  372         LIST_INIT(&invalhash);
  373         simple_lock_init(&buftimelock);
  374 
  375         for (i = 0; i <= bufhashmask; i++)
  376                 LIST_INIT(&bufhashtbl[i]);
  377 
  378         /* next, make a null set of free lists */
  379         for (i = 0; i < BUFFER_QUEUES; i++)
  380                 TAILQ_INIT(&bufqueues[i]);
  381 
  382         /* finally, initialize each buffer header and stick on empty q */
  383         for (i = 0; i < nbuf; i++) {
  384                 bp = &buf[i];
  385                 bzero(bp, sizeof *bp);
  386                 bp->b_flags = B_INVAL;  /* we're just an empty header */
  387                 bp->b_dev = NODEV;
  388                 bp->b_rcred = NOCRED;
  389                 bp->b_wcred = NOCRED;
  390                 bp->b_qindex = QUEUE_EMPTY;
  391                 bp->b_xflags = 0;
  392                 LIST_INIT(&bp->b_dep);
  393                 BUF_LOCKINIT(bp);
  394                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
  395                 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
  396         }
  397 
  398         /*
  399          * maxbufspace is the absolute maximum amount of buffer space we are 
  400          * allowed to reserve in KVM and in real terms.  The absolute maximum
  401          * is nominally used by buf_daemon.  hibufspace is the nominal maximum
  402          * used by most other processes.  The differential is required to 
  403          * ensure that buf_daemon is able to run when other processes might 
  404          * be blocked waiting for buffer space.
  405          *
  406          * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
  407          * this may result in KVM fragmentation which is not handled optimally
  408          * by the system.
  409          */
  410         maxbufspace = nbuf * BKVASIZE;
  411         hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
  412         lobufspace = hibufspace - MAXBSIZE;
  413 
  414         lorunningspace = 512 * 1024;
  415         hirunningspace = 1024 * 1024;
  416 
  417 /*
  418  * Limit the amount of malloc memory since it is wired permanently into
  419  * the kernel space.  Even though this is accounted for in the buffer
  420  * allocation, we don't want the malloced region to grow uncontrolled.
  421  * The malloc scheme improves memory utilization significantly on average
  422  * (small) directories.
  423  */
  424         maxbufmallocspace = hibufspace / 20;
  425 
  426 /*
  427  * Reduce the chance of a deadlock occuring by limiting the number
  428  * of delayed-write dirty buffers we allow to stack up.
  429  */
  430         hidirtybuffers = nbuf / 4 + 20;
  431         numdirtybuffers = 0;
  432 /*
  433  * To support extreme low-memory systems, make sure hidirtybuffers cannot
  434  * eat up all available buffer space.  This occurs when our minimum cannot
  435  * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
  436  * BKVASIZE'd (8K) buffers.
  437  */
  438         while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
  439                 hidirtybuffers >>= 1;
  440         }
  441         lodirtybuffers = hidirtybuffers / 2;
  442 
  443 /*
  444  * Try to keep the number of free buffers in the specified range,
  445  * and give special processes (e.g. like buf_daemon) access to an 
  446  * emergency reserve.
  447  */
  448         lofreebuffers = nbuf / 18 + 5;
  449         hifreebuffers = 2 * lofreebuffers;
  450         numfreebuffers = nbuf;
  451 
  452 /*
  453  * Maximum number of async ops initiated per buf_daemon loop.  This is
  454  * somewhat of a hack at the moment, we really need to limit ourselves
  455  * based on the number of bytes of I/O in-transit that were initiated
  456  * from buf_daemon.
  457  */
  458 
  459         bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
  460         bogus_page = vm_page_alloc(kernel_object,
  461                         ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
  462                         VM_ALLOC_NORMAL);
  463         cnt.v_wire_count++;
  464 
  465 }
  466 
  467 /*
  468  * bfreekva() - free the kva allocation for a buffer.
  469  *
  470  *      Must be called at splbio() or higher as this is the only locking for
  471  *      buffer_map.
  472  *
  473  *      Since this call frees up buffer space, we call bufspacewakeup().
  474  */
  475 static void
  476 bfreekva(struct buf * bp)
  477 {
  478         if (bp->b_kvasize) {
  479                 ++buffreekvacnt;
  480                 vm_map_lock(buffer_map);
  481                 bufspace -= bp->b_kvasize;
  482                 vm_map_delete(buffer_map,
  483                     (vm_offset_t) bp->b_kvabase,
  484                     (vm_offset_t) bp->b_kvabase + bp->b_kvasize
  485                 );
  486                 vm_map_unlock(buffer_map);
  487                 bp->b_kvasize = 0;
  488                 bufspacewakeup();
  489         }
  490 }
  491 
  492 /*
  493  *      bremfree:
  494  *
  495  *      Remove the buffer from the appropriate free list.
  496  */
  497 void
  498 bremfree(struct buf * bp)
  499 {
  500         int s = splbio();
  501         int old_qindex = bp->b_qindex;
  502 
  503         if (bp->b_qindex != QUEUE_NONE) {
  504                 KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
  505                 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
  506                 bp->b_qindex = QUEUE_NONE;
  507         } else {
  508                 if (BUF_REFCNT(bp) <= 1)
  509                         panic("bremfree: removing a buffer not on a queue");
  510         }
  511 
  512         /*
  513          * Fixup numfreebuffers count.  If the buffer is invalid or not
  514          * delayed-write, and it was on the EMPTY, LRU, or AGE queues,
  515          * the buffer was free and we must decrement numfreebuffers.
  516          */
  517         if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
  518                 switch(old_qindex) {
  519                 case QUEUE_DIRTY:
  520                 case QUEUE_CLEAN:
  521                 case QUEUE_EMPTY:
  522                 case QUEUE_EMPTYKVA:
  523                         --numfreebuffers;
  524                         break;
  525                 default:
  526                         break;
  527                 }
  528         }
  529         splx(s);
  530 }
  531 
  532 
  533 /*
  534  * Get a buffer with the specified data.  Look in the cache first.  We
  535  * must clear B_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  536  * is set, the buffer is valid and we do not have to do anything ( see
  537  * getblk() ).
  538  */
  539 int
  540 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
  541     struct buf ** bpp)
  542 {
  543         struct buf *bp;
  544 
  545         bp = getblk(vp, blkno, size, 0, 0);
  546         *bpp = bp;
  547 
  548         /* if not found in cache, do some I/O */
  549         if ((bp->b_flags & B_CACHE) == 0) {
  550                 if (curproc != NULL)
  551                         curproc->p_stats->p_ru.ru_inblock++;
  552                 KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
  553                 bp->b_flags |= B_READ;
  554                 bp->b_flags &= ~(B_ERROR | B_INVAL);
  555                 if (bp->b_rcred == NOCRED) {
  556                         if (cred != NOCRED)
  557                                 crhold(cred);
  558                         bp->b_rcred = cred;
  559                 }
  560                 vfs_busy_pages(bp, 0);
  561                 VOP_STRATEGY(vp, bp);
  562                 return (biowait(bp));
  563         }
  564         return (0);
  565 }
  566 
  567 /*
  568  * Operates like bread, but also starts asynchronous I/O on
  569  * read-ahead blocks.  We must clear B_ERROR and B_INVAL prior
  570  * to initiating I/O . If B_CACHE is set, the buffer is valid 
  571  * and we do not have to do anything.
  572  */
  573 int
  574 breadn(struct vnode * vp, daddr_t blkno, int size,
  575     daddr_t * rablkno, int *rabsize,
  576     int cnt, struct ucred * cred, struct buf ** bpp)
  577 {
  578         struct buf *bp, *rabp;
  579         int i;
  580         int rv = 0, readwait = 0;
  581 
  582         *bpp = bp = getblk(vp, blkno, size, 0, 0);
  583 
  584         /* if not found in cache, do some I/O */
  585         if ((bp->b_flags & B_CACHE) == 0) {
  586                 if (curproc != NULL)
  587                         curproc->p_stats->p_ru.ru_inblock++;
  588                 bp->b_flags |= B_READ;
  589                 bp->b_flags &= ~(B_ERROR | B_INVAL);
  590                 if (bp->b_rcred == NOCRED) {
  591                         if (cred != NOCRED)
  592                                 crhold(cred);
  593                         bp->b_rcred = cred;
  594                 }
  595                 vfs_busy_pages(bp, 0);
  596                 VOP_STRATEGY(vp, bp);
  597                 ++readwait;
  598         }
  599 
  600         for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
  601                 if (inmem(vp, *rablkno))
  602                         continue;
  603                 rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
  604 
  605                 if ((rabp->b_flags & B_CACHE) == 0) {
  606                         if (curproc != NULL)
  607                                 curproc->p_stats->p_ru.ru_inblock++;
  608                         rabp->b_flags |= B_READ | B_ASYNC;
  609                         rabp->b_flags &= ~(B_ERROR | B_INVAL);
  610                         if (rabp->b_rcred == NOCRED) {
  611                                 if (cred != NOCRED)
  612                                         crhold(cred);
  613                                 rabp->b_rcred = cred;
  614                         }
  615                         vfs_busy_pages(rabp, 0);
  616                         BUF_KERNPROC(rabp);
  617                         VOP_STRATEGY(vp, rabp);
  618                 } else {
  619                         brelse(rabp);
  620                 }
  621         }
  622 
  623         if (readwait) {
  624                 rv = biowait(bp);
  625         }
  626         return (rv);
  627 }
  628 
  629 /*
  630  * Write, release buffer on completion.  (Done by iodone
  631  * if async).  Do not bother writing anything if the buffer
  632  * is invalid.
  633  *
  634  * Note that we set B_CACHE here, indicating that buffer is
  635  * fully valid and thus cacheable.  This is true even of NFS
  636  * now so we set it generally.  This could be set either here 
  637  * or in biodone() since the I/O is synchronous.  We put it
  638  * here.
  639  */
  640 int
  641 bwrite(struct buf * bp)
  642 {
  643         int oldflags, s;
  644         struct buf *newbp;
  645 
  646         if (bp->b_flags & B_INVAL) {
  647                 brelse(bp);
  648                 return (0);
  649         }
  650 
  651         oldflags = bp->b_flags;
  652 
  653         if (BUF_REFCNT(bp) == 0)
  654                 panic("bwrite: buffer is not busy???");
  655         s = splbio();
  656         /*
  657          * If a background write is already in progress, delay
  658          * writing this block if it is asynchronous. Otherwise
  659          * wait for the background write to complete.
  660          */
  661         if (bp->b_xflags & BX_BKGRDINPROG) {
  662                 if (bp->b_flags & B_ASYNC) {
  663                         splx(s);
  664                         bdwrite(bp);
  665                         return (0);
  666                 }
  667                 bp->b_xflags |= BX_BKGRDWAIT;
  668                 tsleep(&bp->b_xflags, PRIBIO, "biord", 0);
  669                 if (bp->b_xflags & BX_BKGRDINPROG)
  670                         panic("bwrite: still writing");
  671         }
  672 
  673         /* Mark the buffer clean */
  674         bundirty(bp);
  675 
  676         /*
  677          * If this buffer is marked for background writing and we
  678          * do not have to wait for it, make a copy and write the
  679          * copy so as to leave this buffer ready for further use.
  680          *
  681          * This optimization eats a lot of memory.  If we have a page
  682          * or buffer shortfull we can't do it.
  683          */
  684         if ((bp->b_xflags & BX_BKGRDWRITE) &&
  685             (bp->b_flags & B_ASYNC) &&
  686             !vm_page_count_severe() &&
  687             !buf_dirty_count_severe()) {
  688                 if (bp->b_flags & B_CALL)
  689                         panic("bwrite: need chained iodone");
  690 
  691                 /* get a new block */
  692                 newbp = geteblk(bp->b_bufsize);
  693 
  694                 /* set it to be identical to the old block */
  695                 memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
  696                 bgetvp(bp->b_vp, newbp);
  697                 newbp->b_lblkno = bp->b_lblkno;
  698                 newbp->b_blkno = bp->b_blkno;
  699                 newbp->b_offset = bp->b_offset;
  700                 newbp->b_iodone = vfs_backgroundwritedone;
  701                 newbp->b_flags |= B_ASYNC | B_CALL;
  702                 newbp->b_flags &= ~B_INVAL;
  703 
  704                 /* move over the dependencies */
  705                 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_movedeps)
  706                         (*bioops.io_movedeps)(bp, newbp);
  707 
  708                 /*
  709                  * Initiate write on the copy, release the original to
  710                  * the B_LOCKED queue so that it cannot go away until
  711                  * the background write completes. If not locked it could go
  712                  * away and then be reconstituted while it was being written.
  713                  * If the reconstituted buffer were written, we could end up
  714                  * with two background copies being written at the same time.
  715                  */
  716                 bp->b_xflags |= BX_BKGRDINPROG;
  717                 bp->b_flags |= B_LOCKED;
  718                 bqrelse(bp);
  719                 bp = newbp;
  720         }
  721 
  722         bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
  723         bp->b_flags |= B_WRITEINPROG | B_CACHE;
  724 
  725         bp->b_vp->v_numoutput++;
  726         vfs_busy_pages(bp, 1);
  727 
  728         /*
  729          * Normal bwrites pipeline writes
  730          */
  731         bp->b_runningbufspace = bp->b_bufsize;
  732         runningbufspace += bp->b_runningbufspace;
  733 
  734         if (curproc != NULL)
  735                 curproc->p_stats->p_ru.ru_oublock++;
  736         splx(s);
  737         if (oldflags & B_ASYNC)
  738                 BUF_KERNPROC(bp);
  739         VOP_STRATEGY(bp->b_vp, bp);
  740 
  741         if ((oldflags & B_ASYNC) == 0) {
  742                 int rtval = biowait(bp);
  743                 brelse(bp);
  744                 return (rtval);
  745         } else if ((oldflags & B_NOWDRAIN) == 0) {
  746                 /*
  747                  * don't allow the async write to saturate the I/O
  748                  * system.  Deadlocks can occur only if a device strategy
  749                  * routine (like in VN) turns around and issues another
  750                  * high-level write, in which case B_NOWDRAIN is expected
  751                  * to be set.   Otherwise we will not deadlock here because
  752                  * we are blocking waiting for I/O that is already in-progress
  753                  * to complete.
  754                  */
  755                 waitrunningbufspace();
  756         }
  757 
  758         return (0);
  759 }
  760 
  761 /*
  762  * Complete a background write started from bwrite.
  763  */
  764 static void
  765 vfs_backgroundwritedone(bp)
  766         struct buf *bp;
  767 {
  768         struct buf *origbp;
  769 
  770         /*
  771          * Find the original buffer that we are writing.
  772          */
  773         if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL)
  774                 panic("backgroundwritedone: lost buffer");
  775         /*
  776          * Process dependencies then return any unfinished ones.
  777          */
  778         if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
  779                 (*bioops.io_complete)(bp);
  780         if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_movedeps)
  781                 (*bioops.io_movedeps)(bp, origbp);
  782         /*
  783          * Clear the BX_BKGRDINPROG flag in the original buffer
  784          * and awaken it if it is waiting for the write to complete.
  785          * If BX_BKGRDINPROG is not set in the original buffer it must
  786          * have been released and re-instantiated - which is not legal.
  787          */
  788         KASSERT((origbp->b_xflags & BX_BKGRDINPROG), ("backgroundwritedone: lost buffer2"));
  789         origbp->b_xflags &= ~BX_BKGRDINPROG;
  790         if (origbp->b_xflags & BX_BKGRDWAIT) {
  791                 origbp->b_xflags &= ~BX_BKGRDWAIT;
  792                 wakeup(&origbp->b_xflags);
  793         }
  794         /*
  795          * Clear the B_LOCKED flag and remove it from the locked
  796          * queue if it currently resides there.
  797          */
  798         origbp->b_flags &= ~B_LOCKED;
  799         if (BUF_LOCK(origbp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
  800                 bremfree(origbp);
  801                 bqrelse(origbp);
  802         }
  803         /*
  804          * This buffer is marked B_NOCACHE, so when it is released
  805          * by biodone, it will be tossed. We mark it with B_READ
  806          * to avoid biodone doing a second vwakeup.
  807          */
  808         bp->b_flags |= B_NOCACHE | B_READ;
  809         bp->b_flags &= ~(B_CACHE | B_CALL | B_DONE);
  810         bp->b_iodone = 0;
  811         biodone(bp);
  812 }
  813 
  814 /*
  815  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  816  * anything if the buffer is marked invalid.
  817  *
  818  * Note that since the buffer must be completely valid, we can safely
  819  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  820  * biodone() in order to prevent getblk from writing the buffer
  821  * out synchronously.
  822  */
  823 void
  824 bdwrite(struct buf * bp)
  825 {
  826         if (BUF_REFCNT(bp) == 0)
  827                 panic("bdwrite: buffer is not busy");
  828 
  829         if (bp->b_flags & B_INVAL) {
  830                 brelse(bp);
  831                 return;
  832         }
  833         bdirty(bp);
  834 
  835         /*
  836          * Set B_CACHE, indicating that the buffer is fully valid.  This is
  837          * true even of NFS now.
  838          */
  839         bp->b_flags |= B_CACHE;
  840 
  841         /*
  842          * This bmap keeps the system from needing to do the bmap later,
  843          * perhaps when the system is attempting to do a sync.  Since it
  844          * is likely that the indirect block -- or whatever other datastructure
  845          * that the filesystem needs is still in memory now, it is a good
  846          * thing to do this.  Note also, that if the pageout daemon is
  847          * requesting a sync -- there might not be enough memory to do
  848          * the bmap then...  So, this is important to do.
  849          */
  850         if (bp->b_lblkno == bp->b_blkno) {
  851                 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
  852         }
  853 
  854         /*
  855          * Set the *dirty* buffer range based upon the VM system dirty pages.
  856          */
  857         vfs_setdirty(bp);
  858 
  859         /*
  860          * We need to do this here to satisfy the vnode_pager and the
  861          * pageout daemon, so that it thinks that the pages have been
  862          * "cleaned".  Note that since the pages are in a delayed write
  863          * buffer -- the VFS layer "will" see that the pages get written
  864          * out on the next sync, or perhaps the cluster will be completed.
  865          */
  866         vfs_clean_pages(bp);
  867         bqrelse(bp);
  868 
  869         /*
  870          * Wakeup the buffer flushing daemon if we have a lot of dirty
  871          * buffers (midpoint between our recovery point and our stall
  872          * point).
  873          */
  874         bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
  875 
  876         /*
  877          * note: we cannot initiate I/O from a bdwrite even if we wanted to,
  878          * due to the softdep code.
  879          */
  880 }
  881 
  882 /*
  883  *      bdirty:
  884  *
  885  *      Turn buffer into delayed write request.  We must clear B_READ and
  886  *      B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  887  *      itself to properly update it in the dirty/clean lists.  We mark it
  888  *      B_DONE to ensure that any asynchronization of the buffer properly
  889  *      clears B_DONE ( else a panic will occur later ).  
  890  *
  891  *      bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  892  *      might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  893  *      should only be called if the buffer is known-good.
  894  *
  895  *      Since the buffer is not on a queue, we do not update the numfreebuffers
  896  *      count.
  897  *
  898  *      Must be called at splbio().
  899  *      The buffer must be on QUEUE_NONE.
  900  */
  901 void
  902 bdirty(bp)
  903         struct buf *bp;
  904 {
  905         KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
  906         bp->b_flags &= ~(B_READ|B_RELBUF);
  907 
  908         if ((bp->b_flags & B_DELWRI) == 0) {
  909                 bp->b_flags |= B_DONE | B_DELWRI;
  910                 reassignbuf(bp, bp->b_vp);
  911                 ++numdirtybuffers;
  912                 bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
  913         }
  914 }
  915 
  916 /*
  917  *      bundirty:
  918  *
  919  *      Clear B_DELWRI for buffer.
  920  *
  921  *      Since the buffer is not on a queue, we do not update the numfreebuffers
  922  *      count.
  923  *      
  924  *      Must be called at splbio().
  925  *      The buffer must be on QUEUE_NONE.
  926  */
  927 
  928 void
  929 bundirty(bp)
  930         struct buf *bp;
  931 {
  932         KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
  933 
  934         if (bp->b_flags & B_DELWRI) {
  935                 bp->b_flags &= ~B_DELWRI;
  936                 reassignbuf(bp, bp->b_vp);
  937                 --numdirtybuffers;
  938                 numdirtywakeup(lodirtybuffers);
  939         }
  940         /*
  941          * Since it is now being written, we can clear its deferred write flag.
  942          */
  943         bp->b_flags &= ~B_DEFERRED;
  944 }
  945 
  946 /*
  947  *      bawrite:
  948  *
  949  *      Asynchronous write.  Start output on a buffer, but do not wait for
  950  *      it to complete.  The buffer is released when the output completes.
  951  *
  952  *      bwrite() ( or the VOP routine anyway ) is responsible for handling 
  953  *      B_INVAL buffers.  Not us.
  954  */
  955 void
  956 bawrite(struct buf * bp)
  957 {
  958         bp->b_flags |= B_ASYNC;
  959         (void) VOP_BWRITE(bp->b_vp, bp);
  960 }
  961 
  962 /*
  963  *      bowrite:
  964  *
  965  *      Ordered write.  Start output on a buffer, and flag it so that the 
  966  *      device will write it in the order it was queued.  The buffer is 
  967  *      released when the output completes.  bwrite() ( or the VOP routine
  968  *      anyway ) is responsible for handling B_INVAL buffers.
  969  */
  970 int
  971 bowrite(struct buf * bp)
  972 {
  973         bp->b_flags |= B_ORDERED | B_ASYNC;
  974         return (VOP_BWRITE(bp->b_vp, bp));
  975 }
  976 
  977 /*
  978  *      bwillwrite:
  979  *
  980  *      Called prior to the locking of any vnodes when we are expecting to
  981  *      write.  We do not want to starve the buffer cache with too many
  982  *      dirty buffers so we block here.  By blocking prior to the locking
  983  *      of any vnodes we attempt to avoid the situation where a locked vnode
  984  *      prevents the various system daemons from flushing related buffers.
  985  */
  986 
  987 void
  988 bwillwrite(void)
  989 {
  990         if (numdirtybuffers >= hidirtybuffers) {
  991                 int s;
  992 
  993                 s = splbio();
  994                 while (numdirtybuffers >= hidirtybuffers) {
  995                         bd_wakeup(1);
  996                         needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
  997                         tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
  998                 }
  999                 splx(s);
 1000         }
 1001 }
 1002 
 1003 /*
 1004  * Return true if we have too many dirty buffers.
 1005  */
 1006 int
 1007 buf_dirty_count_severe(void)
 1008 {
 1009         return(numdirtybuffers >= hidirtybuffers);
 1010 }
 1011 
 1012 /*
 1013  *      brelse:
 1014  *
 1015  *      Release a busy buffer and, if requested, free its resources.  The
 1016  *      buffer will be stashed in the appropriate bufqueue[] allowing it
 1017  *      to be accessed later as a cache entity or reused for other purposes.
 1018  */
 1019 void
 1020 brelse(struct buf * bp)
 1021 {
 1022         int s;
 1023 
 1024         KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 1025 
 1026         s = splbio();
 1027 
 1028         if (bp->b_flags & B_LOCKED)
 1029                 bp->b_flags &= ~B_ERROR;
 1030 
 1031         if ((bp->b_flags & (B_READ | B_ERROR | B_INVAL)) == B_ERROR) {
 1032                 /*
 1033                  * Failed write, redirty.  Must clear B_ERROR to prevent
 1034                  * pages from being scrapped.  If B_INVAL is set then
 1035                  * this case is not run and the next case is run to 
 1036                  * destroy the buffer.  B_INVAL can occur if the buffer
 1037                  * is outside the range supported by the underlying device.
 1038                  */
 1039                 bp->b_flags &= ~B_ERROR;
 1040                 bdirty(bp);
 1041         } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
 1042             (bp->b_bufsize <= 0)) {
 1043                 /*
 1044                  * Either a failed I/O or we were asked to free or not
 1045                  * cache the buffer.
 1046                  */
 1047                 bp->b_flags |= B_INVAL;
 1048                 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 1049                         (*bioops.io_deallocate)(bp);
 1050                 if (bp->b_flags & B_DELWRI) {
 1051                         --numdirtybuffers;
 1052                         numdirtywakeup(lodirtybuffers);
 1053                 }
 1054                 bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
 1055                 if ((bp->b_flags & B_VMIO) == 0) {
 1056                         if (bp->b_bufsize)
 1057                                 allocbuf(bp, 0);
 1058                         if (bp->b_vp)
 1059                                 brelvp(bp);
 1060                 }
 1061         }
 1062 
 1063         /*
 1064          * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release() 
 1065          * is called with B_DELWRI set, the underlying pages may wind up
 1066          * getting freed causing a previous write (bdwrite()) to get 'lost'
 1067          * because pages associated with a B_DELWRI bp are marked clean.
 1068          * 
 1069          * We still allow the B_INVAL case to call vfs_vmio_release(), even
 1070          * if B_DELWRI is set.
 1071          *
 1072          * If B_DELWRI is not set we may have to set B_RELBUF if we are low
 1073          * on pages to return pages to the VM page queues.
 1074          */
 1075         if (bp->b_flags & B_DELWRI)
 1076                 bp->b_flags &= ~B_RELBUF;
 1077         else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG))
 1078                 bp->b_flags |= B_RELBUF;
 1079 
 1080         /*
 1081          * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 1082          * constituted, not even NFS buffers now.  Two flags effect this.  If
 1083          * B_INVAL, the struct buf is invalidated but the VM object is kept
 1084          * around ( i.e. so it is trivial to reconstitute the buffer later ).
 1085          *
 1086          * If B_ERROR or B_NOCACHE is set, pages in the VM object will be
 1087          * invalidated.  B_ERROR cannot be set for a failed write unless the
 1088          * buffer is also B_INVAL because it hits the re-dirtying code above.
 1089          *
 1090          * Normally we can do this whether a buffer is B_DELWRI or not.  If
 1091          * the buffer is an NFS buffer, it is tracking piecemeal writes or
 1092          * the commit state and we cannot afford to lose the buffer. If the
 1093          * buffer has a background write in progress, we need to keep it
 1094          * around to prevent it from being reconstituted and starting a second
 1095          * background write.
 1096          */
 1097         if ((bp->b_flags & B_VMIO)
 1098             && !(bp->b_vp->v_tag == VT_NFS &&
 1099                  !vn_isdisk(bp->b_vp, NULL) &&
 1100                  (bp->b_flags & B_DELWRI))
 1101             ) {
 1102 
 1103                 int i, j, resid;
 1104                 vm_page_t m;
 1105                 off_t foff;
 1106                 vm_pindex_t poff;
 1107                 vm_object_t obj;
 1108                 struct vnode *vp;
 1109 
 1110                 vp = bp->b_vp;
 1111 
 1112                 /*
 1113                  * Get the base offset and length of the buffer.  Note that 
 1114                  * in the VMIO case if the buffer block size is not
 1115                  * page-aligned then b_data pointer may not be page-aligned.
 1116                  * But our b_pages[] array *IS* page aligned.
 1117                  *
 1118                  * block sizes less then DEV_BSIZE (usually 512) are not 
 1119                  * supported due to the page granularity bits (m->valid,
 1120                  * m->dirty, etc...). 
 1121                  *
 1122                  * See man buf(9) for more information
 1123                  */
 1124 
 1125                 resid = bp->b_bufsize;
 1126                 foff = bp->b_offset;
 1127 
 1128                 for (i = 0; i < bp->b_npages; i++) {
 1129                         m = bp->b_pages[i];
 1130                         vm_page_flag_clear(m, PG_ZERO);
 1131                         /*
 1132                          * If we hit a bogus page, fixup *all* of them
 1133                          * now.
 1134                          */
 1135                         if (m == bogus_page) {
 1136                                 VOP_GETVOBJECT(vp, &obj);
 1137                                 poff = OFF_TO_IDX(bp->b_offset);
 1138 
 1139                                 for (j = i; j < bp->b_npages; j++) {
 1140                                         vm_page_t mtmp;
 1141 
 1142                                         mtmp = bp->b_pages[j];
 1143                                         if (mtmp == bogus_page) {
 1144                                                 mtmp = vm_page_lookup(obj, poff + j);
 1145                                                 if (!mtmp) {
 1146                                                         panic("brelse: page missing\n");
 1147                                                 }
 1148                                                 bp->b_pages[j] = mtmp;
 1149                                         }
 1150                                 }
 1151 
 1152                                 if ((bp->b_flags & B_INVAL) == 0) {
 1153                                         pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 1154                                 }
 1155                                 m = bp->b_pages[i];
 1156                         }
 1157                         if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
 1158                                 int poffset = foff & PAGE_MASK;
 1159                                 int presid = resid > (PAGE_SIZE - poffset) ?
 1160                                         (PAGE_SIZE - poffset) : resid;
 1161 
 1162                                 KASSERT(presid >= 0, ("brelse: extra page"));
 1163                                 vm_page_set_invalid(m, poffset, presid);
 1164                         }
 1165                         resid -= PAGE_SIZE - (foff & PAGE_MASK);
 1166                         foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 1167                 }
 1168 
 1169                 if (bp->b_flags & (B_INVAL | B_RELBUF))
 1170                         vfs_vmio_release(bp);
 1171 
 1172         } else if (bp->b_flags & B_VMIO) {
 1173 
 1174                 if (bp->b_flags & (B_INVAL | B_RELBUF))
 1175                         vfs_vmio_release(bp);
 1176 
 1177         }
 1178                         
 1179         if (bp->b_qindex != QUEUE_NONE)
 1180                 panic("brelse: free buffer onto another queue???");
 1181         if (BUF_REFCNT(bp) > 1) {
 1182                 /* Temporary panic to verify exclusive locking */
 1183                 /* This panic goes away when we allow shared refs */
 1184                 panic("brelse: multiple refs");
 1185                 /* do not release to free list */
 1186                 BUF_UNLOCK(bp);
 1187                 splx(s);
 1188                 return;
 1189         }
 1190 
 1191         /* enqueue */
 1192 
 1193         /* buffers with no memory */
 1194         if (bp->b_bufsize == 0) {
 1195                 bp->b_flags |= B_INVAL;
 1196                 bp->b_xflags &= ~BX_BKGRDWRITE;
 1197                 if (bp->b_xflags & BX_BKGRDINPROG)
 1198                         panic("losing buffer 1");
 1199                 if (bp->b_kvasize) {
 1200                         bp->b_qindex = QUEUE_EMPTYKVA;
 1201                 } else {
 1202                         bp->b_qindex = QUEUE_EMPTY;
 1203                 }
 1204                 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 1205                 LIST_REMOVE(bp, b_hash);
 1206                 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 1207                 bp->b_dev = NODEV;
 1208         /* buffers with junk contents */
 1209         } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
 1210                 bp->b_flags |= B_INVAL;
 1211                 bp->b_xflags &= ~BX_BKGRDWRITE;
 1212                 if (bp->b_xflags & BX_BKGRDINPROG)
 1213                         panic("losing buffer 2");
 1214                 bp->b_qindex = QUEUE_CLEAN;
 1215                 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1216                 LIST_REMOVE(bp, b_hash);
 1217                 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 1218                 bp->b_dev = NODEV;
 1219 
 1220         /* buffers that are locked */
 1221         } else if (bp->b_flags & B_LOCKED) {
 1222                 bp->b_qindex = QUEUE_LOCKED;
 1223                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
 1224 
 1225         /* remaining buffers */
 1226         } else {
 1227                 switch(bp->b_flags & (B_DELWRI|B_AGE)) {
 1228                 case B_DELWRI | B_AGE:
 1229                     bp->b_qindex = QUEUE_DIRTY;
 1230                     TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 1231                     break;
 1232                 case B_DELWRI:
 1233                     bp->b_qindex = QUEUE_DIRTY;
 1234                     TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 1235                     break;
 1236                 case B_AGE:
 1237                     bp->b_qindex = QUEUE_CLEAN;
 1238                     TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1239                     break;
 1240                 default:
 1241                     bp->b_qindex = QUEUE_CLEAN;
 1242                     TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1243                     break;
 1244                 }
 1245         }
 1246 
 1247         /*
 1248          * If B_INVAL, clear B_DELWRI.  We've already placed the buffer
 1249          * on the correct queue.
 1250          */
 1251         if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI))
 1252                 bundirty(bp);
 1253 
 1254         /*
 1255          * Fixup numfreebuffers count.  The bp is on an appropriate queue
 1256          * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
 1257          * We've already handled the B_INVAL case ( B_DELWRI will be clear
 1258          * if B_INVAL is set ).
 1259          */
 1260 
 1261         if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI))
 1262                 bufcountwakeup();
 1263 
 1264         /*
 1265          * Something we can maybe free or reuse
 1266          */
 1267         if (bp->b_bufsize || bp->b_kvasize)
 1268                 bufspacewakeup();
 1269 
 1270         /* unlock */
 1271         BUF_UNLOCK(bp);
 1272         bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF |
 1273                         B_DIRECT | B_NOWDRAIN);
 1274         splx(s);
 1275 }
 1276 
 1277 /*
 1278  * Release a buffer back to the appropriate queue but do not try to free
 1279  * it.  The buffer is expected to be used again soon.
 1280  *
 1281  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
 1282  * biodone() to requeue an async I/O on completion.  It is also used when
 1283  * known good buffers need to be requeued but we think we may need the data
 1284  * again soon.
 1285  *
 1286  * XXX we should be able to leave the B_RELBUF hint set on completion.
 1287  */
 1288 void
 1289 bqrelse(struct buf * bp)
 1290 {
 1291         int s;
 1292 
 1293         s = splbio();
 1294 
 1295         KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 1296 
 1297         if (bp->b_qindex != QUEUE_NONE)
 1298                 panic("bqrelse: free buffer onto another queue???");
 1299         if (BUF_REFCNT(bp) > 1) {
 1300                 /* do not release to free list */
 1301                 panic("bqrelse: multiple refs");
 1302                 BUF_UNLOCK(bp);
 1303                 splx(s);
 1304                 return;
 1305         }
 1306         if (bp->b_flags & B_LOCKED) {
 1307                 bp->b_flags &= ~B_ERROR;
 1308                 bp->b_qindex = QUEUE_LOCKED;
 1309                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
 1310                 /* buffers with stale but valid contents */
 1311         } else if (bp->b_flags & B_DELWRI) {
 1312                 bp->b_qindex = QUEUE_DIRTY;
 1313                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 1314         } else if (vm_page_count_severe()) {
 1315                 /*
 1316                  * We are too low on memory, we have to try to free the
 1317                  * buffer (most importantly: the wired pages making up its
 1318                  * backing store) *now*.
 1319                  */
 1320                 splx(s);
 1321                 brelse(bp);
 1322                 return;
 1323         } else {
 1324                 bp->b_qindex = QUEUE_CLEAN;
 1325                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 1326         }
 1327 
 1328         if ((bp->b_flags & B_LOCKED) == 0 &&
 1329             ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
 1330                 bufcountwakeup();
 1331         }
 1332 
 1333         /*
 1334          * Something we can maybe free or reuse.
 1335          */
 1336         if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
 1337                 bufspacewakeup();
 1338 
 1339         /* unlock */
 1340         BUF_UNLOCK(bp);
 1341         bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 1342         splx(s);
 1343 }
 1344 
 1345 static void
 1346 vfs_vmio_release(bp)
 1347         struct buf *bp;
 1348 {
 1349         int i, s;
 1350         vm_page_t m;
 1351 
 1352         s = splvm();
 1353         for (i = 0; i < bp->b_npages; i++) {
 1354                 m = bp->b_pages[i];
 1355                 bp->b_pages[i] = NULL;
 1356                 /*
 1357                  * In order to keep page LRU ordering consistent, put
 1358                  * everything on the inactive queue.
 1359                  */
 1360                 vm_page_unwire(m, 0);
 1361                 /*
 1362                  * We don't mess with busy pages, it is
 1363                  * the responsibility of the process that
 1364                  * busied the pages to deal with them.
 1365                  */
 1366                 if ((m->flags & PG_BUSY) || (m->busy != 0))
 1367                         continue;
 1368                         
 1369                 if (m->wire_count == 0) {
 1370                         vm_page_flag_clear(m, PG_ZERO);
 1371                         /*
 1372                          * Might as well free the page if we can and it has
 1373                          * no valid data.  We also free the page if the
 1374                          * buffer was used for direct I/O.
 1375                          */
 1376                         if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
 1377                                 vm_page_busy(m);
 1378                                 vm_page_protect(m, VM_PROT_NONE);
 1379                                 vm_page_free(m);
 1380                         } else if (bp->b_flags & B_DIRECT) {
 1381                                 vm_page_try_to_free(m);
 1382                         } else if (vm_page_count_severe()) {
 1383                                 vm_page_try_to_cache(m);
 1384                         }
 1385                 }
 1386         }
 1387         splx(s);
 1388         pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
 1389         if (bp->b_bufsize) {
 1390                 bufspacewakeup();
 1391                 bp->b_bufsize = 0;
 1392         }
 1393         bp->b_npages = 0;
 1394         bp->b_flags &= ~B_VMIO;
 1395         if (bp->b_vp)
 1396                 brelvp(bp);
 1397 }
 1398 
 1399 /*
 1400  * Check to see if a block is currently memory resident.
 1401  */
 1402 struct buf *
 1403 gbincore(struct vnode * vp, daddr_t blkno)
 1404 {
 1405         struct buf *bp;
 1406         struct bufhashhdr *bh;
 1407 
 1408         bh = bufhash(vp, blkno);
 1409 
 1410         /* Search hash chain */
 1411         LIST_FOREACH(bp, bh, b_hash) {
 1412                 /* hit */
 1413                 if (bp->b_vp == vp && bp->b_lblkno == blkno &&
 1414                     (bp->b_flags & B_INVAL) == 0) {
 1415                         break;
 1416                 }
 1417         }
 1418         return (bp);
 1419 }
 1420 
 1421 /*
 1422  *      vfs_bio_awrite:
 1423  *
 1424  *      Implement clustered async writes for clearing out B_DELWRI buffers.
 1425  *      This is much better then the old way of writing only one buffer at
 1426  *      a time.  Note that we may not be presented with the buffers in the 
 1427  *      correct order, so we search for the cluster in both directions.
 1428  */
 1429 int
 1430 vfs_bio_awrite(struct buf * bp)
 1431 {
 1432         int i;
 1433         int j;
 1434         daddr_t lblkno = bp->b_lblkno;
 1435         struct vnode *vp = bp->b_vp;
 1436         int s;
 1437         int ncl;
 1438         struct buf *bpa;
 1439         int nwritten;
 1440         int size;
 1441         int maxcl;
 1442 
 1443         s = splbio();
 1444         /*
 1445          * right now we support clustered writing only to regular files.  If
 1446          * we find a clusterable block we could be in the middle of a cluster
 1447          * rather then at the beginning.
 1448          */
 1449         if ((vp->v_type == VREG) && 
 1450             (vp->v_mount != 0) && /* Only on nodes that have the size info */
 1451             (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 1452 
 1453                 size = vp->v_mount->mnt_stat.f_iosize;
 1454                 maxcl = MAXPHYS / size;
 1455 
 1456                 for (i = 1; i < maxcl; i++) {
 1457                         if ((bpa = gbincore(vp, lblkno + i)) &&
 1458                             BUF_REFCNT(bpa) == 0 &&
 1459                             ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 1460                             (B_DELWRI | B_CLUSTEROK)) &&
 1461                             (bpa->b_bufsize == size)) {
 1462                                 if ((bpa->b_blkno == bpa->b_lblkno) ||
 1463                                     (bpa->b_blkno !=
 1464                                      bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
 1465                                         break;
 1466                         } else {
 1467                                 break;
 1468                         }
 1469                 }
 1470                 for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
 1471                         if ((bpa = gbincore(vp, lblkno - j)) &&
 1472                             BUF_REFCNT(bpa) == 0 &&
 1473                             ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 1474                             (B_DELWRI | B_CLUSTEROK)) &&
 1475                             (bpa->b_bufsize == size)) {
 1476                                 if ((bpa->b_blkno == bpa->b_lblkno) ||
 1477                                     (bpa->b_blkno !=
 1478                                      bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
 1479                                         break;
 1480                         } else {
 1481                                 break;
 1482                         }
 1483                 }
 1484                 --j;
 1485                 ncl = i + j;
 1486                 /*
 1487                  * this is a possible cluster write
 1488                  */
 1489                 if (ncl != 1) {
 1490                         nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
 1491                         splx(s);
 1492                         return nwritten;
 1493                 }
 1494         }
 1495 
 1496         BUF_LOCK(bp, LK_EXCLUSIVE);
 1497         bremfree(bp);
 1498         bp->b_flags |= B_ASYNC;
 1499 
 1500         splx(s);
 1501         /*
 1502          * default (old) behavior, writing out only one block
 1503          *
 1504          * XXX returns b_bufsize instead of b_bcount for nwritten?
 1505          */
 1506         nwritten = bp->b_bufsize;
 1507         (void) VOP_BWRITE(bp->b_vp, bp);
 1508 
 1509         return nwritten;
 1510 }
 1511 
 1512 /*
 1513  *      getnewbuf:
 1514  *
 1515  *      Find and initialize a new buffer header, freeing up existing buffers 
 1516  *      in the bufqueues as necessary.  The new buffer is returned locked.
 1517  *
 1518  *      Important:  B_INVAL is not set.  If the caller wishes to throw the
 1519  *      buffer away, the caller must set B_INVAL prior to calling brelse().
 1520  *
 1521  *      We block if:
 1522  *              We have insufficient buffer headers
 1523  *              We have insufficient buffer space
 1524  *              buffer_map is too fragmented ( space reservation fails )
 1525  *              If we have to flush dirty buffers ( but we try to avoid this )
 1526  *
 1527  *      To avoid VFS layer recursion we do not flush dirty buffers ourselves.
 1528  *      Instead we ask the buf daemon to do it for us.  We attempt to
 1529  *      avoid piecemeal wakeups of the pageout daemon.
 1530  */
 1531 
 1532 static struct buf *
 1533 getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 1534 {
 1535         struct buf *bp;
 1536         struct buf *nbp;
 1537         int defrag = 0;
 1538         int nqindex;
 1539         static int flushingbufs;
 1540 
 1541         /*
 1542          * We can't afford to block since we might be holding a vnode lock,
 1543          * which may prevent system daemons from running.  We deal with
 1544          * low-memory situations by proactively returning memory and running
 1545          * async I/O rather then sync I/O.
 1546          */
 1547         
 1548         ++getnewbufcalls;
 1549         --getnewbufrestarts;
 1550 restart:
 1551         ++getnewbufrestarts;
 1552 
 1553         /*
 1554          * Setup for scan.  If we do not have enough free buffers,
 1555          * we setup a degenerate case that immediately fails.  Note
 1556          * that if we are specially marked process, we are allowed to
 1557          * dip into our reserves.
 1558          *
 1559          * The scanning sequence is nominally:  EMPTY->EMPTYKVA->CLEAN
 1560          *
 1561          * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
 1562          * However, there are a number of cases (defragging, reusing, ...)
 1563          * where we cannot backup.
 1564          */
 1565         nqindex = QUEUE_EMPTYKVA;
 1566         nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
 1567 
 1568         if (nbp == NULL) {
 1569                 /*
 1570                  * If no EMPTYKVA buffers and we are either
 1571                  * defragging or reusing, locate a CLEAN buffer
 1572                  * to free or reuse.  If bufspace useage is low
 1573                  * skip this step so we can allocate a new buffer.
 1574                  */
 1575                 if (defrag || bufspace >= lobufspace) {
 1576                         nqindex = QUEUE_CLEAN;
 1577                         nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
 1578                 }
 1579 
 1580                 /*
 1581                  * If we could not find or were not allowed to reuse a
 1582                  * CLEAN buffer, check to see if it is ok to use an EMPTY
 1583                  * buffer.  We can only use an EMPTY buffer if allocating
 1584                  * its KVA would not otherwise run us out of buffer space.
 1585                  */
 1586                 if (nbp == NULL && defrag == 0 &&
 1587                     bufspace + maxsize < hibufspace) {
 1588                         nqindex = QUEUE_EMPTY;
 1589                         nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 1590                 }
 1591         }
 1592 
 1593         /*
 1594          * Run scan, possibly freeing data and/or kva mappings on the fly
 1595          * depending.
 1596          */
 1597 
 1598         while ((bp = nbp) != NULL) {
 1599                 int qindex = nqindex;
 1600 
 1601                 /*
 1602                  * Calculate next bp ( we can only use it if we do not block
 1603                  * or do other fancy things ).
 1604                  */
 1605                 if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
 1606                         switch(qindex) {
 1607                         case QUEUE_EMPTY:
 1608                                 nqindex = QUEUE_EMPTYKVA;
 1609                                 if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
 1610                                         break;
 1611                                 /* fall through */
 1612                         case QUEUE_EMPTYKVA:
 1613                                 nqindex = QUEUE_CLEAN;
 1614                                 if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
 1615                                         break;
 1616                                 /* fall through */
 1617                         case QUEUE_CLEAN:
 1618                                 /*
 1619                                  * nbp is NULL. 
 1620                                  */
 1621                                 break;
 1622                         }
 1623                 }
 1624 
 1625                 /*
 1626                  * Sanity Checks
 1627                  */
 1628                 KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
 1629 
 1630                 /*
 1631                  * Note: we no longer distinguish between VMIO and non-VMIO
 1632                  * buffers.
 1633                  */
 1634 
 1635                 KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
 1636 
 1637                 /*
 1638                  * If we are defragging then we need a buffer with 
 1639                  * b_kvasize != 0.  XXX this situation should no longer
 1640                  * occur, if defrag is non-zero the buffer's b_kvasize
 1641                  * should also be non-zero at this point.  XXX
 1642                  */
 1643                 if (defrag && bp->b_kvasize == 0) {
 1644                         printf("Warning: defrag empty buffer %p\n", bp);
 1645                         continue;
 1646                 }
 1647 
 1648                 /*
 1649                  * Start freeing the bp.  This is somewhat involved.  nbp
 1650                  * remains valid only for QUEUE_EMPTY[KVA] bp's.
 1651                  */
 1652 
 1653                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
 1654                         panic("getnewbuf: locked buf");
 1655                 bremfree(bp);
 1656 
 1657                 if (qindex == QUEUE_CLEAN) {
 1658                         if (bp->b_flags & B_VMIO) {
 1659                                 bp->b_flags &= ~B_ASYNC;
 1660                                 vfs_vmio_release(bp);
 1661                         }
 1662                         if (bp->b_vp)
 1663                                 brelvp(bp);
 1664                 }
 1665 
 1666                 /*
 1667                  * NOTE:  nbp is now entirely invalid.  We can only restart
 1668                  * the scan from this point on.
 1669                  *
 1670                  * Get the rest of the buffer freed up.  b_kva* is still
 1671                  * valid after this operation.
 1672                  */
 1673 
 1674                 if (bp->b_rcred != NOCRED) {
 1675                         crfree(bp->b_rcred);
 1676                         bp->b_rcred = NOCRED;
 1677                 }
 1678                 if (bp->b_wcred != NOCRED) {
 1679                         crfree(bp->b_wcred);
 1680                         bp->b_wcred = NOCRED;
 1681                 }
 1682                 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 1683                         (*bioops.io_deallocate)(bp);
 1684                 if (bp->b_xflags & BX_BKGRDINPROG)
 1685                         panic("losing buffer 3");
 1686                 LIST_REMOVE(bp, b_hash);
 1687                 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 1688 
 1689                 if (bp->b_bufsize)
 1690                         allocbuf(bp, 0);
 1691 
 1692                 bp->b_flags = 0;
 1693                 bp->b_xflags = 0;
 1694                 bp->b_dev = NODEV;
 1695                 bp->b_vp = NULL;
 1696                 bp->b_blkno = bp->b_lblkno = 0;
 1697                 bp->b_offset = NOOFFSET;
 1698                 bp->b_iodone = 0;
 1699                 bp->b_error = 0;
 1700                 bp->b_resid = 0;
 1701                 bp->b_bcount = 0;
 1702                 bp->b_npages = 0;
 1703                 bp->b_dirtyoff = bp->b_dirtyend = 0;
 1704 
 1705                 LIST_INIT(&bp->b_dep);
 1706 
 1707                 /*
 1708                  * If we are defragging then free the buffer.
 1709                  */
 1710                 if (defrag) {
 1711                         bp->b_flags |= B_INVAL;
 1712                         bfreekva(bp);
 1713                         brelse(bp);
 1714                         defrag = 0;
 1715                         goto restart;
 1716                 }
 1717 
 1718                 /*
 1719                  * If we are overcomitted then recover the buffer and its
 1720                  * KVM space.  This occurs in rare situations when multiple
 1721                  * processes are blocked in getnewbuf() or allocbuf().
 1722                  */
 1723                 if (bufspace >= hibufspace)
 1724                         flushingbufs = 1;
 1725                 if (flushingbufs && bp->b_kvasize != 0) {
 1726                         bp->b_flags |= B_INVAL;
 1727                         bfreekva(bp);
 1728                         brelse(bp);
 1729                         goto restart;
 1730                 }
 1731                 if (bufspace < lobufspace)
 1732                         flushingbufs = 0;
 1733                 break;
 1734         }
 1735 
 1736         /*
 1737          * If we exhausted our list, sleep as appropriate.  We may have to
 1738          * wakeup various daemons and write out some dirty buffers.
 1739          *
 1740          * Generally we are sleeping due to insufficient buffer space.
 1741          */
 1742 
 1743         if (bp == NULL) {
 1744                 int flags;
 1745                 char *waitmsg;
 1746 
 1747                 if (defrag) {
 1748                         flags = VFS_BIO_NEED_BUFSPACE;
 1749                         waitmsg = "nbufkv";
 1750                 } else if (bufspace >= hibufspace) {
 1751                         waitmsg = "nbufbs";
 1752                         flags = VFS_BIO_NEED_BUFSPACE;
 1753                 } else {
 1754                         waitmsg = "newbuf";
 1755                         flags = VFS_BIO_NEED_ANY;
 1756                 }
 1757 
 1758                 bd_speedup();   /* heeeelp */
 1759 
 1760                 needsbuffer |= flags;
 1761                 while (needsbuffer & flags) {
 1762                         if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
 1763                             waitmsg, slptimeo))
 1764                                 return (NULL);
 1765                 }
 1766         } else {
 1767                 /*
 1768                  * We finally have a valid bp.  We aren't quite out of the
 1769                  * woods, we still have to reserve kva space.  In order
 1770                  * to keep fragmentation sane we only allocate kva in
 1771                  * BKVASIZE chunks.
 1772                  */
 1773                 maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 1774 
 1775                 if (maxsize != bp->b_kvasize) {
 1776                         vm_offset_t addr = 0;
 1777 
 1778                         bfreekva(bp);
 1779 
 1780                         vm_map_lock(buffer_map);
 1781 
 1782                         if (vm_map_findspace(buffer_map,
 1783                                 vm_map_min(buffer_map), maxsize, &addr)) {
 1784                                 /*
 1785                                  * Uh oh.  Buffer map is to fragmented.  We
 1786                                  * must defragment the map.
 1787                                  */
 1788                                 vm_map_unlock(buffer_map);
 1789                                 ++bufdefragcnt;
 1790                                 defrag = 1;
 1791                                 bp->b_flags |= B_INVAL;
 1792                                 brelse(bp);
 1793                                 goto restart;
 1794                         }
 1795                         if (addr) {
 1796                                 vm_map_insert(buffer_map, NULL, 0,
 1797                                         addr, addr + maxsize,
 1798                                         VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 1799 
 1800                                 bp->b_kvabase = (caddr_t) addr;
 1801                                 bp->b_kvasize = maxsize;
 1802                                 bufspace += bp->b_kvasize;
 1803                                 ++bufreusecnt;
 1804                         }
 1805                         vm_map_unlock(buffer_map);
 1806                 }
 1807                 bp->b_data = bp->b_kvabase;
 1808         }
 1809         return(bp);
 1810 }
 1811 
 1812 /*
 1813  *      buf_daemon:
 1814  *
 1815  *      buffer flushing daemon.  Buffers are normally flushed by the
 1816  *      update daemon but if it cannot keep up this process starts to
 1817  *      take the load in an attempt to prevent getnewbuf() from blocking.
 1818  */
 1819 
 1820 static struct proc *bufdaemonproc;
 1821 
 1822 static struct kproc_desc buf_kp = {
 1823         "bufdaemon",
 1824         buf_daemon,
 1825         &bufdaemonproc
 1826 };
 1827 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
 1828 
 1829 static void
 1830 buf_daemon()
 1831 {
 1832         int s;
 1833 
 1834         /*
 1835          * This process needs to be suspended prior to shutdown sync.
 1836          */
 1837         EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, bufdaemonproc,
 1838             SHUTDOWN_PRI_LAST);
 1839 
 1840         /*
 1841          * This process is allowed to take the buffer cache to the limit
 1842          */
 1843         s = splbio();
 1844 
 1845         for (;;) {
 1846                 kproc_suspend_loop(bufdaemonproc);
 1847 
 1848                 /*
 1849                  * Do the flush.  Limit the amount of in-transit I/O we
 1850                  * allow to build up, otherwise we would completely saturate
 1851                  * the I/O system.  Wakeup any waiting processes before we
 1852                  * normally would so they can run in parallel with our drain.
 1853                  */
 1854                 while (numdirtybuffers > lodirtybuffers) {
 1855                         if (flushbufqueues() == 0)
 1856                                 break;
 1857                         waitrunningbufspace();
 1858                         numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
 1859                 }
 1860 
 1861                 /*
 1862                  * Only clear bd_request if we have reached our low water
 1863                  * mark.  The buf_daemon normally waits 5 seconds and
 1864                  * then incrementally flushes any dirty buffers that have
 1865                  * built up, within reason.
 1866                  *
 1867                  * If we were unable to hit our low water mark and couldn't
 1868                  * find any flushable buffers, we sleep half a second. 
 1869                  * Otherwise we loop immediately.
 1870                  */
 1871                 if (numdirtybuffers <= lodirtybuffers) {
 1872                         /*
 1873                          * We reached our low water mark, reset the
 1874                          * request and sleep until we are needed again.
 1875                          * The sleep is just so the suspend code works.
 1876                          */
 1877                         bd_request = 0;
 1878                         tsleep(&bd_request, PVM, "psleep", hz);
 1879                 } else {
 1880                         /*
 1881                          * We couldn't find any flushable dirty buffers but
 1882                          * still have too many dirty buffers, we
 1883                          * have to sleep and try again.  (rare)
 1884                          */
 1885                         tsleep(&bd_request, PVM, "qsleep", hz / 2);
 1886                 }
 1887         }
 1888 }
 1889 
 1890 /*
 1891  *      flushbufqueues:
 1892  *
 1893  *      Try to flush a buffer in the dirty queue.  We must be careful to
 1894  *      free up B_INVAL buffers instead of write them, which NFS is 
 1895  *      particularly sensitive to.
 1896  */
 1897 
 1898 static int
 1899 flushbufqueues(void)
 1900 {
 1901         struct buf *bp;
 1902         int r = 0;
 1903 
 1904         bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
 1905 
 1906         while (bp) {
 1907                 KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
 1908                 if ((bp->b_flags & B_DELWRI) != 0 &&
 1909                     (bp->b_xflags & BX_BKGRDINPROG) == 0) {
 1910                         if (bp->b_flags & B_INVAL) {
 1911                                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
 1912                                         panic("flushbufqueues: locked buf");
 1913                                 bremfree(bp);
 1914                                 brelse(bp);
 1915                                 ++r;
 1916                                 break;
 1917                         }
 1918                         if (LIST_FIRST(&bp->b_dep) != NULL &&
 1919                             bioops.io_countdeps &&
 1920                             (bp->b_flags & B_DEFERRED) == 0 &&
 1921                             (*bioops.io_countdeps)(bp, 0)) {
 1922                                 TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY],
 1923                                     bp, b_freelist);
 1924                                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY],
 1925                                     bp, b_freelist);
 1926                                 bp->b_flags |= B_DEFERRED;
 1927                                 bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
 1928                                 continue;
 1929                         }
 1930                         vfs_bio_awrite(bp);
 1931                         ++r;
 1932                         break;
 1933                 }
 1934                 bp = TAILQ_NEXT(bp, b_freelist);
 1935         }
 1936         return (r);
 1937 }
 1938 
 1939 /*
 1940  * Check to see if a block is currently memory resident.
 1941  */
 1942 struct buf *
 1943 incore(struct vnode * vp, daddr_t blkno)
 1944 {
 1945         struct buf *bp;
 1946 
 1947         int s = splbio();
 1948         bp = gbincore(vp, blkno);
 1949         splx(s);
 1950         return (bp);
 1951 }
 1952 
 1953 /*
 1954  * Returns true if no I/O is needed to access the
 1955  * associated VM object.  This is like incore except
 1956  * it also hunts around in the VM system for the data.
 1957  */
 1958 
 1959 int
 1960 inmem(struct vnode * vp, daddr_t blkno)
 1961 {
 1962         vm_object_t obj;
 1963         vm_offset_t toff, tinc, size;
 1964         vm_page_t m;
 1965         vm_ooffset_t off;
 1966 
 1967         if (incore(vp, blkno))
 1968                 return 1;
 1969         if (vp->v_mount == NULL)
 1970                 return 0;
 1971         if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_flag & VOBJBUF) == 0)
 1972                 return 0;
 1973 
 1974         size = PAGE_SIZE;
 1975         if (size > vp->v_mount->mnt_stat.f_iosize)
 1976                 size = vp->v_mount->mnt_stat.f_iosize;
 1977         off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 1978 
 1979         for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 1980                 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 1981                 if (!m)
 1982                         return 0;
 1983                 tinc = size;
 1984                 if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 1985                         tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 1986                 if (vm_page_is_valid(m,
 1987                     (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 1988                         return 0;
 1989         }
 1990         return 1;
 1991 }
 1992 
 1993 /*
 1994  *      vfs_setdirty:
 1995  *
 1996  *      Sets the dirty range for a buffer based on the status of the dirty
 1997  *      bits in the pages comprising the buffer.
 1998  *
 1999  *      The range is limited to the size of the buffer.
 2000  *
 2001  *      This routine is primarily used by NFS, but is generalized for the
 2002  *      B_VMIO case.
 2003  */
 2004 static void
 2005 vfs_setdirty(struct buf *bp) 
 2006 {
 2007         int i;
 2008         vm_object_t object;
 2009 
 2010         /*
 2011          * Degenerate case - empty buffer
 2012          */
 2013 
 2014         if (bp->b_bufsize == 0)
 2015                 return;
 2016 
 2017         /*
 2018          * We qualify the scan for modified pages on whether the
 2019          * object has been flushed yet.  The OBJ_WRITEABLE flag
 2020          * is not cleared simply by protecting pages off.
 2021          */
 2022 
 2023         if ((bp->b_flags & B_VMIO) == 0)
 2024                 return;
 2025 
 2026         object = bp->b_pages[0]->object;
 2027 
 2028         if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
 2029                 printf("Warning: object %p writeable but not mightbedirty\n", object);
 2030         if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
 2031                 printf("Warning: object %p mightbedirty but not writeable\n", object);
 2032 
 2033         if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
 2034                 vm_offset_t boffset;
 2035                 vm_offset_t eoffset;
 2036 
 2037                 /*
 2038                  * test the pages to see if they have been modified directly
 2039                  * by users through the VM system.
 2040                  */
 2041                 for (i = 0; i < bp->b_npages; i++) {
 2042                         vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
 2043                         vm_page_test_dirty(bp->b_pages[i]);
 2044                 }
 2045 
 2046                 /*
 2047                  * Calculate the encompassing dirty range, boffset and eoffset,
 2048                  * (eoffset - boffset) bytes.
 2049                  */
 2050 
 2051                 for (i = 0; i < bp->b_npages; i++) {
 2052                         if (bp->b_pages[i]->dirty)
 2053                                 break;
 2054                 }
 2055                 boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 2056 
 2057                 for (i = bp->b_npages - 1; i >= 0; --i) {
 2058                         if (bp->b_pages[i]->dirty) {
 2059                                 break;
 2060                         }
 2061                 }
 2062                 eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 2063 
 2064                 /*
 2065                  * Fit it to the buffer.
 2066                  */
 2067 
 2068                 if (eoffset > bp->b_bcount)
 2069                         eoffset = bp->b_bcount;
 2070 
 2071                 /*
 2072                  * If we have a good dirty range, merge with the existing
 2073                  * dirty range.
 2074                  */
 2075 
 2076                 if (boffset < eoffset) {
 2077                         if (bp->b_dirtyoff > boffset)
 2078                                 bp->b_dirtyoff = boffset;
 2079                         if (bp->b_dirtyend < eoffset)
 2080                                 bp->b_dirtyend = eoffset;
 2081                 }
 2082         }
 2083 }
 2084 
 2085 /*
 2086  *      getblk:
 2087  *
 2088  *      Get a block given a specified block and offset into a file/device.
 2089  *      The buffers B_DONE bit will be cleared on return, making it almost
 2090  *      ready for an I/O initiation.  B_INVAL may or may not be set on 
 2091  *      return.  The caller should clear B_INVAL prior to initiating a
 2092  *      READ.
 2093  *
 2094  *      For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
 2095  *      an existing buffer.
 2096  *
 2097  *      For a VMIO buffer, B_CACHE is modified according to the backing VM.
 2098  *      If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
 2099  *      and then cleared based on the backing VM.  If the previous buffer is
 2100  *      non-0-sized but invalid, B_CACHE will be cleared.
 2101  *
 2102  *      If getblk() must create a new buffer, the new buffer is returned with
 2103  *      both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
 2104  *      case it is returned with B_INVAL clear and B_CACHE set based on the
 2105  *      backing VM.
 2106  *
 2107  *      getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos
 2108  *      B_CACHE bit is clear.
 2109  *      
 2110  *      What this means, basically, is that the caller should use B_CACHE to
 2111  *      determine whether the buffer is fully valid or not and should clear
 2112  *      B_INVAL prior to issuing a read.  If the caller intends to validate
 2113  *      the buffer by loading its data area with something, the caller needs
 2114  *      to clear B_INVAL.  If the caller does this without issuing an I/O, 
 2115  *      the caller should set B_CACHE ( as an optimization ), else the caller
 2116  *      should issue the I/O and biodone() will set B_CACHE if the I/O was
 2117  *      a write attempt or if it was a successfull read.  If the caller 
 2118  *      intends to issue a READ, the caller must clear B_INVAL and B_ERROR
 2119  *      prior to issuing the READ.  biodone() will *not* clear B_INVAL.
 2120  */
 2121 struct buf *
 2122 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 2123 {
 2124         struct buf *bp;
 2125         int s;
 2126         struct bufhashhdr *bh;
 2127 
 2128         if (size > MAXBSIZE)
 2129                 panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
 2130 
 2131         s = splbio();
 2132 loop:
 2133         /*
 2134          * Block if we are low on buffers.   Certain processes are allowed
 2135          * to completely exhaust the buffer cache.
 2136          *
 2137          * If this check ever becomes a bottleneck it may be better to
 2138          * move it into the else, when gbincore() fails.  At the moment
 2139          * it isn't a problem.
 2140          *
 2141          * XXX remove, we cannot afford to block anywhere if holding a vnode
 2142          * lock in low-memory situation, so take it to the max.
 2143          */
 2144         if (numfreebuffers == 0) {
 2145                 if (!curproc)
 2146                         return NULL;
 2147                 needsbuffer |= VFS_BIO_NEED_ANY;
 2148                 tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
 2149                     slptimeo);
 2150         }
 2151 
 2152         if ((bp = gbincore(vp, blkno))) {
 2153                 /*
 2154                  * Buffer is in-core.  If the buffer is not busy, it must
 2155                  * be on a queue.
 2156                  */
 2157 
 2158                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 2159                         if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
 2160                             "getblk", slpflag, slptimeo) == ENOLCK)
 2161                                 goto loop;
 2162                         splx(s);
 2163                         return (struct buf *) NULL;
 2164                 }
 2165 
 2166                 /*
 2167                  * The buffer is locked.  B_CACHE is cleared if the buffer is 
 2168                  * invalid.  Ohterwise, for a non-VMIO buffer, B_CACHE is set
 2169                  * and for a VMIO buffer B_CACHE is adjusted according to the
 2170                  * backing VM cache.
 2171                  */
 2172                 if (bp->b_flags & B_INVAL)
 2173                         bp->b_flags &= ~B_CACHE;
 2174                 else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 2175                         bp->b_flags |= B_CACHE;
 2176                 bremfree(bp);
 2177 
 2178                 /*
 2179                  * check for size inconsistancies for non-VMIO case.
 2180                  */
 2181 
 2182                 if (bp->b_bcount != size) {
 2183                         if ((bp->b_flags & B_VMIO) == 0 ||
 2184                             (size > bp->b_kvasize)) {
 2185                                 if (bp->b_flags & B_DELWRI) {
 2186                                         bp->b_flags |= B_NOCACHE;
 2187                                         VOP_BWRITE(bp->b_vp, bp);
 2188                                 } else {
 2189                                         if ((bp->b_flags & B_VMIO) &&
 2190                                            (LIST_FIRST(&bp->b_dep) == NULL)) {
 2191                                                 bp->b_flags |= B_RELBUF;
 2192                                                 brelse(bp);
 2193                                         } else {
 2194                                                 bp->b_flags |= B_NOCACHE;
 2195                                                 VOP_BWRITE(bp->b_vp, bp);
 2196                                         }
 2197                                 }
 2198                                 goto loop;
 2199                         }
 2200                 }
 2201 
 2202                 /*
 2203                  * If the size is inconsistant in the VMIO case, we can resize
 2204                  * the buffer.  This might lead to B_CACHE getting set or
 2205                  * cleared.  If the size has not changed, B_CACHE remains
 2206                  * unchanged from its previous state.
 2207                  */
 2208 
 2209                 if (bp->b_bcount != size)
 2210                         allocbuf(bp, size);
 2211 
 2212                 KASSERT(bp->b_offset != NOOFFSET, 
 2213                     ("getblk: no buffer offset"));
 2214 
 2215                 /*
 2216                  * A buffer with B_DELWRI set and B_CACHE clear must
 2217                  * be committed before we can return the buffer in
 2218                  * order to prevent the caller from issuing a read
 2219                  * ( due to B_CACHE not being set ) and overwriting
 2220                  * it.
 2221                  *
 2222                  * Most callers, including NFS and FFS, need this to
 2223                  * operate properly either because they assume they
 2224                  * can issue a read if B_CACHE is not set, or because
 2225                  * ( for example ) an uncached B_DELWRI might loop due 
 2226                  * to softupdates re-dirtying the buffer.  In the latter
 2227                  * case, B_CACHE is set after the first write completes,
 2228                  * preventing further loops.
 2229                  *
 2230                  * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
 2231                  * above while extending the buffer, we cannot allow the
 2232                  * buffer to remain with B_CACHE set after the write
 2233                  * completes or it will represent a corrupt state.  To
 2234                  * deal with this we set B_NOCACHE to scrap the buffer
 2235                  * after the write.
 2236                  *
 2237                  * We might be able to do something fancy, like setting
 2238                  * B_CACHE in bwrite() except if B_DELWRI is already set,
 2239                  * so the below call doesn't set B_CACHE, but that gets real
 2240                  * confusing.  This is much easier.
 2241                  */
 2242 
 2243                 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 2244                         bp->b_flags |= B_NOCACHE;
 2245                         VOP_BWRITE(bp->b_vp, bp);
 2246                         goto loop;
 2247                 }
 2248 
 2249                 splx(s);
 2250                 bp->b_flags &= ~B_DONE;
 2251         } else {
 2252                 /*
 2253                  * Buffer is not in-core, create new buffer.  The buffer
 2254                  * returned by getnewbuf() is locked.  Note that the returned
 2255                  * buffer is also considered valid (not marked B_INVAL).
 2256                  */
 2257                 int bsize, maxsize, vmio;
 2258                 off_t offset;
 2259 
 2260                 if (vn_isdisk(vp, NULL))
 2261                         bsize = DEV_BSIZE;
 2262                 else if (vp->v_mountedhere)
 2263                         bsize = vp->v_mountedhere->mnt_stat.f_iosize;
 2264                 else if (vp->v_mount)
 2265                         bsize = vp->v_mount->mnt_stat.f_iosize;
 2266                 else
 2267                         bsize = size;
 2268 
 2269                 offset = (off_t)blkno * bsize;
 2270                 vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_flag & VOBJBUF);
 2271                 maxsize = vmio ? size + (offset & PAGE_MASK) : size;
 2272                 maxsize = imax(maxsize, bsize);
 2273 
 2274                 if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
 2275                         if (slpflag || slptimeo) {
 2276                                 splx(s);
 2277                                 return NULL;
 2278                         }
 2279                         goto loop;
 2280                 }
 2281 
 2282                 /*
 2283                  * This code is used to make sure that a buffer is not
 2284                  * created while the getnewbuf routine is blocked.
 2285                  * This can be a problem whether the vnode is locked or not.
 2286                  * If the buffer is created out from under us, we have to
 2287                  * throw away the one we just created.  There is now window
 2288                  * race because we are safely running at splbio() from the
 2289                  * point of the duplicate buffer creation through to here,
 2290                  * and we've locked the buffer.
 2291                  */
 2292                 if (gbincore(vp, blkno)) {
 2293                         bp->b_flags |= B_INVAL;
 2294                         brelse(bp);
 2295                         goto loop;
 2296                 }
 2297 
 2298                 /*
 2299                  * Insert the buffer into the hash, so that it can
 2300                  * be found by incore.
 2301                  */
 2302                 bp->b_blkno = bp->b_lblkno = blkno;
 2303                 bp->b_offset = offset;
 2304 
 2305                 bgetvp(vp, bp);
 2306                 LIST_REMOVE(bp, b_hash);
 2307                 bh = bufhash(vp, blkno);
 2308                 LIST_INSERT_HEAD(bh, bp, b_hash);
 2309 
 2310                 /*
 2311                  * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 2312                  * buffer size starts out as 0, B_CACHE will be set by
 2313                  * allocbuf() for the VMIO case prior to it testing the
 2314                  * backing store for validity.
 2315                  */
 2316 
 2317                 if (vmio) {
 2318                         bp->b_flags |= B_VMIO;
 2319 #if defined(VFS_BIO_DEBUG)
 2320                         if (vp->v_type != VREG && vp->v_type != VBLK)
 2321                                 printf("getblk: vmioing file type %d???\n", vp->v_type);
 2322 #endif
 2323                 } else {
 2324                         bp->b_flags &= ~B_VMIO;
 2325                 }
 2326 
 2327                 allocbuf(bp, size);
 2328 
 2329                 splx(s);
 2330                 bp->b_flags &= ~B_DONE;
 2331         }
 2332         return (bp);
 2333 }
 2334 
 2335 /*
 2336  * Get an empty, disassociated buffer of given size.  The buffer is initially
 2337  * set to B_INVAL.
 2338  */
 2339 struct buf *
 2340 geteblk(int size)
 2341 {
 2342         struct buf *bp;
 2343         int s;
 2344         int maxsize;
 2345 
 2346         maxsize = (size + BKVAMASK) & ~BKVAMASK;
 2347 
 2348         s = splbio();
 2349         while ((bp = getnewbuf(0, 0, size, maxsize)) == 0);
 2350         splx(s);
 2351         allocbuf(bp, size);
 2352         bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
 2353         return (bp);
 2354 }
 2355 
 2356 
 2357 /*
 2358  * This code constitutes the buffer memory from either anonymous system
 2359  * memory (in the case of non-VMIO operations) or from an associated
 2360  * VM object (in the case of VMIO operations).  This code is able to
 2361  * resize a buffer up or down.
 2362  *
 2363  * Note that this code is tricky, and has many complications to resolve
 2364  * deadlock or inconsistant data situations.  Tread lightly!!! 
 2365  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
 2366  * the caller.  Calling this code willy nilly can result in the loss of data.
 2367  *
 2368  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
 2369  * B_CACHE for the non-VMIO case.
 2370  */
 2371 
 2372 int
 2373 allocbuf(struct buf *bp, int size)
 2374 {
 2375         int newbsize, mbsize;
 2376         int i;
 2377 
 2378         if (BUF_REFCNT(bp) == 0)
 2379                 panic("allocbuf: buffer not busy");
 2380 
 2381         if (bp->b_kvasize < size)
 2382                 panic("allocbuf: buffer too small");
 2383 
 2384         if ((bp->b_flags & B_VMIO) == 0) {
 2385                 caddr_t origbuf;
 2386                 int origbufsize;
 2387                 /*
 2388                  * Just get anonymous memory from the kernel.  Don't
 2389                  * mess with B_CACHE.
 2390                  */
 2391                 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 2392 #if !defined(NO_B_MALLOC)
 2393                 if (bp->b_flags & B_MALLOC)
 2394                         newbsize = mbsize;
 2395                 else
 2396 #endif
 2397                         newbsize = round_page(size);
 2398 
 2399                 if (newbsize < bp->b_bufsize) {
 2400 #if !defined(NO_B_MALLOC)
 2401                         /*
 2402                          * malloced buffers are not shrunk
 2403                          */
 2404                         if (bp->b_flags & B_MALLOC) {
 2405                                 if (newbsize) {
 2406                                         bp->b_bcount = size;
 2407                                 } else {
 2408                                         free(bp->b_data, M_BIOBUF);
 2409                                         if (bp->b_bufsize) {
 2410                                                 bufmallocspace -= bp->b_bufsize;
 2411                                                 bufspacewakeup();
 2412                                                 bp->b_bufsize = 0;
 2413                                         }
 2414                                         bp->b_data = bp->b_kvabase;
 2415                                         bp->b_bcount = 0;
 2416                                         bp->b_flags &= ~B_MALLOC;
 2417                                 }
 2418                                 return 1;
 2419                         }               
 2420 #endif
 2421                         vm_hold_free_pages(
 2422                             bp,
 2423                             (vm_offset_t) bp->b_data + newbsize,
 2424                             (vm_offset_t) bp->b_data + bp->b_bufsize);
 2425                 } else if (newbsize > bp->b_bufsize) {
 2426 #if !defined(NO_B_MALLOC)
 2427                         /*
 2428                          * We only use malloced memory on the first allocation.
 2429                          * and revert to page-allocated memory when the buffer
 2430                          * grows.
 2431                          */
 2432                         if ( (bufmallocspace < maxbufmallocspace) &&
 2433                                 (bp->b_bufsize == 0) &&
 2434                                 (mbsize <= PAGE_SIZE/2)) {
 2435 
 2436                                 bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
 2437                                 bp->b_bufsize = mbsize;
 2438                                 bp->b_bcount = size;
 2439                                 bp->b_flags |= B_MALLOC;
 2440                                 bufmallocspace += mbsize;
 2441                                 return 1;
 2442                         }
 2443 #endif
 2444                         origbuf = NULL;
 2445                         origbufsize = 0;
 2446 #if !defined(NO_B_MALLOC)
 2447                         /*
 2448                          * If the buffer is growing on its other-than-first allocation,
 2449                          * then we revert to the page-allocation scheme.
 2450                          */
 2451                         if (bp->b_flags & B_MALLOC) {
 2452                                 origbuf = bp->b_data;
 2453                                 origbufsize = bp->b_bufsize;
 2454                                 bp->b_data = bp->b_kvabase;
 2455                                 if (bp->b_bufsize) {
 2456                                         bufmallocspace -= bp->b_bufsize;
 2457                                         bufspacewakeup();
 2458                                         bp->b_bufsize = 0;
 2459                                 }
 2460                                 bp->b_flags &= ~B_MALLOC;
 2461                                 newbsize = round_page(newbsize);
 2462                         }
 2463 #endif
 2464                         vm_hold_load_pages(
 2465                             bp,
 2466                             (vm_offset_t) bp->b_data + bp->b_bufsize,
 2467                             (vm_offset_t) bp->b_data + newbsize);
 2468 #if !defined(NO_B_MALLOC)
 2469                         if (origbuf) {
 2470                                 bcopy(origbuf, bp->b_data, origbufsize);
 2471                                 free(origbuf, M_BIOBUF);
 2472                         }
 2473 #endif
 2474                 }
 2475         } else {
 2476                 vm_page_t m;
 2477                 int desiredpages;
 2478 
 2479                 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 2480                 desiredpages = (size == 0) ? 0 :
 2481                         num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 2482 
 2483 #if !defined(NO_B_MALLOC)
 2484                 if (bp->b_flags & B_MALLOC)
 2485                         panic("allocbuf: VMIO buffer can't be malloced");
 2486 #endif
 2487                 /*
 2488                  * Set B_CACHE initially if buffer is 0 length or will become
 2489                  * 0-length.
 2490                  */
 2491                 if (size == 0 || bp->b_bufsize == 0)
 2492                         bp->b_flags |= B_CACHE;
 2493 
 2494                 if (newbsize < bp->b_bufsize) {
 2495                         /*
 2496                          * DEV_BSIZE aligned new buffer size is less then the
 2497                          * DEV_BSIZE aligned existing buffer size.  Figure out
 2498                          * if we have to remove any pages.
 2499                          */
 2500                         if (desiredpages < bp->b_npages) {
 2501                                 for (i = desiredpages; i < bp->b_npages; i++) {
 2502                                         /*
 2503                                          * the page is not freed here -- it
 2504                                          * is the responsibility of 
 2505                                          * vnode_pager_setsize
 2506                                          */
 2507                                         m = bp->b_pages[i];
 2508                                         KASSERT(m != bogus_page,
 2509                                             ("allocbuf: bogus page found"));
 2510                                         while (vm_page_sleep_busy(m, TRUE, "biodep"))
 2511                                                 ;
 2512 
 2513                                         bp->b_pages[i] = NULL;
 2514                                         vm_page_unwire(m, 0);
 2515                                 }
 2516                                 pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
 2517                                     (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 2518                                 bp->b_npages = desiredpages;
 2519                         }
 2520                 } else if (size > bp->b_bcount) {
 2521                         /*
 2522                          * We are growing the buffer, possibly in a 
 2523                          * byte-granular fashion.
 2524                          */
 2525                         struct vnode *vp;
 2526                         vm_object_t obj;
 2527                         vm_offset_t toff;
 2528                         vm_offset_t tinc;
 2529 
 2530                         /*
 2531                          * Step 1, bring in the VM pages from the object, 
 2532                          * allocating them if necessary.  We must clear
 2533                          * B_CACHE if these pages are not valid for the 
 2534                          * range covered by the buffer.
 2535                          */
 2536 
 2537                         vp = bp->b_vp;
 2538                         VOP_GETVOBJECT(vp, &obj);
 2539 
 2540                         while (bp->b_npages < desiredpages) {
 2541                                 vm_page_t m;
 2542                                 vm_pindex_t pi;
 2543 
 2544                                 pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
 2545                                 if ((m = vm_page_lookup(obj, pi)) == NULL) {
 2546                                         /*
 2547                                          * note: must allocate system pages
 2548                                          * since blocking here could intefere
 2549                                          * with paging I/O, no matter which
 2550                                          * process we are.
 2551                                          */
 2552                                         m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM);
 2553                                         if (m == NULL) {
 2554                                                 VM_WAIT;
 2555                                                 vm_pageout_deficit += desiredpages - bp->b_npages;
 2556                                         } else {
 2557                                                 vm_page_wire(m);
 2558                                                 vm_page_wakeup(m);
 2559                                                 bp->b_flags &= ~B_CACHE;
 2560                                                 bp->b_pages[bp->b_npages] = m;
 2561                                                 ++bp->b_npages;
 2562                                         }
 2563                                         continue;
 2564                                 }
 2565 
 2566                                 /*
 2567                                  * We found a page.  If we have to sleep on it,
 2568                                  * retry because it might have gotten freed out
 2569                                  * from under us.
 2570                                  *
 2571                                  * We can only test PG_BUSY here.  Blocking on
 2572                                  * m->busy might lead to a deadlock:
 2573                                  *
 2574                                  *  vm_fault->getpages->cluster_read->allocbuf
 2575                                  *
 2576                                  */
 2577 
 2578                                 if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
 2579                                         continue;
 2580 
 2581                                 /*
 2582                                  * We have a good page.  Should we wakeup the
 2583                                  * page daemon?
 2584                                  */
 2585                                 if ((curproc != pageproc) &&
 2586                                     ((m->queue - m->pc) == PQ_CACHE) &&
 2587                                     ((cnt.v_free_count + cnt.v_cache_count) <
 2588                                         (cnt.v_free_min + cnt.v_cache_min))) {
 2589                                         pagedaemon_wakeup();
 2590                                 }
 2591                                 vm_page_flag_clear(m, PG_ZERO);
 2592                                 vm_page_wire(m);
 2593                                 bp->b_pages[bp->b_npages] = m;
 2594                                 ++bp->b_npages;
 2595                         }
 2596 
 2597                         /*
 2598                          * Step 2.  We've loaded the pages into the buffer,
 2599                          * we have to figure out if we can still have B_CACHE
 2600                          * set.  Note that B_CACHE is set according to the
 2601                          * byte-granular range ( bcount and size ), new the
 2602                          * aligned range ( newbsize ).
 2603                          *
 2604                          * The VM test is against m->valid, which is DEV_BSIZE
 2605                          * aligned.  Needless to say, the validity of the data
 2606                          * needs to also be DEV_BSIZE aligned.  Note that this
 2607                          * fails with NFS if the server or some other client
 2608                          * extends the file's EOF.  If our buffer is resized, 
 2609                          * B_CACHE may remain set! XXX
 2610                          */
 2611 
 2612                         toff = bp->b_bcount;
 2613                         tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 2614 
 2615                         while ((bp->b_flags & B_CACHE) && toff < size) {
 2616                                 vm_pindex_t pi;
 2617 
 2618                                 if (tinc > (size - toff))
 2619                                         tinc = size - toff;
 2620 
 2621                                 pi = ((bp->b_offset & PAGE_MASK) + toff) >> 
 2622                                     PAGE_SHIFT;
 2623 
 2624                                 vfs_buf_test_cache(
 2625                                     bp, 
 2626                                     bp->b_offset,
 2627                                     toff, 
 2628                                     tinc, 
 2629                                     bp->b_pages[pi]
 2630                                 );
 2631                                 toff += tinc;
 2632                                 tinc = PAGE_SIZE;
 2633                         }
 2634 
 2635                         /*
 2636                          * Step 3, fixup the KVM pmap.  Remember that
 2637                          * bp->b_data is relative to bp->b_offset, but 
 2638                          * bp->b_offset may be offset into the first page.
 2639                          */
 2640 
 2641                         bp->b_data = (caddr_t)
 2642                             trunc_page((vm_offset_t)bp->b_data);
 2643                         pmap_qenter(
 2644                             (vm_offset_t)bp->b_data,
 2645                             bp->b_pages, 
 2646                             bp->b_npages
 2647                         );
 2648                         bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
 2649                             (vm_offset_t)(bp->b_offset & PAGE_MASK));
 2650                 }
 2651         }
 2652         if (newbsize < bp->b_bufsize)
 2653                 bufspacewakeup();
 2654         bp->b_bufsize = newbsize;       /* actual buffer allocation     */
 2655         bp->b_bcount = size;            /* requested buffer size        */
 2656         return 1;
 2657 }
 2658 
 2659 /*
 2660  *      biowait:
 2661  *
 2662  *      Wait for buffer I/O completion, returning error status.  The buffer
 2663  *      is left locked and B_DONE on return.  B_EINTR is converted into a EINTR
 2664  *      error and cleared.
 2665  */
 2666 int
 2667 biowait(register struct buf * bp)
 2668 {
 2669         int s;
 2670 
 2671         s = splbio();
 2672         while ((bp->b_flags & B_DONE) == 0) {
 2673 #if defined(NO_SCHEDULE_MODS)
 2674                 tsleep(bp, PRIBIO, "biowait", 0);
 2675 #else
 2676                 if (bp->b_flags & B_READ)
 2677                         tsleep(bp, PRIBIO, "biord", 0);
 2678                 else
 2679                         tsleep(bp, PRIBIO, "biowr", 0);
 2680 #endif
 2681         }
 2682         splx(s);
 2683         if (bp->b_flags & B_EINTR) {
 2684                 bp->b_flags &= ~B_EINTR;
 2685                 return (EINTR);
 2686         }
 2687         if (bp->b_flags & B_ERROR) {
 2688                 return (bp->b_error ? bp->b_error : EIO);
 2689         } else {
 2690                 return (0);
 2691         }
 2692 }
 2693 
 2694 /*
 2695  *      biodone:
 2696  *
 2697  *      Finish I/O on a buffer, optionally calling a completion function.
 2698  *      This is usually called from an interrupt so process blocking is
 2699  *      not allowed.
 2700  *
 2701  *      biodone is also responsible for setting B_CACHE in a B_VMIO bp.
 2702  *      In a non-VMIO bp, B_CACHE will be set on the next getblk() 
 2703  *      assuming B_INVAL is clear.
 2704  *
 2705  *      For the VMIO case, we set B_CACHE if the op was a read and no
 2706  *      read error occured, or if the op was a write.  B_CACHE is never
 2707  *      set if the buffer is invalid or otherwise uncacheable.
 2708  *
 2709  *      biodone does not mess with B_INVAL, allowing the I/O routine or the
 2710  *      initiator to leave B_INVAL set to brelse the buffer out of existance
 2711  *      in the biodone routine.
 2712  */
 2713 void
 2714 biodone(register struct buf * bp)
 2715 {
 2716         int s, error;
 2717 
 2718         s = splbio();
 2719 
 2720         KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
 2721         KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 2722 
 2723         bp->b_flags |= B_DONE;
 2724         runningbufwakeup(bp);
 2725 
 2726         if (bp->b_flags & B_FREEBUF) {
 2727                 brelse(bp);
 2728                 splx(s);
 2729                 return;
 2730         }
 2731 
 2732         if ((bp->b_flags & B_READ) == 0) {
 2733                 vwakeup(bp);
 2734         }
 2735 
 2736         /* call optional completion function if requested */
 2737         if (bp->b_flags & B_CALL) {
 2738                 bp->b_flags &= ~B_CALL;
 2739                 (*bp->b_iodone) (bp);
 2740                 splx(s);
 2741                 return;
 2742         }
 2743         if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
 2744                 (*bioops.io_complete)(bp);
 2745 
 2746         if (bp->b_flags & B_VMIO) {
 2747                 int i;
 2748                 vm_ooffset_t foff;
 2749                 vm_page_t m;
 2750                 vm_object_t obj;
 2751                 int iosize;
 2752                 struct vnode *vp = bp->b_vp;
 2753 
 2754                 error = VOP_GETVOBJECT(vp, &obj);
 2755 
 2756 #if defined(VFS_BIO_DEBUG)
 2757                 if (vp->v_usecount == 0) {
 2758                         panic("biodone: zero vnode ref count");
 2759                 }
 2760 
 2761                 if (error) {
 2762                         panic("biodone: missing VM object");
 2763                 }
 2764 
 2765                 if ((vp->v_flag & VOBJBUF) == 0) {
 2766                         panic("biodone: vnode is not setup for merged cache");
 2767                 }
 2768 #endif
 2769 
 2770                 foff = bp->b_offset;
 2771                 KASSERT(bp->b_offset != NOOFFSET,
 2772                     ("biodone: no buffer offset"));
 2773 
 2774                 if (error) {
 2775                         panic("biodone: no object");
 2776                 }
 2777 #if defined(VFS_BIO_DEBUG)
 2778                 if (obj->paging_in_progress < bp->b_npages) {
 2779                         printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
 2780                             obj->paging_in_progress, bp->b_npages);
 2781                 }
 2782 #endif
 2783 
 2784                 /*
 2785                  * Set B_CACHE if the op was a normal read and no error
 2786                  * occured.  B_CACHE is set for writes in the b*write()
 2787                  * routines.
 2788                  */
 2789                 iosize = bp->b_bcount - bp->b_resid;
 2790                 if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) {
 2791                         bp->b_flags |= B_CACHE;
 2792                 }
 2793 
 2794                 for (i = 0; i < bp->b_npages; i++) {
 2795                         int bogusflag = 0;
 2796                         int resid;
 2797 
 2798                         resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 2799                         if (resid > iosize)
 2800                                 resid = iosize;
 2801 
 2802                         /*
 2803                          * cleanup bogus pages, restoring the originals
 2804                          */
 2805                         m = bp->b_pages[i];
 2806                         if (m == bogus_page) {
 2807                                 bogusflag = 1;
 2808                                 m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 2809                                 if (m == NULL)
 2810                                         panic("biodone: page disappeared");
 2811                                 bp->b_pages[i] = m;
 2812                                 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 2813                         }
 2814 #if defined(VFS_BIO_DEBUG)
 2815                         if (OFF_TO_IDX(foff) != m->pindex) {
 2816                                 printf(
 2817 "biodone: foff(%lu)/m->pindex(%d) mismatch\n",
 2818                                     (unsigned long)foff, m->pindex);
 2819                         }
 2820 #endif
 2821 
 2822                         /*
 2823                          * In the write case, the valid and clean bits are
 2824                          * already changed correctly ( see bdwrite() ), so we 
 2825                          * only need to do this here in the read case.
 2826                          */
 2827                         if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
 2828                                 vfs_page_set_valid(bp, foff, i, m);
 2829                         }
 2830                         vm_page_flag_clear(m, PG_ZERO);
 2831 
 2832                         /*
 2833                          * when debugging new filesystems or buffer I/O methods, this
 2834                          * is the most common error that pops up.  if you see this, you
 2835                          * have not set the page busy flag correctly!!!
 2836                          */
 2837                         if (m->busy == 0) {
 2838                                 printf("biodone: page busy < 0, "
 2839                                     "pindex: %d, foff: 0x(%x,%x), "
 2840                                     "resid: %d, index: %d\n",
 2841                                     (int) m->pindex, (int)(foff >> 32),
 2842                                                 (int) foff & 0xffffffff, resid, i);
 2843                                 if (!vn_isdisk(vp, NULL))
 2844                                         printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
 2845                                             bp->b_vp->v_mount->mnt_stat.f_iosize,
 2846                                             (int) bp->b_lblkno,
 2847                                             bp->b_flags, bp->b_npages);
 2848                                 else
 2849                                         printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
 2850                                             (int) bp->b_lblkno,
 2851                                             bp->b_flags, bp->b_npages);
 2852                                 printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
 2853                                     m->valid, m->dirty, m->wire_count);
 2854                                 panic("biodone: page busy < 0\n");
 2855                         }
 2856                         vm_page_io_finish(m);
 2857                         vm_object_pip_subtract(obj, 1);
 2858                         foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 2859                         iosize -= resid;
 2860                 }
 2861                 if (obj)
 2862                         vm_object_pip_wakeupn(obj, 0);
 2863         }
 2864 
 2865         /*
 2866          * For asynchronous completions, release the buffer now. The brelse
 2867          * will do a wakeup there if necessary - so no need to do a wakeup
 2868          * here in the async case. The sync case always needs to do a wakeup.
 2869          */
 2870 
 2871         if (bp->b_flags & B_ASYNC) {
 2872                 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
 2873                         brelse(bp);
 2874                 else
 2875                         bqrelse(bp);
 2876         } else {
 2877                 wakeup(bp);
 2878         }
 2879         splx(s);
 2880 }
 2881 
 2882 /*
 2883  * This routine is called in lieu of iodone in the case of
 2884  * incomplete I/O.  This keeps the busy status for pages
 2885  * consistant.
 2886  */
 2887 void
 2888 vfs_unbusy_pages(struct buf * bp)
 2889 {
 2890         int i;
 2891 
 2892         runningbufwakeup(bp);
 2893         if (bp->b_flags & B_VMIO) {
 2894                 struct vnode *vp = bp->b_vp;
 2895                 vm_object_t obj;
 2896 
 2897                 VOP_GETVOBJECT(vp, &obj);
 2898 
 2899                 for (i = 0; i < bp->b_npages; i++) {
 2900                         vm_page_t m = bp->b_pages[i];
 2901 
 2902                         if (m == bogus_page) {
 2903                                 m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 2904                                 if (!m) {
 2905                                         panic("vfs_unbusy_pages: page missing\n");
 2906                                 }
 2907                                 bp->b_pages[i] = m;
 2908                                 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 2909                         }
 2910                         vm_object_pip_subtract(obj, 1);
 2911                         vm_page_flag_clear(m, PG_ZERO);
 2912                         vm_page_io_finish(m);
 2913                 }
 2914                 vm_object_pip_wakeupn(obj, 0);
 2915         }
 2916 }
 2917 
 2918 /*
 2919  * vfs_page_set_valid:
 2920  *
 2921  *      Set the valid bits in a page based on the supplied offset.   The
 2922  *      range is restricted to the buffer's size.
 2923  *
 2924  *      This routine is typically called after a read completes.
 2925  */
 2926 static void
 2927 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 2928 {
 2929         vm_ooffset_t soff, eoff;
 2930 
 2931         /*
 2932          * Start and end offsets in buffer.  eoff - soff may not cross a
 2933          * page boundry or cross the end of the buffer.  The end of the
 2934          * buffer, in this case, is our file EOF, not the allocation size
 2935          * of the buffer.
 2936          */
 2937         soff = off;
 2938         eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 2939         if (eoff > bp->b_offset + bp->b_bcount)
 2940                 eoff = bp->b_offset + bp->b_bcount;
 2941 
 2942         /*
 2943          * Set valid range.  This is typically the entire buffer and thus the
 2944          * entire page.
 2945          */
 2946         if (eoff > soff) {
 2947                 vm_page_set_validclean(
 2948                     m,
 2949                    (vm_offset_t) (soff & PAGE_MASK),
 2950                    (vm_offset_t) (eoff - soff)
 2951                 );
 2952         }
 2953 }
 2954 
 2955 /*
 2956  * This routine is called before a device strategy routine.
 2957  * It is used to tell the VM system that paging I/O is in
 2958  * progress, and treat the pages associated with the buffer
 2959  * almost as being PG_BUSY.  Also the object paging_in_progress
 2960  * flag is handled to make sure that the object doesn't become
 2961  * inconsistant.
 2962  *
 2963  * Since I/O has not been initiated yet, certain buffer flags
 2964  * such as B_ERROR or B_INVAL may be in an inconsistant state
 2965  * and should be ignored.
 2966  */
 2967 void
 2968 vfs_busy_pages(struct buf * bp, int clear_modify)
 2969 {
 2970         int i, bogus;
 2971 
 2972         if (bp->b_flags & B_VMIO) {
 2973                 struct vnode *vp = bp->b_vp;
 2974                 vm_object_t obj;
 2975                 vm_ooffset_t foff;
 2976 
 2977                 VOP_GETVOBJECT(vp, &obj);
 2978                 foff = bp->b_offset;
 2979                 KASSERT(bp->b_offset != NOOFFSET,
 2980                     ("vfs_busy_pages: no buffer offset"));
 2981                 vfs_setdirty(bp);
 2982 
 2983 retry:
 2984                 for (i = 0; i < bp->b_npages; i++) {
 2985                         vm_page_t m = bp->b_pages[i];
 2986                         if (vm_page_sleep_busy(m, FALSE, "vbpage"))
 2987                                 goto retry;
 2988                 }
 2989 
 2990                 bogus = 0;
 2991                 for (i = 0; i < bp->b_npages; i++) {
 2992                         vm_page_t m = bp->b_pages[i];
 2993 
 2994                         vm_page_flag_clear(m, PG_ZERO);
 2995                         if ((bp->b_flags & B_CLUSTER) == 0) {
 2996                                 vm_object_pip_add(obj, 1);
 2997                                 vm_page_io_start(m);
 2998                         }
 2999 
 3000                         /*
 3001                          * When readying a buffer for a read ( i.e
 3002                          * clear_modify == 0 ), it is important to do
 3003                          * bogus_page replacement for valid pages in 
 3004                          * partially instantiated buffers.  Partially 
 3005                          * instantiated buffers can, in turn, occur when
 3006                          * reconstituting a buffer from its VM backing store
 3007                          * base.  We only have to do this if B_CACHE is
 3008                          * clear ( which causes the I/O to occur in the
 3009                          * first place ).  The replacement prevents the read
 3010                          * I/O from overwriting potentially dirty VM-backed
 3011                          * pages.  XXX bogus page replacement is, uh, bogus.
 3012                          * It may not work properly with small-block devices.
 3013                          * We need to find a better way.
 3014                          */
 3015 
 3016                         vm_page_protect(m, VM_PROT_NONE);
 3017                         if (clear_modify)
 3018                                 vfs_page_set_valid(bp, foff, i, m);
 3019                         else if (m->valid == VM_PAGE_BITS_ALL &&
 3020                                 (bp->b_flags & B_CACHE) == 0) {
 3021                                 bp->b_pages[i] = bogus_page;
 3022                                 bogus++;
 3023                         }
 3024                         foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 3025                 }
 3026                 if (bogus)
 3027                         pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 3028         }
 3029 }
 3030 
 3031 /*
 3032  * Tell the VM system that the pages associated with this buffer
 3033  * are clean.  This is used for delayed writes where the data is
 3034  * going to go to disk eventually without additional VM intevention.
 3035  *
 3036  * Note that while we only really need to clean through to b_bcount, we
 3037  * just go ahead and clean through to b_bufsize.
 3038  */
 3039 static void
 3040 vfs_clean_pages(struct buf * bp)
 3041 {
 3042         int i;
 3043 
 3044         if (bp->b_flags & B_VMIO) {
 3045                 vm_ooffset_t foff;
 3046 
 3047                 foff = bp->b_offset;
 3048                 KASSERT(bp->b_offset != NOOFFSET,
 3049                     ("vfs_clean_pages: no buffer offset"));
 3050                 for (i = 0; i < bp->b_npages; i++) {
 3051                         vm_page_t m = bp->b_pages[i];
 3052                         vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 3053                         vm_ooffset_t eoff = noff;
 3054 
 3055                         if (eoff > bp->b_offset + bp->b_bufsize)
 3056                                 eoff = bp->b_offset + bp->b_bufsize;
 3057                         vfs_page_set_valid(bp, foff, i, m);
 3058                         /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 3059                         foff = noff;
 3060                 }
 3061         }
 3062 }
 3063 
 3064 /*
 3065  *      vfs_bio_set_validclean:
 3066  *
 3067  *      Set the range within the buffer to valid and clean.  The range is 
 3068  *      relative to the beginning of the buffer, b_offset.  Note that b_offset
 3069  *      itself may be offset from the beginning of the first page.
 3070  */
 3071 
 3072 void   
 3073 vfs_bio_set_validclean(struct buf *bp, int base, int size)
 3074 {
 3075         if (bp->b_flags & B_VMIO) {
 3076                 int i;
 3077                 int n;
 3078 
 3079                 /*
 3080                  * Fixup base to be relative to beginning of first page.
 3081                  * Set initial n to be the maximum number of bytes in the
 3082                  * first page that can be validated.
 3083                  */
 3084 
 3085                 base += (bp->b_offset & PAGE_MASK);
 3086                 n = PAGE_SIZE - (base & PAGE_MASK);
 3087 
 3088                 for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 3089                         vm_page_t m = bp->b_pages[i];
 3090 
 3091                         if (n > size)
 3092                                 n = size;
 3093 
 3094                         vm_page_set_validclean(m, base & PAGE_MASK, n);
 3095                         base += n;
 3096                         size -= n;
 3097                         n = PAGE_SIZE;
 3098                 }
 3099         }
 3100 }
 3101 
 3102 /*
 3103  *      vfs_bio_clrbuf:
 3104  *
 3105  *      clear a buffer.  This routine essentially fakes an I/O, so we need
 3106  *      to clear B_ERROR and B_INVAL.
 3107  *
 3108  *      Note that while we only theoretically need to clear through b_bcount,
 3109  *      we go ahead and clear through b_bufsize.
 3110  */
 3111 
 3112 void
 3113 vfs_bio_clrbuf(struct buf *bp)
 3114 {
 3115         int i, mask = 0;
 3116         caddr_t sa, ea;
 3117         if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
 3118                 bp->b_flags &= ~(B_INVAL|B_ERROR);
 3119                 if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 3120                     (bp->b_offset & PAGE_MASK) == 0) {
 3121                         mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 3122                         if ((bp->b_pages[0]->valid & mask) == mask) {
 3123                                 bp->b_resid = 0;
 3124                                 return;
 3125                         }
 3126                         if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
 3127                             ((bp->b_pages[0]->valid & mask) == 0)) {
 3128                                 bzero(bp->b_data, bp->b_bufsize);
 3129                                 bp->b_pages[0]->valid |= mask;
 3130                                 bp->b_resid = 0;
 3131                                 return;
 3132                         }
 3133                 }
 3134                 ea = sa = bp->b_data;
 3135                 for(i=0;i<bp->b_npages;i++,sa=ea) {
 3136                         int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
 3137                         ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
 3138                         ea = (caddr_t)(vm_offset_t)ulmin(
 3139                             (u_long)(vm_offset_t)ea,
 3140                             (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
 3141                         mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 3142                         if ((bp->b_pages[i]->valid & mask) == mask)
 3143                                 continue;
 3144                         if ((bp->b_pages[i]->valid & mask) == 0) {
 3145                                 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
 3146                                         bzero(sa, ea - sa);
 3147                                 }
 3148                         } else {
 3149                                 for (; sa < ea; sa += DEV_BSIZE, j++) {
 3150                                         if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
 3151                                                 (bp->b_pages[i]->valid & (1<<j)) == 0)
 3152                                                 bzero(sa, DEV_BSIZE);
 3153                                 }
 3154                         }
 3155                         bp->b_pages[i]->valid |= mask;
 3156                         vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
 3157                 }
 3158                 bp->b_resid = 0;
 3159         } else {
 3160                 clrbuf(bp);
 3161         }
 3162 }
 3163 
 3164 /*
 3165  * vm_hold_load_pages and vm_hold_unload pages get pages into
 3166  * a buffers address space.  The pages are anonymous and are
 3167  * not associated with a file object.
 3168  */
 3169 void
 3170 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 3171 {
 3172         vm_offset_t pg;
 3173         vm_page_t p;
 3174         int index;
 3175 
 3176         to = round_page(to);
 3177         from = round_page(from);
 3178         index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 3179 
 3180         for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 3181 
 3182 tryagain:
 3183 
 3184                 /*
 3185                  * note: must allocate system pages since blocking here
 3186                  * could intefere with paging I/O, no matter which
 3187                  * process we are.
 3188                  */
 3189                 p = vm_page_alloc(kernel_object,
 3190                         ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
 3191                     VM_ALLOC_SYSTEM);
 3192                 if (!p) {
 3193                         vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
 3194                         VM_WAIT;
 3195                         goto tryagain;
 3196                 }
 3197                 vm_page_wire(p);
 3198                 p->valid = VM_PAGE_BITS_ALL;
 3199                 vm_page_flag_clear(p, PG_ZERO);
 3200                 pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
 3201                 bp->b_pages[index] = p;
 3202                 vm_page_wakeup(p);
 3203         }
 3204         bp->b_npages = index;
 3205 }
 3206 
 3207 void
 3208 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 3209 {
 3210         vm_offset_t pg;
 3211         vm_page_t p;
 3212         int index, newnpages;
 3213 
 3214         from = round_page(from);
 3215         to = round_page(to);
 3216         newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 3217 
 3218         for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 3219                 p = bp->b_pages[index];
 3220                 if (p && (index < bp->b_npages)) {
 3221                         if (p->busy) {
 3222                                 printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
 3223                                         bp->b_blkno, bp->b_lblkno);
 3224                         }
 3225                         bp->b_pages[index] = NULL;
 3226                         pmap_kremove(pg);
 3227                         vm_page_busy(p);
 3228                         vm_page_unwire(p, 0);
 3229                         vm_page_free(p);
 3230                 }
 3231         }
 3232         bp->b_npages = newnpages;
 3233 }
 3234 
 3235 /*
 3236  * Map an IO request into kernel virtual address space.
 3237  *
 3238  * All requests are (re)mapped into kernel VA space.
 3239  * Notice that we use b_bufsize for the size of the buffer
 3240  * to be mapped.  b_bcount might be modified by the driver.
 3241  */
 3242 int
 3243 vmapbuf(struct buf *bp)
 3244 {
 3245         caddr_t addr, v, kva;
 3246         vm_paddr_t pa;
 3247         int pidx;
 3248         int i;
 3249         struct vm_page *m;
 3250 
 3251         if ((bp->b_flags & B_PHYS) == 0)
 3252                 panic("vmapbuf");
 3253         if (bp->b_bufsize < 0)
 3254                 return (-1);
 3255         for (v = bp->b_saveaddr,
 3256                      addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data),
 3257                      pidx = 0;
 3258              addr < bp->b_data + bp->b_bufsize;
 3259              addr += PAGE_SIZE, v += PAGE_SIZE, pidx++) {
 3260                 /*
 3261                  * Do the vm_fault if needed; do the copy-on-write thing
 3262                  * when reading stuff off device into memory.
 3263                  */
 3264 retry:
 3265                 i = vm_fault_quick((addr >= bp->b_data) ? addr : bp->b_data,
 3266                         (bp->b_flags&B_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ);
 3267                 if (i < 0) {
 3268                         for (i = 0; i < pidx; ++i) {
 3269                             vm_page_unhold(bp->b_pages[i]);
 3270                             bp->b_pages[i] = NULL;
 3271                         }
 3272                         return(-1);
 3273                 }
 3274 
 3275                 /*
 3276                  * WARNING!  If sparc support is MFCd in the future this will
 3277                  * have to be changed from pmap_kextract() to pmap_extract()
 3278                  * ala -current.
 3279                  */
 3280 #ifdef __sparc64__
 3281 #error "If MFCing sparc support use pmap_extract"
 3282 #endif
 3283                 pa = pmap_kextract((vm_offset_t)addr);
 3284                 if (pa == 0) {
 3285                         printf("vmapbuf: warning, race against user address during I/O");
 3286                         goto retry;
 3287                 }
 3288                 m = PHYS_TO_VM_PAGE(pa);
 3289                 vm_page_hold(m);
 3290                 bp->b_pages[pidx] = m;
 3291         }
 3292         if (pidx > btoc(MAXPHYS))
 3293                 panic("vmapbuf: mapped more than MAXPHYS");
 3294         pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
 3295         
 3296         kva = bp->b_saveaddr;
 3297         bp->b_npages = pidx;
 3298         bp->b_saveaddr = bp->b_data;
 3299         bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK);
 3300         return(0);
 3301 }
 3302 
 3303 /*
 3304  * Free the io map PTEs associated with this IO operation.
 3305  * We also invalidate the TLB entries and restore the original b_addr.
 3306  */
 3307 void
 3308 vunmapbuf(bp)
 3309         register struct buf *bp;
 3310 {
 3311         int pidx;
 3312         int npages;
 3313         vm_page_t *m;
 3314 
 3315         if ((bp->b_flags & B_PHYS) == 0)
 3316                 panic("vunmapbuf");
 3317 
 3318         npages = bp->b_npages;
 3319         pmap_qremove(trunc_page((vm_offset_t)bp->b_data),
 3320                      npages);
 3321         m = bp->b_pages;
 3322         for (pidx = 0; pidx < npages; pidx++)
 3323                 vm_page_unhold(*m++);
 3324 
 3325         bp->b_data = bp->b_saveaddr;
 3326 }
 3327 
 3328 #include "opt_ddb.h"
 3329 #ifdef DDB
 3330 #include <ddb/ddb.h>
 3331 
 3332 DB_SHOW_COMMAND(buffer, db_show_buffer)
 3333 {
 3334         /* get args */
 3335         struct buf *bp = (struct buf *)addr;
 3336 
 3337         if (!have_addr) {
 3338                 db_printf("usage: show buffer <addr>\n");
 3339                 return;
 3340         }
 3341 
 3342         db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
 3343         db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
 3344                   "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, "
 3345                   "b_blkno = %d, b_pblkno = %d\n",
 3346                   bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 3347                   major(bp->b_dev), minor(bp->b_dev),
 3348                   bp->b_data, bp->b_blkno, bp->b_pblkno);
 3349         if (bp->b_npages) {
 3350                 int i;
 3351                 db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 3352                 for (i = 0; i < bp->b_npages; i++) {
 3353                         vm_page_t m;
 3354                         m = bp->b_pages[i];
 3355                         db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
 3356                             (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
 3357                         if ((i + 1) < bp->b_npages)
 3358                                 db_printf(",");
 3359                 }
 3360                 db_printf("\n");
 3361         }
 3362 }
 3363 #endif /* DDB */
Cache object: 5ad8e4934be67fe27f48d240b41fbe78
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_bio.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_bio.c