The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_bio.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 1994 John S. Dyson
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice immediately at the beginning of the file, without modification,
   10  *    this list of conditions, and the following disclaimer.
   11  * 2. Redistributions in binary form must reproduce the above copyright
   12  *    notice, this list of conditions and the following disclaimer in the
   13  *    documentation and/or other materials provided with the distribution.
   14  * 3. Absolutely no warranty of function or purpose is made by the author
   15  *    John S. Dyson.
   16  * 4. This work was done expressly for inclusion into FreeBSD.  Other use
   17  *    is allowed if this notation is included.
   18  * 5. Modifications may be freely made to this file if the above conditions
   19  *    are met.
   20  *
   21  * $FreeBSD: src/sys/kern/vfs_bio.c,v 1.104.2.15 1999/09/05 08:15:37 peter Exp $
   22  */
   23 
   24 /*
   25  * this file contains a new buffer I/O scheme implementing a coherent
   26  * VM object and buffer cache scheme.  Pains have been taken to make
   27  * sure that the performance degradation associated with schemes such
   28  * as this is not realized.
   29  *
   30  * Author:  John S. Dyson
   31  * Significant help during the development and debugging phases
   32  * had been provided by David Greenman, also of the FreeBSD core team.
   33  */
   34 
   35 #include "opt_bounce.h"
   36 
   37 #define VMIO
   38 #include <sys/param.h>
   39 #include <sys/systm.h>
   40 #include <sys/sysproto.h>
   41 #include <sys/kernel.h>
   42 #include <sys/sysctl.h>
   43 #include <sys/proc.h>
   44 #include <sys/vnode.h>
   45 #include <sys/vmmeter.h>
   46 #include <vm/vm.h>
   47 #include <vm/vm_param.h>
   48 #include <vm/vm_prot.h>
   49 #include <vm/vm_kern.h>
   50 #include <vm/vm_pageout.h>
   51 #include <vm/vm_page.h>
   52 #include <vm/vm_object.h>
   53 #include <vm/vm_extern.h>
   54 #include <vm/lock.h>
   55 #include <vm/vm_map.h>
   56 #include <sys/buf.h>
   57 #include <sys/mount.h>
   58 #include <sys/malloc.h>
   59 #include <sys/resourcevar.h>
   60 #include <sys/proc.h>
   61 
   62 #include <miscfs/specfs/specdev.h>
   63 
   64 static void vfs_update __P((void));
   65 static struct   proc *updateproc;
   66 static struct kproc_desc up_kp = {
   67         "update",
   68         vfs_update,
   69         &updateproc
   70 };
   71 SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
   72 
   73 struct buf *buf;                /* buffer header pool */
   74 struct swqueue bswlist;
   75 
   76 int count_lock_queue __P((void));
   77 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
   78                 vm_offset_t to);
   79 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
   80                 vm_offset_t to);
   81 static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
   82                               vm_offset_t off, vm_offset_t size,
   83                               vm_page_t m);
   84 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
   85                                int pageno, vm_page_t m);
   86 static void vfs_clean_pages(struct buf * bp);
   87 static void vfs_setdirty(struct buf *bp);
   88 static void vfs_vmio_release(struct buf *bp);
   89 
   90 int needsbuffer;
   91 
   92 /*
   93  * Internal update daemon, process 3
   94  *      The variable vfs_update_wakeup allows for internal syncs.
   95  */
   96 int vfs_update_wakeup;
   97 
   98 
   99 /*
  100  * buffers base kva
  101  */
  102 
  103 /*
  104  * bogus page -- for I/O to/from partially complete buffers
  105  * this is a temporary solution to the problem, but it is not
  106  * really that bad.  it would be better to split the buffer
  107  * for input in the case of buffers partially already in memory,
  108  * but the code is intricate enough already.
  109  */
  110 vm_page_t bogus_page;
  111 static vm_offset_t bogus_offset;
  112 
  113 static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
  114         bufmallocspace, maxbufmallocspace;
  115 
  116 static struct bufhashhdr bufhashtbl[BUFHSZ], invalhash;
  117 static struct bqueues bufqueues[BUFFER_QUEUES];
  118 
  119 extern int vm_swap_size;
  120 
  121 #define BUF_MAXUSE 16
  122 
  123 /*
  124  * Initialize buffer headers and related structures.
  125  */
  126 void
  127 bufinit()
  128 {
  129         struct buf *bp;
  130         int i;
  131 
  132         TAILQ_INIT(&bswlist);
  133         LIST_INIT(&invalhash);
  134 
  135         /* first, make a null hash table */
  136         for (i = 0; i < BUFHSZ; i++)
  137                 LIST_INIT(&bufhashtbl[i]);
  138 
  139         /* next, make a null set of free lists */
  140         for (i = 0; i < BUFFER_QUEUES; i++)
  141                 TAILQ_INIT(&bufqueues[i]);
  142 
  143         /* finally, initialize each buffer header and stick on empty q */
  144         for (i = 0; i < nbuf; i++) {
  145                 bp = &buf[i];
  146                 bzero(bp, sizeof *bp);
  147                 bp->b_flags = B_INVAL;  /* we're just an empty header */
  148                 bp->b_dev = NODEV;
  149                 bp->b_rcred = NOCRED;
  150                 bp->b_wcred = NOCRED;
  151                 bp->b_qindex = QUEUE_EMPTY;
  152                 bp->b_vnbufs.le_next = NOLIST;
  153                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
  154                 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
  155         }
  156 /*
  157  * maxbufspace is currently calculated to support all filesystem blocks
  158  * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
  159  * cache is still the same as it would be for 8K filesystems.  This
  160  * keeps the size of the buffer cache "in check" for big block filesystems.
  161  */
  162         maxbufspace = (nbuf + 8) * DFLTBSIZE;
  163 /*
  164  * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
  165  */
  166         maxvmiobufspace = 2 * maxbufspace / 3;
  167 /*
  168  * Limit the amount of malloc memory since it is wired permanently into
  169  * the kernel space.  Even though this is accounted for in the buffer
  170  * allocation, we don't want the malloced region to grow uncontrolled.
  171  * The malloc scheme improves memory utilization significantly on average
  172  * (small) directories.
  173  */
  174         maxbufmallocspace = maxbufspace / 20;
  175 
  176         bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
  177         bogus_page = vm_page_alloc(kernel_object,
  178                         ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
  179                         VM_ALLOC_NORMAL);
  180 
  181 }
  182 
  183 /*
  184  * Free the kva allocation for a buffer
  185  * Must be called only at splbio or higher,
  186  *  as this is the only locking for buffer_map.
  187  */
  188 static void
  189 bfreekva(struct buf * bp)
  190 {
  191         if (bp->b_kvasize == 0)
  192                 return;
  193                 
  194         vm_map_delete(buffer_map,
  195                 (vm_offset_t) bp->b_kvabase,
  196                 (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
  197 
  198         bp->b_kvasize = 0;
  199 
  200 }
  201 
  202 /*
  203  * remove the buffer from the appropriate free list
  204  */
  205 void
  206 bremfree(struct buf * bp)
  207 {
  208         int s = splbio();
  209 
  210         if (bp->b_qindex != QUEUE_NONE) {
  211                 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
  212                 bp->b_qindex = QUEUE_NONE;
  213         } else {
  214                 panic("bremfree: removing a buffer when not on a queue");
  215         }
  216         splx(s);
  217 }
  218 
  219 /*
  220  * Get a buffer with the specified data.  Look in the cache first.
  221  */
  222 int
  223 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
  224     struct buf ** bpp)
  225 {
  226         struct buf *bp;
  227 
  228         bp = getblk(vp, blkno, size, 0, 0);
  229         *bpp = bp;
  230 
  231         /* if not found in cache, do some I/O */
  232         if ((bp->b_flags & B_CACHE) == 0) {
  233                 if (curproc != NULL)
  234                         curproc->p_stats->p_ru.ru_inblock++;
  235                 bp->b_flags |= B_READ;
  236                 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
  237                 if (bp->b_rcred == NOCRED) {
  238                         if (cred != NOCRED)
  239                                 crhold(cred);
  240                         bp->b_rcred = cred;
  241                 }
  242                 vfs_busy_pages(bp, 0);
  243                 VOP_STRATEGY(bp);
  244                 return (biowait(bp));
  245         }
  246         return (0);
  247 }
  248 
  249 /*
  250  * Operates like bread, but also starts asynchronous I/O on
  251  * read-ahead blocks.
  252  */
  253 int
  254 breadn(struct vnode * vp, daddr_t blkno, int size,
  255     daddr_t * rablkno, int *rabsize,
  256     int cnt, struct ucred * cred, struct buf ** bpp)
  257 {
  258         struct buf *bp, *rabp;
  259         int i;
  260         int rv = 0, readwait = 0;
  261 
  262         *bpp = bp = getblk(vp, blkno, size, 0, 0);
  263 
  264         /* if not found in cache, do some I/O */
  265         if ((bp->b_flags & B_CACHE) == 0) {
  266                 if (curproc != NULL)
  267                         curproc->p_stats->p_ru.ru_inblock++;
  268                 bp->b_flags |= B_READ;
  269                 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
  270                 if (bp->b_rcred == NOCRED) {
  271                         if (cred != NOCRED)
  272                                 crhold(cred);
  273                         bp->b_rcred = cred;
  274                 }
  275                 vfs_busy_pages(bp, 0);
  276                 VOP_STRATEGY(bp);
  277                 ++readwait;
  278         }
  279         for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
  280                 if (inmem(vp, *rablkno))
  281                         continue;
  282                 rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
  283 
  284                 if ((rabp->b_flags & B_CACHE) == 0) {
  285                         if (curproc != NULL)
  286                                 curproc->p_stats->p_ru.ru_inblock++;
  287                         rabp->b_flags |= B_READ | B_ASYNC;
  288                         rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
  289                         if (rabp->b_rcred == NOCRED) {
  290                                 if (cred != NOCRED)
  291                                         crhold(cred);
  292                                 rabp->b_rcred = cred;
  293                         }
  294                         vfs_busy_pages(rabp, 0);
  295                         VOP_STRATEGY(rabp);
  296                 } else {
  297                         brelse(rabp);
  298                 }
  299         }
  300 
  301         if (readwait) {
  302                 rv = biowait(bp);
  303         }
  304         return (rv);
  305 }
  306 
  307 /*
  308  * Write, release buffer on completion.  (Done by iodone
  309  * if async.)
  310  */
  311 int
  312 bwrite(struct buf * bp)
  313 {
  314         int oldflags = bp->b_flags;
  315 
  316         if (bp->b_flags & B_INVAL) {
  317                 brelse(bp);
  318                 return (0);
  319         }
  320         if (!(bp->b_flags & B_BUSY))
  321                 panic("bwrite: buffer is not busy???");
  322 
  323         bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
  324         bp->b_flags |= B_WRITEINPROG;
  325 
  326         if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
  327                 reassignbuf(bp, bp->b_vp);
  328         }
  329 
  330         bp->b_vp->v_numoutput++;
  331         vfs_busy_pages(bp, 1);
  332         if (curproc != NULL)
  333                 curproc->p_stats->p_ru.ru_oublock++;
  334         VOP_STRATEGY(bp);
  335 
  336         if ((oldflags & B_ASYNC) == 0) {
  337                 int rtval = biowait(bp);
  338 
  339                 if (oldflags & B_DELWRI) {
  340                         reassignbuf(bp, bp->b_vp);
  341                 }
  342                 brelse(bp);
  343                 return (rtval);
  344         }
  345         return (0);
  346 }
  347 
  348 int
  349 vn_bwrite(ap)
  350         struct vop_bwrite_args *ap;
  351 {
  352         return (bwrite(ap->a_bp));
  353 }
  354 
  355 /*
  356  * Delayed write. (Buffer is marked dirty).
  357  */
  358 void
  359 bdwrite(struct buf * bp)
  360 {
  361 
  362         if ((bp->b_flags & B_BUSY) == 0) {
  363                 panic("bdwrite: buffer is not busy");
  364         }
  365         if (bp->b_flags & B_INVAL) {
  366                 brelse(bp);
  367                 return;
  368         }
  369         if (bp->b_flags & B_TAPE) {
  370                 bawrite(bp);
  371                 return;
  372         }
  373         bp->b_flags &= ~(B_READ|B_RELBUF);
  374         if ((bp->b_flags & B_DELWRI) == 0) {
  375                 bp->b_flags |= B_DONE | B_DELWRI;
  376                 reassignbuf(bp, bp->b_vp);
  377         }
  378 
  379         /*
  380          * This bmap keeps the system from needing to do the bmap later,
  381          * perhaps when the system is attempting to do a sync.  Since it
  382          * is likely that the indirect block -- or whatever other datastructure
  383          * that the filesystem needs is still in memory now, it is a good
  384          * thing to do this.  Note also, that if the pageout daemon is
  385          * requesting a sync -- there might not be enough memory to do
  386          * the bmap then...  So, this is important to do.
  387          */
  388         if( bp->b_lblkno == bp->b_blkno) {
  389                 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
  390         }
  391 
  392         /*
  393          * Set the *dirty* buffer range based upon the VM system dirty pages.
  394          */
  395         vfs_setdirty(bp);
  396 
  397         /*
  398          * We need to do this here to satisfy the vnode_pager and the
  399          * pageout daemon, so that it thinks that the pages have been
  400          * "cleaned".  Note that since the pages are in a delayed write
  401          * buffer -- the VFS layer "will" see that the pages get written
  402          * out on the next sync, or perhaps the cluster will be completed.
  403          */
  404         vfs_clean_pages(bp);
  405         bqrelse(bp);
  406         return;
  407 }
  408 
  409 /*
  410  * Asynchronous write.
  411  * Start output on a buffer, but do not wait for it to complete.
  412  * The buffer is released when the output completes.
  413  */
  414 void
  415 bawrite(struct buf * bp)
  416 {
  417         bp->b_flags |= B_ASYNC;
  418         (void) VOP_BWRITE(bp);
  419 }
  420 
  421 /*
  422  * Ordered write.
  423  * Start output on a buffer, but only wait for it to complete if the
  424  * output device cannot guarantee ordering in some other way.  Devices
  425  * that can perform asynchronous ordered writes will set the B_ASYNC
  426  * flag in their strategy routine.
  427  * The buffer is released when the output completes.
  428  */
  429 int
  430 bowrite(struct buf * bp)
  431 {
  432         bp->b_flags |= B_ORDERED;
  433         return (VOP_BWRITE(bp));
  434 }
  435 
  436 /*
  437  * Release a buffer.
  438  */
  439 void
  440 brelse(struct buf * bp)
  441 {
  442         int s;
  443 
  444         if (bp->b_flags & B_CLUSTER) {
  445                 relpbuf(bp);
  446                 return;
  447         }
  448         /* anyone need a "free" block? */
  449         s = splbio();
  450 
  451         /* anyone need this block? */
  452         if (bp->b_flags & B_WANTED) {
  453                 bp->b_flags &= ~(B_WANTED | B_AGE);
  454                 wakeup(bp);
  455         } 
  456 
  457         if (bp->b_flags & B_LOCKED)
  458                 bp->b_flags &= ~B_ERROR;
  459 
  460         if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
  461             (bp->b_bufsize <= 0)) {
  462                 bp->b_flags |= B_INVAL;
  463                 bp->b_flags &= ~(B_DELWRI | B_CACHE);
  464                 if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) {
  465                         if (bp->b_bufsize)
  466                                 allocbuf(bp, 0);
  467                         brelvp(bp);
  468                 }
  469         }
  470 
  471         /*
  472          * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
  473          * is called with B_DELWRI set, the underlying pages may wind up
  474          * getting freed causing a previous write (bdwrite()) to get 'lost'
  475          * because pages associated with a B_DELWRI bp are marked clean.
  476          *
  477          * We still allow the B_INVAL case to call vfs_vmio_release(), even
  478          * if B_DELWRI is set.
  479          */
  480  
  481         if (bp->b_flags & B_DELWRI)
  482                 bp->b_flags &= ~B_RELBUF; 
  483 
  484         /*
  485          * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
  486          * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
  487          * but the VM object is kept around.  The B_NOCACHE flag is used to
  488          * invalidate the pages in the VM object.
  489          *
  490          * If the buffer is a partially filled NFS buffer, keep it
  491          * since invalidating it now will lose informatio.  The valid
  492          * flags in the vm_pages have only DEV_BSIZE resolution but
  493          * the b_validoff, b_validend fields have byte resolution.
  494          * This can avoid unnecessary re-reads of the buffer.
  495          */
  496         if ((bp->b_flags & B_VMIO)
  497             && (bp->b_vp->v_tag != VT_NFS
  498                 || bp->b_vp->v_type == VBLK
  499                 || (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
  500                 || bp->b_validend == 0
  501                 || (bp->b_validoff == 0
  502                     && bp->b_validend == bp->b_bufsize))) {
  503                 vm_ooffset_t foff;
  504                 vm_object_t obj;
  505                 int i, resid;
  506                 vm_page_t m;
  507                 struct vnode *vp;
  508                 int iototal = bp->b_bufsize;
  509 
  510                 vp = bp->b_vp;
  511                 if (!vp) 
  512                         panic("brelse: missing vp");
  513 
  514                 if (bp->b_npages) {
  515                         vm_pindex_t poff;
  516                         obj = (vm_object_t) vp->v_object;
  517                         if (vp->v_type == VBLK)
  518                                 foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT;
  519                         else
  520                                 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
  521                         poff = OFF_TO_IDX(foff);
  522                         for (i = 0; i < bp->b_npages; i++) {
  523                                 m = bp->b_pages[i];
  524                                 if (m == bogus_page) {
  525                                         m = vm_page_lookup(obj, poff + i);
  526                                         if (!m) {
  527                                                 panic("brelse: page missing\n");
  528                                         }
  529                                         bp->b_pages[i] = m;
  530                                         pmap_qenter(trunc_page(bp->b_data),
  531                                                 bp->b_pages, bp->b_npages);
  532                                 }
  533                                 resid = IDX_TO_OFF(m->pindex+1) - foff;
  534                                 if (resid > iototal)
  535                                         resid = iototal;
  536                                 if (resid > 0) {
  537                                         /*
  538                                          * Don't invalidate the page if the local machine has already
  539                                          * modified it.  This is the lesser of two evils, and should
  540                                          * be fixed.
  541                                          */
  542                                         if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
  543                                                 vm_page_test_dirty(m);
  544                                                 if (m->dirty == 0) {
  545                                                         vm_page_set_invalid(m, (vm_offset_t) foff, resid);
  546                                                         if (m->valid == 0)
  547                                                                 vm_page_protect(m, VM_PROT_NONE);
  548                                                 }
  549                                         }
  550                                         if (resid >= PAGE_SIZE) {
  551                                                 if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
  552                                                         bp->b_flags |= B_INVAL;
  553                                                 }
  554                                         } else {
  555                                                 if (!vm_page_is_valid(m,
  556                                                         (((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) {
  557                                                         bp->b_flags |= B_INVAL;
  558                                                 }
  559                                         }
  560                                 }
  561                                 foff += resid;
  562                                 iototal -= resid;
  563                         }
  564                 }
  565                 if (bp->b_flags & (B_INVAL | B_RELBUF))
  566                         vfs_vmio_release(bp);
  567         }
  568         if (bp->b_qindex != QUEUE_NONE)
  569                 panic("brelse: free buffer onto another queue???");
  570 
  571         /* enqueue */
  572         /* buffers with no memory */
  573         if (bp->b_bufsize == 0) {
  574                 bp->b_qindex = QUEUE_EMPTY;
  575                 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
  576                 LIST_REMOVE(bp, b_hash);
  577                 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
  578                 bp->b_dev = NODEV;
  579                 /*
  580                  * Get rid of the kva allocation *now*
  581                  */
  582                 bfreekva(bp);
  583                 if (needsbuffer) {
  584                         wakeup(&needsbuffer);
  585                         needsbuffer=0;
  586                 }
  587                 /* buffers with junk contents */
  588         } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
  589                 bp->b_qindex = QUEUE_AGE;
  590                 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
  591                 LIST_REMOVE(bp, b_hash);
  592                 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
  593                 bp->b_dev = NODEV;
  594                 if (needsbuffer) {
  595                         wakeup(&needsbuffer);
  596                         needsbuffer=0;
  597                 }
  598                 /* buffers that are locked */
  599         } else if (bp->b_flags & B_LOCKED) {
  600                 bp->b_qindex = QUEUE_LOCKED;
  601                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
  602                 /* buffers with stale but valid contents */
  603         } else if (bp->b_flags & B_AGE) {
  604                 bp->b_qindex = QUEUE_AGE;
  605                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
  606                 if (needsbuffer) {
  607                         wakeup(&needsbuffer);
  608                         needsbuffer=0;
  609                 }
  610                 /* buffers with valid and quite potentially reuseable contents */
  611         } else {
  612                 bp->b_qindex = QUEUE_LRU;
  613                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
  614                 if (needsbuffer) {
  615                         wakeup(&needsbuffer);
  616                         needsbuffer=0;
  617                 }
  618         }
  619 
  620         /* unlock */
  621         bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
  622                                 B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
  623         splx(s);
  624 }
  625 
  626 /*
  627  * Release a buffer.
  628  */
  629 void
  630 bqrelse(struct buf * bp)
  631 {
  632         int s;
  633 
  634         s = splbio();
  635 
  636 
  637         /* anyone need this block? */
  638         if (bp->b_flags & B_WANTED) {
  639                 bp->b_flags &= ~(B_WANTED | B_AGE);
  640                 wakeup(bp);
  641         } 
  642 
  643         if (bp->b_qindex != QUEUE_NONE)
  644                 panic("bqrelse: free buffer onto another queue???");
  645 
  646         if (bp->b_flags & B_LOCKED) {
  647                 bp->b_flags &= ~B_ERROR;
  648                 bp->b_qindex = QUEUE_LOCKED;
  649                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
  650                 /* buffers with stale but valid contents */
  651         } else {
  652                 bp->b_qindex = QUEUE_LRU;
  653                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
  654                 if (needsbuffer) {
  655                         wakeup(&needsbuffer);
  656                         needsbuffer=0;
  657                 }
  658         }
  659 
  660         /* unlock */
  661         bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
  662                 B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
  663         splx(s);
  664 }
  665 
  666 static void
  667 vfs_vmio_release(bp)
  668         struct buf *bp;
  669 {
  670         int i;
  671         vm_page_t m;
  672 
  673         for (i = 0; i < bp->b_npages; i++) {
  674                 m = bp->b_pages[i];
  675                 bp->b_pages[i] = NULL;
  676                 vm_page_unwire(m);
  677                 /*
  678                  * We don't mess with busy pages, it is
  679                  * the responsibility of the process that
  680                  * busied the pages to deal with them.
  681                  */
  682                 if ((m->flags & PG_BUSY) || (m->busy != 0))
  683                         continue;
  684                         
  685                 if (m->wire_count == 0) {
  686 
  687                         if (m->flags & PG_WANTED) {
  688                                 m->flags &= ~PG_WANTED;
  689                                 wakeup(m);
  690                         }
  691 
  692                         /*
  693                          * If this is an async free -- we cannot place
  694                          * pages onto the cache queue, so our policy for
  695                          * such buffers is to avoid the cache queue, and
  696                          * only modify the active queue or free queue.
  697                          */
  698                         if ((bp->b_flags & B_ASYNC) == 0) {
  699 
  700                         /*
  701                          * In the case of sync buffer frees, we can do pretty much
  702                          * anything to any of the memory queues.  Specifically,
  703                          * the cache queue is free to be modified.
  704                          */
  705                                 if (m->valid) {
  706                                         if(m->dirty == 0)
  707                                                 vm_page_test_dirty(m);
  708                                         /*
  709                                          * this keeps pressure off of the process memory
  710                                          */
  711                                         if ((vm_swap_size == 0) ||
  712                                                 (cnt.v_free_count < cnt.v_free_min)) {
  713                                                 if ((m->dirty == 0) &&
  714                                                         (m->hold_count == 0)) 
  715                                                         vm_page_cache(m);
  716                                                 else
  717                                                         vm_page_deactivate(m);
  718                                         }
  719                                 } else if (m->hold_count == 0) {
  720                                         vm_page_protect(m, VM_PROT_NONE);
  721                                         vm_page_free(m);
  722                                 }
  723                         } else {
  724                                 /*
  725                                  * If async, then at least we clear the
  726                                  * act_count.
  727                                  */
  728                                 m->act_count = 0;
  729                         }
  730                 }
  731         }
  732         bufspace -= bp->b_bufsize;
  733         vmiospace -= bp->b_bufsize;
  734         pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
  735         bp->b_npages = 0;
  736         bp->b_bufsize = 0;
  737         bp->b_flags &= ~B_VMIO;
  738         if (bp->b_vp)
  739                 brelvp(bp);
  740 }
  741 
  742 /*
  743  * Check to see if a block is currently memory resident.
  744  */
  745 struct buf *
  746 gbincore(struct vnode * vp, daddr_t blkno)
  747 {
  748         struct buf *bp;
  749         struct bufhashhdr *bh;
  750 
  751         bh = BUFHASH(vp, blkno);
  752         bp = bh->lh_first;
  753 
  754         /* Search hash chain */
  755         while (bp != NULL) {
  756                 /* hit */
  757                 if (bp->b_vp == vp && bp->b_lblkno == blkno &&
  758                     (bp->b_flags & B_INVAL) == 0) {
  759                         break;
  760                 }
  761                 bp = bp->b_hash.le_next;
  762         }
  763         return (bp);
  764 }
  765 
  766 /*
  767  * this routine implements clustered async writes for
  768  * clearing out B_DELWRI buffers...  This is much better
  769  * than the old way of writing only one buffer at a time.
  770  */
  771 int
  772 vfs_bio_awrite(struct buf * bp)
  773 {
  774         int i;
  775         daddr_t lblkno = bp->b_lblkno;
  776         struct vnode *vp = bp->b_vp;
  777         int s;
  778         int ncl;
  779         struct buf *bpa;
  780         int nwritten;
  781 
  782         s = splbio();
  783         /*
  784          * right now we support clustered writing only to regular files
  785          */
  786         if ((vp->v_type == VREG) && 
  787             (vp->v_mount != 0) && /* Only on nodes that have the size info */
  788             (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
  789                 int size;
  790                 int maxcl;
  791 
  792                 size = vp->v_mount->mnt_stat.f_iosize;
  793                 maxcl = MAXPHYS / size;
  794 
  795                 for (i = 1; i < maxcl; i++) {
  796                         if ((bpa = gbincore(vp, lblkno + i)) &&
  797                             ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
  798                             (B_DELWRI | B_CLUSTEROK)) &&
  799                             (bpa->b_bufsize == size)) {
  800                                 if ((bpa->b_blkno == bpa->b_lblkno) ||
  801                                     (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
  802                                         break;
  803                         } else {
  804                                 break;
  805                         }
  806                 }
  807                 ncl = i;
  808                 /*
  809                  * this is a possible cluster write
  810                  */
  811                 if (ncl != 1) {
  812                         nwritten = cluster_wbuild(vp, size, lblkno, ncl);
  813                         splx(s);
  814                         return nwritten;
  815                 }
  816         }
  817         bremfree(bp);
  818         splx(s);
  819         /*
  820          * default (old) behavior, writing out only one block
  821          */
  822         bp->b_flags |= B_BUSY | B_ASYNC;
  823         nwritten = bp->b_bufsize;
  824         (void) VOP_BWRITE(bp);
  825         return nwritten;
  826 }
  827 
  828 
  829 /*
  830  * Find a buffer header which is available for use.
  831  */
  832 static struct buf *
  833 getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
  834 {
  835         struct buf *bp;
  836         int nbyteswritten = 0;
  837         vm_offset_t addr;
  838 
  839 start:
  840         if (bufspace >= maxbufspace)
  841                 goto trytofreespace;
  842 
  843         /* can we constitute a new buffer? */
  844         if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
  845                 if (bp->b_qindex != QUEUE_EMPTY)
  846                         panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
  847                             bp->b_qindex);
  848                 bp->b_flags |= B_BUSY;
  849                 bremfree(bp);
  850                 goto fillbuf;
  851         }
  852 trytofreespace:
  853         /*
  854          * We keep the file I/O from hogging metadata I/O
  855          * This is desirable because file data is cached in the
  856          * VM/Buffer cache even if a buffer is freed.
  857          */
  858         if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
  859                 if (bp->b_qindex != QUEUE_AGE)
  860                         panic("getnewbuf: inconsistent AGE queue, qindex=%d",
  861                             bp->b_qindex);
  862         } else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
  863                 if (bp->b_qindex != QUEUE_LRU)
  864                         panic("getnewbuf: inconsistent LRU queue, qindex=%d",
  865                             bp->b_qindex);
  866         }
  867         if (!bp) {
  868                 /* wait for a free buffer of any kind */
  869                 needsbuffer = 1;
  870                 do
  871                         tsleep(&needsbuffer, (PRIBIO + 1) | slpflag, "newbuf",
  872                             slptimeo);
  873                 while (needsbuffer);
  874                 return (0);
  875         }
  876 
  877 #if defined(DIAGNOSTIC)
  878         if (bp->b_flags & B_BUSY) {
  879                 panic("getnewbuf: busy buffer on free list\n");
  880         }
  881 #endif
  882 
  883         /*
  884          * We are fairly aggressive about freeing VMIO buffers, but since
  885          * the buffering is intact without buffer headers, there is not
  886          * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
  887          */
  888         if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
  889                 if ((bp->b_flags & B_VMIO) == 0 ||
  890                         (vmiospace < maxvmiobufspace)) {
  891                         --bp->b_usecount;
  892                         TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
  893                         if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
  894                                 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
  895                                 goto start;
  896                         }
  897                         TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
  898                 }
  899         }
  900 
  901         /* if we are a delayed write, convert to an async write */
  902         if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
  903                 nbyteswritten += vfs_bio_awrite(bp);
  904                 if (!slpflag && !slptimeo) {
  905                         return (0);
  906                 }
  907                 goto start;
  908         }
  909 
  910         if (bp->b_flags & B_WANTED) {
  911                 bp->b_flags &= ~B_WANTED;
  912                 wakeup(bp);
  913         }
  914         bremfree(bp);
  915         bp->b_flags |= B_BUSY;
  916 
  917         if (bp->b_flags & B_VMIO) {
  918                 bp->b_flags &= ~B_ASYNC;
  919                 vfs_vmio_release(bp);
  920         }
  921 
  922         if (bp->b_vp)
  923                 brelvp(bp);
  924 
  925 fillbuf:
  926         /* we are not free, nor do we contain interesting data */
  927         if (bp->b_rcred != NOCRED) {
  928                 crfree(bp->b_rcred);
  929                 bp->b_rcred = NOCRED;
  930         }
  931         if (bp->b_wcred != NOCRED) {
  932                 crfree(bp->b_wcred);
  933                 bp->b_wcred = NOCRED;
  934         }
  935 
  936         LIST_REMOVE(bp, b_hash);
  937         LIST_INSERT_HEAD(&invalhash, bp, b_hash);
  938         if (bp->b_bufsize) {
  939                 allocbuf(bp, 0);
  940         }
  941         bp->b_flags = B_BUSY;
  942         bp->b_dev = NODEV;
  943         bp->b_vp = NULL;
  944         bp->b_blkno = bp->b_lblkno = 0;
  945         bp->b_iodone = 0;
  946         bp->b_error = 0;
  947         bp->b_resid = 0;
  948         bp->b_bcount = 0;
  949         bp->b_npages = 0;
  950         bp->b_dirtyoff = bp->b_dirtyend = 0;
  951         bp->b_validoff = bp->b_validend = 0;
  952         bp->b_usecount = 4;
  953 
  954         maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
  955 
  956         /*
  957          * we assume that buffer_map is not at address 0
  958          */
  959         addr = 0;
  960         if (maxsize != bp->b_kvasize) {
  961                 bfreekva(bp);
  962                 
  963                 /*
  964                  * See if we have buffer kva space
  965                  */
  966                 if (vm_map_findspace(buffer_map,
  967                         vm_map_min(buffer_map), maxsize, &addr)) {
  968                         bp->b_flags |= B_INVAL;
  969                         brelse(bp);
  970                         goto trytofreespace;
  971                 }
  972         }
  973 
  974         /*
  975          * See if we are below are allocated minimum
  976          */
  977         if (bufspace >= (maxbufspace + nbyteswritten)) {
  978                 bp->b_flags |= B_INVAL;
  979                 brelse(bp);
  980                 goto trytofreespace;
  981         }
  982 
  983         /*
  984          * create a map entry for the buffer -- in essence
  985          * reserving the kva space.
  986          */
  987         if (addr) {
  988                 vm_map_insert(buffer_map, NULL, 0,
  989                         addr, addr + maxsize,
  990                         VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
  991 
  992                 bp->b_kvabase = (caddr_t) addr;
  993                 bp->b_kvasize = maxsize;
  994         }
  995         bp->b_data = bp->b_kvabase;
  996         
  997         return (bp);
  998 }
  999 
 1000 /*
 1001  * Check to see if a block is currently memory resident.
 1002  */
 1003 struct buf *
 1004 incore(struct vnode * vp, daddr_t blkno)
 1005 {
 1006         struct buf *bp;
 1007 
 1008         int s = splbio();
 1009         bp = gbincore(vp, blkno);
 1010         splx(s);
 1011         return (bp);
 1012 }
 1013 
 1014 /*
 1015  * Returns true if no I/O is needed to access the
 1016  * associated VM object.  This is like incore except
 1017  * it also hunts around in the VM system for the data.
 1018  */
 1019 
 1020 int
 1021 inmem(struct vnode * vp, daddr_t blkno)
 1022 {
 1023         vm_object_t obj;
 1024         vm_offset_t toff, tinc;
 1025         vm_page_t m;
 1026         vm_ooffset_t off;
 1027 
 1028         if (incore(vp, blkno))
 1029                 return 1;
 1030         if (vp->v_mount == NULL)
 1031                 return 0;
 1032         if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
 1033                 return 0;
 1034 
 1035         obj = vp->v_object;
 1036         tinc = PAGE_SIZE;
 1037         if (tinc > vp->v_mount->mnt_stat.f_iosize)
 1038                 tinc = vp->v_mount->mnt_stat.f_iosize;
 1039         off = blkno * vp->v_mount->mnt_stat.f_iosize;
 1040 
 1041         for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 1042 
 1043                 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 1044                 if (!m)
 1045                         return 0;
 1046                 if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0)
 1047                         return 0;
 1048         }
 1049         return 1;
 1050 }
 1051 
 1052 /*
 1053  * now we set the dirty range for the buffer --
 1054  * for NFS -- if the file is mapped and pages have
 1055  * been written to, let it know.  We want the
 1056  * entire range of the buffer to be marked dirty if
 1057  * any of the pages have been written to for consistancy
 1058  * with the b_validoff, b_validend set in the nfs write
 1059  * code, and used by the nfs read code.
 1060  */
 1061 static void
 1062 vfs_setdirty(struct buf *bp) {
 1063         int i;
 1064         vm_object_t object;
 1065         vm_offset_t boffset, offset;
 1066         /*
 1067          * We qualify the scan for modified pages on whether the
 1068          * object has been flushed yet.  The OBJ_WRITEABLE flag
 1069          * is not cleared simply by protecting pages off.
 1070          */
 1071         if ((bp->b_flags & B_VMIO) &&
 1072                 ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
 1073                 /*
 1074                  * test the pages to see if they have been modified directly
 1075                  * by users through the VM system.
 1076                  */
 1077                 for (i = 0; i < bp->b_npages; i++)
 1078                         vm_page_test_dirty(bp->b_pages[i]);
 1079 
 1080                 /*
 1081                  * scan forwards for the first page modified
 1082                  */
 1083                 for (i = 0; i < bp->b_npages; i++) {
 1084                         if (bp->b_pages[i]->dirty) {
 1085                                 break;
 1086                         }
 1087                 }
 1088                 boffset = (i << PAGE_SHIFT);
 1089                 if (boffset < bp->b_dirtyoff) {
 1090                         bp->b_dirtyoff = boffset;
 1091                 }
 1092 
 1093                 /*
 1094                  * scan backwards for the last page modified
 1095                  */
 1096                 for (i = bp->b_npages - 1; i >= 0; --i) {
 1097                         if (bp->b_pages[i]->dirty) {
 1098                                 break;
 1099                         }
 1100                 }
 1101                 boffset = (i + 1);
 1102                 offset = boffset + bp->b_pages[0]->pindex;
 1103                 if (offset >= object->size)
 1104                         boffset = object->size - bp->b_pages[0]->pindex;
 1105                 if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
 1106                         bp->b_dirtyend = (boffset << PAGE_SHIFT);
 1107         }
 1108 }
 1109 
 1110 /*
 1111  * Get a block given a specified block and offset into a file/device.
 1112  */
 1113 struct buf *
 1114 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 1115 {
 1116         struct buf *bp;
 1117         int s;
 1118         struct bufhashhdr *bh;
 1119         int maxsize;
 1120 
 1121         if (vp->v_mount) {
 1122                 maxsize = vp->v_mount->mnt_stat.f_iosize;
 1123                 /*
 1124                  * This happens on mount points.
 1125                  */
 1126                 if (maxsize < size)
 1127                         maxsize = size;
 1128         } else {
 1129                 maxsize = size;
 1130         }
 1131 
 1132         if (size > MAXBSIZE)
 1133                 panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
 1134 
 1135         s = splbio();
 1136 loop:
 1137         if ((bp = gbincore(vp, blkno))) {
 1138                 if (bp->b_flags & B_BUSY) {
 1139                         bp->b_flags |= B_WANTED;
 1140                         if (bp->b_usecount < BUF_MAXUSE)
 1141                                 ++bp->b_usecount;
 1142                         if (!tsleep(bp,
 1143                                 (PRIBIO + 1) | slpflag, "getblk", slptimeo))
 1144                                 goto loop;
 1145 
 1146                         splx(s);
 1147                         return (struct buf *) NULL;
 1148                 }
 1149                 bp->b_flags |= B_BUSY | B_CACHE;
 1150                 bremfree(bp);
 1151                                 
 1152                 /*
 1153                  * check for size inconsistancies (note that they shouldn't happen
 1154                  * but do when filesystems don't handle the size changes correctly.)
 1155                  * We are conservative on metadata and don't just extend the buffer
 1156                  * but write and re-constitute it.
 1157                  */
 1158 
 1159                 if (bp->b_bcount != size) {
 1160                         if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
 1161                                 allocbuf(bp, size);
 1162                         } else {
 1163                                 bp->b_flags |= B_NOCACHE;
 1164                                 VOP_BWRITE(bp);
 1165                                 goto loop;
 1166                         }
 1167                 }
 1168 
 1169                 if (bp->b_usecount < BUF_MAXUSE)
 1170                         ++bp->b_usecount;
 1171                 splx(s);
 1172                 return (bp);
 1173         } else {
 1174                 vm_object_t obj;
 1175 
 1176                 if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == 0) {
 1177                         if (slpflag || slptimeo) {
 1178                                 splx(s);
 1179                                 return NULL;
 1180                         }
 1181                         goto loop;
 1182                 }
 1183 
 1184                 /*
 1185                  * This code is used to make sure that a buffer is not
 1186                  * created while the getnewbuf routine is blocked.
 1187                  * This can be a problem whether the vnode is locked or not.
 1188                  */
 1189                 if (gbincore(vp, blkno)) {
 1190                         bp->b_flags |= B_INVAL;
 1191                         brelse(bp);
 1192                         goto loop;
 1193                 }
 1194 
 1195                 /*
 1196                  * Insert the buffer into the hash, so that it can
 1197                  * be found by incore.
 1198                  */
 1199                 bp->b_blkno = bp->b_lblkno = blkno;
 1200                 bgetvp(vp, bp);
 1201                 LIST_REMOVE(bp, b_hash);
 1202                 bh = BUFHASH(vp, blkno);
 1203                 LIST_INSERT_HEAD(bh, bp, b_hash);
 1204 
 1205                 if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
 1206                         bp->b_flags |= (B_VMIO | B_CACHE);
 1207 #if defined(VFS_BIO_DEBUG)
 1208                         if (vp->v_type != VREG && vp->v_type != VBLK)
 1209                                 printf("getblk: vmioing file type %d???\n", vp->v_type);
 1210 #endif
 1211                 } else {
 1212                         bp->b_flags &= ~B_VMIO;
 1213                 }
 1214                 splx(s);
 1215 
 1216                 allocbuf(bp, size);
 1217 #ifdef  PC98
 1218                 /*
 1219                  * 1024byte/sector support
 1220                  */
 1221 #define B_XXX2 0x8000000
 1222                 if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2;
 1223 #endif
 1224                 return (bp);
 1225         }
 1226 }
 1227 
 1228 /*
 1229  * Get an empty, disassociated buffer of given size.
 1230  */
 1231 struct buf *
 1232 geteblk(int size)
 1233 {
 1234         struct buf *bp;
 1235         int s;
 1236 
 1237         s = splbio();
 1238         while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0);
 1239         splx(s);
 1240         allocbuf(bp, size);
 1241         bp->b_flags |= B_INVAL;
 1242         return (bp);
 1243 }
 1244 
 1245 
 1246 /*
 1247  * This code constitutes the buffer memory from either anonymous system
 1248  * memory (in the case of non-VMIO operations) or from an associated
 1249  * VM object (in the case of VMIO operations).
 1250  *
 1251  * Note that this code is tricky, and has many complications to resolve
 1252  * deadlock or inconsistant data situations.  Tread lightly!!!
 1253  *
 1254  * Modify the length of a buffer's underlying buffer storage without
 1255  * destroying information (unless, of course the buffer is shrinking).
 1256  */
 1257 int
 1258 allocbuf(struct buf * bp, int size)
 1259 {
 1260 
 1261         int s;
 1262         int newbsize, mbsize;
 1263         int i;
 1264 
 1265         if (!(bp->b_flags & B_BUSY))
 1266                 panic("allocbuf: buffer not busy");
 1267 
 1268         if (bp->b_kvasize < size)
 1269                 panic("allocbuf: buffer too small");
 1270 
 1271         if ((bp->b_flags & B_VMIO) == 0) {
 1272                 caddr_t origbuf;
 1273                 int origbufsize;
 1274                 /*
 1275                  * Just get anonymous memory from the kernel
 1276                  */
 1277                 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 1278 #if !defined(NO_B_MALLOC)
 1279                 if (bp->b_flags & B_MALLOC)
 1280                         newbsize = mbsize;
 1281                 else
 1282 #endif
 1283                         newbsize = round_page(size);
 1284 
 1285                 if (newbsize < bp->b_bufsize) {
 1286 #if !defined(NO_B_MALLOC)
 1287                         /*
 1288                          * malloced buffers are not shrunk
 1289                          */
 1290                         if (bp->b_flags & B_MALLOC) {
 1291                                 if (newbsize) {
 1292                                         bp->b_bcount = size;
 1293                                 } else {
 1294                                         free(bp->b_data, M_BIOBUF);
 1295                                         bufspace -= bp->b_bufsize;
 1296                                         bufmallocspace -= bp->b_bufsize;
 1297                                         bp->b_data = bp->b_kvabase;
 1298                                         bp->b_bufsize = 0;
 1299                                         bp->b_bcount = 0;
 1300                                         bp->b_flags &= ~B_MALLOC;
 1301                                 }
 1302                                 return 1;
 1303                         }               
 1304 #endif
 1305                         vm_hold_free_pages(
 1306                             bp,
 1307                             (vm_offset_t) bp->b_data + newbsize,
 1308                             (vm_offset_t) bp->b_data + bp->b_bufsize);
 1309                 } else if (newbsize > bp->b_bufsize) {
 1310 #if !defined(NO_B_MALLOC)
 1311                         /*
 1312                          * We only use malloced memory on the first allocation.
 1313                          * and revert to page-allocated memory when the buffer grows.
 1314                          */
 1315                         if ( (bufmallocspace < maxbufmallocspace) &&
 1316                                 (bp->b_bufsize == 0) &&
 1317                                 (mbsize <= PAGE_SIZE/2)) {
 1318 
 1319                                 bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
 1320                                 bp->b_bufsize = mbsize;
 1321                                 bp->b_bcount = size;
 1322                                 bp->b_flags |= B_MALLOC;
 1323                                 bufspace += mbsize;
 1324                                 bufmallocspace += mbsize;
 1325                                 return 1;
 1326                         }
 1327 #endif
 1328                         origbuf = NULL;
 1329                         origbufsize = 0;
 1330 #if !defined(NO_B_MALLOC)
 1331                         /*
 1332                          * If the buffer is growing on it's other-than-first allocation,
 1333                          * then we revert to the page-allocation scheme.
 1334                          */
 1335                         if (bp->b_flags & B_MALLOC) {
 1336                                 origbuf = bp->b_data;
 1337                                 origbufsize = bp->b_bufsize;
 1338                                 bp->b_data = bp->b_kvabase;
 1339                                 bufspace -= bp->b_bufsize;
 1340                                 bufmallocspace -= bp->b_bufsize;
 1341                                 bp->b_bufsize = 0;
 1342                                 bp->b_flags &= ~B_MALLOC;
 1343                                 newbsize = round_page(newbsize);
 1344                         }
 1345 #endif
 1346                         vm_hold_load_pages(
 1347                             bp,
 1348                             (vm_offset_t) bp->b_data + bp->b_bufsize,
 1349                             (vm_offset_t) bp->b_data + newbsize);
 1350 #if !defined(NO_B_MALLOC)
 1351                         if (origbuf) {
 1352                                 bcopy(origbuf, bp->b_data, origbufsize);
 1353                                 free(origbuf, M_BIOBUF);
 1354                         }
 1355 #endif
 1356                 }
 1357         } else {
 1358                 vm_page_t m;
 1359                 int desiredpages;
 1360 
 1361                 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 1362                 desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
 1363 
 1364 #if !defined(NO_B_MALLOC)
 1365                 if (bp->b_flags & B_MALLOC)
 1366                         panic("allocbuf: VMIO buffer can't be malloced");
 1367 #endif
 1368 
 1369                 if (newbsize < bp->b_bufsize) {
 1370                         if (desiredpages < bp->b_npages) {
 1371                                 for (i = desiredpages; i < bp->b_npages; i++) {
 1372                                         /*
 1373                                          * the page is not freed here -- it
 1374                                          * is the responsibility of vnode_pager_setsize
 1375                                          */
 1376                                         m = bp->b_pages[i];
 1377 #if defined(DIAGNOSTIC)
 1378                                         if (m == bogus_page)
 1379                                                 panic("allocbuf: bogus page found");
 1380 #endif
 1381                                         s = splvm();
 1382                                         while ((m->flags & PG_BUSY) || (m->busy != 0)) {
 1383                                                 m->flags |= PG_WANTED;
 1384                                                 tsleep(m, PVM, "biodep", 0);
 1385                                         }
 1386                                         splx(s);
 1387 
 1388                                         bp->b_pages[i] = NULL;
 1389                                         vm_page_unwire(m);
 1390                                 }
 1391                                 pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
 1392                                     (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 1393                                 bp->b_npages = desiredpages;
 1394                         }
 1395                 } else if (newbsize > bp->b_bufsize) {
 1396                         vm_object_t obj;
 1397                         vm_offset_t tinc, toff;
 1398                         vm_ooffset_t off;
 1399                         vm_pindex_t objoff;
 1400                         int pageindex, curbpnpages;
 1401                         struct vnode *vp;
 1402                         int bsize;
 1403 
 1404                         vp = bp->b_vp;
 1405 
 1406                         if (vp->v_type == VBLK)
 1407                                 bsize = DEV_BSIZE;
 1408                         else
 1409                                 bsize = vp->v_mount->mnt_stat.f_iosize;
 1410 
 1411                         if (bp->b_npages < desiredpages) {
 1412                                 obj = vp->v_object;
 1413                                 tinc = PAGE_SIZE;
 1414                                 if (tinc > bsize)
 1415                                         tinc = bsize;
 1416                                 off = (vm_ooffset_t) bp->b_lblkno * bsize;
 1417                                 curbpnpages = bp->b_npages;
 1418                 doretry:
 1419                                 bp->b_flags |= B_CACHE;
 1420                                 bp->b_validoff = bp->b_validend = 0;
 1421                                 for (toff = 0; toff < newbsize; toff += tinc) {
 1422                                         int bytesinpage;
 1423 
 1424                                         pageindex = toff >> PAGE_SHIFT;
 1425                                         objoff = OFF_TO_IDX(off + toff);
 1426                                         if (pageindex < curbpnpages) {
 1427 
 1428                                                 m = bp->b_pages[pageindex];
 1429 #ifdef VFS_BIO_DIAG
 1430                                                 if (m->pindex != objoff)
 1431                                                         panic("allocbuf: page changed offset??!!!?");
 1432 #endif
 1433                                                 bytesinpage = tinc;
 1434                                                 if (tinc > (newbsize - toff))
 1435                                                         bytesinpage = newbsize - toff;
 1436                                                 if (bp->b_flags & B_CACHE)
 1437                                                         vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
 1438                                                 continue;
 1439                                         }
 1440                                         m = vm_page_lookup(obj, objoff);
 1441                                         if (!m) {
 1442                                                 m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
 1443                                                 if (!m) {
 1444                                                         VM_WAIT;
 1445                                                         goto doretry;
 1446                                                 }
 1447                                                 /*
 1448                                                  * Normally it is unwise to clear PG_BUSY without
 1449                                                  * PAGE_WAKEUP -- but it is okay here, as there is
 1450                                                  * no chance for blocking between here and vm_page_alloc
 1451                                                  */
 1452                                                 m->flags &= ~PG_BUSY;
 1453                                                 vm_page_wire(m);
 1454                                                 bp->b_flags &= ~B_CACHE;
 1455                                         } else if (m->flags & PG_BUSY) {
 1456                                                 s = splvm();
 1457                                                 if (m->flags & PG_BUSY) {
 1458                                                         m->flags |= PG_WANTED;
 1459                                                         tsleep(m, PVM, "pgtblk", 0);
 1460                                                 }
 1461                                                 splx(s);
 1462                                                 goto doretry;
 1463                                         } else {
 1464                                                 if ((curproc != pageproc) &&
 1465                                                         ((m->queue - m->pc) == PQ_CACHE) &&
 1466                                                     ((cnt.v_free_count + cnt.v_cache_count) <
 1467                                                                 (cnt.v_free_min + cnt.v_cache_min))) {
 1468                                                         pagedaemon_wakeup();
 1469                                                 }
 1470                                                 bytesinpage = tinc;
 1471                                                 if (tinc > (newbsize - toff))
 1472                                                         bytesinpage = newbsize - toff;
 1473                                                 if (bp->b_flags & B_CACHE)
 1474                                                         vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
 1475                                                 vm_page_wire(m);
 1476                                         }
 1477                                         bp->b_pages[pageindex] = m;
 1478                                         curbpnpages = pageindex + 1;
 1479                                 }
 1480                                 if (vp->v_tag == VT_NFS &&
 1481                                     vp->v_type != VBLK && 
 1482                                     bp->b_validend == 0)
 1483                                         bp->b_flags &= ~B_CACHE;
 1484                                 bp->b_data = (caddr_t) trunc_page(bp->b_data);
 1485                                 bp->b_npages = curbpnpages;
 1486                                 pmap_qenter((vm_offset_t) bp->b_data,
 1487                                         bp->b_pages, bp->b_npages);
 1488                                 ((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
 1489                         }
 1490                 }
 1491         }
 1492         if (bp->b_flags & B_VMIO)
 1493                 vmiospace += bp->b_bufsize;
 1494         bufspace += (newbsize - bp->b_bufsize);
 1495         bp->b_bufsize = newbsize;
 1496         bp->b_bcount = size;
 1497         return 1;
 1498 }
 1499 
 1500 /*
 1501  * Wait for buffer I/O completion, returning error status.
 1502  */
 1503 int
 1504 biowait(register struct buf * bp)
 1505 {
 1506         int s;
 1507 
 1508         s = splbio();
 1509         while ((bp->b_flags & B_DONE) == 0)
 1510                 tsleep(bp, PRIBIO, "biowait", 0);
 1511         splx(s);
 1512         if (bp->b_flags & B_EINTR) {
 1513                 bp->b_flags &= ~B_EINTR;
 1514                 return (EINTR);
 1515         }
 1516         if (bp->b_flags & B_ERROR) {
 1517                 return (bp->b_error ? bp->b_error : EIO);
 1518         } else {
 1519                 return (0);
 1520         }
 1521 }
 1522 
 1523 /*
 1524  * Finish I/O on a buffer, calling an optional function.
 1525  * This is usually called from interrupt level, so process blocking
 1526  * is not *a good idea*.
 1527  */
 1528 void
 1529 biodone(register struct buf * bp)
 1530 {
 1531         int s;
 1532 
 1533         s = splbio();
 1534         if (!(bp->b_flags & B_BUSY))
 1535                 panic("biodone: buffer not busy");
 1536 
 1537         if (bp->b_flags & B_DONE) {
 1538                 splx(s);
 1539                 printf("biodone: buffer already done\n");
 1540                 return;
 1541         }
 1542         bp->b_flags |= B_DONE;
 1543 
 1544         if ((bp->b_flags & B_READ) == 0) {
 1545                 vwakeup(bp);
 1546         }
 1547 #ifdef BOUNCE_BUFFERS
 1548         if (bp->b_flags & B_BOUNCE)
 1549                 vm_bounce_free(bp);
 1550 #endif
 1551 
 1552         /* call optional completion function if requested */
 1553         if (bp->b_flags & B_CALL) {
 1554                 bp->b_flags &= ~B_CALL;
 1555                 (*bp->b_iodone) (bp);
 1556                 splx(s);
 1557                 return;
 1558         }
 1559         if (bp->b_flags & B_VMIO) {
 1560                 int i, resid;
 1561                 vm_ooffset_t foff;
 1562                 vm_page_t m;
 1563                 vm_object_t obj;
 1564                 int iosize;
 1565                 struct vnode *vp = bp->b_vp;
 1566 
 1567                 if (vp->v_type == VBLK)
 1568                         foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
 1569                 else
 1570                         foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
 1571                 obj = vp->v_object;
 1572                 if (!obj) {
 1573                         panic("biodone: no object");
 1574                 }
 1575 #if defined(VFS_BIO_DEBUG)
 1576                 if (obj->paging_in_progress < bp->b_npages) {
 1577                         printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
 1578                             obj->paging_in_progress, bp->b_npages);
 1579                 }
 1580 #endif
 1581                 iosize = bp->b_bufsize;
 1582                 for (i = 0; i < bp->b_npages; i++) {
 1583                         int bogusflag = 0;
 1584                         m = bp->b_pages[i];
 1585                         if (m == bogus_page) {
 1586                                 bogusflag = 1;
 1587                                 m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 1588                                 if (!m) {
 1589 #if defined(VFS_BIO_DEBUG)
 1590                                         printf("biodone: page disappeared\n");
 1591 #endif
 1592                                         --obj->paging_in_progress;
 1593                                         continue;
 1594                                 }
 1595                                 bp->b_pages[i] = m;
 1596                                 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
 1597                         }
 1598 #if defined(VFS_BIO_DEBUG)
 1599                         if (OFF_TO_IDX(foff) != m->pindex) {
 1600                                 printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
 1601                         }
 1602 #endif
 1603                         resid = IDX_TO_OFF(m->pindex + 1) - foff;
 1604                         if (resid > iosize)
 1605                                 resid = iosize;
 1606                         /*
 1607                          * In the write case, the valid and clean bits are
 1608                          * already changed correctly, so we only need to do this
 1609                          * here in the read case.
 1610                          */
 1611                         if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
 1612                                 vfs_page_set_valid(bp, foff, i, m);
 1613                         }
 1614 
 1615                         /*
 1616                          * when debugging new filesystems or buffer I/O methods, this
 1617                          * is the most common error that pops up.  if you see this, you
 1618                          * have not set the page busy flag correctly!!!
 1619                          */
 1620                         if (m->busy == 0) {
 1621                                 printf("biodone: page busy < 0, "
 1622                                     "pindex: %d, foff: 0x(%x,%x), "
 1623                                     "resid: %d, index: %d\n",
 1624                                     (int) m->pindex, (int)(foff >> 32),
 1625                                                 (int) foff & 0xffffffff, resid, i);
 1626                                 if (vp->v_type != VBLK)
 1627                                         printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
 1628                                             bp->b_vp->v_mount->mnt_stat.f_iosize,
 1629                                             (int) bp->b_lblkno,
 1630                                             bp->b_flags, bp->b_npages);
 1631                                 else
 1632                                         printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
 1633                                             (int) bp->b_lblkno,
 1634                                             bp->b_flags, bp->b_npages);
 1635                                 printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
 1636                                     m->valid, m->dirty, m->wire_count);
 1637                                 panic("biodone: page busy < 0\n");
 1638                         }
 1639                         --m->busy;
 1640                         if ((m->busy == 0) && (m->flags & PG_WANTED)) {
 1641                                 m->flags &= ~PG_WANTED;
 1642                                 wakeup(m);
 1643                         }
 1644                         --obj->paging_in_progress;
 1645                         foff += resid;
 1646                         iosize -= resid;
 1647                 }
 1648                 if (obj && obj->paging_in_progress == 0 &&
 1649                     (obj->flags & OBJ_PIPWNT)) {
 1650                         obj->flags &= ~OBJ_PIPWNT;
 1651                         wakeup(obj);
 1652                 }
 1653         }
 1654         /*
 1655          * For asynchronous completions, release the buffer now. The brelse
 1656          * checks for B_WANTED and will do the wakeup there if necessary - so
 1657          * no need to do a wakeup here in the async case.
 1658          */
 1659 
 1660         if (bp->b_flags & B_ASYNC) {
 1661                 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
 1662                         brelse(bp);
 1663                 else
 1664                         bqrelse(bp);
 1665         } else {
 1666                 bp->b_flags &= ~B_WANTED;
 1667                 wakeup(bp);
 1668         }
 1669         splx(s);
 1670 }
 1671 
 1672 int
 1673 count_lock_queue()
 1674 {
 1675         int count;
 1676         struct buf *bp;
 1677 
 1678         count = 0;
 1679         for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]);
 1680             bp != NULL;
 1681             bp = TAILQ_NEXT(bp, b_freelist))
 1682                 count++;
 1683         return (count);
 1684 }
 1685 
 1686 int vfs_update_interval = 30;
 1687 
 1688 static void
 1689 vfs_update()
 1690 {
 1691         while (1) {
 1692                 tsleep(&vfs_update_wakeup, PUSER, "update",
 1693                     hz * vfs_update_interval);
 1694                 vfs_update_wakeup = 0;
 1695                 sync(curproc, NULL, NULL);
 1696         }
 1697 }
 1698 
 1699 static int
 1700 sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
 1701 {
 1702         int error = sysctl_handle_int(oidp,
 1703                 oidp->oid_arg1, oidp->oid_arg2, req);
 1704         if (!error)
 1705                 wakeup(&vfs_update_wakeup);
 1706         return error;
 1707 }
 1708 
 1709 SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
 1710         &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
 1711 
 1712 
 1713 /*
 1714  * This routine is called in lieu of iodone in the case of
 1715  * incomplete I/O.  This keeps the busy status for pages
 1716  * consistant.
 1717  */
 1718 void
 1719 vfs_unbusy_pages(struct buf * bp)
 1720 {
 1721         int i;
 1722 
 1723         if (bp->b_flags & B_VMIO) {
 1724                 struct vnode *vp = bp->b_vp;
 1725                 vm_object_t obj = vp->v_object;
 1726                 vm_ooffset_t foff;
 1727 
 1728                 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
 1729 
 1730                 for (i = 0; i < bp->b_npages; i++) {
 1731                         vm_page_t m = bp->b_pages[i];
 1732 
 1733                         if (m == bogus_page) {
 1734                                 m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i);
 1735                                 if (!m) {
 1736                                         panic("vfs_unbusy_pages: page missing\n");
 1737                                 }
 1738                                 bp->b_pages[i] = m;
 1739                                 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
 1740                         }
 1741                         --obj->paging_in_progress;
 1742                         --m->busy;
 1743                         if ((m->busy == 0) && (m->flags & PG_WANTED)) {
 1744                                 m->flags &= ~PG_WANTED;
 1745                                 wakeup(m);
 1746                         }
 1747                 }
 1748                 if (obj->paging_in_progress == 0 &&
 1749                     (obj->flags & OBJ_PIPWNT)) {
 1750                         obj->flags &= ~OBJ_PIPWNT;
 1751                         wakeup(obj);
 1752                 }
 1753         }
 1754 }
 1755 
 1756 /*
 1757  * Set NFS' b_validoff and b_validend fields from the valid bits
 1758  * of a page.  If the consumer is not NFS, and the page is not
 1759  * valid for the entire range, clear the B_CACHE flag to force
 1760  * the consumer to re-read the page.
 1761  */
 1762 static void
 1763 vfs_buf_set_valid(struct buf *bp,
 1764                   vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
 1765                   vm_page_t m)
 1766 {
 1767         if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
 1768                 vm_offset_t svalid, evalid;
 1769                 int validbits = m->valid;
 1770 
 1771                 /*
 1772                  * This only bothers with the first valid range in the
 1773                  * page.
 1774                  */
 1775                 svalid = off;
 1776                 while (validbits && !(validbits & 1)) {
 1777                         svalid += DEV_BSIZE;
 1778                         validbits >>= 1;
 1779                 }
 1780                 evalid = svalid;
 1781                 while (validbits & 1) {
 1782                         evalid += DEV_BSIZE;
 1783                         validbits >>= 1;
 1784                 }
 1785                 /*
 1786                  * Make sure this range is contiguous with the range
 1787                  * built up from previous pages.  If not, then we will
 1788                  * just use the range from the previous pages.
 1789                  */
 1790                 if (svalid == bp->b_validend) {
 1791                         bp->b_validoff = min(bp->b_validoff, svalid);
 1792                         bp->b_validend = max(bp->b_validend, evalid);
 1793                 }
 1794         } else if (!vm_page_is_valid(m,
 1795                                      (vm_offset_t) ((foff + off) & PAGE_MASK),
 1796                                      size)) {
 1797                 bp->b_flags &= ~B_CACHE;
 1798         }
 1799 }
 1800 
 1801 /*
 1802  * Set the valid bits in a page, taking care of the b_validoff,
 1803  * b_validend fields which NFS uses to optimise small reads.  Off is
 1804  * the offset within the file and pageno is the page index within the buf.
 1805  */
 1806 static void
 1807 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 1808 {
 1809         struct vnode *vp = bp->b_vp;
 1810         vm_ooffset_t soff, eoff;
 1811 
 1812         soff = off;
 1813         eoff = off + min(PAGE_SIZE, bp->b_bufsize);
 1814         vm_page_set_invalid(m,
 1815                             (vm_offset_t) (soff & PAGE_MASK),
 1816                             (vm_offset_t) (eoff - soff));
 1817         if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
 1818                 vm_ooffset_t sv, ev;
 1819                 off = off - pageno * PAGE_SIZE;
 1820                 sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
 1821                 ev = off + ((bp->b_validend + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
 1822                 soff = max(sv, soff);
 1823                 eoff = min(ev, eoff);
 1824         }
 1825         if (eoff > soff)
 1826                 vm_page_set_validclean(m,
 1827                                        (vm_offset_t) (soff & PAGE_MASK),
 1828                                        (vm_offset_t) (eoff - soff));
 1829 }
 1830 
 1831 /*
 1832  * This routine is called before a device strategy routine.
 1833  * It is used to tell the VM system that paging I/O is in
 1834  * progress, and treat the pages associated with the buffer
 1835  * almost as being PG_BUSY.  Also the object paging_in_progress
 1836  * flag is handled to make sure that the object doesn't become
 1837  * inconsistant.
 1838  */
 1839 void
 1840 vfs_busy_pages(struct buf * bp, int clear_modify)
 1841 {
 1842         int i;
 1843 
 1844         if (bp->b_flags & B_VMIO) {
 1845                 struct vnode *vp = bp->b_vp;
 1846                 vm_object_t obj = vp->v_object;
 1847                 vm_ooffset_t foff;
 1848 
 1849                 if (vp->v_type == VBLK)
 1850                         foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
 1851                 else
 1852                         foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
 1853                 vfs_setdirty(bp);
 1854                 for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
 1855                         vm_page_t m = bp->b_pages[i];
 1856 
 1857                         if ((bp->b_flags & B_CLUSTER) == 0) {
 1858                                 obj->paging_in_progress++;
 1859                                 m->busy++;
 1860                         }
 1861                         vm_page_protect(m, VM_PROT_NONE);
 1862                         if (clear_modify)
 1863                                 vfs_page_set_valid(bp, foff, i, m);
 1864                         else if (bp->b_bcount >= PAGE_SIZE) {
 1865                                 if (m->valid && (bp->b_flags & B_CACHE) == 0) {
 1866                                         bp->b_pages[i] = bogus_page;
 1867                                         pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
 1868                                 }
 1869                         }
 1870                 }
 1871         }
 1872 }
 1873 
 1874 /*
 1875  * Tell the VM system that the pages associated with this buffer
 1876  * are clean.  This is used for delayed writes where the data is
 1877  * going to go to disk eventually without additional VM intevention.
 1878  */
 1879 void
 1880 vfs_clean_pages(struct buf * bp)
 1881 {
 1882         int i;
 1883 
 1884         if (bp->b_flags & B_VMIO) {
 1885                 struct vnode *vp = bp->b_vp;
 1886                 vm_object_t obj = vp->v_object;
 1887                 vm_ooffset_t foff;
 1888 
 1889                 if (vp->v_type == VBLK)
 1890                         foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
 1891                 else
 1892                         foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
 1893                 for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
 1894                         vm_page_t m = bp->b_pages[i];
 1895 
 1896                         vfs_page_set_valid(bp, foff, i, m);
 1897                 }
 1898         }
 1899 }
 1900 
 1901 void
 1902 vfs_bio_clrbuf(struct buf *bp) {
 1903         int i;
 1904         if( bp->b_flags & B_VMIO) {
 1905                 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
 1906                         int mask;
 1907                         mask = 0;
 1908                         for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
 1909                                 mask |= (1 << (i/DEV_BSIZE));
 1910                         if( bp->b_pages[0]->valid != mask) {
 1911                                 bzero(bp->b_data, bp->b_bufsize);
 1912                         }
 1913                         bp->b_pages[0]->valid = mask;
 1914                         bp->b_resid = 0;
 1915                         return;
 1916                 }
 1917                 for(i=0;i<bp->b_npages;i++) {
 1918                         if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
 1919                                 continue;
 1920                         if( bp->b_pages[i]->valid == 0) {
 1921                                 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
 1922                                         bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
 1923                                 }
 1924                         } else {
 1925                                 int j;
 1926                                 for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
 1927                                         if( (bp->b_pages[i]->valid & (1<<j)) == 0)
 1928                                                 bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
 1929                                 }
 1930                         }
 1931                         /* bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; */
 1932                 }
 1933                 bp->b_resid = 0;
 1934         } else {
 1935                 clrbuf(bp);
 1936         }
 1937 }
 1938 
 1939 /*
 1940  * vm_hold_load_pages and vm_hold_unload pages get pages into
 1941  * a buffers address space.  The pages are anonymous and are
 1942  * not associated with a file object.
 1943  */
 1944 void
 1945 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 1946 {
 1947         vm_offset_t pg;
 1948         vm_page_t p;
 1949         int index;
 1950 
 1951         to = round_page(to);
 1952         from = round_page(from);
 1953         index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
 1954 
 1955         for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 1956 
 1957 tryagain:
 1958 
 1959                 p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
 1960                     VM_ALLOC_NORMAL);
 1961                 if (!p) {
 1962                         VM_WAIT;
 1963                         goto tryagain;
 1964                 }
 1965                 vm_page_wire(p);
 1966                 pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
 1967                 bp->b_pages[index] = p;
 1968                 PAGE_WAKEUP(p);
 1969         }
 1970         bp->b_npages = to >> PAGE_SHIFT;
 1971 }
 1972 
 1973 void
 1974 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 1975 {
 1976         vm_offset_t pg;
 1977         vm_page_t p;
 1978         int index;
 1979 
 1980         from = round_page(from);
 1981         to = round_page(to);
 1982         index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
 1983 
 1984         for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 1985                 p = bp->b_pages[index];
 1986                 if (p && (index < bp->b_npages)) {
 1987                         if (p->busy) {
 1988                                 printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
 1989                                         bp->b_blkno, bp->b_lblkno);
 1990                         }
 1991                         bp->b_pages[index] = NULL;
 1992                         pmap_kremove(pg);
 1993                         vm_page_unwire(p);
 1994                         vm_page_free(p);
 1995                 }
 1996         }
 1997         bp->b_npages = from >> PAGE_SHIFT;
 1998 }

Cache object: 3e54aeefe771eedf0898a598992b59c3


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.