vfs_cluster.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  * Modifications/enhancements:
    5  *      Copyright (c) 1995 John S. Dyson.  All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  * 3. All advertising materials mentioning features or use of this software
   16  *    must display the following acknowledgement:
   17  *      This product includes software developed by the University of
   18  *      California, Berkeley and its contributors.
   19  * 4. Neither the name of the University nor the names of its contributors
   20  *    may be used to endorse or promote products derived from this software
   21  *    without specific prior written permission.
   22  *
   23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   33  * SUCH DAMAGE.
   34  *
   35  *      @(#)vfs_cluster.c       8.7 (Berkeley) 2/13/94
   36  * $FreeBSD: src/sys/kern/vfs_cluster.c,v 1.38.2.3 1999/09/05 08:15:40 peter Exp $
   37  */
   38 
   39 #include <sys/param.h>
   40 #include <sys/systm.h>
   41 #include <sys/proc.h>
   42 #include <sys/buf.h>
   43 #include <sys/vnode.h>
   44 #include <sys/mount.h>
   45 #include <sys/malloc.h>
   46 #include <sys/resourcevar.h>
   47 #include <sys/vmmeter.h>
   48 #include <miscfs/specfs/specdev.h>
   49 #include <vm/vm.h>
   50 #include <vm/vm_param.h>
   51 #include <vm/vm_prot.h>
   52 #include <vm/vm_object.h>
   53 #include <vm/vm_page.h>
   54 
   55 #ifdef notyet_block_reallocation_enabled
   56 #ifdef DEBUG
   57 #include <sys/sysctl.h>
   58 #include <sys/kernel.h>
   59 
   60 static int      doreallocblks = 0;
   61 SYSCTL_INT(_debug, 13, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "");
   62 #else
   63 #define doreallocblks 0
   64 #endif
   65 #endif /* notyet_block_reallocation_enabled */
   66 
   67 #ifdef notyet_block_reallocation_enabled
   68 static struct cluster_save *
   69         cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp));
   70 #endif
   71 static struct buf *
   72         cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
   73                             daddr_t blkno, long size, int run));
   74 
   75 static int      totreads;
   76 static int      totreadblocks;
   77 extern vm_page_t        bogus_page;
   78 
   79 #ifdef DIAGNOSTIC
   80 /*
   81  * Set to 1 if reads of block zero should cause readahead to be done.
   82  * Set to 0 treats a read of block zero as a non-sequential read.
   83  *
   84  * Setting to one assumes that most reads of block zero of files are due to
   85  * sequential passes over the files (e.g. cat, sum) where additional blocks
   86  * will soon be needed.  Setting to zero assumes that the majority are
   87  * surgical strikes to get particular info (e.g. size, file) where readahead
   88  * blocks will not be used and, in fact, push out other potentially useful
   89  * blocks from the cache.  The former seems intuitive, but some quick tests
   90  * showed that the latter performed better from a system-wide point of view.
   91  */
   92         int doclusterraz = 0;
   93 
   94 #define ISSEQREAD(vp, blk) \
   95         (((blk) != 0 || doclusterraz) && \
   96          ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
   97 #else
   98 #define ISSEQREAD(vp, blk) \
   99         (/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
  100 #endif
  101 
  102 /*
  103  * allow for three entire read-aheads...  The system will
  104  * adjust downwards rapidly if needed...
  105  */
  106 #define RA_MULTIPLE_FAST        2
  107 #define RA_MULTIPLE_SLOW        3
  108 #define RA_SHIFTDOWN    1       /* approx lg2(RA_MULTIPLE) */
  109 /*
  110  * This replaces bread.  If this is a bread at the beginning of a file and
  111  * lastr is 0, we assume this is the first read and we'll read up to two
  112  * blocks if they are sequential.  After that, we'll do regular read ahead
  113  * in clustered chunks.
  114  *      bp is the block requested.
  115  *      rbp is the read-ahead block.
  116  *      If either is NULL, then you don't have to do the I/O.
  117  */
  118 int
  119 cluster_read(vp, filesize, lblkno, size, cred, bpp)
  120         struct vnode *vp;
  121         u_quad_t filesize;
  122         daddr_t lblkno;
  123         long size;
  124         struct ucred *cred;
  125         struct buf **bpp;
  126 {
  127         struct buf *bp, *rbp;
  128         daddr_t blkno, rablkno, origlblkno;
  129         int error, num_ra, alreadyincore;
  130         int i;
  131         int seq;
  132 
  133         error = 0;
  134         /*
  135          * get the requested block
  136          */
  137         origlblkno = lblkno;
  138         *bpp = bp = getblk(vp, lblkno, size, 0, 0);
  139 
  140         seq = ISSEQREAD(vp, lblkno);
  141         /*
  142          * if it is in the cache, then check to see if the reads have been
  143          * sequential.  If they have, then try some read-ahead, otherwise
  144          * back-off on prospective read-aheads.
  145          */
  146         if (bp->b_flags & B_CACHE) {
  147                 if (!seq) {
  148                         vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
  149                         vp->v_ralen >>= RA_SHIFTDOWN;
  150                         return 0;
  151                 } else if( vp->v_maxra > lblkno) {
  152                         if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST * (MAXPHYS / size))
  153                                 ++vp->v_ralen;
  154                         if ( vp->v_maxra > lblkno + vp->v_ralen ) {
  155                                 return 0;
  156                         }
  157                         lblkno = vp->v_maxra;
  158                 } else {
  159                         lblkno += 1;
  160                 }
  161                 bp = NULL;
  162         } else {
  163                 /*
  164                  * if it isn't in the cache, then get a chunk from disk if
  165                  * sequential, otherwise just get the block.
  166                  */
  167                 bp->b_flags |= B_READ;
  168                 lblkno += 1;
  169                 curproc->p_stats->p_ru.ru_inblock++;    /* XXX */
  170                 vp->v_ralen = 0;
  171         }
  172         /*
  173          * assume no read-ahead
  174          */
  175         alreadyincore = 1;
  176         rablkno = lblkno;
  177 
  178         /*
  179          * if we have been doing sequential I/O, then do some read-ahead
  180          */
  181         if (seq) {
  182                 alreadyincore = 0;
  183 
  184         /*
  185          * bump ralen a bit...
  186          */
  187                 if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size))
  188                         ++vp->v_ralen;
  189                 /*
  190                  * this code makes sure that the stuff that we have read-ahead
  191                  * is still in the cache.  If it isn't, we have been reading
  192                  * ahead too much, and we need to back-off, otherwise we might
  193                  * try to read more.
  194                  */
  195                 for (i = 0; i < vp->v_maxra - lblkno; i++) {
  196                         rablkno = lblkno + i;
  197                         alreadyincore = (int) incore(vp, rablkno);
  198                         if (!alreadyincore) {
  199                                 vp->v_maxra = rablkno;
  200                                 vp->v_ralen >>= RA_SHIFTDOWN;
  201                                 alreadyincore = 1;
  202                         }
  203                 }
  204         }
  205         /*
  206          * we now build the read-ahead buffer if it is desirable.
  207          */
  208         rbp = NULL;
  209         if (!alreadyincore &&
  210             ((u_quad_t)(rablkno + 1) * size) <= filesize &&
  211             !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) &&
  212             blkno != -1) {
  213                 if (num_ra > vp->v_ralen)
  214                         num_ra = vp->v_ralen;
  215 
  216                 if (num_ra) {
  217                         rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size,
  218                                 num_ra + 1);
  219                 } else {
  220                         rbp = getblk(vp, rablkno, size, 0, 0);
  221                         rbp->b_flags |= B_READ | B_ASYNC;
  222                         rbp->b_blkno = blkno;
  223                 }
  224         }
  225 
  226         /*
  227          * handle the synchronous read
  228          */
  229         if (bp) {
  230                 if (bp->b_flags & (B_DONE | B_DELWRI))
  231                         panic("cluster_read: DONE bp");
  232                 else {
  233                         vfs_busy_pages(bp, 0);
  234                         error = VOP_STRATEGY(bp);
  235                         vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
  236                         totreads++;
  237                         totreadblocks += bp->b_bcount / size;
  238                         curproc->p_stats->p_ru.ru_inblock++;
  239                 }
  240         }
  241         /*
  242          * and if we have read-aheads, do them too
  243          */
  244         if (rbp) {
  245                 vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size;
  246                 if (error) {
  247                         rbp->b_flags &= ~(B_ASYNC | B_READ);
  248                         brelse(rbp);
  249                 } else if (rbp->b_flags & B_CACHE) {
  250                         rbp->b_flags &= ~(B_ASYNC | B_READ);
  251                         bqrelse(rbp);
  252                 } else {
  253                         if ((rbp->b_flags & B_CLUSTER) == 0)
  254                                 vfs_busy_pages(rbp, 0);
  255                         (void) VOP_STRATEGY(rbp);
  256                         totreads++;
  257                         totreadblocks += rbp->b_bcount / size;
  258                         curproc->p_stats->p_ru.ru_inblock++;
  259                 }
  260         }
  261         if (bp && ((bp->b_flags & B_ASYNC) == 0))
  262                 return (biowait(bp));
  263         return (error);
  264 }
  265 
  266 /*
  267  * If blocks are contiguous on disk, use this to provide clustered
  268  * read ahead.  We will read as many blocks as possible sequentially
  269  * and then parcel them up into logical blocks in the buffer hash table.
  270  */
  271 static struct buf *
  272 cluster_rbuild(vp, filesize, lbn, blkno, size, run)
  273         struct vnode *vp;
  274         u_quad_t filesize;
  275         daddr_t lbn;
  276         daddr_t blkno;
  277         long size;
  278         int run;
  279 {
  280         struct buf *bp, *tbp;
  281         daddr_t bn;
  282         int i, inc, j;
  283 
  284 #ifdef DIAGNOSTIC
  285         if (size != vp->v_mount->mnt_stat.f_iosize)
  286                 panic("cluster_rbuild: size %d != filesize %d\n",
  287                     size, vp->v_mount->mnt_stat.f_iosize);
  288 #endif
  289         /*
  290          * avoid a division
  291          */
  292         while ((u_quad_t) size * (lbn + run) > filesize) {
  293                 --run;
  294         }
  295 
  296         tbp = getblk(vp, lbn, size, 0, 0);
  297         if (tbp->b_flags & B_CACHE)
  298                 return tbp;
  299 
  300         tbp->b_blkno = blkno;
  301         tbp->b_flags |= B_ASYNC | B_READ; 
  302         if( (tbp->b_flags & B_MALLOC) ||
  303                 ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
  304                 return tbp;
  305 
  306         bp = trypbuf();
  307         if (bp == 0)
  308                 return tbp;
  309 
  310         (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
  311         bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
  312         bp->b_iodone = cluster_callback;
  313         bp->b_blkno = blkno;
  314         bp->b_lblkno = lbn;
  315         pbgetvp(vp, bp);
  316 
  317         TAILQ_INIT(&bp->b_cluster.cluster_head);
  318 
  319         bp->b_bcount = 0;
  320         bp->b_bufsize = 0;
  321         bp->b_npages = 0;
  322 
  323         inc = btodb(size);
  324         for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
  325                 if (i != 0) {
  326                         if ((bp->b_npages * PAGE_SIZE) +
  327                                 round_page(size) > MAXPHYS)
  328                                 break;
  329 
  330                         if (incore(vp, lbn + i))
  331                                 break;
  332 
  333                         tbp = getblk(vp, lbn + i, size, 0, 0);
  334 
  335                         if ((tbp->b_flags & B_CACHE) ||
  336                                 (tbp->b_flags & B_VMIO) == 0) {
  337                                 bqrelse(tbp);
  338                                 break;
  339                         }
  340 
  341                         for (j=0;j<tbp->b_npages;j++) {
  342                                 if (tbp->b_pages[j]->valid) {
  343                                         break;
  344                                 }
  345                         }
  346 
  347                         if (j != tbp->b_npages) {
  348                                 /*
  349                                  * force buffer to be re-constituted later
  350                                  */
  351                                 tbp->b_flags |= B_RELBUF;
  352                                 brelse(tbp);
  353                                 break;
  354                         }
  355 
  356                         tbp->b_flags |= B_READ | B_ASYNC;
  357                         if (tbp->b_blkno == tbp->b_lblkno) {
  358                                 tbp->b_blkno = bn;
  359                         } else if (tbp->b_blkno != bn) {
  360                                 brelse(tbp);
  361                                 break;
  362                         }
  363                 }
  364                 TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
  365                         tbp, b_cluster.cluster_entry);
  366                 for (j = 0; j < tbp->b_npages; j += 1) {
  367                         vm_page_t m;
  368                         m = tbp->b_pages[j];
  369                         ++m->busy;
  370                         ++m->object->paging_in_progress;
  371                         if ((bp->b_npages == 0) ||
  372                                 (bp->b_pages[bp->b_npages-1] != m)) {
  373                                 bp->b_pages[bp->b_npages] = m;
  374                                 bp->b_npages++;
  375                         }
  376                         if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
  377                                 tbp->b_pages[j] = bogus_page;
  378                 }
  379                 bp->b_bcount += tbp->b_bcount;
  380                 bp->b_bufsize += tbp->b_bufsize;
  381         }
  382 
  383         for(j=0;j<bp->b_npages;j++) {
  384                 if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) ==
  385                         VM_PAGE_BITS_ALL)
  386                         bp->b_pages[j] = bogus_page;
  387         }
  388         if (bp->b_bufsize > bp->b_kvasize)
  389                 panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)\n",
  390                         bp->b_bufsize, bp->b_kvasize);
  391         bp->b_kvasize = bp->b_bufsize;
  392 
  393         pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
  394                 (vm_page_t *)bp->b_pages, bp->b_npages);
  395         return (bp);
  396 }
  397 
  398 /*
  399  * Cleanup after a clustered read or write.
  400  * This is complicated by the fact that any of the buffers might have
  401  * extra memory (if there were no empty buffer headers at allocbuf time)
  402  * that we will need to shift around.
  403  */
  404 void
  405 cluster_callback(bp)
  406         struct buf *bp;
  407 {
  408         struct buf *nbp, *tbp;
  409         int error = 0;
  410 
  411         /*
  412          * Must propogate errors to all the components.
  413          */
  414         if (bp->b_flags & B_ERROR)
  415                 error = bp->b_error;
  416 
  417         pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
  418         /*
  419          * Move memory from the large cluster buffer into the component
  420          * buffers and mark IO as done on these.
  421          */
  422         for (tbp = bp->b_cluster.cluster_head.tqh_first;
  423                 tbp; tbp = nbp) {
  424                 nbp = tbp->b_cluster.cluster_entry.tqe_next;
  425                 if (error) {
  426                         tbp->b_flags |= B_ERROR;
  427                         tbp->b_error = error;
  428                 } else
  429                         tbp->b_dirtyoff = tbp->b_dirtyend = 0;
  430                 biodone(tbp);
  431         }
  432         relpbuf(bp);
  433 }
  434 
  435 /*
  436  * Do clustered write for FFS.
  437  *
  438  * Three cases:
  439  *      1. Write is not sequential (write asynchronously)
  440  *      Write is sequential:
  441  *      2.      beginning of cluster - begin cluster
  442  *      3.      middle of a cluster - add to cluster
  443  *      4.      end of a cluster - asynchronously write cluster
  444  */
  445 void
  446 cluster_write(bp, filesize)
  447         struct buf *bp;
  448         u_quad_t filesize;
  449 {
  450         struct vnode *vp;
  451         daddr_t lbn;
  452         int maxclen, cursize;
  453         int lblocksize;
  454         int async;
  455 
  456         vp = bp->b_vp;
  457         async = (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC));
  458         lblocksize = vp->v_mount->mnt_stat.f_iosize;
  459         lbn = bp->b_lblkno;
  460 
  461         /* Initialize vnode to beginning of file. */
  462         if (lbn == 0)
  463                 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
  464 
  465         if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
  466             (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
  467                 maxclen = MAXPHYS / lblocksize - 1;
  468                 if (vp->v_clen != 0) {
  469                         /*
  470                          * Next block is not sequential.
  471                          *
  472                          * If we are not writing at end of file, the process
  473                          * seeked to another point in the file since its last
  474                          * write, or we have reached our maximum cluster size,
  475                          * then push the previous cluster. Otherwise try
  476                          * reallocating to make it sequential.
  477                          */
  478                         cursize = vp->v_lastw - vp->v_cstart + 1;
  479 #ifndef notyet_block_reallocation_enabled
  480                         if (((u_quad_t)(lbn + 1) * lblocksize) != filesize ||
  481                                 lbn != vp->v_lastw + 1 ||
  482                                 vp->v_clen <= cursize) {
  483                                 if (!async)
  484                                         cluster_wbuild(vp, lblocksize,
  485                                                 vp->v_cstart, cursize);
  486                         }
  487 #else
  488                         if (!doreallocblks ||
  489                             (lbn + 1) * lblocksize != filesize ||
  490                             lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
  491                                 if (!async)
  492                                         cluster_wbuild(vp, lblocksize,
  493                                                 vp->v_cstart, cursize);
  494                         } else {
  495                                 struct buf **bpp, **endbp;
  496                                 struct cluster_save *buflist;
  497 
  498                                 buflist = cluster_collectbufs(vp, bp);
  499                                 endbp = &buflist->bs_children
  500                                     [buflist->bs_nchildren - 1];
  501                                 if (VOP_REALLOCBLKS(vp, buflist)) {
  502                                         /*
  503                                          * Failed, push the previous cluster.
  504                                          */
  505                                         for (bpp = buflist->bs_children;
  506                                              bpp < endbp; bpp++)
  507                                                 brelse(*bpp);
  508                                         free(buflist, M_SEGMENT);
  509                                         cluster_wbuild(vp, lblocksize,
  510                                             vp->v_cstart, cursize);
  511                                 } else {
  512                                         /*
  513                                          * Succeeded, keep building cluster.
  514                                          */
  515                                         for (bpp = buflist->bs_children;
  516                                              bpp <= endbp; bpp++)
  517                                                 bdwrite(*bpp);
  518                                         free(buflist, M_SEGMENT);
  519                                         vp->v_lastw = lbn;
  520                                         vp->v_lasta = bp->b_blkno;
  521                                         return;
  522                                 }
  523                         }
  524 #endif /* notyet_block_reallocation_enabled */
  525                 }
  526                 /*
  527                  * Consider beginning a cluster. If at end of file, make
  528                  * cluster as large as possible, otherwise find size of
  529                  * existing cluster.
  530                  */
  531                 if (((u_quad_t) (lbn + 1) * lblocksize) != filesize &&
  532                     (bp->b_blkno == bp->b_lblkno) &&
  533                     (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
  534                      bp->b_blkno == -1)) {
  535                         bawrite(bp);
  536                         vp->v_clen = 0;
  537                         vp->v_lasta = bp->b_blkno;
  538                         vp->v_cstart = lbn + 1;
  539                         vp->v_lastw = lbn;
  540                         return;
  541                 }
  542                 vp->v_clen = maxclen;
  543                 if (!async && maxclen == 0) {   /* I/O not contiguous */
  544                         vp->v_cstart = lbn + 1;
  545                         bawrite(bp);
  546                 } else {        /* Wait for rest of cluster */
  547                         vp->v_cstart = lbn;
  548                         bdwrite(bp);
  549                 }
  550         } else if (lbn == vp->v_cstart + vp->v_clen) {
  551                 /*
  552                  * At end of cluster, write it out.
  553                  */
  554                 bdwrite(bp);
  555                 cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
  556                 vp->v_clen = 0;
  557                 vp->v_cstart = lbn + 1;
  558         } else
  559                 /*
  560                  * In the middle of a cluster, so just delay the I/O for now.
  561                  */
  562                 bdwrite(bp);
  563         vp->v_lastw = lbn;
  564         vp->v_lasta = bp->b_blkno;
  565 }
  566 
  567 
  568 /*
  569  * This is an awful lot like cluster_rbuild...wish they could be combined.
  570  * The last lbn argument is the current block on which I/O is being
  571  * performed.  Check to see that it doesn't fall in the middle of
  572  * the current block (if last_bp == NULL).
  573  */
  574 int
  575 cluster_wbuild(vp, size, start_lbn, len)
  576         struct vnode *vp;
  577         long size;
  578         daddr_t start_lbn;
  579         int len;
  580 {
  581         struct buf *bp, *tbp;
  582         int i, j, s;
  583         int totalwritten = 0;
  584         int dbsize = btodb(size);
  585         while (len > 0) {
  586                 s = splbio();
  587                 if ( ((tbp = gbincore(vp, start_lbn)) == NULL) ||
  588                         ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
  589                         ++start_lbn;
  590                         --len;
  591                         splx(s);
  592                         continue;
  593                 }
  594                 bremfree(tbp);
  595                 tbp->b_flags |= B_BUSY;
  596                 tbp->b_flags &= ~B_DONE;
  597                 splx(s);
  598 
  599         /*
  600          * Extra memory in the buffer, punt on this buffer. XXX we could
  601          * handle this in most cases, but we would have to push the extra
  602          * memory down to after our max possible cluster size and then
  603          * potentially pull it back up if the cluster was terminated
  604          * prematurely--too much hassle.
  605          */
  606                 if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
  607                         (tbp->b_bcount != tbp->b_bufsize) ||
  608                         (tbp->b_bcount != size) ||
  609                         len == 1) {
  610                         totalwritten += tbp->b_bufsize;
  611                         bawrite(tbp);
  612                         ++start_lbn;
  613                         --len;
  614                         continue;
  615                 }
  616 
  617                 bp = trypbuf();
  618                 if (bp == NULL) {
  619                         totalwritten += tbp->b_bufsize;
  620                         bawrite(tbp);
  621                         ++start_lbn;
  622                         --len;
  623                         continue;
  624                 }
  625 
  626                 TAILQ_INIT(&bp->b_cluster.cluster_head);
  627                 bp->b_bcount = 0;
  628                 bp->b_bufsize = 0;
  629                 bp->b_npages = 0;
  630                 if (tbp->b_wcred != NOCRED) {
  631                     bp->b_wcred = tbp->b_wcred;
  632                     crhold(bp->b_wcred);
  633                 }
  634 
  635                 bp->b_blkno = tbp->b_blkno;
  636                 bp->b_lblkno = tbp->b_lblkno;
  637                 (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
  638                 bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | (tbp->b_flags & (B_VMIO|B_NEEDCOMMIT));
  639                 bp->b_iodone = cluster_callback;
  640                 pbgetvp(vp, bp);
  641 
  642                 for (i = 0; i < len; ++i, ++start_lbn) {
  643                         if (i != 0) {
  644                                 s = splbio();
  645                                 if ((tbp = gbincore(vp, start_lbn)) == NULL) {
  646                                         splx(s);
  647                                         break;
  648                                 }
  649 
  650                                 if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI|B_NEEDCOMMIT)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & (B_VMIO|B_NEEDCOMMIT)))) {
  651                                         splx(s);
  652                                         break;
  653                                 }
  654 
  655                                 if (tbp->b_wcred != bp->b_wcred) {
  656                                         splx(s);
  657                                         break;
  658                                 }
  659 
  660                                 if ((tbp->b_bcount != size) ||
  661                                         ((bp->b_blkno + dbsize * i) != tbp->b_blkno) ||
  662                                         ((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) {
  663                                         splx(s);
  664                                         break;
  665                                 }
  666                                 bremfree(tbp);
  667                                 tbp->b_flags |= B_BUSY;
  668                                 tbp->b_flags &= ~B_DONE;
  669                                 splx(s);
  670                         }
  671                         if (tbp->b_flags & B_VMIO) {
  672                                 for (j = 0; j < tbp->b_npages; j += 1) {
  673                                         vm_page_t m;
  674                                         m = tbp->b_pages[j];
  675                                         ++m->busy;
  676                                         ++m->object->paging_in_progress;
  677                                         if ((bp->b_npages == 0) ||
  678                                                 (bp->b_pages[bp->b_npages - 1] != m)) {
  679                                                 bp->b_pages[bp->b_npages] = m;
  680                                                 bp->b_npages++;
  681                                         }
  682                                 }
  683                         }
  684                         bp->b_bcount += size;
  685                         bp->b_bufsize += size;
  686 
  687                         tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
  688                         tbp->b_flags |= B_ASYNC;
  689                         s = splbio();
  690                         reassignbuf(tbp, tbp->b_vp);    /* put on clean list */
  691                         ++tbp->b_vp->v_numoutput;
  692                         splx(s);
  693                         TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
  694                                 tbp, b_cluster.cluster_entry);
  695                 }
  696                 pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
  697                         (vm_page_t *) bp->b_pages, bp->b_npages);
  698                 if (bp->b_bufsize > bp->b_kvasize)
  699                         panic("cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n",
  700                                 bp->b_bufsize, bp->b_kvasize);
  701                 bp->b_kvasize = bp->b_bufsize;
  702                 totalwritten += bp->b_bufsize;
  703                 bp->b_dirtyoff = 0;
  704                 bp->b_dirtyend = bp->b_bufsize;
  705                 bawrite(bp);
  706 
  707                 len -= i;
  708         }
  709         return totalwritten;
  710 }
  711 
  712 #ifdef notyet_block_reallocation_enabled
  713 /*
  714  * Collect together all the buffers in a cluster.
  715  * Plus add one additional buffer.
  716  */
  717 static struct cluster_save *
  718 cluster_collectbufs(vp, last_bp)
  719         struct vnode *vp;
  720         struct buf *last_bp;
  721 {
  722         struct cluster_save *buflist;
  723         daddr_t lbn;
  724         int i, len;
  725 
  726         len = vp->v_lastw - vp->v_cstart + 1;
  727         buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
  728             M_SEGMENT, M_WAITOK);
  729         buflist->bs_nchildren = 0;
  730         buflist->bs_children = (struct buf **) (buflist + 1);
  731         for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
  732                 (void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
  733                     &buflist->bs_children[i]);
  734         buflist->bs_children[i] = last_bp;
  735         buflist->bs_nchildren = i + 1;
  736         return (buflist);
  737 }
  738 #endif /* notyet_block_reallocation_enabled */
Cache object: 6361081e2e9aba0fbe550f56911b161c
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_cluster.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_cluster.c