The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_vm.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
    3  *
    4  * This code is derived from software contributed to The DragonFly Project
    5  * by Matthew Dillon <dillon@backplane.com>
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  *
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in
   15  *    the documentation and/or other materials provided with the
   16  *    distribution.
   17  * 3. Neither the name of The DragonFly Project nor the names of its
   18  *    contributors may be used to endorse or promote products derived
   19  *    from this software without specific, prior written permission.
   20  *
   21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
   25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  */
   34 
   35 /*
   36  * Implements new VFS/VM coherency functions.  For conforming VFSs
   37  * we treat the backing VM object slightly differently.  Instead of
   38  * maintaining a number of pages to exactly fit the size of the file
   39  * we instead maintain pages to fit the entire contents of the last
   40  * buffer cache buffer used by the file.
   41  *
   42  * For VFSs like NFS and HAMMER which use (generally speaking) fixed
   43  * sized buffers this greatly reduces the complexity of VFS/VM interactions.
   44  *
   45  * Truncations no longer invalidate pages covered by the buffer cache
   46  * beyond the file EOF which still fit within the file's last buffer.
   47  * We simply unmap them and do not allow userland to fault them in.
   48  *
   49  * The VFS is no longer responsible for zero-filling buffers during a
   50  * truncation, the last buffer will be automatically zero-filled by
   51  * nvtruncbuf().
   52  *
   53  * This code is intended to (eventually) replace vtruncbuf() and
   54  * vnode_pager_setsize().
   55  */
   56 
   57 #include <sys/param.h>
   58 #include <sys/systm.h>
   59 #include <sys/buf.h>
   60 #include <sys/conf.h>
   61 #include <sys/fcntl.h>
   62 #include <sys/file.h>
   63 #include <sys/kernel.h>
   64 #include <sys/malloc.h>
   65 #include <sys/mount.h>
   66 #include <sys/proc.h>
   67 #include <sys/socket.h>
   68 #include <sys/stat.h>
   69 #include <sys/sysctl.h>
   70 #include <sys/unistd.h>
   71 #include <sys/vmmeter.h>
   72 #include <sys/vnode.h>
   73 
   74 #include <machine/limits.h>
   75 
   76 #include <vm/vm.h>
   77 #include <vm/vm_object.h>
   78 #include <vm/vm_extern.h>
   79 #include <vm/vm_kern.h>
   80 #include <vm/pmap.h>
   81 #include <vm/vm_map.h>
   82 #include <vm/vm_page.h>
   83 #include <vm/vm_pager.h>
   84 #include <vm/vnode_pager.h>
   85 #include <vm/vm_zone.h>
   86 
   87 #include <sys/buf2.h>
   88 #include <sys/thread2.h>
   89 #include <sys/sysref2.h>
   90 #include <vm/vm_page2.h>
   91 
   92 static int nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data);
   93 static int nvtruncbuf_bp_trunc(struct buf *bp, void *data);
   94 static int nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data);
   95 static int nvtruncbuf_bp_metasync(struct buf *bp, void *data);
   96 
   97 /*
   98  * Truncate a file's buffer and pages to a specified length. The
   99  * byte-granular length of the file is specified along with the block
  100  * size of the buffer containing that offset.
  101  *
  102  * If the last buffer straddles the length its contents will be zero-filled
  103  * as appropriate.  All buffers and pages after the last buffer will be
  104  * destroyed.  The last buffer itself will be destroyed only if the length
  105  * is exactly aligned with it.
  106  *
  107  * UFS typically passes the old block size prior to the actual truncation,
  108  * then later resizes the block based on the new file size.  NFS uses a
  109  * fixed block size and doesn't care.  HAMMER uses a block size based on
  110  * the offset which is fixed for any particular offset.
  111  *
  112  * When zero-filling we must bdwrite() to avoid a window of opportunity
  113  * where the kernel might throw away a clean buffer and the filesystem
  114  * then attempts to bread() it again before completing (or as part of)
  115  * the extension.  The filesystem is still responsible for zero-filling
  116  * any remainder when writing to the media in the strategy function when
  117  * it is able to do so without the page being mapped.  The page may still
  118  * be mapped by userland here.
  119  *
  120  * When modifying a buffer we must clear any cached raw disk offset.
  121  * bdwrite() will call BMAP on it again.  Some filesystems, like HAMMER,
  122  * never overwrite existing data blocks.
  123  */
  124 
  125 struct truncbuf_info {
  126         struct vnode *vp;
  127         off_t truncloffset;     /* truncation point */
  128         int clean;              /* clean tree, else dirty tree */
  129 };
  130 
  131 int
  132 nvtruncbuf(struct vnode *vp, off_t length, int blksize, int boff, int trivial)
  133 {
  134         struct truncbuf_info info;
  135         off_t truncboffset;
  136         const char *filename;
  137         struct buf *bp;
  138         int count;
  139         int error;
  140 
  141         /*
  142          * Round up to the *next* block, then destroy the buffers in question.
  143          * Since we are only removing some of the buffers we must rely on the
  144          * scan count to determine whether a loop is necessary.
  145          *
  146          * Destroy any pages beyond the last buffer.
  147          */
  148         if (boff < 0)
  149                 boff = (int)(length % blksize);
  150         if (boff)
  151                 info.truncloffset = length + (blksize - boff);
  152         else
  153                 info.truncloffset = length;
  154         info.vp = vp;
  155         lwkt_gettoken(&vp->v_token);
  156         do {
  157                 info.clean = 1;
  158                 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
  159                                 nvtruncbuf_bp_trunc_cmp,
  160                                 nvtruncbuf_bp_trunc, &info);
  161                 info.clean = 0;
  162                 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
  163                                 nvtruncbuf_bp_trunc_cmp,
  164                                 nvtruncbuf_bp_trunc, &info);
  165         } while(count);
  166 
  167         nvnode_pager_setsize(vp, length, blksize, boff);
  168 
  169         /*
  170          * Zero-fill the area beyond the file EOF that still fits within
  171          * the last buffer.  We must mark the buffer as dirty even though
  172          * the modified area is beyond EOF to avoid races where the kernel
  173          * might flush the buffer before the filesystem is able to reallocate
  174          * the block.
  175          *
  176          * The VFS is responsible for dealing with the actual truncation.
  177          *
  178          * Only do this if trivial is zero, otherwise it is up to the
  179          * VFS to handle the block straddling the EOF.
  180          */
  181         if (boff && trivial == 0) {
  182                 truncboffset = length - boff;
  183                 error = bread(vp, truncboffset, blksize, &bp);
  184                 if (error == 0) {
  185                         bzero(bp->b_data + boff, blksize - boff);
  186                         if (bp->b_flags & B_DELWRI) {
  187                                 if (bp->b_dirtyoff > boff)
  188                                         bp->b_dirtyoff = boff;
  189                                 if (bp->b_dirtyend > boff)
  190                                         bp->b_dirtyend = boff;
  191                         }
  192                         bp->b_bio2.bio_offset = NOOFFSET;
  193                         bdwrite(bp);
  194                 }
  195         } else {
  196                 error = 0;
  197         }
  198 
  199         /*
  200          * For safety, fsync any remaining metadata if the file is not being
  201          * truncated to 0.  Since the metadata does not represent the entire
  202          * dirty list we have to rely on the hit count to ensure that we get
  203          * all of it.
  204          *
  205          * This is typically applicable only to UFS.  NFS and HAMMER do
  206          * not store indirect blocks in the per-vnode buffer cache.
  207          */
  208         if (length > 0) {
  209                 do {
  210                         count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
  211                                         nvtruncbuf_bp_metasync_cmp,
  212                                         nvtruncbuf_bp_metasync, &info);
  213                 } while (count);
  214         }
  215 
  216         /*
  217          * It is possible to have in-progress I/O from buffers that were
  218          * not part of the truncation.  This should not happen if we
  219          * are truncating to 0-length.
  220          */
  221         bio_track_wait(&vp->v_track_write, 0, 0);
  222 
  223         /*
  224          * Debugging only
  225          */
  226         spin_lock(&vp->v_spin);
  227         filename = TAILQ_FIRST(&vp->v_namecache) ?
  228                    TAILQ_FIRST(&vp->v_namecache)->nc_name : "?";
  229         spin_unlock(&vp->v_spin);
  230 
  231         /*
  232          * Make sure no buffers were instantiated while we were trying
  233          * to clean out the remaining VM pages.  This could occur due
  234          * to busy dirty VM pages being flushed out to disk.
  235          */
  236         do {
  237                 info.clean = 1;
  238                 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
  239                                 nvtruncbuf_bp_trunc_cmp,
  240                                 nvtruncbuf_bp_trunc, &info);
  241                 info.clean = 0;
  242                 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
  243                                 nvtruncbuf_bp_trunc_cmp,
  244                                 nvtruncbuf_bp_trunc, &info);
  245                 if (count) {
  246                         kprintf("Warning: vtruncbuf():  Had to re-clean %d "
  247                                "left over buffers in %s\n", count, filename);
  248                 }
  249         } while(count);
  250 
  251         lwkt_reltoken(&vp->v_token);
  252 
  253         return (error);
  254 }
  255 
  256 /*
  257  * The callback buffer is beyond the new file EOF and must be destroyed.
  258  * Note that the compare function must conform to the RB_SCAN's requirements.
  259  */
  260 static
  261 int
  262 nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data)
  263 {
  264         struct truncbuf_info *info = data;
  265 
  266         if (bp->b_loffset >= info->truncloffset)
  267                 return(0);
  268         return(-1);
  269 }
  270 
  271 static
  272 int
  273 nvtruncbuf_bp_trunc(struct buf *bp, void *data)
  274 {
  275         struct truncbuf_info *info = data;
  276 
  277         /*
  278          * Do not try to use a buffer we cannot immediately lock,
  279          * but sleep anyway to prevent a livelock.  The code will
  280          * loop until all buffers can be acted upon.
  281          */
  282         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
  283                 atomic_add_int(&bp->b_refs, 1);
  284                 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
  285                         BUF_UNLOCK(bp);
  286                 atomic_subtract_int(&bp->b_refs, 1);
  287         } else if ((info->clean && (bp->b_flags & B_DELWRI)) ||
  288                    (info->clean == 0 && (bp->b_flags & B_DELWRI) == 0) ||
  289                    bp->b_vp != info->vp ||
  290                    nvtruncbuf_bp_trunc_cmp(bp, data)) {
  291                 BUF_UNLOCK(bp);
  292         } else {
  293                 bremfree(bp);
  294                 bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE);
  295                 brelse(bp);
  296         }
  297         lwkt_yield();
  298         return(1);
  299 }
  300 
  301 /*
  302  * Fsync all meta-data after truncating a file to be non-zero.  Only metadata
  303  * blocks (with a negative loffset) are scanned.
  304  * Note that the compare function must conform to the RB_SCAN's requirements.
  305  */
  306 static int
  307 nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data __unused)
  308 {
  309         if (bp->b_loffset < 0)
  310                 return(0);
  311         lwkt_yield();
  312         return(1);
  313 }
  314 
  315 static int
  316 nvtruncbuf_bp_metasync(struct buf *bp, void *data)
  317 {
  318         struct truncbuf_info *info = data;
  319 
  320         /*
  321          * Do not try to use a buffer we cannot immediately lock,
  322          * but sleep anyway to prevent a livelock.  The code will
  323          * loop until all buffers can be acted upon.
  324          */
  325         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
  326                 atomic_add_int(&bp->b_refs, 1);
  327                 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
  328                         BUF_UNLOCK(bp);
  329                 atomic_subtract_int(&bp->b_refs, 1);
  330         } else if ((bp->b_flags & B_DELWRI) == 0 ||
  331                    bp->b_vp != info->vp ||
  332                    nvtruncbuf_bp_metasync_cmp(bp, data)) {
  333                 BUF_UNLOCK(bp);
  334         } else {
  335                 bremfree(bp);
  336                 bawrite(bp);
  337         }
  338         lwkt_yield();
  339         return(1);
  340 }
  341 
  342 /*
  343  * Extend a file's buffer and pages to a new, larger size.  The block size
  344  * at both the old and new length must be passed, but buffer cache operations
  345  * will only be performed on the old block.  The new nlength/nblksize will
  346  * be used to properly set the VM object size.
  347  *
  348  * To make this explicit we require the old length to passed even though
  349  * we can acquire it from vp->v_filesize, which also avoids potential
  350  * corruption if the filesystem and vp get desynchronized somehow.
  351  *
  352  * If the caller intends to immediately write into the newly extended
  353  * space pass trivial == 1.  If trivial is 0 the original buffer will be
  354  * zero-filled as necessary to clean out any junk in the extended space.
  355  * If non-zero the original buffer (straddling EOF) is not touched.
  356  *
  357  * When zero-filling we must bdwrite() to avoid a window of opportunity
  358  * where the kernel might throw away a clean buffer and the filesystem
  359  * then attempts to bread() it again before completing (or as part of)
  360  * the extension.  The filesystem is still responsible for zero-filling
  361  * any remainder when writing to the media in the strategy function when
  362  * it is able to do so without the page being mapped.  The page may still
  363  * be mapped by userland here.
  364  *
  365  * When modifying a buffer we must clear any cached raw disk offset.
  366  * bdwrite() will call BMAP on it again.  Some filesystems, like HAMMER,
  367  * never overwrite existing data blocks.
  368  */
  369 int
  370 nvextendbuf(struct vnode *vp, off_t olength, off_t nlength,
  371             int oblksize, int nblksize, int oboff, int nboff, int trivial)
  372 {
  373         off_t truncboffset;
  374         struct buf *bp;
  375         int error;
  376 
  377         error = 0;
  378         nvnode_pager_setsize(vp, nlength, nblksize, nboff);
  379         if (trivial == 0) {
  380                 if (oboff < 0)
  381                         oboff = (int)(olength % oblksize);
  382                 truncboffset = olength - oboff;
  383 
  384                 if (oboff) {
  385                         error = bread(vp, truncboffset, oblksize, &bp);
  386                         if (error == 0) {
  387                                 bzero(bp->b_data + oboff, oblksize - oboff);
  388                                 bp->b_bio2.bio_offset = NOOFFSET;
  389                                 bdwrite(bp);
  390                         }
  391                 }
  392         }
  393         return (error);
  394 }
  395 
  396 /*
  397  * Set vp->v_filesize and vp->v_object->size, destroy pages beyond
  398  * the last buffer when truncating.
  399  *
  400  * This function does not do any zeroing or invalidating of partially
  401  * overlapping pages.  Zeroing is the responsibility of nvtruncbuf().
  402  * However, it does unmap VM pages from the user address space on a
  403  * page-granular (verses buffer cache granular) basis.
  404  *
  405  * If boff is passed as -1 the base offset of the buffer cache buffer is
  406  * calculated from length and blksize.  Filesystems such as UFS which deal
  407  * with fragments have to specify a boff >= 0 since the base offset cannot
  408  * be calculated from length and blksize.
  409  *
  410  * For UFS blksize is the 'new' blocksize, used only to determine how large
  411  * the VM object must become.
  412  */
  413 void
  414 nvnode_pager_setsize(struct vnode *vp, off_t length, int blksize, int boff)
  415 {
  416         vm_pindex_t nobjsize;
  417         vm_pindex_t oobjsize;
  418         vm_pindex_t pi;
  419         vm_object_t object;
  420         vm_page_t m;
  421         off_t truncboffset;
  422 
  423         /*
  424          * Degenerate conditions
  425          */
  426         if ((object = vp->v_object) == NULL)
  427                 return;
  428         vm_object_hold(object);
  429         if (length == vp->v_filesize) {
  430                 vm_object_drop(object);
  431                 return;
  432         }
  433 
  434         /*
  435          * Calculate the size of the VM object, coverage includes
  436          * the buffer straddling EOF.  If EOF is buffer-aligned
  437          * we don't bother.
  438          *
  439          * Buffers do not have to be page-aligned.  Make sure
  440          * nobjsize is beyond the last page of the buffer.
  441          */
  442         if (boff < 0)
  443                 boff = (int)(length % blksize);
  444         truncboffset = length - boff;
  445         oobjsize = object->size;
  446         if (boff)
  447                 nobjsize = OFF_TO_IDX(truncboffset + blksize + PAGE_MASK);
  448         else
  449                 nobjsize = OFF_TO_IDX(truncboffset + PAGE_MASK);
  450         object->size = nobjsize;
  451 
  452         if (length < vp->v_filesize) {
  453                 /*
  454                  * File has shrunk, toss any cached pages beyond
  455                  * the end of the buffer (blksize aligned) for the
  456                  * new EOF.
  457                  */
  458                 vp->v_filesize = length;
  459                 if (nobjsize < oobjsize) {
  460                         vm_object_page_remove(object, nobjsize, oobjsize,
  461                                               FALSE);
  462                 }
  463 
  464                 /*
  465                  * Unmap any pages (page aligned) beyond the new EOF.
  466                  * The pages remain part of the (last) buffer and are not
  467                  * invalidated.
  468                  */
  469                 pi = OFF_TO_IDX(length + PAGE_MASK);
  470                 while (pi < nobjsize) {
  471                         m = vm_page_lookup_busy_wait(object, pi, FALSE, "vmpg");
  472                         if (m) {
  473                                 vm_page_protect(m, VM_PROT_NONE);
  474                                 vm_page_wakeup(m);
  475                         }
  476                         ++pi;
  477                         lwkt_yield();
  478                 }
  479         } else {
  480                 /*
  481                  * File has expanded.
  482                  */
  483                 vp->v_filesize = length;
  484         }
  485         vm_object_drop(object);
  486 }

Cache object: 2ca303128d61ab96822cf78fc4736a80


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.