ffs_snapshot.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
    3  *
    4  * Further information about snapshots can be obtained from:
    5  *
    6  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
    7  *      1614 Oxford Street              mckusick@mckusick.com
    8  *      Berkeley, CA 94709-1608         +1-510-843-9542
    9  *      USA
   10  *
   11  * Redistribution and use in source and binary forms, with or without
   12  * modification, are permitted provided that the following conditions
   13  * are met:
   14  *
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  *
   21  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
   22  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   24  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
   25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   31  * SUCH DAMAGE.
   32  *
   33  *      @(#)ffs_snapshot.c      8.11 (McKusick) 7/23/00
   34  */
   35 
   36 #include <sys/cdefs.h>
   37 __FBSDID("$FreeBSD: releng/8.0/sys/ufs/ffs/ffs_snapshot.c 190888 2009-04-10 10:52:19Z rwatson $");
   38 
   39 #include "opt_quota.h"
   40 
   41 #include <sys/param.h>
   42 #include <sys/kernel.h>
   43 #include <sys/systm.h>
   44 #include <sys/conf.h>
   45 #include <sys/bio.h>
   46 #include <sys/buf.h>
   47 #include <sys/fcntl.h>
   48 #include <sys/proc.h>
   49 #include <sys/namei.h>
   50 #include <sys/sched.h>
   51 #include <sys/stat.h>
   52 #include <sys/malloc.h>
   53 #include <sys/mount.h>
   54 #include <sys/resource.h>
   55 #include <sys/resourcevar.h>
   56 #include <sys/vnode.h>
   57 
   58 #include <geom/geom.h>
   59 
   60 #include <ufs/ufs/extattr.h>
   61 #include <ufs/ufs/quota.h>
   62 #include <ufs/ufs/ufsmount.h>
   63 #include <ufs/ufs/inode.h>
   64 #include <ufs/ufs/ufs_extern.h>
   65 
   66 #include <ufs/ffs/fs.h>
   67 #include <ufs/ffs/ffs_extern.h>
   68 
   69 #define KERNCRED thread0.td_ucred
   70 #define DEBUG 1
   71 
   72 #include "opt_ffs.h"
   73 
   74 #ifdef NO_FFS_SNAPSHOT
   75 int
   76 ffs_snapshot(mp, snapfile)
   77         struct mount *mp;
   78         char *snapfile;
   79 {
   80         return (EINVAL);
   81 }
   82 
   83 int
   84 ffs_snapblkfree(fs, devvp, bno, size, inum)
   85         struct fs *fs;
   86         struct vnode *devvp;
   87         ufs2_daddr_t bno;
   88         long size;
   89         ino_t inum;
   90 {
   91         return (EINVAL);
   92 }
   93 
   94 void
   95 ffs_snapremove(vp)
   96         struct vnode *vp;
   97 {
   98 }
   99 
  100 void
  101 ffs_snapshot_mount(mp)
  102         struct mount *mp;
  103 {
  104 }
  105 
  106 void
  107 ffs_snapshot_unmount(mp)
  108         struct mount *mp;
  109 {
  110 }
  111 
  112 void
  113 ffs_snapgone(ip)
  114         struct inode *ip;
  115 {
  116 }
  117 
  118 int
  119 ffs_copyonwrite(devvp, bp)
  120         struct vnode *devvp;
  121         struct buf *bp;
  122 {
  123         return (EINVAL);
  124 }
  125 
  126 #else
  127 
  128 TAILQ_HEAD(snaphead, inode);
  129 
  130 struct snapdata {
  131         LIST_ENTRY(snapdata) sn_link;
  132         struct snaphead sn_head;
  133         daddr_t sn_listsize;
  134         daddr_t *sn_blklist;
  135         struct lock sn_lock;
  136 };
  137 
  138 LIST_HEAD(, snapdata) snapfree;
  139 static struct mtx snapfree_lock;
  140 MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF);
  141 
  142 static int cgaccount(int, struct vnode *, struct buf *, int);
  143 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
  144     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
  145     ufs_lbn_t, int), int);
  146 static int indiracct_ufs1(struct vnode *, struct vnode *, int,
  147     ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
  148     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
  149     ufs_lbn_t, int), int);
  150 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
  151     struct fs *, ufs_lbn_t, int);
  152 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
  153     struct fs *, ufs_lbn_t, int);
  154 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
  155     struct fs *, ufs_lbn_t, int);
  156 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
  157     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
  158     ufs_lbn_t, int), int);
  159 static int indiracct_ufs2(struct vnode *, struct vnode *, int,
  160     ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
  161     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
  162     ufs_lbn_t, int), int);
  163 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
  164     struct fs *, ufs_lbn_t, int);
  165 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
  166     struct fs *, ufs_lbn_t, int);
  167 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
  168     struct fs *, ufs_lbn_t, int);
  169 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
  170 static void try_free_snapdata(struct vnode *devvp);
  171 static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp);
  172 static int ffs_bp_snapblk(struct vnode *, struct buf *);
  173 
  174 /*
  175  * To ensure the consistency of snapshots across crashes, we must
  176  * synchronously write out copied blocks before allowing the
  177  * originals to be modified. Because of the rather severe speed
  178  * penalty that this imposes, the following flag allows this
  179  * crash persistence to be disabled.
  180  */
  181 int dopersistence = 0;
  182 
  183 #ifdef DEBUG
  184 #include <sys/sysctl.h>
  185 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
  186 static int snapdebug = 0;
  187 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
  188 int collectsnapstats = 0;
  189 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
  190         0, "");
  191 #endif /* DEBUG */
  192 
  193 /*
  194  * Create a snapshot file and initialize it for the filesystem.
  195  */
  196 int
  197 ffs_snapshot(mp, snapfile)
  198         struct mount *mp;
  199         char *snapfile;
  200 {
  201         ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
  202         int error, cg, snaploc;
  203         int i, size, len, loc;
  204         int flag;
  205         struct timespec starttime = {0, 0}, endtime;
  206         char saved_nice = 0;
  207         long redo = 0, snaplistsize = 0;
  208         int32_t *lp;
  209         void *space;
  210         struct fs *copy_fs = NULL, *fs;
  211         struct thread *td = curthread;
  212         struct inode *ip, *xp;
  213         struct buf *bp, *nbp, *ibp, *sbp = NULL;
  214         struct nameidata nd;
  215         struct mount *wrtmp;
  216         struct vattr vat;
  217         struct vnode *vp, *xvp, *mvp, *devvp;
  218         struct uio auio;
  219         struct iovec aiov;
  220         struct snapdata *sn;
  221         struct ufsmount *ump;
  222 
  223         ump = VFSTOUFS(mp);
  224         fs = ump->um_fs;
  225         sn = NULL;
  226         MNT_ILOCK(mp);
  227         flag = mp->mnt_flag;
  228         MNT_IUNLOCK(mp);
  229 
  230         /*
  231          * Need to serialize access to snapshot code per filesystem.
  232          */
  233         /*
  234          * Assign a snapshot slot in the superblock.
  235          */
  236         UFS_LOCK(ump);
  237         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
  238                 if (fs->fs_snapinum[snaploc] == 0)
  239                         break;
  240         UFS_UNLOCK(ump);
  241         if (snaploc == FSMAXSNAP)
  242                 return (ENOSPC);
  243         /*
  244          * Create the snapshot file.
  245          */
  246 restart:
  247         NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile, td);
  248         if ((error = namei(&nd)) != 0)
  249                 return (error);
  250         if (nd.ni_vp != NULL) {
  251                 vput(nd.ni_vp);
  252                 error = EEXIST;
  253         }
  254         if (nd.ni_dvp->v_mount != mp)
  255                 error = EXDEV;
  256         if (error) {
  257                 NDFREE(&nd, NDF_ONLY_PNBUF);
  258                 if (nd.ni_dvp == nd.ni_vp)
  259                         vrele(nd.ni_dvp);
  260                 else
  261                         vput(nd.ni_dvp);
  262                 return (error);
  263         }
  264         VATTR_NULL(&vat);
  265         vat.va_type = VREG;
  266         vat.va_mode = S_IRUSR;
  267         vat.va_vaflags |= VA_EXCLUSIVE;
  268         if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
  269                 wrtmp = NULL;
  270         if (wrtmp != mp)
  271                 panic("ffs_snapshot: mount mismatch");
  272         vfs_rel(wrtmp);
  273         if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
  274                 NDFREE(&nd, NDF_ONLY_PNBUF);
  275                 vput(nd.ni_dvp);
  276                 if ((error = vn_start_write(NULL, &wrtmp,
  277                     V_XSLEEP | PCATCH)) != 0)
  278                         return (error);
  279                 goto restart;
  280         }
  281         error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
  282         VOP_UNLOCK(nd.ni_dvp, 0);
  283         if (error) {
  284                 NDFREE(&nd, NDF_ONLY_PNBUF);
  285                 vn_finished_write(wrtmp);
  286                 vrele(nd.ni_dvp);
  287                 return (error);
  288         }
  289         vp = nd.ni_vp;
  290         vp->v_vflag |= VV_SYSTEM;
  291         ip = VTOI(vp);
  292         devvp = ip->i_devvp;
  293         /*
  294          * Allocate and copy the last block contents so as to be able
  295          * to set size to that of the filesystem.
  296          */
  297         numblks = howmany(fs->fs_size, fs->fs_frag);
  298         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
  299             fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
  300         if (error)
  301                 goto out;
  302         ip->i_size = lblktosize(fs, (off_t)numblks);
  303         DIP_SET(ip, i_size, ip->i_size);
  304         ip->i_flag |= IN_CHANGE | IN_UPDATE;
  305         error = readblock(vp, bp, numblks - 1);
  306         bawrite(bp);
  307         if (error != 0)
  308                 goto out;
  309         /*
  310          * Preallocate critical data structures so that we can copy
  311          * them in without further allocation after we suspend all
  312          * operations on the filesystem. We would like to just release
  313          * the allocated buffers without writing them since they will
  314          * be filled in below once we are ready to go, but this upsets
  315          * the soft update code, so we go ahead and write the new buffers.
  316          *
  317          * Allocate all indirect blocks and mark all of them as not
  318          * needing to be copied.
  319          */
  320         for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
  321                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
  322                     fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
  323                 if (error)
  324                         goto out;
  325                 bawrite(ibp);
  326         }
  327         /*
  328          * Allocate copies for the superblock and its summary information.
  329          */
  330         error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
  331             0, &nbp);
  332         if (error)
  333                 goto out;
  334         bawrite(nbp);
  335         blkno = fragstoblks(fs, fs->fs_csaddr);
  336         len = howmany(fs->fs_cssize, fs->fs_bsize);
  337         for (loc = 0; loc < len; loc++) {
  338                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
  339                     fs->fs_bsize, KERNCRED, 0, &nbp);
  340                 if (error)
  341                         goto out;
  342                 bawrite(nbp);
  343         }
  344         /*
  345          * Allocate all cylinder group blocks.
  346          */
  347         for (cg = 0; cg < fs->fs_ncg; cg++) {
  348                 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
  349                     fs->fs_bsize, KERNCRED, 0, &nbp);
  350                 if (error)
  351                         goto out;
  352                 bawrite(nbp);
  353                 if (cg % 10 == 0)
  354                         ffs_syncvnode(vp, MNT_WAIT);
  355         }
  356         /*
  357          * Copy all the cylinder group maps. Although the
  358          * filesystem is still active, we hope that only a few
  359          * cylinder groups will change between now and when we
  360          * suspend operations. Thus, we will be able to quickly
  361          * touch up the few cylinder groups that changed during
  362          * the suspension period.
  363          */
  364         len = howmany(fs->fs_ncg, NBBY);
  365         space = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
  366         UFS_LOCK(ump);
  367         fs->fs_active = space;
  368         UFS_UNLOCK(ump);
  369         for (cg = 0; cg < fs->fs_ncg; cg++) {
  370                 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
  371                     fs->fs_bsize, KERNCRED, 0, &nbp);
  372                 if (error)
  373                         goto out;
  374                 error = cgaccount(cg, vp, nbp, 1);
  375                 bawrite(nbp);
  376                 if (cg % 10 == 0)
  377                         ffs_syncvnode(vp, MNT_WAIT);
  378                 if (error)
  379                         goto out;
  380         }
  381         /*
  382          * Change inode to snapshot type file.
  383          */
  384         ip->i_flags |= SF_SNAPSHOT;
  385         DIP_SET(ip, i_flags, ip->i_flags);
  386         ip->i_flag |= IN_CHANGE | IN_UPDATE;
  387         /*
  388          * Ensure that the snapshot is completely on disk.
  389          * Since we have marked it as a snapshot it is safe to
  390          * unlock it as no process will be allowed to write to it.
  391          */
  392         if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
  393                 goto out;
  394         VOP_UNLOCK(vp, 0);
  395         /*
  396          * All allocations are done, so we can now snapshot the system.
  397          *
  398          * Recind nice scheduling while running with the filesystem suspended.
  399          */
  400         if (td->td_proc->p_nice > 0) {
  401                 struct proc *p;
  402 
  403                 p = td->td_proc;
  404                 PROC_LOCK(p);
  405                 saved_nice = p->p_nice;
  406                 sched_nice(p, 0);
  407                 PROC_UNLOCK(p);
  408         }
  409         /*
  410          * Suspend operation on filesystem.
  411          */
  412         for (;;) {
  413                 vn_finished_write(wrtmp);
  414                 if ((error = vfs_write_suspend(vp->v_mount)) != 0) {
  415                         vn_start_write(NULL, &wrtmp, V_WAIT);
  416                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  417                         goto out;
  418                 }
  419                 if (mp->mnt_kern_flag & MNTK_SUSPENDED)
  420                         break;
  421                 vn_start_write(NULL, &wrtmp, V_WAIT);
  422         }
  423         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  424         if (ip->i_effnlink == 0) {
  425                 error = ENOENT;         /* Snapshot file unlinked */
  426                 goto out1;
  427         }
  428         if (collectsnapstats)
  429                 nanotime(&starttime);
  430 
  431         /* The last block might have changed.  Copy it again to be sure. */
  432         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
  433             fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
  434         if (error != 0)
  435                 goto out1;
  436         error = readblock(vp, bp, numblks - 1);
  437         bp->b_flags |= B_VALIDSUSPWRT;
  438         bawrite(bp);
  439         if (error != 0)
  440                 goto out1;
  441         /*
  442          * First, copy all the cylinder group maps that have changed.
  443          */
  444         for (cg = 0; cg < fs->fs_ncg; cg++) {
  445                 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
  446                         continue;
  447                 redo++;
  448                 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
  449                     fs->fs_bsize, KERNCRED, 0, &nbp);
  450                 if (error)
  451                         goto out1;
  452                 error = cgaccount(cg, vp, nbp, 2);
  453                 bawrite(nbp);
  454                 if (error)
  455                         goto out1;
  456         }
  457         /*
  458          * Grab a copy of the superblock and its summary information.
  459          * We delay writing it until the suspension is released below.
  460          */
  461         error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
  462             KERNCRED, &sbp);
  463         if (error) {
  464                 brelse(sbp);
  465                 sbp = NULL;
  466                 goto out1;
  467         }
  468         loc = blkoff(fs, fs->fs_sblockloc);
  469         copy_fs = (struct fs *)(sbp->b_data + loc);
  470         bcopy(fs, copy_fs, fs->fs_sbsize);
  471         if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
  472                 copy_fs->fs_clean = 1;
  473         size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
  474         if (fs->fs_sbsize < size)
  475                 bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize);
  476         size = blkroundup(fs, fs->fs_cssize);
  477         if (fs->fs_contigsumsize > 0)
  478                 size += fs->fs_ncg * sizeof(int32_t);
  479         space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
  480         copy_fs->fs_csp = space;
  481         bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
  482         space = (char *)space + fs->fs_cssize;
  483         loc = howmany(fs->fs_cssize, fs->fs_fsize);
  484         i = fs->fs_frag - loc % fs->fs_frag;
  485         len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
  486         if (len > 0) {
  487                 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
  488                     len, KERNCRED, &bp)) != 0) {
  489                         brelse(bp);
  490                         free(copy_fs->fs_csp, M_UFSMNT);
  491                         bawrite(sbp);
  492                         sbp = NULL;
  493                         goto out1;
  494                 }
  495                 bcopy(bp->b_data, space, (u_int)len);
  496                 space = (char *)space + len;
  497                 bp->b_flags |= B_INVAL | B_NOCACHE;
  498                 brelse(bp);
  499         }
  500         if (fs->fs_contigsumsize > 0) {
  501                 copy_fs->fs_maxcluster = lp = space;
  502                 for (i = 0; i < fs->fs_ncg; i++)
  503                         *lp++ = fs->fs_contigsumsize;
  504         }
  505         /*
  506          * We must check for active files that have been unlinked
  507          * (e.g., with a zero link count). We have to expunge all
  508          * trace of these files from the snapshot so that they are
  509          * not reclaimed prematurely by fsck or unnecessarily dumped.
  510          * We turn off the MNTK_SUSPENDED flag to avoid a panic from
  511          * spec_strategy about writing on a suspended filesystem.
  512          * Note that we skip unlinked snapshot files as they will
  513          * be handled separately below.
  514          *
  515          * We also calculate the needed size for the snapshot list.
  516          */
  517         snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
  518             FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
  519         MNT_ILOCK(mp);
  520         mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
  521 loop:
  522         MNT_VNODE_FOREACH(xvp, mp, mvp) {
  523                 VI_LOCK(xvp);
  524                 MNT_IUNLOCK(mp);
  525                 if ((xvp->v_iflag & VI_DOOMED) ||
  526                     (xvp->v_usecount == 0 &&
  527                      (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) ||
  528                     xvp->v_type == VNON ||
  529                     (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
  530                         VI_UNLOCK(xvp);
  531                         MNT_ILOCK(mp);
  532                         continue;
  533                 }
  534                 /*
  535                  * We can skip parent directory vnode because it must have
  536                  * this snapshot file in it.
  537                  */
  538                 if (xvp == nd.ni_dvp) {
  539                         VI_UNLOCK(xvp);
  540                         MNT_ILOCK(mp);
  541                         continue;
  542                 }
  543                 vholdl(xvp);
  544                 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) {
  545                         MNT_ILOCK(mp);
  546                         MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
  547                         vdrop(xvp);
  548                         goto loop;
  549                 }
  550                 VI_LOCK(xvp);
  551                 if (xvp->v_usecount == 0 &&
  552                     (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) {
  553                         VI_UNLOCK(xvp);
  554                         VOP_UNLOCK(xvp, 0);
  555                         vdrop(xvp);
  556                         MNT_ILOCK(mp);
  557                         continue;
  558                 }
  559                 VI_UNLOCK(xvp);
  560                 if (snapdebug)
  561                         vprint("ffs_snapshot: busy vnode", xvp);
  562                 if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 &&
  563                     vat.va_nlink > 0) {
  564                         VOP_UNLOCK(xvp, 0);
  565                         vdrop(xvp);
  566                         MNT_ILOCK(mp);
  567                         continue;
  568                 }
  569                 xp = VTOI(xvp);
  570                 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
  571                         VOP_UNLOCK(xvp, 0);
  572                         vdrop(xvp);
  573                         MNT_ILOCK(mp);
  574                         continue;
  575                 }
  576                 /*
  577                  * If there is a fragment, clear it here.
  578                  */
  579                 blkno = 0;
  580                 loc = howmany(xp->i_size, fs->fs_bsize) - 1;
  581                 if (loc < NDADDR) {
  582                         len = fragroundup(fs, blkoff(fs, xp->i_size));
  583                         if (len != 0 && len < fs->fs_bsize) {
  584                                 ffs_blkfree(ump, copy_fs, vp,
  585                                     DIP(xp, i_db[loc]), len, xp->i_number);
  586                                 blkno = DIP(xp, i_db[loc]);
  587                                 DIP_SET(xp, i_db[loc], 0);
  588                         }
  589                 }
  590                 snaplistsize += 1;
  591                 if (xp->i_ump->um_fstype == UFS1)
  592                         error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
  593                             BLK_NOCOPY);
  594                 else
  595                         error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
  596                             BLK_NOCOPY);
  597                 if (blkno)
  598                         DIP_SET(xp, i_db[loc], blkno);
  599                 if (!error)
  600                         error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
  601                             xp->i_mode);
  602                 VOP_UNLOCK(xvp, 0);
  603                 vdrop(xvp);
  604                 if (error) {
  605                         free(copy_fs->fs_csp, M_UFSMNT);
  606                         bawrite(sbp);
  607                         sbp = NULL;
  608                         MNT_VNODE_FOREACH_ABORT(mp, mvp);
  609                         goto out1;
  610                 }
  611                 MNT_ILOCK(mp);
  612         }
  613         MNT_IUNLOCK(mp);
  614         /*
  615          * Acquire a lock on the snapdata structure, creating it if necessary.
  616          */
  617         sn = ffs_snapdata_acquire(devvp);
  618         /* 
  619          * Change vnode to use shared snapshot lock instead of the original
  620          * private lock.
  621          */
  622         vp->v_vnlock = &sn->sn_lock;
  623         lockmgr(&vp->v_lock, LK_RELEASE, NULL);
  624         xp = TAILQ_FIRST(&sn->sn_head);
  625         /*
  626          * If this is the first snapshot on this filesystem, then we need
  627          * to allocate the space for the list of preallocated snapshot blocks.
  628          * This list will be refined below, but this preliminary one will
  629          * keep us out of deadlock until the full one is ready.
  630          */
  631         if (xp == NULL) {
  632                 snapblklist = malloc(snaplistsize * sizeof(daddr_t),
  633                     M_UFSMNT, M_WAITOK);
  634                 blkp = &snapblklist[1];
  635                 *blkp++ = lblkno(fs, fs->fs_sblockloc);
  636                 blkno = fragstoblks(fs, fs->fs_csaddr);
  637                 for (cg = 0; cg < fs->fs_ncg; cg++) {
  638                         if (fragstoblks(fs, cgtod(fs, cg) > blkno))
  639                                 break;
  640                         *blkp++ = fragstoblks(fs, cgtod(fs, cg));
  641                 }
  642                 len = howmany(fs->fs_cssize, fs->fs_bsize);
  643                 for (loc = 0; loc < len; loc++)
  644                         *blkp++ = blkno + loc;
  645                 for (; cg < fs->fs_ncg; cg++)
  646                         *blkp++ = fragstoblks(fs, cgtod(fs, cg));
  647                 snapblklist[0] = blkp - snapblklist;
  648                 VI_LOCK(devvp);
  649                 if (sn->sn_blklist != NULL)
  650                         panic("ffs_snapshot: non-empty list");
  651                 sn->sn_blklist = snapblklist;
  652                 sn->sn_listsize = blkp - snapblklist;
  653                 VI_UNLOCK(devvp);
  654         }
  655         /*
  656          * Record snapshot inode. Since this is the newest snapshot,
  657          * it must be placed at the end of the list.
  658          */
  659         VI_LOCK(devvp);
  660         fs->fs_snapinum[snaploc] = ip->i_number;
  661         if (ip->i_nextsnap.tqe_prev != 0)
  662                 panic("ffs_snapshot: %d already on list", ip->i_number);
  663         TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
  664         devvp->v_vflag |= VV_COPYONWRITE;
  665         VI_UNLOCK(devvp);
  666         ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
  667 out1:
  668         KASSERT((sn != NULL && sbp != NULL && error == 0) ||
  669                 (sn == NULL && sbp == NULL && error != 0),
  670                 ("email phk@ and mckusick@"));
  671         /*
  672          * Resume operation on filesystem.
  673          */
  674         vfs_write_resume(vp->v_mount);
  675         vn_start_write(NULL, &wrtmp, V_WAIT);
  676         if (collectsnapstats && starttime.tv_sec > 0) {
  677                 nanotime(&endtime);
  678                 timespecsub(&endtime, &starttime);
  679                 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
  680                     vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
  681                     endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
  682         }
  683         if (sbp == NULL)
  684                 goto out;
  685         /*
  686          * Copy allocation information from all the snapshots in
  687          * this snapshot and then expunge them from its view.
  688          */
  689         TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) {
  690                 if (xp == ip)
  691                         break;
  692                 if (xp->i_ump->um_fstype == UFS1)
  693                         error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
  694                             BLK_SNAP);
  695                 else
  696                         error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
  697                             BLK_SNAP);
  698                 if (error == 0 && xp->i_effnlink == 0) {
  699                         error = ffs_freefile(ump,
  700                                              copy_fs,
  701                                              vp,
  702                                              xp->i_number,
  703                                              xp->i_mode);
  704                 }
  705                 if (error) {
  706                         fs->fs_snapinum[snaploc] = 0;
  707                         goto done;
  708                 }
  709         }
  710         /*
  711          * Allocate space for the full list of preallocated snapshot blocks.
  712          */
  713         snapblklist = malloc(snaplistsize * sizeof(daddr_t),
  714             M_UFSMNT, M_WAITOK);
  715         ip->i_snapblklist = &snapblklist[1];
  716         /*
  717          * Expunge the blocks used by the snapshots from the set of
  718          * blocks marked as used in the snapshot bitmaps. Also, collect
  719          * the list of allocated blocks in i_snapblklist.
  720          */
  721         if (ip->i_ump->um_fstype == UFS1)
  722                 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
  723         else
  724                 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
  725         if (error) {
  726                 fs->fs_snapinum[snaploc] = 0;
  727                 free(snapblklist, M_UFSMNT);
  728                 goto done;
  729         }
  730         if (snaplistsize < ip->i_snapblklist - snapblklist)
  731                 panic("ffs_snapshot: list too small");
  732         snaplistsize = ip->i_snapblklist - snapblklist;
  733         snapblklist[0] = snaplistsize;
  734         ip->i_snapblklist = 0;
  735         /*
  736          * Write out the list of allocated blocks to the end of the snapshot.
  737          */
  738         auio.uio_iov = &aiov;
  739         auio.uio_iovcnt = 1;
  740         aiov.iov_base = (void *)snapblklist;
  741         aiov.iov_len = snaplistsize * sizeof(daddr_t);
  742         auio.uio_resid = aiov.iov_len;;
  743         auio.uio_offset = ip->i_size;
  744         auio.uio_segflg = UIO_SYSSPACE;
  745         auio.uio_rw = UIO_WRITE;
  746         auio.uio_td = td;
  747         if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
  748                 fs->fs_snapinum[snaploc] = 0;
  749                 free(snapblklist, M_UFSMNT);
  750                 goto done;
  751         }
  752         /*
  753          * Write the superblock and its summary information
  754          * to the snapshot.
  755          */
  756         blkno = fragstoblks(fs, fs->fs_csaddr);
  757         len = howmany(fs->fs_cssize, fs->fs_bsize);
  758         space = copy_fs->fs_csp;
  759         for (loc = 0; loc < len; loc++) {
  760                 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
  761                 if (error) {
  762                         brelse(nbp);
  763                         fs->fs_snapinum[snaploc] = 0;
  764                         free(snapblklist, M_UFSMNT);
  765                         goto done;
  766                 }
  767                 bcopy(space, nbp->b_data, fs->fs_bsize);
  768                 space = (char *)space + fs->fs_bsize;
  769                 bawrite(nbp);
  770         }
  771         /*
  772          * As this is the newest list, it is the most inclusive, so
  773          * should replace the previous list.
  774          */
  775         VI_LOCK(devvp);
  776         space = sn->sn_blklist;
  777         sn->sn_blklist = snapblklist;
  778         sn->sn_listsize = snaplistsize;
  779         VI_UNLOCK(devvp);
  780         if (space != NULL)
  781                 free(space, M_UFSMNT);
  782         /*
  783          * If another process is currently writing the buffer containing
  784          * the inode for this snapshot then a deadlock can occur. Drop
  785          * the snapshot lock until the buffer has been written.
  786          */
  787         VREF(vp);       /* Protect against ffs_snapgone() */
  788         VOP_UNLOCK(vp, 0);
  789         (void) bread(ip->i_devvp,
  790                      fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
  791                      (int) fs->fs_bsize, NOCRED, &nbp);
  792         brelse(nbp);
  793         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  794         if (ip->i_effnlink == 0)
  795                 error = ENOENT;         /* Snapshot file unlinked */
  796         else
  797                 vrele(vp);              /* Drop extra reference */
  798 done:
  799         free(copy_fs->fs_csp, M_UFSMNT);
  800         bawrite(sbp);
  801 out:
  802         NDFREE(&nd, NDF_ONLY_PNBUF);
  803         if (saved_nice > 0) {
  804                 struct proc *p;
  805 
  806                 p = td->td_proc;
  807                 PROC_LOCK(p);
  808                 sched_nice(td->td_proc, saved_nice);
  809                 PROC_UNLOCK(td->td_proc);
  810         }
  811         UFS_LOCK(ump);
  812         if (fs->fs_active != 0) {
  813                 free(fs->fs_active, M_DEVBUF);
  814                 fs->fs_active = 0;
  815         }
  816         UFS_UNLOCK(ump);
  817         MNT_ILOCK(mp);
  818         mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
  819         MNT_IUNLOCK(mp);
  820         if (error)
  821                 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
  822         (void) ffs_syncvnode(vp, MNT_WAIT);
  823         if (error)
  824                 vput(vp);
  825         else
  826                 VOP_UNLOCK(vp, 0);
  827         vrele(nd.ni_dvp);
  828         vn_finished_write(wrtmp);
  829         process_deferred_inactive(mp);
  830         return (error);
  831 }
  832 
  833 /*
  834  * Copy a cylinder group map. All the unallocated blocks are marked
  835  * BLK_NOCOPY so that the snapshot knows that it need not copy them
  836  * if they are later written. If passno is one, then this is a first
  837  * pass, so only setting needs to be done. If passno is 2, then this
  838  * is a revision to a previous pass which must be undone as the
  839  * replacement pass is done.
  840  */
  841 static int
  842 cgaccount(cg, vp, nbp, passno)
  843         int cg;
  844         struct vnode *vp;
  845         struct buf *nbp;
  846         int passno;
  847 {
  848         struct buf *bp, *ibp;
  849         struct inode *ip;
  850         struct cg *cgp;
  851         struct fs *fs;
  852         ufs2_daddr_t base, numblks;
  853         int error, len, loc, indiroff;
  854 
  855         ip = VTOI(vp);
  856         fs = ip->i_fs;
  857         error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
  858                 (int)fs->fs_cgsize, KERNCRED, &bp);
  859         if (error) {
  860                 brelse(bp);
  861                 return (error);
  862         }
  863         cgp = (struct cg *)bp->b_data;
  864         if (!cg_chkmagic(cgp)) {
  865                 brelse(bp);
  866                 return (EIO);
  867         }
  868         UFS_LOCK(ip->i_ump);
  869         ACTIVESET(fs, cg);
  870         /*
  871          * Recomputation of summary information might not have been performed
  872          * at mount time.  Sync up summary information for current cylinder
  873          * group while data is in memory to ensure that result of background
  874          * fsck is slightly more consistent.
  875          */
  876         fs->fs_cs(fs, cg) = cgp->cg_cs;
  877         UFS_UNLOCK(ip->i_ump);
  878         bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
  879         if (fs->fs_cgsize < fs->fs_bsize)
  880                 bzero(&nbp->b_data[fs->fs_cgsize],
  881                     fs->fs_bsize - fs->fs_cgsize);
  882         cgp = (struct cg *)nbp->b_data;
  883         bqrelse(bp);
  884         if (passno == 2)
  885                 nbp->b_flags |= B_VALIDSUSPWRT;
  886         numblks = howmany(fs->fs_size, fs->fs_frag);
  887         len = howmany(fs->fs_fpg, fs->fs_frag);
  888         base = cgbase(fs, cg) / fs->fs_frag;
  889         if (base + len >= numblks)
  890                 len = numblks - base - 1;
  891         loc = 0;
  892         if (base < NDADDR) {
  893                 for ( ; loc < NDADDR; loc++) {
  894                         if (ffs_isblock(fs, cg_blksfree(cgp), loc))
  895                                 DIP_SET(ip, i_db[loc], BLK_NOCOPY);
  896                         else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
  897                                 DIP_SET(ip, i_db[loc], 0);
  898                         else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
  899                                 panic("ffs_snapshot: lost direct block");
  900                 }
  901         }
  902         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
  903             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
  904         if (error) {
  905                 return (error);
  906         }
  907         indiroff = (base + loc - NDADDR) % NINDIR(fs);
  908         for ( ; loc < len; loc++, indiroff++) {
  909                 if (indiroff >= NINDIR(fs)) {
  910                         if (passno == 2)
  911                                 ibp->b_flags |= B_VALIDSUSPWRT;
  912                         bawrite(ibp);
  913                         error = UFS_BALLOC(vp,
  914                             lblktosize(fs, (off_t)(base + loc)),
  915                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
  916                         if (error) {
  917                                 return (error);
  918                         }
  919                         indiroff = 0;
  920                 }
  921                 if (ip->i_ump->um_fstype == UFS1) {
  922                         if (ffs_isblock(fs, cg_blksfree(cgp), loc))
  923                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
  924                                     BLK_NOCOPY;
  925                         else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
  926                             [indiroff] == BLK_NOCOPY)
  927                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
  928                         else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
  929                             [indiroff] == BLK_NOCOPY)
  930                                 panic("ffs_snapshot: lost indirect block");
  931                         continue;
  932                 }
  933                 if (ffs_isblock(fs, cg_blksfree(cgp), loc))
  934                         ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
  935                 else if (passno == 2 &&
  936                     ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
  937                         ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
  938                 else if (passno == 1 &&
  939                     ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
  940                         panic("ffs_snapshot: lost indirect block");
  941         }
  942         if (passno == 2)
  943                 ibp->b_flags |= B_VALIDSUSPWRT;
  944         bdwrite(ibp);
  945         return (0);
  946 }
  947 
  948 /*
  949  * Before expunging a snapshot inode, note all the
  950  * blocks that it claims with BLK_SNAP so that fsck will
  951  * be able to account for those blocks properly and so
  952  * that this snapshot knows that it need not copy them
  953  * if the other snapshot holding them is freed. This code
  954  * is reproduced once each for UFS1 and UFS2.
  955  */
  956 static int
  957 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
  958         struct vnode *snapvp;
  959         struct inode *cancelip;
  960         struct fs *fs;
  961         int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
  962             struct fs *, ufs_lbn_t, int);
  963         int expungetype;
  964 {
  965         int i, error, indiroff;
  966         ufs_lbn_t lbn, rlbn;
  967         ufs2_daddr_t len, blkno, numblks, blksperindir;
  968         struct ufs1_dinode *dip;
  969         struct thread *td = curthread;
  970         struct buf *bp;
  971 
  972         /*
  973          * Prepare to expunge the inode. If its inode block has not
  974          * yet been copied, then allocate and fill the copy.
  975          */
  976         lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
  977         blkno = 0;
  978         if (lbn < NDADDR) {
  979                 blkno = VTOI(snapvp)->i_din1->di_db[lbn];
  980         } else {
  981                 td->td_pflags |= TDP_COWINPROGRESS;
  982                 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
  983                    fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
  984                 td->td_pflags &= ~TDP_COWINPROGRESS;
  985                 if (error)
  986                         return (error);
  987                 indiroff = (lbn - NDADDR) % NINDIR(fs);
  988                 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
  989                 bqrelse(bp);
  990         }
  991         if (blkno != 0) {
  992                 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
  993                         return (error);
  994         } else {
  995                 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
  996                     fs->fs_bsize, KERNCRED, 0, &bp);
  997                 if (error)
  998                         return (error);
  999                 if ((error = readblock(snapvp, bp, lbn)) != 0)
 1000                         return (error);
 1001         }
 1002         /*
 1003          * Set a snapshot inode to be a zero length file, regular files
 1004          * or unlinked snapshots to be completely unallocated.
 1005          */
 1006         dip = (struct ufs1_dinode *)bp->b_data +
 1007             ino_to_fsbo(fs, cancelip->i_number);
 1008         if (expungetype == BLK_NOCOPY || cancelip->i_effnlink == 0)
 1009                 dip->di_mode = 0;
 1010         dip->di_size = 0;
 1011         dip->di_blocks = 0;
 1012         dip->di_flags &= ~SF_SNAPSHOT;
 1013         bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
 1014         bdwrite(bp);
 1015         /*
 1016          * Now go through and expunge all the blocks in the file
 1017          * using the function requested.
 1018          */
 1019         numblks = howmany(cancelip->i_size, fs->fs_bsize);
 1020         if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
 1021             &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype)))
 1022                 return (error);
 1023         if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0],
 1024             &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype)))
 1025                 return (error);
 1026         blksperindir = 1;
 1027         lbn = -NDADDR;
 1028         len = numblks - NDADDR;
 1029         rlbn = NDADDR;
 1030         for (i = 0; len > 0 && i < NIADDR; i++) {
 1031                 error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
 1032                     cancelip->i_din1->di_ib[i], lbn, rlbn, len,
 1033                     blksperindir, fs, acctfunc, expungetype);
 1034                 if (error)
 1035                         return (error);
 1036                 blksperindir *= NINDIR(fs);
 1037                 lbn -= blksperindir + 1;
 1038                 len -= blksperindir;
 1039                 rlbn += blksperindir;
 1040         }
 1041         return (0);
 1042 }
 1043 
 1044 /*
 1045  * Descend an indirect block chain for vnode cancelvp accounting for all
 1046  * its indirect blocks in snapvp.
 1047  */ 
 1048 static int
 1049 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 1050             blksperindir, fs, acctfunc, expungetype)
 1051         struct vnode *snapvp;
 1052         struct vnode *cancelvp;
 1053         int level;
 1054         ufs1_daddr_t blkno;
 1055         ufs_lbn_t lbn;
 1056         ufs_lbn_t rlbn;
 1057         ufs_lbn_t remblks;
 1058         ufs_lbn_t blksperindir;
 1059         struct fs *fs;
 1060         int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 1061             struct fs *, ufs_lbn_t, int);
 1062         int expungetype;
 1063 {
 1064         int error, num, i;
 1065         ufs_lbn_t subblksperindir;
 1066         struct indir indirs[NIADDR + 2];
 1067         ufs1_daddr_t last, *bap;
 1068         struct buf *bp;
 1069 
 1070         if (blkno == 0) {
 1071                 if (expungetype == BLK_NOCOPY)
 1072                         return (0);
 1073                 panic("indiracct_ufs1: missing indir");
 1074         }
 1075         if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 1076                 return (error);
 1077         if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 1078                 panic("indiracct_ufs1: botched params");
 1079         /*
 1080          * We have to expand bread here since it will deadlock looking
 1081          * up the block number for any blocks that are not in the cache.
 1082          */
 1083         bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 1084         bp->b_blkno = fsbtodb(fs, blkno);
 1085         if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 1086             (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
 1087                 brelse(bp);
 1088                 return (error);
 1089         }
 1090         /*
 1091          * Account for the block pointers in this indirect block.
 1092          */
 1093         last = howmany(remblks, blksperindir);
 1094         if (last > NINDIR(fs))
 1095                 last = NINDIR(fs);
 1096         bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
 1097         bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 1098         bqrelse(bp);
 1099         error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 1100             level == 0 ? rlbn : -1, expungetype);
 1101         if (error || level == 0)
 1102                 goto out;
 1103         /*
 1104          * Account for the block pointers in each of the indirect blocks
 1105          * in the levels below us.
 1106          */
 1107         subblksperindir = blksperindir / NINDIR(fs);
 1108         for (lbn++, level--, i = 0; i < last; i++) {
 1109                 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
 1110                     rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 1111                 if (error)
 1112                         goto out;
 1113                 rlbn += blksperindir;
 1114                 lbn -= blksperindir;
 1115                 remblks -= blksperindir;
 1116         }
 1117 out:
 1118         free(bap, M_DEVBUF);
 1119         return (error);
 1120 }
 1121 
 1122 /*
 1123  * Do both snap accounting and map accounting.
 1124  */
 1125 static int
 1126 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 1127         struct vnode *vp;
 1128         ufs1_daddr_t *oldblkp, *lastblkp;
 1129         struct fs *fs;
 1130         ufs_lbn_t lblkno;
 1131         int exptype;    /* BLK_SNAP or BLK_NOCOPY */
 1132 {
 1133         int error;
 1134 
 1135         if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 1136                 return (error);
 1137         return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 1138 }
 1139 
 1140 /*
 1141  * Identify a set of blocks allocated in a snapshot inode.
 1142  */
 1143 static int
 1144 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 1145         struct vnode *vp;
 1146         ufs1_daddr_t *oldblkp, *lastblkp;
 1147         struct fs *fs;
 1148         ufs_lbn_t lblkno;
 1149         int expungetype;        /* BLK_SNAP or BLK_NOCOPY */
 1150 {
 1151         struct inode *ip = VTOI(vp);
 1152         ufs1_daddr_t blkno, *blkp;
 1153         ufs_lbn_t lbn;
 1154         struct buf *ibp;
 1155         int error;
 1156 
 1157         for ( ; oldblkp < lastblkp; oldblkp++) {
 1158                 blkno = *oldblkp;
 1159                 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 1160                         continue;
 1161                 lbn = fragstoblks(fs, blkno);
 1162                 if (lbn < NDADDR) {
 1163                         blkp = &ip->i_din1->di_db[lbn];
 1164                         ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1165                 } else {
 1166                         error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn),
 1167                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 1168                         if (error)
 1169                                 return (error);
 1170                         blkp = &((ufs1_daddr_t *)(ibp->b_data))
 1171                             [(lbn - NDADDR) % NINDIR(fs)];
 1172                 }
 1173                 /*
 1174                  * If we are expunging a snapshot vnode and we
 1175                  * find a block marked BLK_NOCOPY, then it is
 1176                  * one that has been allocated to this snapshot after
 1177                  * we took our current snapshot and can be ignored.
 1178                  */
 1179                 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 1180                         if (lbn >= NDADDR)
 1181                                 brelse(ibp);
 1182                 } else {
 1183                         if (*blkp != 0)
 1184                                 panic("snapacct_ufs1: bad block");
 1185                         *blkp = expungetype;
 1186                         if (lbn >= NDADDR)
 1187                                 bdwrite(ibp);
 1188                 }
 1189         }
 1190         return (0);
 1191 }
 1192 
 1193 /*
 1194  * Account for a set of blocks allocated in a snapshot inode.
 1195  */
 1196 static int
 1197 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 1198         struct vnode *vp;
 1199         ufs1_daddr_t *oldblkp, *lastblkp;
 1200         struct fs *fs;
 1201         ufs_lbn_t lblkno;
 1202         int expungetype;
 1203 {
 1204         ufs1_daddr_t blkno;
 1205         struct inode *ip;
 1206         ino_t inum;
 1207         int acctit;
 1208 
 1209         ip = VTOI(vp);
 1210         inum = ip->i_number;
 1211         if (lblkno == -1)
 1212                 acctit = 0;
 1213         else
 1214                 acctit = 1;
 1215         for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 1216                 blkno = *oldblkp;
 1217                 if (blkno == 0 || blkno == BLK_NOCOPY)
 1218                         continue;
 1219                 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 1220                         *ip->i_snapblklist++ = lblkno;
 1221                 if (blkno == BLK_SNAP)
 1222                         blkno = blkstofrags(fs, lblkno);
 1223                 ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
 1224         }
 1225         return (0);
 1226 }
 1227 
 1228 /*
 1229  * Before expunging a snapshot inode, note all the
 1230  * blocks that it claims with BLK_SNAP so that fsck will
 1231  * be able to account for those blocks properly and so
 1232  * that this snapshot knows that it need not copy them
 1233  * if the other snapshot holding them is freed. This code
 1234  * is reproduced once each for UFS1 and UFS2.
 1235  */
 1236 static int
 1237 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
 1238         struct vnode *snapvp;
 1239         struct inode *cancelip;
 1240         struct fs *fs;
 1241         int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 1242             struct fs *, ufs_lbn_t, int);
 1243         int expungetype;
 1244 {
 1245         int i, error, indiroff;
 1246         ufs_lbn_t lbn, rlbn;
 1247         ufs2_daddr_t len, blkno, numblks, blksperindir;
 1248         struct ufs2_dinode *dip;
 1249         struct thread *td = curthread;
 1250         struct buf *bp;
 1251 
 1252         /*
 1253          * Prepare to expunge the inode. If its inode block has not
 1254          * yet been copied, then allocate and fill the copy.
 1255          */
 1256         lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 1257         blkno = 0;
 1258         if (lbn < NDADDR) {
 1259                 blkno = VTOI(snapvp)->i_din2->di_db[lbn];
 1260         } else {
 1261                 td->td_pflags |= TDP_COWINPROGRESS;
 1262                 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
 1263                    fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 1264                 td->td_pflags &= ~TDP_COWINPROGRESS;
 1265                 if (error)
 1266                         return (error);
 1267                 indiroff = (lbn - NDADDR) % NINDIR(fs);
 1268                 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
 1269                 bqrelse(bp);
 1270         }
 1271         if (blkno != 0) {
 1272                 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
 1273                         return (error);
 1274         } else {
 1275                 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
 1276                     fs->fs_bsize, KERNCRED, 0, &bp);
 1277                 if (error)
 1278                         return (error);
 1279                 if ((error = readblock(snapvp, bp, lbn)) != 0)
 1280                         return (error);
 1281         }
 1282         /*
 1283          * Set a snapshot inode to be a zero length file, regular files
 1284          * to be completely unallocated.
 1285          */
 1286         dip = (struct ufs2_dinode *)bp->b_data +
 1287             ino_to_fsbo(fs, cancelip->i_number);
 1288         if (expungetype == BLK_NOCOPY)
 1289                 dip->di_mode = 0;
 1290         dip->di_size = 0;
 1291         dip->di_blocks = 0;
 1292         dip->di_flags &= ~SF_SNAPSHOT;
 1293         bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
 1294         bdwrite(bp);
 1295         /*
 1296          * Now go through and expunge all the blocks in the file
 1297          * using the function requested.
 1298          */
 1299         numblks = howmany(cancelip->i_size, fs->fs_bsize);
 1300         if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
 1301             &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype)))
 1302                 return (error);
 1303         if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0],
 1304             &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype)))
 1305                 return (error);
 1306         blksperindir = 1;
 1307         lbn = -NDADDR;
 1308         len = numblks - NDADDR;
 1309         rlbn = NDADDR;
 1310         for (i = 0; len > 0 && i < NIADDR; i++) {
 1311                 error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
 1312                     cancelip->i_din2->di_ib[i], lbn, rlbn, len,
 1313                     blksperindir, fs, acctfunc, expungetype);
 1314                 if (error)
 1315                         return (error);
 1316                 blksperindir *= NINDIR(fs);
 1317                 lbn -= blksperindir + 1;
 1318                 len -= blksperindir;
 1319                 rlbn += blksperindir;
 1320         }
 1321         return (0);
 1322 }
 1323 
 1324 /*
 1325  * Descend an indirect block chain for vnode cancelvp accounting for all
 1326  * its indirect blocks in snapvp.
 1327  */ 
 1328 static int
 1329 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 1330             blksperindir, fs, acctfunc, expungetype)
 1331         struct vnode *snapvp;
 1332         struct vnode *cancelvp;
 1333         int level;
 1334         ufs2_daddr_t blkno;
 1335         ufs_lbn_t lbn;
 1336         ufs_lbn_t rlbn;
 1337         ufs_lbn_t remblks;
 1338         ufs_lbn_t blksperindir;
 1339         struct fs *fs;
 1340         int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 1341             struct fs *, ufs_lbn_t, int);
 1342         int expungetype;
 1343 {
 1344         int error, num, i;
 1345         ufs_lbn_t subblksperindir;
 1346         struct indir indirs[NIADDR + 2];
 1347         ufs2_daddr_t last, *bap;
 1348         struct buf *bp;
 1349 
 1350         if (blkno == 0) {
 1351                 if (expungetype == BLK_NOCOPY)
 1352                         return (0);
 1353                 panic("indiracct_ufs2: missing indir");
 1354         }
 1355         if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 1356                 return (error);
 1357         if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 1358                 panic("indiracct_ufs2: botched params");
 1359         /*
 1360          * We have to expand bread here since it will deadlock looking
 1361          * up the block number for any blocks that are not in the cache.
 1362          */
 1363         bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 1364         bp->b_blkno = fsbtodb(fs, blkno);
 1365         if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 1366             (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
 1367                 brelse(bp);
 1368                 return (error);
 1369         }
 1370         /*
 1371          * Account for the block pointers in this indirect block.
 1372          */
 1373         last = howmany(remblks, blksperindir);
 1374         if (last > NINDIR(fs))
 1375                 last = NINDIR(fs);
 1376         bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
 1377         bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 1378         bqrelse(bp);
 1379         error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 1380             level == 0 ? rlbn : -1, expungetype);
 1381         if (error || level == 0)
 1382                 goto out;
 1383         /*
 1384          * Account for the block pointers in each of the indirect blocks
 1385          * in the levels below us.
 1386          */
 1387         subblksperindir = blksperindir / NINDIR(fs);
 1388         for (lbn++, level--, i = 0; i < last; i++) {
 1389                 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
 1390                     rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 1391                 if (error)
 1392                         goto out;
 1393                 rlbn += blksperindir;
 1394                 lbn -= blksperindir;
 1395                 remblks -= blksperindir;
 1396         }
 1397 out:
 1398         free(bap, M_DEVBUF);
 1399         return (error);
 1400 }
 1401 
 1402 /*
 1403  * Do both snap accounting and map accounting.
 1404  */
 1405 static int
 1406 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 1407         struct vnode *vp;
 1408         ufs2_daddr_t *oldblkp, *lastblkp;
 1409         struct fs *fs;
 1410         ufs_lbn_t lblkno;
 1411         int exptype;    /* BLK_SNAP or BLK_NOCOPY */
 1412 {
 1413         int error;
 1414 
 1415         if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 1416                 return (error);
 1417         return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 1418 }
 1419 
 1420 /*
 1421  * Identify a set of blocks allocated in a snapshot inode.
 1422  */
 1423 static int
 1424 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 1425         struct vnode *vp;
 1426         ufs2_daddr_t *oldblkp, *lastblkp;
 1427         struct fs *fs;
 1428         ufs_lbn_t lblkno;
 1429         int expungetype;        /* BLK_SNAP or BLK_NOCOPY */
 1430 {
 1431         struct inode *ip = VTOI(vp);
 1432         ufs2_daddr_t blkno, *blkp;
 1433         ufs_lbn_t lbn;
 1434         struct buf *ibp;
 1435         int error;
 1436 
 1437         for ( ; oldblkp < lastblkp; oldblkp++) {
 1438                 blkno = *oldblkp;
 1439                 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 1440                         continue;
 1441                 lbn = fragstoblks(fs, blkno);
 1442                 if (lbn < NDADDR) {
 1443                         blkp = &ip->i_din2->di_db[lbn];
 1444                         ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1445                 } else {
 1446                         error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn),
 1447                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 1448                         if (error)
 1449                                 return (error);
 1450                         blkp = &((ufs2_daddr_t *)(ibp->b_data))
 1451                             [(lbn - NDADDR) % NINDIR(fs)];
 1452                 }
 1453                 /*
 1454                  * If we are expunging a snapshot vnode and we
 1455                  * find a block marked BLK_NOCOPY, then it is
 1456                  * one that has been allocated to this snapshot after
 1457                  * we took our current snapshot and can be ignored.
 1458                  */
 1459                 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 1460                         if (lbn >= NDADDR)
 1461                                 brelse(ibp);
 1462                 } else {
 1463                         if (*blkp != 0)
 1464                                 panic("snapacct_ufs2: bad block");
 1465                         *blkp = expungetype;
 1466                         if (lbn >= NDADDR)
 1467                                 bdwrite(ibp);
 1468                 }
 1469         }
 1470         return (0);
 1471 }
 1472 
 1473 /*
 1474  * Account for a set of blocks allocated in a snapshot inode.
 1475  */
 1476 static int
 1477 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 1478         struct vnode *vp;
 1479         ufs2_daddr_t *oldblkp, *lastblkp;
 1480         struct fs *fs;
 1481         ufs_lbn_t lblkno;
 1482         int expungetype;
 1483 {
 1484         ufs2_daddr_t blkno;
 1485         struct inode *ip;
 1486         ino_t inum;
 1487         int acctit;
 1488 
 1489         ip = VTOI(vp);
 1490         inum = ip->i_number;
 1491         if (lblkno == -1)
 1492                 acctit = 0;
 1493         else
 1494                 acctit = 1;
 1495         for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 1496                 blkno = *oldblkp;
 1497                 if (blkno == 0 || blkno == BLK_NOCOPY)
 1498                         continue;
 1499                 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 1500                         *ip->i_snapblklist++ = lblkno;
 1501                 if (blkno == BLK_SNAP)
 1502                         blkno = blkstofrags(fs, lblkno);
 1503                 ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
 1504         }
 1505         return (0);
 1506 }
 1507 
 1508 /*
 1509  * Decrement extra reference on snapshot when last name is removed.
 1510  * It will not be freed until the last open reference goes away.
 1511  */
 1512 void
 1513 ffs_snapgone(ip)
 1514         struct inode *ip;
 1515 {
 1516         struct inode *xp;
 1517         struct fs *fs;
 1518         int snaploc;
 1519         struct snapdata *sn;
 1520         struct ufsmount *ump;
 1521 
 1522         /*
 1523          * Find snapshot in incore list.
 1524          */
 1525         xp = NULL;
 1526         sn = ip->i_devvp->v_rdev->si_snapdata;
 1527         if (sn != NULL)
 1528                 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap)
 1529                         if (xp == ip)
 1530                                 break;
 1531         if (xp != NULL)
 1532                 vrele(ITOV(ip));
 1533         else if (snapdebug)
 1534                 printf("ffs_snapgone: lost snapshot vnode %d\n",
 1535                     ip->i_number);
 1536         /*
 1537          * Delete snapshot inode from superblock. Keep list dense.
 1538          */
 1539         fs = ip->i_fs;
 1540         ump = ip->i_ump;
 1541         UFS_LOCK(ump);
 1542         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 1543                 if (fs->fs_snapinum[snaploc] == ip->i_number)
 1544                         break;
 1545         if (snaploc < FSMAXSNAP) {
 1546                 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
 1547                         if (fs->fs_snapinum[snaploc] == 0)
 1548                                 break;
 1549                         fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
 1550                 }
 1551                 fs->fs_snapinum[snaploc - 1] = 0;
 1552         }
 1553         UFS_UNLOCK(ump);
 1554 }
 1555 
 1556 /*
 1557  * Prepare a snapshot file for being removed.
 1558  */
 1559 void
 1560 ffs_snapremove(vp)
 1561         struct vnode *vp;
 1562 {
 1563         struct inode *ip;
 1564         struct vnode *devvp;
 1565         struct buf *ibp;
 1566         struct fs *fs;
 1567         ufs2_daddr_t numblks, blkno, dblk;
 1568         int error, loc, last;
 1569         struct snapdata *sn;
 1570 
 1571         ip = VTOI(vp);
 1572         fs = ip->i_fs;
 1573         devvp = ip->i_devvp;
 1574         /*
 1575          * If active, delete from incore list (this snapshot may
 1576          * already have been in the process of being deleted, so
 1577          * would not have been active).
 1578          *
 1579          * Clear copy-on-write flag if last snapshot.
 1580          */
 1581         VI_LOCK(devvp);
 1582         if (ip->i_nextsnap.tqe_prev != 0) {
 1583                 sn = devvp->v_rdev->si_snapdata;
 1584                 TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap);
 1585                 ip->i_nextsnap.tqe_prev = 0;
 1586                 VI_UNLOCK(devvp);
 1587                 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
 1588                 KASSERT(vp->v_vnlock == &sn->sn_lock,
 1589                         ("ffs_snapremove: lost lock mutation")); 
 1590                 vp->v_vnlock = &vp->v_lock;
 1591                 VI_LOCK(devvp);
 1592                 lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
 1593                 try_free_snapdata(devvp);
 1594         } else
 1595                 VI_UNLOCK(devvp);
 1596         /*
 1597          * Clear all BLK_NOCOPY fields. Pass any block claims to other
 1598          * snapshots that want them (see ffs_snapblkfree below).
 1599          */
 1600         for (blkno = 1; blkno < NDADDR; blkno++) {
 1601                 dblk = DIP(ip, i_db[blkno]);
 1602                 if (dblk == 0)
 1603                         continue;
 1604                 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 1605                         DIP_SET(ip, i_db[blkno], 0);
 1606                 else if ((dblk == blkstofrags(fs, blkno) &&
 1607                      ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
 1608                      ip->i_number))) {
 1609                         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) -
 1610                             btodb(fs->fs_bsize));
 1611                         DIP_SET(ip, i_db[blkno], 0);
 1612                 }
 1613         }
 1614         numblks = howmany(ip->i_size, fs->fs_bsize);
 1615         for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 1616                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 1617                     fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 1618                 if (error)
 1619                         continue;
 1620                 if (fs->fs_size - blkno > NINDIR(fs))
 1621                         last = NINDIR(fs);
 1622                 else
 1623                         last = fs->fs_size - blkno;
 1624                 for (loc = 0; loc < last; loc++) {
 1625                         if (ip->i_ump->um_fstype == UFS1) {
 1626                                 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
 1627                                 if (dblk == 0)
 1628                                         continue;
 1629                                 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 1630                                         ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 1631                                 else if ((dblk == blkstofrags(fs, blkno) &&
 1632                                      ffs_snapblkfree(fs, ip->i_devvp, dblk,
 1633                                      fs->fs_bsize, ip->i_number))) {
 1634                                         ip->i_din1->di_blocks -=
 1635                                             btodb(fs->fs_bsize);
 1636                                         ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 1637                                 }
 1638                                 continue;
 1639                         }
 1640                         dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
 1641                         if (dblk == 0)
 1642                                 continue;
 1643                         if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 1644                                 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 1645                         else if ((dblk == blkstofrags(fs, blkno) &&
 1646                              ffs_snapblkfree(fs, ip->i_devvp, dblk,
 1647                              fs->fs_bsize, ip->i_number))) {
 1648                                 ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
 1649                                 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 1650                         }
 1651                 }
 1652                 bawrite(ibp);
 1653         }
 1654         /*
 1655          * Clear snapshot flag and drop reference.
 1656          */
 1657         ip->i_flags &= ~SF_SNAPSHOT;
 1658         DIP_SET(ip, i_flags, ip->i_flags);
 1659         ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1660 #ifdef QUOTA
 1661         /*
 1662          * Reenable disk quotas for ex-snapshot file.
 1663          */
 1664         if (!getinoquota(ip))
 1665                 (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE);
 1666 #endif
 1667 }
 1668 
 1669 /*
 1670  * Notification that a block is being freed. Return zero if the free
 1671  * should be allowed to proceed. Return non-zero if the snapshot file
 1672  * wants to claim the block. The block will be claimed if it is an
 1673  * uncopied part of one of the snapshots. It will be freed if it is
 1674  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
 1675  * If a fragment is being freed, then all snapshots that care about
 1676  * it must make a copy since a snapshot file can only claim full sized
 1677  * blocks. Note that if more than one snapshot file maps the block,
 1678  * we can pick one at random to claim it. Since none of the snapshots
 1679  * can change, we are assurred that they will all see the same unmodified
 1680  * image. When deleting a snapshot file (see ffs_snapremove above), we
 1681  * must push any of these claimed blocks to one of the other snapshots
 1682  * that maps it. These claimed blocks are easily identified as they will
 1683  * have a block number equal to their logical block number within the
 1684  * snapshot. A copied block can never have this property because they
 1685  * must always have been allocated from a BLK_NOCOPY location.
 1686  */
 1687 int
 1688 ffs_snapblkfree(fs, devvp, bno, size, inum)
 1689         struct fs *fs;
 1690         struct vnode *devvp;
 1691         ufs2_daddr_t bno;
 1692         long size;
 1693         ino_t inum;
 1694 {
 1695         struct buf *ibp, *cbp, *savedcbp = 0;
 1696         struct thread *td = curthread;
 1697         struct inode *ip;
 1698         struct vnode *vp = NULL;
 1699         ufs_lbn_t lbn;
 1700         ufs2_daddr_t blkno;
 1701         int indiroff = 0, error = 0, claimedblk = 0;
 1702         struct snapdata *sn;
 1703 
 1704         lbn = fragstoblks(fs, bno);
 1705 retry:
 1706         VI_LOCK(devvp);
 1707         sn = devvp->v_rdev->si_snapdata;
 1708         if (sn == NULL) {
 1709                 VI_UNLOCK(devvp);
 1710                 return (0);
 1711         }
 1712         if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 1713             VI_MTX(devvp)) != 0)
 1714                 goto retry;
 1715         TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 1716                 vp = ITOV(ip);
 1717                 /*
 1718                  * Lookup block being written.
 1719                  */
 1720                 if (lbn < NDADDR) {
 1721                         blkno = DIP(ip, i_db[lbn]);
 1722                 } else {
 1723                         td->td_pflags |= TDP_COWINPROGRESS;
 1724                         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 1725                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 1726                         td->td_pflags &= ~TDP_COWINPROGRESS;
 1727                         if (error)
 1728                                 break;
 1729                         indiroff = (lbn - NDADDR) % NINDIR(fs);
 1730                         if (ip->i_ump->um_fstype == UFS1)
 1731                                 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 1732                         else
 1733                                 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 1734                 }
 1735                 /*
 1736                  * Check to see if block needs to be copied.
 1737                  */
 1738                 if (blkno == 0) {
 1739                         /*
 1740                          * A block that we map is being freed. If it has not
 1741                          * been claimed yet, we will claim or copy it (below).
 1742                          */
 1743                         claimedblk = 1;
 1744                 } else if (blkno == BLK_SNAP) {
 1745                         /*
 1746                          * No previous snapshot claimed the block,
 1747                          * so it will be freed and become a BLK_NOCOPY
 1748                          * (don't care) for us.
 1749                          */
 1750                         if (claimedblk)
 1751                                 panic("snapblkfree: inconsistent block type");
 1752                         if (lbn < NDADDR) {
 1753                                 DIP_SET(ip, i_db[lbn], BLK_NOCOPY);
 1754                                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1755                         } else if (ip->i_ump->um_fstype == UFS1) {
 1756                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 1757                                     BLK_NOCOPY;
 1758                                 bdwrite(ibp);
 1759                         } else {
 1760                                 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
 1761                                     BLK_NOCOPY;
 1762                                 bdwrite(ibp);
 1763                         }
 1764                         continue;
 1765                 } else /* BLK_NOCOPY or default */ {
 1766                         /*
 1767                          * If the snapshot has already copied the block
 1768                          * (default), or does not care about the block,
 1769                          * it is not needed.
 1770                          */
 1771                         if (lbn >= NDADDR)
 1772                                 bqrelse(ibp);
 1773                         continue;
 1774                 }
 1775                 /*
 1776                  * If this is a full size block, we will just grab it
 1777                  * and assign it to the snapshot inode. Otherwise we
 1778                  * will proceed to copy it. See explanation for this
 1779                  * routine as to why only a single snapshot needs to
 1780                  * claim this block.
 1781                  */
 1782                 if (size == fs->fs_bsize) {
 1783 #ifdef DEBUG
 1784                         if (snapdebug)
 1785                                 printf("%s %d lbn %jd from inum %d\n",
 1786                                     "Grabonremove: snapino", ip->i_number,
 1787                                     (intmax_t)lbn, inum);
 1788 #endif
 1789                         if (lbn < NDADDR) {
 1790                                 DIP_SET(ip, i_db[lbn], bno);
 1791                         } else if (ip->i_ump->um_fstype == UFS1) {
 1792                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
 1793                                 bdwrite(ibp);
 1794                         } else {
 1795                                 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
 1796                                 bdwrite(ibp);
 1797                         }
 1798                         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
 1799                         ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1800                         lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
 1801                         return (1);
 1802                 }
 1803                 if (lbn >= NDADDR)
 1804                         bqrelse(ibp);
 1805                 /*
 1806                  * Allocate the block into which to do the copy. Note that this
 1807                  * allocation will never require any additional allocations for
 1808                  * the snapshot inode.
 1809                  */
 1810                 td->td_pflags |= TDP_COWINPROGRESS;
 1811                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 1812                     fs->fs_bsize, KERNCRED, 0, &cbp);
 1813                 td->td_pflags &= ~TDP_COWINPROGRESS;
 1814                 if (error)
 1815                         break;
 1816 #ifdef DEBUG
 1817                 if (snapdebug)
 1818                         printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n",
 1819                             "Copyonremove: snapino ", ip->i_number,
 1820                             (intmax_t)lbn, "for inum", inum, size,
 1821                             (intmax_t)cbp->b_blkno);
 1822 #endif
 1823                 /*
 1824                  * If we have already read the old block contents, then
 1825                  * simply copy them to the new block. Note that we need
 1826                  * to synchronously write snapshots that have not been
 1827                  * unlinked, and hence will be visible after a crash,
 1828                  * to ensure their integrity.
 1829                  */
 1830                 if (savedcbp != 0) {
 1831                         bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 1832                         bawrite(cbp);
 1833                         if (dopersistence && ip->i_effnlink > 0)
 1834                                 (void) ffs_syncvnode(vp, MNT_WAIT);
 1835                         continue;
 1836                 }
 1837                 /*
 1838                  * Otherwise, read the old block contents into the buffer.
 1839                  */
 1840                 if ((error = readblock(vp, cbp, lbn)) != 0) {
 1841                         bzero(cbp->b_data, fs->fs_bsize);
 1842                         bawrite(cbp);
 1843                         if (dopersistence && ip->i_effnlink > 0)
 1844                                 (void) ffs_syncvnode(vp, MNT_WAIT);
 1845                         break;
 1846                 }
 1847                 savedcbp = cbp;
 1848         }
 1849         /*
 1850          * Note that we need to synchronously write snapshots that
 1851          * have not been unlinked, and hence will be visible after
 1852          * a crash, to ensure their integrity.
 1853          */
 1854         if (savedcbp) {
 1855                 vp = savedcbp->b_vp;
 1856                 bawrite(savedcbp);
 1857                 if (dopersistence && VTOI(vp)->i_effnlink > 0)
 1858                         (void) ffs_syncvnode(vp, MNT_WAIT);
 1859         }
 1860         /*
 1861          * If we have been unable to allocate a block in which to do
 1862          * the copy, then return non-zero so that the fragment will
 1863          * not be freed. Although space will be lost, the snapshot
 1864          * will stay consistent.
 1865          */
 1866         lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
 1867         return (error);
 1868 }
 1869 
 1870 /*
 1871  * Associate snapshot files when mounting.
 1872  */
 1873 void
 1874 ffs_snapshot_mount(mp)
 1875         struct mount *mp;
 1876 {
 1877         struct ufsmount *ump = VFSTOUFS(mp);
 1878         struct vnode *devvp = ump->um_devvp;
 1879         struct fs *fs = ump->um_fs;
 1880         struct thread *td = curthread;
 1881         struct snapdata *sn;
 1882         struct vnode *vp;
 1883         struct vnode *lastvp;
 1884         struct inode *ip;
 1885         struct uio auio;
 1886         struct iovec aiov;
 1887         void *snapblklist;
 1888         char *reason;
 1889         daddr_t snaplistsize;
 1890         int error, snaploc, loc;
 1891 
 1892         /*
 1893          * XXX The following needs to be set before ffs_truncate or
 1894          * VOP_READ can be called.
 1895          */
 1896         mp->mnt_stat.f_iosize = fs->fs_bsize;
 1897         /*
 1898          * Process each snapshot listed in the superblock.
 1899          */
 1900         vp = NULL;
 1901         lastvp = NULL;
 1902         sn = NULL;
 1903         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
 1904                 if (fs->fs_snapinum[snaploc] == 0)
 1905                         break;
 1906                 if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc],
 1907                     LK_EXCLUSIVE, &vp)) != 0){
 1908                         printf("ffs_snapshot_mount: vget failed %d\n", error);
 1909                         continue;
 1910                 }
 1911                 ip = VTOI(vp);
 1912                 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size ==
 1913                     lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
 1914                         if ((ip->i_flags & SF_SNAPSHOT) == 0) {
 1915                                 reason = "non-snapshot";
 1916                         } else {
 1917                                 reason = "old format snapshot";
 1918                                 (void)ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
 1919                                 (void)ffs_syncvnode(vp, MNT_WAIT);
 1920                         }
 1921                         printf("ffs_snapshot_mount: %s inode %d\n",
 1922                             reason, fs->fs_snapinum[snaploc]);
 1923                         vput(vp);
 1924                         vp = NULL;
 1925                         for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
 1926                                 if (fs->fs_snapinum[loc] == 0)
 1927                                         break;
 1928                                 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
 1929                         }
 1930                         fs->fs_snapinum[loc - 1] = 0;
 1931                         snaploc--;
 1932                         continue;
 1933                 }
 1934                 /*
 1935                  * Acquire a lock on the snapdata structure, creating it if
 1936                  * necessary.
 1937                  */
 1938                 sn = ffs_snapdata_acquire(devvp);
 1939                 /* 
 1940                  * Change vnode to use shared snapshot lock instead of the
 1941                  * original private lock.
 1942                  */
 1943                 vp->v_vnlock = &sn->sn_lock;
 1944                 lockmgr(&vp->v_lock, LK_RELEASE, NULL);
 1945                 /*
 1946                  * Link it onto the active snapshot list.
 1947                  */
 1948                 VI_LOCK(devvp);
 1949                 if (ip->i_nextsnap.tqe_prev != 0)
 1950                         panic("ffs_snapshot_mount: %d already on list",
 1951                             ip->i_number);
 1952                 else
 1953                         TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
 1954                 vp->v_vflag |= VV_SYSTEM;
 1955                 VI_UNLOCK(devvp);
 1956                 VOP_UNLOCK(vp, 0);
 1957                 lastvp = vp;
 1958         }
 1959         vp = lastvp;
 1960         /*
 1961          * No usable snapshots found.
 1962          */
 1963         if (sn == NULL || vp == NULL)
 1964                 return;
 1965         /*
 1966          * Allocate the space for the block hints list. We always want to
 1967          * use the list from the newest snapshot.
 1968          */
 1969         auio.uio_iov = &aiov;
 1970         auio.uio_iovcnt = 1;
 1971         aiov.iov_base = (void *)&snaplistsize;
 1972         aiov.iov_len = sizeof(snaplistsize);
 1973         auio.uio_resid = aiov.iov_len;
 1974         auio.uio_offset =
 1975             lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
 1976         auio.uio_segflg = UIO_SYSSPACE;
 1977         auio.uio_rw = UIO_READ;
 1978         auio.uio_td = td;
 1979         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1980         if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 1981                 printf("ffs_snapshot_mount: read_1 failed %d\n", error);
 1982                 VOP_UNLOCK(vp, 0);
 1983                 return;
 1984         }
 1985         snapblklist = malloc(snaplistsize * sizeof(daddr_t),
 1986             M_UFSMNT, M_WAITOK);
 1987         auio.uio_iovcnt = 1;
 1988         aiov.iov_base = snapblklist;
 1989         aiov.iov_len = snaplistsize * sizeof (daddr_t);
 1990         auio.uio_resid = aiov.iov_len;
 1991         auio.uio_offset -= sizeof(snaplistsize);
 1992         if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 1993                 printf("ffs_snapshot_mount: read_2 failed %d\n", error);
 1994                 VOP_UNLOCK(vp, 0);
 1995                 free(snapblklist, M_UFSMNT);
 1996                 return;
 1997         }
 1998         VOP_UNLOCK(vp, 0);
 1999         VI_LOCK(devvp);
 2000         ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount");
 2001         sn->sn_listsize = snaplistsize;
 2002         sn->sn_blklist = (daddr_t *)snapblklist;
 2003         devvp->v_vflag |= VV_COPYONWRITE;
 2004         VI_UNLOCK(devvp);
 2005 }
 2006 
 2007 /*
 2008  * Disassociate snapshot files when unmounting.
 2009  */
 2010 void
 2011 ffs_snapshot_unmount(mp)
 2012         struct mount *mp;
 2013 {
 2014         struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
 2015         struct snapdata *sn;
 2016         struct inode *xp;
 2017         struct vnode *vp;
 2018 
 2019         VI_LOCK(devvp);
 2020         sn = devvp->v_rdev->si_snapdata;
 2021         while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) {
 2022                 vp = ITOV(xp);
 2023                 TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap);
 2024                 xp->i_nextsnap.tqe_prev = 0;
 2025                 lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE,
 2026                     VI_MTX(devvp));
 2027                 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
 2028                 KASSERT(vp->v_vnlock == &sn->sn_lock,
 2029                 ("ffs_snapshot_unmount: lost lock mutation")); 
 2030                 vp->v_vnlock = &vp->v_lock;
 2031                 lockmgr(&vp->v_lock, LK_RELEASE, NULL);
 2032                 lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
 2033                 if (xp->i_effnlink > 0)
 2034                         vrele(vp);
 2035                 VI_LOCK(devvp);
 2036                 sn = devvp->v_rdev->si_snapdata;
 2037         }
 2038         try_free_snapdata(devvp);
 2039         ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
 2040 }
 2041 
 2042 /*
 2043  * Check the buffer block to be belong to device buffer that shall be
 2044  * locked after snaplk. devvp shall be locked on entry, and will be
 2045  * leaved locked upon exit.
 2046  */
 2047 static int
 2048 ffs_bp_snapblk(devvp, bp)
 2049         struct vnode *devvp;
 2050         struct buf *bp;
 2051 {
 2052         struct snapdata *sn;
 2053         struct fs *fs;
 2054         ufs2_daddr_t lbn, *snapblklist;
 2055         int lower, upper, mid;
 2056 
 2057         ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk");
 2058         KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp));
 2059         sn = devvp->v_rdev->si_snapdata;
 2060         if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL)
 2061                 return (0);
 2062         fs = TAILQ_FIRST(&sn->sn_head)->i_fs;
 2063         lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 2064         snapblklist = sn->sn_blklist;
 2065         upper = sn->sn_listsize - 1;
 2066         lower = 1;
 2067         while (lower <= upper) {
 2068                 mid = (lower + upper) / 2;
 2069                 if (snapblklist[mid] == lbn)
 2070                         break;
 2071                 if (snapblklist[mid] < lbn)
 2072                         lower = mid + 1;
 2073                 else
 2074                         upper = mid - 1;
 2075         }
 2076         if (lower <= upper)
 2077                 return (1);
 2078         return (0);
 2079 }
 2080 
 2081 void
 2082 ffs_bdflush(bo, bp)
 2083         struct bufobj *bo;
 2084         struct buf *bp;
 2085 {
 2086         struct thread *td;
 2087         struct vnode *vp, *devvp;
 2088         struct buf *nbp;
 2089         int bp_bdskip;
 2090 
 2091         if (bo->bo_dirty.bv_cnt <= dirtybufthresh)
 2092                 return;
 2093 
 2094         td = curthread;
 2095         vp = bp->b_vp;
 2096         devvp = bo->__bo_vnode;
 2097         KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp));
 2098 
 2099         VI_LOCK(devvp);
 2100         bp_bdskip = ffs_bp_snapblk(devvp, bp);
 2101         if (bp_bdskip)
 2102                 bdwriteskip++;
 2103         VI_UNLOCK(devvp);
 2104         if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) {
 2105                 (void) VOP_FSYNC(vp, MNT_NOWAIT, td);
 2106                 altbufferflushes++;
 2107         } else {
 2108                 BO_LOCK(bo);
 2109                 /*
 2110                  * Try to find a buffer to flush.
 2111                  */
 2112                 TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
 2113                         if ((nbp->b_vflags & BV_BKGRDINPROG) ||
 2114                             BUF_LOCK(nbp,
 2115                                      LK_EXCLUSIVE | LK_NOWAIT, NULL))
 2116                                 continue;
 2117                         if (bp == nbp)
 2118                                 panic("bdwrite: found ourselves");
 2119                         BO_UNLOCK(bo);
 2120                         /*
 2121                          * Don't countdeps with the bo lock
 2122                          * held.
 2123                          */
 2124                         if (buf_countdeps(nbp, 0)) {
 2125                                 BO_LOCK(bo);
 2126                                 BUF_UNLOCK(nbp);
 2127                                 continue;
 2128                         }
 2129                         if (bp_bdskip) {
 2130                                 VI_LOCK(devvp);
 2131                                 if (!ffs_bp_snapblk(vp, nbp)) {
 2132                                         if (BO_MTX(bo) != VI_MTX(vp)) {
 2133                                                 VI_UNLOCK(devvp);
 2134                                                 BO_LOCK(bo);
 2135                                         }
 2136                                         BUF_UNLOCK(nbp);
 2137                                         continue;
 2138                                 }
 2139                                 VI_UNLOCK(devvp);
 2140                         }
 2141                         if (nbp->b_flags & B_CLUSTEROK) {
 2142                                 vfs_bio_awrite(nbp);
 2143                         } else {
 2144                                 bremfree(nbp);
 2145                                 bawrite(nbp);
 2146                         }
 2147                         dirtybufferflushes++;
 2148                         break;
 2149                 }
 2150                 if (nbp == NULL)
 2151                         BO_UNLOCK(bo);
 2152         }
 2153 }
 2154 
 2155 /*
 2156  * Check for need to copy block that is about to be written,
 2157  * copying the block if necessary.
 2158  */
 2159 int
 2160 ffs_copyonwrite(devvp, bp)
 2161         struct vnode *devvp;
 2162         struct buf *bp;
 2163 {
 2164         struct snapdata *sn;
 2165         struct buf *ibp, *cbp, *savedcbp = 0;
 2166         struct thread *td = curthread;
 2167         struct fs *fs;
 2168         struct inode *ip;
 2169         struct vnode *vp = 0;
 2170         ufs2_daddr_t lbn, blkno, *snapblklist;
 2171         int lower, upper, mid, indiroff, error = 0;
 2172         int launched_async_io, prev_norunningbuf;
 2173         long saved_runningbufspace;
 2174 
 2175         if (devvp != bp->b_vp && (VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0)
 2176                 return (0);             /* Update on a snapshot file */
 2177         if (td->td_pflags & TDP_COWINPROGRESS)
 2178                 panic("ffs_copyonwrite: recursive call");
 2179         /*
 2180          * First check to see if it is in the preallocated list.
 2181          * By doing this check we avoid several potential deadlocks.
 2182          */
 2183         VI_LOCK(devvp);
 2184         sn = devvp->v_rdev->si_snapdata;
 2185         if (sn == NULL ||
 2186             TAILQ_EMPTY(&sn->sn_head)) {
 2187                 VI_UNLOCK(devvp);
 2188                 return (0);             /* No snapshot */
 2189         }
 2190         ip = TAILQ_FIRST(&sn->sn_head);
 2191         fs = ip->i_fs;
 2192         lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 2193         snapblklist = sn->sn_blklist;
 2194         upper = sn->sn_listsize - 1;
 2195         lower = 1;
 2196         while (lower <= upper) {
 2197                 mid = (lower + upper) / 2;
 2198                 if (snapblklist[mid] == lbn)
 2199                         break;
 2200                 if (snapblklist[mid] < lbn)
 2201                         lower = mid + 1;
 2202                 else
 2203                         upper = mid - 1;
 2204         }
 2205         if (lower <= upper) {
 2206                 VI_UNLOCK(devvp);
 2207                 return (0);
 2208         }
 2209         launched_async_io = 0;
 2210         prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF;
 2211         /*
 2212          * Since I/O on bp isn't yet in progress and it may be blocked
 2213          * for a long time waiting on snaplk, back it out of
 2214          * runningbufspace, possibly waking other threads waiting for space.
 2215          */
 2216         saved_runningbufspace = bp->b_runningbufspace;
 2217         if (saved_runningbufspace != 0)
 2218                 runningbufwakeup(bp);
 2219         /*
 2220          * Not in the precomputed list, so check the snapshots.
 2221          */
 2222         while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 2223             VI_MTX(devvp)) != 0) {
 2224                 VI_LOCK(devvp);
 2225                 sn = devvp->v_rdev->si_snapdata;
 2226                 if (sn == NULL ||
 2227                     TAILQ_EMPTY(&sn->sn_head)) {
 2228                         VI_UNLOCK(devvp);
 2229                         if (saved_runningbufspace != 0) {
 2230                                 bp->b_runningbufspace = saved_runningbufspace;
 2231                                 atomic_add_long(&runningbufspace,
 2232                                                bp->b_runningbufspace);
 2233                         }
 2234                         return (0);             /* Snapshot gone */
 2235                 }
 2236         }
 2237         TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 2238                 vp = ITOV(ip);
 2239                 /*
 2240                  * We ensure that everything of our own that needs to be
 2241                  * copied will be done at the time that ffs_snapshot is
 2242                  * called. Thus we can skip the check here which can
 2243                  * deadlock in doing the lookup in UFS_BALLOC.
 2244                  */
 2245                 if (bp->b_vp == vp)
 2246                         continue;
 2247                 /*
 2248                  * Check to see if block needs to be copied. We do not have
 2249                  * to hold the snapshot lock while doing this lookup as it
 2250                  * will never require any additional allocations for the
 2251                  * snapshot inode.
 2252                  */
 2253                 if (lbn < NDADDR) {
 2254                         blkno = DIP(ip, i_db[lbn]);
 2255                 } else {
 2256                         td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
 2257                         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 2258                            fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 2259                         td->td_pflags &= ~TDP_COWINPROGRESS;
 2260                         if (error)
 2261                                 break;
 2262                         indiroff = (lbn - NDADDR) % NINDIR(fs);
 2263                         if (ip->i_ump->um_fstype == UFS1)
 2264                                 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 2265                         else
 2266                                 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 2267                         bqrelse(ibp);
 2268                 }
 2269 #ifdef INVARIANTS
 2270                 if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
 2271                         panic("ffs_copyonwrite: bad copy block");
 2272 #endif
 2273                 if (blkno != 0)
 2274                         continue;
 2275                 /*
 2276                  * Allocate the block into which to do the copy. Since
 2277                  * multiple processes may all try to copy the same block,
 2278                  * we have to recheck our need to do a copy if we sleep
 2279                  * waiting for the lock.
 2280                  *
 2281                  * Because all snapshots on a filesystem share a single
 2282                  * lock, we ensure that we will never be in competition
 2283                  * with another process to allocate a block.
 2284                  */
 2285                 td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
 2286                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 2287                     fs->fs_bsize, KERNCRED, 0, &cbp);
 2288                 td->td_pflags &= ~TDP_COWINPROGRESS;
 2289                 if (error)
 2290                         break;
 2291 #ifdef DEBUG
 2292                 if (snapdebug) {
 2293                         printf("Copyonwrite: snapino %d lbn %jd for ",
 2294                             ip->i_number, (intmax_t)lbn);
 2295                         if (bp->b_vp == devvp)
 2296                                 printf("fs metadata");
 2297                         else
 2298                                 printf("inum %d", VTOI(bp->b_vp)->i_number);
 2299                         printf(" lblkno %jd to blkno %jd\n",
 2300                             (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
 2301                 }
 2302 #endif
 2303                 /*
 2304                  * If we have already read the old block contents, then
 2305                  * simply copy them to the new block. Note that we need
 2306                  * to synchronously write snapshots that have not been
 2307                  * unlinked, and hence will be visible after a crash,
 2308                  * to ensure their integrity.
 2309                  */
 2310                 if (savedcbp != 0) {
 2311                         bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 2312                         bawrite(cbp);
 2313                         if (dopersistence && ip->i_effnlink > 0)
 2314                                 (void) ffs_syncvnode(vp, MNT_WAIT);
 2315                         else
 2316                                 launched_async_io = 1;
 2317                         continue;
 2318                 }
 2319                 /*
 2320                  * Otherwise, read the old block contents into the buffer.
 2321                  */
 2322                 if ((error = readblock(vp, cbp, lbn)) != 0) {
 2323                         bzero(cbp->b_data, fs->fs_bsize);
 2324                         bawrite(cbp);
 2325                         if (dopersistence && ip->i_effnlink > 0)
 2326                                 (void) ffs_syncvnode(vp, MNT_WAIT);
 2327                         else
 2328                                 launched_async_io = 1;
 2329                         break;
 2330                 }
 2331                 savedcbp = cbp;
 2332         }
 2333         /*
 2334          * Note that we need to synchronously write snapshots that
 2335          * have not been unlinked, and hence will be visible after
 2336          * a crash, to ensure their integrity.
 2337          */
 2338         if (savedcbp) {
 2339                 vp = savedcbp->b_vp;
 2340                 bawrite(savedcbp);
 2341                 if (dopersistence && VTOI(vp)->i_effnlink > 0)
 2342                         (void) ffs_syncvnode(vp, MNT_WAIT);
 2343                 else
 2344                         launched_async_io = 1;
 2345         }
 2346         lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
 2347         td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) |
 2348                 prev_norunningbuf;
 2349         if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0)
 2350                 waitrunningbufspace();
 2351         /*
 2352          * I/O on bp will now be started, so count it in runningbufspace.
 2353          */
 2354         if (saved_runningbufspace != 0) {
 2355                 bp->b_runningbufspace = saved_runningbufspace;
 2356                 atomic_add_long(&runningbufspace, bp->b_runningbufspace);
 2357         }
 2358         return (error);
 2359 }
 2360 
 2361 /*
 2362  * Read the specified block into the given buffer.
 2363  * Much of this boiler-plate comes from bwrite().
 2364  */
 2365 static int
 2366 readblock(vp, bp, lbn)
 2367         struct vnode *vp;
 2368         struct buf *bp;
 2369         ufs2_daddr_t lbn;
 2370 {
 2371         struct inode *ip = VTOI(vp);
 2372         struct bio *bip;
 2373 
 2374         bip = g_alloc_bio();
 2375         bip->bio_cmd = BIO_READ;
 2376         bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
 2377         bip->bio_data = bp->b_data;
 2378         bip->bio_length = bp->b_bcount;
 2379         bip->bio_done = NULL;
 2380 
 2381         g_io_request(bip, ip->i_devvp->v_bufobj.bo_private);
 2382         bp->b_error = biowait(bip, "snaprdb");
 2383         g_destroy_bio(bip);
 2384         return (bp->b_error);
 2385 }
 2386 
 2387 #endif
 2388 
 2389 /*
 2390  * Process file deletes that were deferred by ufs_inactive() due to
 2391  * the file system being suspended. Transfer IN_LAZYACCESS into
 2392  * IN_MODIFIED for vnodes that were accessed during suspension.
 2393  */
 2394 void
 2395 process_deferred_inactive(struct mount *mp)
 2396 {
 2397         struct vnode *vp, *mvp;
 2398         struct inode *ip;
 2399         struct thread *td;
 2400         int error;
 2401 
 2402         td = curthread;
 2403         (void) vn_start_secondary_write(NULL, &mp, V_WAIT);
 2404         MNT_ILOCK(mp);
 2405  loop:
 2406         MNT_VNODE_FOREACH(vp, mp, mvp) {
 2407                 VI_LOCK(vp);
 2408                 /*
 2409                  * IN_LAZYACCESS is checked here without holding any
 2410                  * vnode lock, but this flag is set only while holding
 2411                  * vnode interlock.
 2412                  */
 2413                 if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED) != 0 ||
 2414                     ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 &&
 2415                         ((vp->v_iflag & VI_OWEINACT) == 0 ||
 2416                         vp->v_usecount > 0))) {
 2417                         VI_UNLOCK(vp);
 2418                         continue;
 2419                 }
 2420                 MNT_IUNLOCK(mp);
 2421                 vholdl(vp);
 2422                 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
 2423                 if (error != 0) {
 2424                         vdrop(vp);
 2425                         MNT_ILOCK(mp);
 2426                         if (error == ENOENT)
 2427                                 continue;       /* vnode recycled */
 2428                         MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 2429                         goto loop;
 2430                 }
 2431                 ip = VTOI(vp);
 2432                 if ((ip->i_flag & IN_LAZYACCESS) != 0) {
 2433                         ip->i_flag &= ~IN_LAZYACCESS;
 2434                         ip->i_flag |= IN_MODIFIED;
 2435                 }
 2436                 VI_LOCK(vp);
 2437                 if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) {
 2438                         VI_UNLOCK(vp);
 2439                         VOP_UNLOCK(vp, 0);
 2440                         vdrop(vp);
 2441                         MNT_ILOCK(mp);
 2442                         continue;
 2443                 }
 2444                 
 2445                 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 2446                          ("process_deferred_inactive: "
 2447                           "recursed on VI_DOINGINACT"));
 2448                 vp->v_iflag |= VI_DOINGINACT;
 2449                 vp->v_iflag &= ~VI_OWEINACT;
 2450                 VI_UNLOCK(vp);
 2451                 (void) VOP_INACTIVE(vp, td);
 2452                 VI_LOCK(vp);
 2453                 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 2454                          ("process_deferred_inactive: lost VI_DOINGINACT"));
 2455                 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
 2456                          ("process_deferred_inactive: got VI_OWEINACT"));
 2457                 vp->v_iflag &= ~VI_DOINGINACT;
 2458                 VI_UNLOCK(vp);
 2459                 VOP_UNLOCK(vp, 0);
 2460                 vdrop(vp);
 2461                 MNT_ILOCK(mp);
 2462         }
 2463         MNT_IUNLOCK(mp);
 2464         vn_finished_secondary_write(mp);
 2465 }
 2466 
 2467 #ifndef NO_FFS_SNAPSHOT
 2468 
 2469 static struct snapdata *
 2470 ffs_snapdata_alloc(void)
 2471 {
 2472         struct snapdata *sn;
 2473 
 2474         /*
 2475          * Fetch a snapdata from the free list if there is one available.
 2476          */
 2477         mtx_lock(&snapfree_lock);
 2478         sn = LIST_FIRST(&snapfree);
 2479         if (sn != NULL)
 2480                 LIST_REMOVE(sn, sn_link);
 2481         mtx_unlock(&snapfree_lock);
 2482         if (sn != NULL)
 2483                 return (sn);
 2484         /*
 2485          * If there were no free snapdatas allocate one.
 2486          */
 2487         sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
 2488         TAILQ_INIT(&sn->sn_head);
 2489         lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
 2490             LK_CANRECURSE | LK_NOSHARE);
 2491         return (sn);
 2492 }
 2493 
 2494 /*
 2495  * The snapdata is never freed because we can not be certain that
 2496  * there are no threads sleeping on the snap lock.  Persisting
 2497  * them permanently avoids costly synchronization in ffs_lock().
 2498  */
 2499 static void
 2500 ffs_snapdata_free(struct snapdata *sn)
 2501 {
 2502         mtx_lock(&snapfree_lock);
 2503         LIST_INSERT_HEAD(&snapfree, sn, sn_link);
 2504         mtx_unlock(&snapfree_lock);
 2505 }
 2506 
 2507 /* Try to free snapdata associated with devvp */
 2508 static void
 2509 try_free_snapdata(struct vnode *devvp)
 2510 {
 2511         struct snapdata *sn;
 2512         ufs2_daddr_t *snapblklist;
 2513 
 2514         ASSERT_VI_LOCKED(devvp, "try_free_snapdata");
 2515         sn = devvp->v_rdev->si_snapdata;
 2516 
 2517         if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL ||
 2518             (devvp->v_vflag & VV_COPYONWRITE) == 0) {
 2519                 VI_UNLOCK(devvp);
 2520                 return;
 2521         }
 2522 
 2523         devvp->v_rdev->si_snapdata = NULL;
 2524         devvp->v_vflag &= ~VV_COPYONWRITE;
 2525         lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp));
 2526         snapblklist = sn->sn_blklist;
 2527         sn->sn_blklist = NULL;
 2528         sn->sn_listsize = 0;
 2529         lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
 2530         if (snapblklist != NULL)
 2531                 free(snapblklist, M_UFSMNT);
 2532         ffs_snapdata_free(sn);
 2533 }
 2534 
 2535 static struct snapdata *
 2536 ffs_snapdata_acquire(struct vnode *devvp)
 2537 {
 2538         struct snapdata *nsn;
 2539         struct snapdata *sn;
 2540 
 2541         /*
 2542          * Allocate a free snapdata.  This is done before acquiring the
 2543          * devvp lock to avoid allocation while the devvp interlock is
 2544          * held.
 2545          */
 2546         nsn = ffs_snapdata_alloc();
 2547         /*
 2548          * If there snapshots already exist on this filesystem grab a
 2549          * reference to the shared lock.  Otherwise this is the first
 2550          * snapshot on this filesystem and we need to use our
 2551          * pre-allocated snapdata.
 2552          */
 2553         VI_LOCK(devvp);
 2554         if (devvp->v_rdev->si_snapdata == NULL) {
 2555                 devvp->v_rdev->si_snapdata = nsn;
 2556                 nsn = NULL;
 2557         }
 2558         sn = devvp->v_rdev->si_snapdata;
 2559         /*
 2560          * Acquire the snapshot lock.
 2561          */
 2562         lockmgr(&sn->sn_lock,
 2563             LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, VI_MTX(devvp));
 2564         /*
 2565          * Free any unused snapdata.
 2566          */
 2567         if (nsn != NULL)
 2568                 ffs_snapdata_free(nsn);
 2569 
 2570         return (sn);
 2571 }
 2572 
 2573 #endif
Cache object: 3e718b875a0027f15a7a62ab20bd0910
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/ufs/ffs/ffs_snapshot.c

FreeBSD/Linux Kernel Cross Reference
sys/ufs/ffs/ffs_snapshot.c