ffs_snapshot.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
    3  *
    4  * Further information about snapshots can be obtained from:
    5  *
    6  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
    7  *      1614 Oxford Street              mckusick@mckusick.com
    8  *      Berkeley, CA 94709-1608         +1-510-843-9542
    9  *      USA
   10  *
   11  * Redistribution and use in source and binary forms, with or without
   12  * modification, are permitted provided that the following conditions
   13  * are met:
   14  *
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  *
   21  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
   22  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   24  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
   25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   31  * SUCH DAMAGE.
   32  *
   33  *      @(#)ffs_snapshot.c      8.11 (McKusick) 7/23/00
   34  */
   35 
   36 #include <sys/cdefs.h>
   37 __FBSDID("$FreeBSD$");
   38 
   39 #include "opt_quota.h"
   40 
   41 #include <sys/param.h>
   42 #include <sys/kernel.h>
   43 #include <sys/systm.h>
   44 #include <sys/conf.h>
   45 #include <sys/bio.h>
   46 #include <sys/buf.h>
   47 #include <sys/proc.h>
   48 #include <sys/namei.h>
   49 #include <sys/sched.h>
   50 #include <sys/stat.h>
   51 #include <sys/malloc.h>
   52 #include <sys/mount.h>
   53 #include <sys/resource.h>
   54 #include <sys/resourcevar.h>
   55 #include <sys/vnode.h>
   56 
   57 #include <geom/geom.h>
   58 
   59 #include <ufs/ufs/extattr.h>
   60 #include <ufs/ufs/quota.h>
   61 #include <ufs/ufs/ufsmount.h>
   62 #include <ufs/ufs/inode.h>
   63 #include <ufs/ufs/ufs_extern.h>
   64 
   65 #include <ufs/ffs/fs.h>
   66 #include <ufs/ffs/ffs_extern.h>
   67 
   68 #define KERNCRED thread0.td_ucred
   69 #define DEBUG 1
   70 
   71 #include "opt_ffs.h"
   72 
   73 #ifdef NO_FFS_SNAPSHOT
   74 int
   75 ffs_snapshot(mp, snapfile)
   76         struct mount *mp;
   77         char *snapfile;
   78 {
   79         return (EINVAL);
   80 }
   81 
   82 int
   83 ffs_snapblkfree(fs, devvp, bno, size, inum)
   84         struct fs *fs;
   85         struct vnode *devvp;
   86         ufs2_daddr_t bno;
   87         long size;
   88         ino_t inum;
   89 {
   90         return (EINVAL);
   91 }
   92 
   93 void
   94 ffs_snapremove(vp)
   95         struct vnode *vp;
   96 {
   97 }
   98 
   99 void
  100 ffs_snapshot_mount(mp)
  101         struct mount *mp;
  102 {
  103 }
  104 
  105 void
  106 ffs_snapshot_unmount(mp)
  107         struct mount *mp;
  108 {
  109 }
  110 
  111 void
  112 ffs_snapgone(ip)
  113         struct inode *ip;
  114 {
  115 }
  116 
  117 int
  118 ffs_copyonwrite(devvp, bp)
  119         struct vnode *devvp;
  120         struct buf *bp;
  121 {
  122         return (EINVAL);
  123 }
  124 
  125 #else
  126 
  127 TAILQ_HEAD(snaphead, inode);
  128 
  129 struct snapdata {
  130         struct snaphead sn_head;
  131         daddr_t sn_listsize;
  132         daddr_t *sn_blklist;
  133         struct lock sn_lock;
  134 };
  135 
  136 static int cgaccount(int, struct vnode *, struct buf *, int);
  137 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
  138     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
  139     ufs_lbn_t, int), int);
  140 static int indiracct_ufs1(struct vnode *, struct vnode *, int,
  141     ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
  142     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
  143     ufs_lbn_t, int), int);
  144 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
  145     struct fs *, ufs_lbn_t, int);
  146 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
  147     struct fs *, ufs_lbn_t, int);
  148 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
  149     struct fs *, ufs_lbn_t, int);
  150 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
  151     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
  152     ufs_lbn_t, int), int);
  153 static int indiracct_ufs2(struct vnode *, struct vnode *, int,
  154     ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
  155     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
  156     ufs_lbn_t, int), int);
  157 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
  158     struct fs *, ufs_lbn_t, int);
  159 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
  160     struct fs *, ufs_lbn_t, int);
  161 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
  162     struct fs *, ufs_lbn_t, int);
  163 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
  164 static void process_deferred_inactive(struct mount *);
  165 static void try_free_snapdata(struct vnode *devvp, struct thread *td);
  166 static int ffs_bp_snapblk(struct vnode *, struct buf *);
  167 
  168 /*
  169  * To ensure the consistency of snapshots across crashes, we must
  170  * synchronously write out copied blocks before allowing the
  171  * originals to be modified. Because of the rather severe speed
  172  * penalty that this imposes, the following flag allows this
  173  * crash persistence to be disabled.
  174  */
  175 int dopersistence = 0;
  176 
  177 #ifdef DEBUG
  178 #include <sys/sysctl.h>
  179 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
  180 static int snapdebug = 0;
  181 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
  182 int collectsnapstats = 0;
  183 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
  184         0, "");
  185 #endif /* DEBUG */
  186 
  187 /*
  188  * Create a snapshot file and initialize it for the filesystem.
  189  */
  190 int
  191 ffs_snapshot(mp, snapfile)
  192         struct mount *mp;
  193         char *snapfile;
  194 {
  195         ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
  196         int error, cg, snaploc;
  197         int i, size, len, loc;
  198         int flag;
  199         struct timespec starttime = {0, 0}, endtime;
  200         char saved_nice = 0;
  201         long redo = 0, snaplistsize = 0;
  202         int32_t *lp;
  203         void *space;
  204         struct fs *copy_fs = NULL, *fs;
  205         struct thread *td = curthread;
  206         struct inode *ip, *xp;
  207         struct buf *bp, *nbp, *ibp, *sbp = NULL;
  208         struct nameidata nd;
  209         struct mount *wrtmp;
  210         struct vattr vat;
  211         struct vnode *vp, *xvp, *mvp, *devvp;
  212         struct uio auio;
  213         struct iovec aiov;
  214         struct snapdata *sn;
  215         struct ufsmount *ump;
  216 
  217         ump = VFSTOUFS(mp);
  218         fs = ump->um_fs;
  219         sn = NULL;
  220         MNT_ILOCK(mp);
  221         flag = mp->mnt_flag;
  222         MNT_IUNLOCK(mp);
  223 
  224         /*
  225          * Need to serialize access to snapshot code per filesystem.
  226          */
  227         /*
  228          * Assign a snapshot slot in the superblock.
  229          */
  230         UFS_LOCK(ump);
  231         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
  232                 if (fs->fs_snapinum[snaploc] == 0)
  233                         break;
  234         UFS_UNLOCK(ump);
  235         if (snaploc == FSMAXSNAP)
  236                 return (ENOSPC);
  237         /*
  238          * Create the snapshot file.
  239          */
  240 restart:
  241         NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile, td);
  242         if ((error = namei(&nd)) != 0)
  243                 return (error);
  244         if (nd.ni_vp != NULL) {
  245                 vput(nd.ni_vp);
  246                 error = EEXIST;
  247         }
  248         if (nd.ni_dvp->v_mount != mp)
  249                 error = EXDEV;
  250         if (error) {
  251                 NDFREE(&nd, NDF_ONLY_PNBUF);
  252                 if (nd.ni_dvp == nd.ni_vp)
  253                         vrele(nd.ni_dvp);
  254                 else
  255                         vput(nd.ni_dvp);
  256                 return (error);
  257         }
  258         VATTR_NULL(&vat);
  259         vat.va_type = VREG;
  260         vat.va_mode = S_IRUSR;
  261         vat.va_vaflags |= VA_EXCLUSIVE;
  262         if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
  263                 wrtmp = NULL;
  264         if (wrtmp != mp)
  265                 panic("ffs_snapshot: mount mismatch");
  266         vfs_rel(wrtmp);
  267         if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
  268                 NDFREE(&nd, NDF_ONLY_PNBUF);
  269                 vput(nd.ni_dvp);
  270                 if ((error = vn_start_write(NULL, &wrtmp,
  271                     V_XSLEEP | PCATCH)) != 0)
  272                         return (error);
  273                 goto restart;
  274         }
  275         VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE);
  276         error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
  277         VOP_UNLOCK(nd.ni_dvp, 0, td);
  278         if (error) {
  279                 NDFREE(&nd, NDF_ONLY_PNBUF);
  280                 vn_finished_write(wrtmp);
  281                 vrele(nd.ni_dvp);
  282                 return (error);
  283         }
  284         vp = nd.ni_vp;
  285         vp->v_vflag |= VV_SYSTEM;
  286         ip = VTOI(vp);
  287         devvp = ip->i_devvp;
  288         /*
  289          * Allocate and copy the last block contents so as to be able
  290          * to set size to that of the filesystem.
  291          */
  292         numblks = howmany(fs->fs_size, fs->fs_frag);
  293         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
  294             fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
  295         if (error)
  296                 goto out;
  297         ip->i_size = lblktosize(fs, (off_t)numblks);
  298         DIP_SET(ip, i_size, ip->i_size);
  299         ip->i_flag |= IN_CHANGE | IN_UPDATE;
  300         error = readblock(vp, bp, numblks - 1);
  301         bawrite(bp);
  302         if (error != 0)
  303                 goto out;
  304         /*
  305          * Preallocate critical data structures so that we can copy
  306          * them in without further allocation after we suspend all
  307          * operations on the filesystem. We would like to just release
  308          * the allocated buffers without writing them since they will
  309          * be filled in below once we are ready to go, but this upsets
  310          * the soft update code, so we go ahead and write the new buffers.
  311          *
  312          * Allocate all indirect blocks and mark all of them as not
  313          * needing to be copied.
  314          */
  315         for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
  316                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
  317                     fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
  318                 if (error)
  319                         goto out;
  320                 bawrite(ibp);
  321         }
  322         /*
  323          * Allocate copies for the superblock and its summary information.
  324          */
  325         error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
  326             0, &nbp);
  327         if (error)
  328                 goto out;
  329         bawrite(nbp);
  330         blkno = fragstoblks(fs, fs->fs_csaddr);
  331         len = howmany(fs->fs_cssize, fs->fs_bsize);
  332         for (loc = 0; loc < len; loc++) {
  333                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
  334                     fs->fs_bsize, KERNCRED, 0, &nbp);
  335                 if (error)
  336                         goto out;
  337                 bawrite(nbp);
  338         }
  339         /*
  340          * Allocate all cylinder group blocks.
  341          */
  342         for (cg = 0; cg < fs->fs_ncg; cg++) {
  343                 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
  344                     fs->fs_bsize, KERNCRED, 0, &nbp);
  345                 if (error)
  346                         goto out;
  347                 bawrite(nbp);
  348                 if (cg % 10 == 0)
  349                         ffs_syncvnode(vp, MNT_WAIT);
  350         }
  351         /*
  352          * Copy all the cylinder group maps. Although the
  353          * filesystem is still active, we hope that only a few
  354          * cylinder groups will change between now and when we
  355          * suspend operations. Thus, we will be able to quickly
  356          * touch up the few cylinder groups that changed during
  357          * the suspension period.
  358          */
  359         len = howmany(fs->fs_ncg, NBBY);
  360         MALLOC(space, void *, len, M_DEVBUF, M_WAITOK|M_ZERO);
  361         UFS_LOCK(ump);
  362         fs->fs_active = space;
  363         UFS_UNLOCK(ump);
  364         for (cg = 0; cg < fs->fs_ncg; cg++) {
  365                 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
  366                     fs->fs_bsize, KERNCRED, 0, &nbp);
  367                 if (error)
  368                         goto out;
  369                 error = cgaccount(cg, vp, nbp, 1);
  370                 bawrite(nbp);
  371                 if (cg % 10 == 0)
  372                         ffs_syncvnode(vp, MNT_WAIT);
  373                 if (error)
  374                         goto out;
  375         }
  376         /*
  377          * Change inode to snapshot type file.
  378          */
  379         ip->i_flags |= SF_SNAPSHOT;
  380         DIP_SET(ip, i_flags, ip->i_flags);
  381         ip->i_flag |= IN_CHANGE | IN_UPDATE;
  382         /*
  383          * Ensure that the snapshot is completely on disk.
  384          * Since we have marked it as a snapshot it is safe to
  385          * unlock it as no process will be allowed to write to it.
  386          */
  387         if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
  388                 goto out;
  389         VOP_UNLOCK(vp, 0, td);
  390         /*
  391          * All allocations are done, so we can now snapshot the system.
  392          *
  393          * Recind nice scheduling while running with the filesystem suspended.
  394          */
  395         if (td->td_proc->p_nice > 0) {
  396                 struct proc *p;
  397 
  398                 p = td->td_proc;
  399                 PROC_LOCK(p);
  400                 PROC_SLOCK(p);
  401                 saved_nice = p->p_nice;
  402                 sched_nice(p, 0);
  403                 PROC_SUNLOCK(p);
  404                 PROC_UNLOCK(p);
  405         }
  406         /*
  407          * Suspend operation on filesystem.
  408          */
  409         for (;;) {
  410                 vn_finished_write(wrtmp);
  411                 if ((error = vfs_write_suspend(vp->v_mount)) != 0) {
  412                         vn_start_write(NULL, &wrtmp, V_WAIT);
  413                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
  414                         goto out;
  415                 }
  416                 if (mp->mnt_kern_flag & MNTK_SUSPENDED)
  417                         break;
  418                 vn_start_write(NULL, &wrtmp, V_WAIT);
  419         }
  420         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
  421         if (ip->i_effnlink == 0) {
  422                 error = ENOENT;         /* Snapshot file unlinked */
  423                 goto out1;
  424         }
  425         if (collectsnapstats)
  426                 nanotime(&starttime);
  427 
  428         /* The last block might have changed.  Copy it again to be sure. */
  429         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
  430             fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
  431         if (error != 0)
  432                 goto out1;
  433         error = readblock(vp, bp, numblks - 1);
  434         bp->b_flags |= B_VALIDSUSPWRT;
  435         bawrite(bp);
  436         if (error != 0)
  437                 goto out1;
  438         /*
  439          * First, copy all the cylinder group maps that have changed.
  440          */
  441         for (cg = 0; cg < fs->fs_ncg; cg++) {
  442                 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
  443                         continue;
  444                 redo++;
  445                 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
  446                     fs->fs_bsize, KERNCRED, 0, &nbp);
  447                 if (error)
  448                         goto out1;
  449                 error = cgaccount(cg, vp, nbp, 2);
  450                 bawrite(nbp);
  451                 if (error)
  452                         goto out1;
  453         }
  454         /*
  455          * Grab a copy of the superblock and its summary information.
  456          * We delay writing it until the suspension is released below.
  457          */
  458         error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
  459             KERNCRED, &sbp);
  460         if (error) {
  461                 brelse(sbp);
  462                 sbp = NULL;
  463                 goto out1;
  464         }
  465         loc = blkoff(fs, fs->fs_sblockloc);
  466         copy_fs = (struct fs *)(sbp->b_data + loc);
  467         bcopy(fs, copy_fs, fs->fs_sbsize);
  468         if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
  469                 copy_fs->fs_clean = 1;
  470         size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
  471         if (fs->fs_sbsize < size)
  472                 bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize);
  473         size = blkroundup(fs, fs->fs_cssize);
  474         if (fs->fs_contigsumsize > 0)
  475                 size += fs->fs_ncg * sizeof(int32_t);
  476         space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
  477         copy_fs->fs_csp = space;
  478         bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
  479         space = (char *)space + fs->fs_cssize;
  480         loc = howmany(fs->fs_cssize, fs->fs_fsize);
  481         i = fs->fs_frag - loc % fs->fs_frag;
  482         len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
  483         if (len > 0) {
  484                 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
  485                     len, KERNCRED, &bp)) != 0) {
  486                         brelse(bp);
  487                         free(copy_fs->fs_csp, M_UFSMNT);
  488                         bawrite(sbp);
  489                         sbp = NULL;
  490                         goto out1;
  491                 }
  492                 bcopy(bp->b_data, space, (u_int)len);
  493                 space = (char *)space + len;
  494                 bp->b_flags |= B_INVAL | B_NOCACHE;
  495                 brelse(bp);
  496         }
  497         if (fs->fs_contigsumsize > 0) {
  498                 copy_fs->fs_maxcluster = lp = space;
  499                 for (i = 0; i < fs->fs_ncg; i++)
  500                         *lp++ = fs->fs_contigsumsize;
  501         }
  502         /*
  503          * We must check for active files that have been unlinked
  504          * (e.g., with a zero link count). We have to expunge all
  505          * trace of these files from the snapshot so that they are
  506          * not reclaimed prematurely by fsck or unnecessarily dumped.
  507          * We turn off the MNTK_SUSPENDED flag to avoid a panic from
  508          * spec_strategy about writing on a suspended filesystem.
  509          * Note that we skip unlinked snapshot files as they will
  510          * be handled separately below.
  511          *
  512          * We also calculate the needed size for the snapshot list.
  513          */
  514         snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
  515             FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
  516         MNT_ILOCK(mp);
  517         mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
  518 loop:
  519         MNT_VNODE_FOREACH(xvp, mp, mvp) {
  520                 VI_LOCK(xvp);
  521                 MNT_IUNLOCK(mp);
  522                 if ((xvp->v_iflag & VI_DOOMED) ||
  523                     (xvp->v_usecount == 0 &&
  524                      (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) ||
  525                     xvp->v_type == VNON ||
  526                     (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
  527                         VI_UNLOCK(xvp);
  528                         MNT_ILOCK(mp);
  529                         continue;
  530                 }
  531                 /*
  532                  * We can skip parent directory vnode because it must have
  533                  * this snapshot file in it.
  534                  */
  535                 if (xvp == nd.ni_dvp) {
  536                         VI_UNLOCK(xvp);
  537                         MNT_ILOCK(mp);
  538                         continue;
  539                 }
  540                 vholdl(xvp);
  541                 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) {
  542                         MNT_ILOCK(mp);
  543                         MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
  544                         vdrop(xvp);
  545                         goto loop;
  546                 }
  547                 VI_LOCK(xvp);
  548                 if (xvp->v_usecount == 0 &&
  549                     (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) {
  550                         VI_UNLOCK(xvp);
  551                         VOP_UNLOCK(xvp, 0, td);
  552                         vdrop(xvp);
  553                         MNT_ILOCK(mp);
  554                         continue;
  555                 }
  556                 VI_UNLOCK(xvp);
  557                 if (snapdebug)
  558                         vprint("ffs_snapshot: busy vnode", xvp);
  559                 if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 &&
  560                     vat.va_nlink > 0) {
  561                         VOP_UNLOCK(xvp, 0, td);
  562                         vdrop(xvp);
  563                         MNT_ILOCK(mp);
  564                         continue;
  565                 }
  566                 xp = VTOI(xvp);
  567                 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
  568                         VOP_UNLOCK(xvp, 0, td);
  569                         vdrop(xvp);
  570                         MNT_ILOCK(mp);
  571                         continue;
  572                 }
  573                 /*
  574                  * If there is a fragment, clear it here.
  575                  */
  576                 blkno = 0;
  577                 loc = howmany(xp->i_size, fs->fs_bsize) - 1;
  578                 if (loc < NDADDR) {
  579                         len = fragroundup(fs, blkoff(fs, xp->i_size));
  580                         if (len != 0 && len < fs->fs_bsize) {
  581                                 ffs_blkfree(ump, copy_fs, vp,
  582                                     DIP(xp, i_db[loc]), len, xp->i_number);
  583                                 blkno = DIP(xp, i_db[loc]);
  584                                 DIP_SET(xp, i_db[loc], 0);
  585                         }
  586                 }
  587                 snaplistsize += 1;
  588                 if (xp->i_ump->um_fstype == UFS1)
  589                         error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
  590                             BLK_NOCOPY);
  591                 else
  592                         error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
  593                             BLK_NOCOPY);
  594                 if (blkno)
  595                         DIP_SET(xp, i_db[loc], blkno);
  596                 if (!error)
  597                         error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
  598                             xp->i_mode);
  599                 VOP_UNLOCK(xvp, 0, td);
  600                 vdrop(xvp);
  601                 if (error) {
  602                         free(copy_fs->fs_csp, M_UFSMNT);
  603                         bawrite(sbp);
  604                         sbp = NULL;
  605                         MNT_VNODE_FOREACH_ABORT(mp, mvp);
  606                         goto out1;
  607                 }
  608                 MNT_ILOCK(mp);
  609         }
  610         MNT_IUNLOCK(mp);
  611         /*
  612          * If there already exist snapshots on this filesystem, grab a
  613          * reference to their shared lock. If this is the first snapshot
  614          * on this filesystem, we need to allocate a lock for the snapshots
  615          * to share. In either case, acquire the snapshot lock and give
  616          * up our original private lock.
  617          */
  618         VI_LOCK(devvp);
  619         sn = devvp->v_rdev->si_snapdata;
  620         if (sn != NULL) {
  621                 xp = TAILQ_FIRST(&sn->sn_head);
  622                 VI_UNLOCK(devvp);
  623                 VI_LOCK(vp);
  624                 vp->v_vnlock = &sn->sn_lock;
  625         } else {
  626                 VI_UNLOCK(devvp);
  627                 sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
  628                 TAILQ_INIT(&sn->sn_head);
  629                 lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
  630                     LK_CANRECURSE | LK_NOSHARE);
  631                 VI_LOCK(vp);
  632                 vp->v_vnlock = &sn->sn_lock;
  633                 mp_fixme("si_snapdata setting is racey.");
  634                 devvp->v_rdev->si_snapdata = sn;
  635                 xp = NULL;
  636         }
  637         lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
  638             VI_MTX(vp), td);
  639         lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
  640         /*
  641          * If this is the first snapshot on this filesystem, then we need
  642          * to allocate the space for the list of preallocated snapshot blocks.
  643          * This list will be refined below, but this preliminary one will
  644          * keep us out of deadlock until the full one is ready.
  645          */
  646         if (xp == NULL) {
  647                 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
  648                     M_UFSMNT, M_WAITOK);
  649                 blkp = &snapblklist[1];
  650                 *blkp++ = lblkno(fs, fs->fs_sblockloc);
  651                 blkno = fragstoblks(fs, fs->fs_csaddr);
  652                 for (cg = 0; cg < fs->fs_ncg; cg++) {
  653                         if (fragstoblks(fs, cgtod(fs, cg) > blkno))
  654                                 break;
  655                         *blkp++ = fragstoblks(fs, cgtod(fs, cg));
  656                 }
  657                 len = howmany(fs->fs_cssize, fs->fs_bsize);
  658                 for (loc = 0; loc < len; loc++)
  659                         *blkp++ = blkno + loc;
  660                 for (; cg < fs->fs_ncg; cg++)
  661                         *blkp++ = fragstoblks(fs, cgtod(fs, cg));
  662                 snapblklist[0] = blkp - snapblklist;
  663                 VI_LOCK(devvp);
  664                 if (sn->sn_blklist != NULL)
  665                         panic("ffs_snapshot: non-empty list");
  666                 sn->sn_blklist = snapblklist;
  667                 sn->sn_listsize = blkp - snapblklist;
  668                 VI_UNLOCK(devvp);
  669         }
  670         /*
  671          * Record snapshot inode. Since this is the newest snapshot,
  672          * it must be placed at the end of the list.
  673          */
  674         VI_LOCK(devvp);
  675         fs->fs_snapinum[snaploc] = ip->i_number;
  676         if (ip->i_nextsnap.tqe_prev != 0)
  677                 panic("ffs_snapshot: %d already on list", ip->i_number);
  678         TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
  679         devvp->v_vflag |= VV_COPYONWRITE;
  680         VI_UNLOCK(devvp);
  681         ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
  682 out1:
  683         KASSERT((sn != NULL && sbp != NULL && error == 0) ||
  684                 (sn == NULL && sbp == NULL && error != 0),
  685                 ("email phk@ and mckusick@"));
  686         /*
  687          * Resume operation on filesystem.
  688          */
  689         vfs_write_resume(vp->v_mount);
  690         vn_start_write(NULL, &wrtmp, V_WAIT);
  691         if (collectsnapstats && starttime.tv_sec > 0) {
  692                 nanotime(&endtime);
  693                 timespecsub(&endtime, &starttime);
  694                 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
  695                     vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
  696                     endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
  697         }
  698         if (sbp == NULL)
  699                 goto out;
  700         /*
  701          * Copy allocation information from all the snapshots in
  702          * this snapshot and then expunge them from its view.
  703          */
  704         TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) {
  705                 if (xp == ip)
  706                         break;
  707                 if (xp->i_ump->um_fstype == UFS1)
  708                         error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
  709                             BLK_SNAP);
  710                 else
  711                         error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
  712                             BLK_SNAP);
  713                 if (error == 0 && xp->i_effnlink == 0) {
  714                         error = ffs_freefile(ump,
  715                                              copy_fs,
  716                                              vp,
  717                                              xp->i_number,
  718                                              xp->i_mode);
  719                 }
  720                 if (error) {
  721                         fs->fs_snapinum[snaploc] = 0;
  722                         goto done;
  723                 }
  724         }
  725         /*
  726          * Allocate space for the full list of preallocated snapshot blocks.
  727          */
  728         MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
  729             M_UFSMNT, M_WAITOK);
  730         ip->i_snapblklist = &snapblklist[1];
  731         /*
  732          * Expunge the blocks used by the snapshots from the set of
  733          * blocks marked as used in the snapshot bitmaps. Also, collect
  734          * the list of allocated blocks in i_snapblklist.
  735          */
  736         if (ip->i_ump->um_fstype == UFS1)
  737                 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
  738         else
  739                 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
  740         if (error) {
  741                 fs->fs_snapinum[snaploc] = 0;
  742                 FREE(snapblklist, M_UFSMNT);
  743                 goto done;
  744         }
  745         if (snaplistsize < ip->i_snapblklist - snapblklist)
  746                 panic("ffs_snapshot: list too small");
  747         snaplistsize = ip->i_snapblklist - snapblklist;
  748         snapblklist[0] = snaplistsize;
  749         ip->i_snapblklist = 0;
  750         /*
  751          * Write out the list of allocated blocks to the end of the snapshot.
  752          */
  753         auio.uio_iov = &aiov;
  754         auio.uio_iovcnt = 1;
  755         aiov.iov_base = (void *)snapblklist;
  756         aiov.iov_len = snaplistsize * sizeof(daddr_t);
  757         auio.uio_resid = aiov.iov_len;;
  758         auio.uio_offset = ip->i_size;
  759         auio.uio_segflg = UIO_SYSSPACE;
  760         auio.uio_rw = UIO_WRITE;
  761         auio.uio_td = td;
  762         if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
  763                 fs->fs_snapinum[snaploc] = 0;
  764                 FREE(snapblklist, M_UFSMNT);
  765                 goto done;
  766         }
  767         /*
  768          * Write the superblock and its summary information
  769          * to the snapshot.
  770          */
  771         blkno = fragstoblks(fs, fs->fs_csaddr);
  772         len = howmany(fs->fs_cssize, fs->fs_bsize);
  773         space = copy_fs->fs_csp;
  774         for (loc = 0; loc < len; loc++) {
  775                 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
  776                 if (error) {
  777                         brelse(nbp);
  778                         fs->fs_snapinum[snaploc] = 0;
  779                         FREE(snapblklist, M_UFSMNT);
  780                         goto done;
  781                 }
  782                 bcopy(space, nbp->b_data, fs->fs_bsize);
  783                 space = (char *)space + fs->fs_bsize;
  784                 bawrite(nbp);
  785         }
  786         /*
  787          * As this is the newest list, it is the most inclusive, so
  788          * should replace the previous list.
  789          */
  790         VI_LOCK(devvp);
  791         space = sn->sn_blklist;
  792         sn->sn_blklist = snapblklist;
  793         sn->sn_listsize = snaplistsize;
  794         VI_UNLOCK(devvp);
  795         if (space != NULL)
  796                 FREE(space, M_UFSMNT);
  797         /*
  798          * If another process is currently writing the buffer containing
  799          * the inode for this snapshot then a deadlock can occur. Drop
  800          * the snapshot lock until the buffer has been written.
  801          */
  802         VREF(vp);       /* Protect against ffs_snapgone() */
  803         VOP_UNLOCK(vp, 0, td);
  804         (void) bread(ip->i_devvp,
  805                      fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
  806                      (int) fs->fs_bsize, NOCRED, &nbp);
  807         brelse(nbp);
  808         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
  809         if (ip->i_effnlink == 0)
  810                 error = ENOENT;         /* Snapshot file unlinked */
  811         else
  812                 vrele(vp);              /* Drop extra reference */
  813 done:
  814         FREE(copy_fs->fs_csp, M_UFSMNT);
  815         bawrite(sbp);
  816 out:
  817         NDFREE(&nd, NDF_ONLY_PNBUF);
  818         if (saved_nice > 0) {
  819                 struct proc *p;
  820 
  821                 p = td->td_proc;
  822                 PROC_LOCK(p);
  823                 PROC_SLOCK(p);
  824                 sched_nice(td->td_proc, saved_nice);
  825                 PROC_SUNLOCK(p);
  826                 PROC_UNLOCK(td->td_proc);
  827         }
  828         UFS_LOCK(ump);
  829         if (fs->fs_active != 0) {
  830                 FREE(fs->fs_active, M_DEVBUF);
  831                 fs->fs_active = 0;
  832         }
  833         UFS_UNLOCK(ump);
  834         MNT_ILOCK(mp);
  835         mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
  836         MNT_IUNLOCK(mp);
  837         if (error)
  838                 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
  839         (void) ffs_syncvnode(vp, MNT_WAIT);
  840         if (error)
  841                 vput(vp);
  842         else
  843                 VOP_UNLOCK(vp, 0, td);
  844         vrele(nd.ni_dvp);
  845         vn_finished_write(wrtmp);
  846         process_deferred_inactive(mp);
  847         return (error);
  848 }
  849 
  850 /*
  851  * Copy a cylinder group map. All the unallocated blocks are marked
  852  * BLK_NOCOPY so that the snapshot knows that it need not copy them
  853  * if they are later written. If passno is one, then this is a first
  854  * pass, so only setting needs to be done. If passno is 2, then this
  855  * is a revision to a previous pass which must be undone as the
  856  * replacement pass is done.
  857  */
  858 static int
  859 cgaccount(cg, vp, nbp, passno)
  860         int cg;
  861         struct vnode *vp;
  862         struct buf *nbp;
  863         int passno;
  864 {
  865         struct buf *bp, *ibp;
  866         struct inode *ip;
  867         struct cg *cgp;
  868         struct fs *fs;
  869         ufs2_daddr_t base, numblks;
  870         int error, len, loc, indiroff;
  871 
  872         ip = VTOI(vp);
  873         fs = ip->i_fs;
  874         error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
  875                 (int)fs->fs_cgsize, KERNCRED, &bp);
  876         if (error) {
  877                 brelse(bp);
  878                 return (error);
  879         }
  880         cgp = (struct cg *)bp->b_data;
  881         if (!cg_chkmagic(cgp)) {
  882                 brelse(bp);
  883                 return (EIO);
  884         }
  885         UFS_LOCK(ip->i_ump);
  886         ACTIVESET(fs, cg);
  887         /*
  888          * Recomputation of summary information might not have been performed
  889          * at mount time.  Sync up summary information for current cylinder
  890          * group while data is in memory to ensure that result of background
  891          * fsck is slightly more consistent.
  892          */
  893         fs->fs_cs(fs, cg) = cgp->cg_cs;
  894         UFS_UNLOCK(ip->i_ump);
  895         bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
  896         if (fs->fs_cgsize < fs->fs_bsize)
  897                 bzero(&nbp->b_data[fs->fs_cgsize],
  898                     fs->fs_bsize - fs->fs_cgsize);
  899         cgp = (struct cg *)nbp->b_data;
  900         bqrelse(bp);
  901         if (passno == 2)
  902                 nbp->b_flags |= B_VALIDSUSPWRT;
  903         numblks = howmany(fs->fs_size, fs->fs_frag);
  904         len = howmany(fs->fs_fpg, fs->fs_frag);
  905         base = cgbase(fs, cg) / fs->fs_frag;
  906         if (base + len >= numblks)
  907                 len = numblks - base - 1;
  908         loc = 0;
  909         if (base < NDADDR) {
  910                 for ( ; loc < NDADDR; loc++) {
  911                         if (ffs_isblock(fs, cg_blksfree(cgp), loc))
  912                                 DIP_SET(ip, i_db[loc], BLK_NOCOPY);
  913                         else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
  914                                 DIP_SET(ip, i_db[loc], 0);
  915                         else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
  916                                 panic("ffs_snapshot: lost direct block");
  917                 }
  918         }
  919         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
  920             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
  921         if (error) {
  922                 return (error);
  923         }
  924         indiroff = (base + loc - NDADDR) % NINDIR(fs);
  925         for ( ; loc < len; loc++, indiroff++) {
  926                 if (indiroff >= NINDIR(fs)) {
  927                         if (passno == 2)
  928                                 ibp->b_flags |= B_VALIDSUSPWRT;
  929                         bawrite(ibp);
  930                         error = UFS_BALLOC(vp,
  931                             lblktosize(fs, (off_t)(base + loc)),
  932                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
  933                         if (error) {
  934                                 return (error);
  935                         }
  936                         indiroff = 0;
  937                 }
  938                 if (ip->i_ump->um_fstype == UFS1) {
  939                         if (ffs_isblock(fs, cg_blksfree(cgp), loc))
  940                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
  941                                     BLK_NOCOPY;
  942                         else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
  943                             [indiroff] == BLK_NOCOPY)
  944                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
  945                         else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
  946                             [indiroff] == BLK_NOCOPY)
  947                                 panic("ffs_snapshot: lost indirect block");
  948                         continue;
  949                 }
  950                 if (ffs_isblock(fs, cg_blksfree(cgp), loc))
  951                         ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
  952                 else if (passno == 2 &&
  953                     ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
  954                         ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
  955                 else if (passno == 1 &&
  956                     ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
  957                         panic("ffs_snapshot: lost indirect block");
  958         }
  959         if (passno == 2)
  960                 ibp->b_flags |= B_VALIDSUSPWRT;
  961         bdwrite(ibp);
  962         return (0);
  963 }
  964 
  965 /*
  966  * Before expunging a snapshot inode, note all the
  967  * blocks that it claims with BLK_SNAP so that fsck will
  968  * be able to account for those blocks properly and so
  969  * that this snapshot knows that it need not copy them
  970  * if the other snapshot holding them is freed. This code
  971  * is reproduced once each for UFS1 and UFS2.
  972  */
  973 static int
  974 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
  975         struct vnode *snapvp;
  976         struct inode *cancelip;
  977         struct fs *fs;
  978         int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
  979             struct fs *, ufs_lbn_t, int);
  980         int expungetype;
  981 {
  982         int i, error, indiroff;
  983         ufs_lbn_t lbn, rlbn;
  984         ufs2_daddr_t len, blkno, numblks, blksperindir;
  985         struct ufs1_dinode *dip;
  986         struct thread *td = curthread;
  987         struct buf *bp;
  988 
  989         /*
  990          * Prepare to expunge the inode. If its inode block has not
  991          * yet been copied, then allocate and fill the copy.
  992          */
  993         lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
  994         blkno = 0;
  995         if (lbn < NDADDR) {
  996                 blkno = VTOI(snapvp)->i_din1->di_db[lbn];
  997         } else {
  998                 td->td_pflags |= TDP_COWINPROGRESS;
  999                 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
 1000                    fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 1001                 td->td_pflags &= ~TDP_COWINPROGRESS;
 1002                 if (error)
 1003                         return (error);
 1004                 indiroff = (lbn - NDADDR) % NINDIR(fs);
 1005                 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
 1006                 bqrelse(bp);
 1007         }
 1008         if (blkno != 0) {
 1009                 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
 1010                         return (error);
 1011         } else {
 1012                 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
 1013                     fs->fs_bsize, KERNCRED, 0, &bp);
 1014                 if (error)
 1015                         return (error);
 1016                 if ((error = readblock(snapvp, bp, lbn)) != 0)
 1017                         return (error);
 1018         }
 1019         /*
 1020          * Set a snapshot inode to be a zero length file, regular files
 1021          * or unlinked snapshots to be completely unallocated.
 1022          */
 1023         dip = (struct ufs1_dinode *)bp->b_data +
 1024             ino_to_fsbo(fs, cancelip->i_number);
 1025         if (expungetype == BLK_NOCOPY || cancelip->i_effnlink == 0)
 1026                 dip->di_mode = 0;
 1027         dip->di_size = 0;
 1028         dip->di_blocks = 0;
 1029         dip->di_flags &= ~SF_SNAPSHOT;
 1030         bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
 1031         bdwrite(bp);
 1032         /*
 1033          * Now go through and expunge all the blocks in the file
 1034          * using the function requested.
 1035          */
 1036         numblks = howmany(cancelip->i_size, fs->fs_bsize);
 1037         if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
 1038             &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype)))
 1039                 return (error);
 1040         if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0],
 1041             &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype)))
 1042                 return (error);
 1043         blksperindir = 1;
 1044         lbn = -NDADDR;
 1045         len = numblks - NDADDR;
 1046         rlbn = NDADDR;
 1047         for (i = 0; len > 0 && i < NIADDR; i++) {
 1048                 error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
 1049                     cancelip->i_din1->di_ib[i], lbn, rlbn, len,
 1050                     blksperindir, fs, acctfunc, expungetype);
 1051                 if (error)
 1052                         return (error);
 1053                 blksperindir *= NINDIR(fs);
 1054                 lbn -= blksperindir + 1;
 1055                 len -= blksperindir;
 1056                 rlbn += blksperindir;
 1057         }
 1058         return (0);
 1059 }
 1060 
 1061 /*
 1062  * Descend an indirect block chain for vnode cancelvp accounting for all
 1063  * its indirect blocks in snapvp.
 1064  */ 
 1065 static int
 1066 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 1067             blksperindir, fs, acctfunc, expungetype)
 1068         struct vnode *snapvp;
 1069         struct vnode *cancelvp;
 1070         int level;
 1071         ufs1_daddr_t blkno;
 1072         ufs_lbn_t lbn;
 1073         ufs_lbn_t rlbn;
 1074         ufs_lbn_t remblks;
 1075         ufs_lbn_t blksperindir;
 1076         struct fs *fs;
 1077         int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 1078             struct fs *, ufs_lbn_t, int);
 1079         int expungetype;
 1080 {
 1081         int error, num, i;
 1082         ufs_lbn_t subblksperindir;
 1083         struct indir indirs[NIADDR + 2];
 1084         ufs1_daddr_t last, *bap;
 1085         struct buf *bp;
 1086 
 1087         if (blkno == 0) {
 1088                 if (expungetype == BLK_NOCOPY)
 1089                         return (0);
 1090                 panic("indiracct_ufs1: missing indir");
 1091         }
 1092         if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 1093                 return (error);
 1094         if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 1095                 panic("indiracct_ufs1: botched params");
 1096         /*
 1097          * We have to expand bread here since it will deadlock looking
 1098          * up the block number for any blocks that are not in the cache.
 1099          */
 1100         bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 1101         bp->b_blkno = fsbtodb(fs, blkno);
 1102         if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 1103             (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
 1104                 brelse(bp);
 1105                 return (error);
 1106         }
 1107         /*
 1108          * Account for the block pointers in this indirect block.
 1109          */
 1110         last = howmany(remblks, blksperindir);
 1111         if (last > NINDIR(fs))
 1112                 last = NINDIR(fs);
 1113         MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 1114         bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 1115         bqrelse(bp);
 1116         error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 1117             level == 0 ? rlbn : -1, expungetype);
 1118         if (error || level == 0)
 1119                 goto out;
 1120         /*
 1121          * Account for the block pointers in each of the indirect blocks
 1122          * in the levels below us.
 1123          */
 1124         subblksperindir = blksperindir / NINDIR(fs);
 1125         for (lbn++, level--, i = 0; i < last; i++) {
 1126                 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
 1127                     rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 1128                 if (error)
 1129                         goto out;
 1130                 rlbn += blksperindir;
 1131                 lbn -= blksperindir;
 1132                 remblks -= blksperindir;
 1133         }
 1134 out:
 1135         FREE(bap, M_DEVBUF);
 1136         return (error);
 1137 }
 1138 
 1139 /*
 1140  * Do both snap accounting and map accounting.
 1141  */
 1142 static int
 1143 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 1144         struct vnode *vp;
 1145         ufs1_daddr_t *oldblkp, *lastblkp;
 1146         struct fs *fs;
 1147         ufs_lbn_t lblkno;
 1148         int exptype;    /* BLK_SNAP or BLK_NOCOPY */
 1149 {
 1150         int error;
 1151 
 1152         if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 1153                 return (error);
 1154         return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 1155 }
 1156 
 1157 /*
 1158  * Identify a set of blocks allocated in a snapshot inode.
 1159  */
 1160 static int
 1161 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 1162         struct vnode *vp;
 1163         ufs1_daddr_t *oldblkp, *lastblkp;
 1164         struct fs *fs;
 1165         ufs_lbn_t lblkno;
 1166         int expungetype;        /* BLK_SNAP or BLK_NOCOPY */
 1167 {
 1168         struct inode *ip = VTOI(vp);
 1169         ufs1_daddr_t blkno, *blkp;
 1170         ufs_lbn_t lbn;
 1171         struct buf *ibp;
 1172         int error;
 1173 
 1174         for ( ; oldblkp < lastblkp; oldblkp++) {
 1175                 blkno = *oldblkp;
 1176                 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 1177                         continue;
 1178                 lbn = fragstoblks(fs, blkno);
 1179                 if (lbn < NDADDR) {
 1180                         blkp = &ip->i_din1->di_db[lbn];
 1181                         ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1182                 } else {
 1183                         error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn),
 1184                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 1185                         if (error)
 1186                                 return (error);
 1187                         blkp = &((ufs1_daddr_t *)(ibp->b_data))
 1188                             [(lbn - NDADDR) % NINDIR(fs)];
 1189                 }
 1190                 /*
 1191                  * If we are expunging a snapshot vnode and we
 1192                  * find a block marked BLK_NOCOPY, then it is
 1193                  * one that has been allocated to this snapshot after
 1194                  * we took our current snapshot and can be ignored.
 1195                  */
 1196                 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 1197                         if (lbn >= NDADDR)
 1198                                 brelse(ibp);
 1199                 } else {
 1200                         if (*blkp != 0)
 1201                                 panic("snapacct_ufs1: bad block");
 1202                         *blkp = expungetype;
 1203                         if (lbn >= NDADDR)
 1204                                 bdwrite(ibp);
 1205                 }
 1206         }
 1207         return (0);
 1208 }
 1209 
 1210 /*
 1211  * Account for a set of blocks allocated in a snapshot inode.
 1212  */
 1213 static int
 1214 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 1215         struct vnode *vp;
 1216         ufs1_daddr_t *oldblkp, *lastblkp;
 1217         struct fs *fs;
 1218         ufs_lbn_t lblkno;
 1219         int expungetype;
 1220 {
 1221         ufs1_daddr_t blkno;
 1222         struct inode *ip;
 1223         ino_t inum;
 1224         int acctit;
 1225 
 1226         ip = VTOI(vp);
 1227         inum = ip->i_number;
 1228         if (lblkno == -1)
 1229                 acctit = 0;
 1230         else
 1231                 acctit = 1;
 1232         for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 1233                 blkno = *oldblkp;
 1234                 if (blkno == 0 || blkno == BLK_NOCOPY)
 1235                         continue;
 1236                 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 1237                         *ip->i_snapblklist++ = lblkno;
 1238                 if (blkno == BLK_SNAP)
 1239                         blkno = blkstofrags(fs, lblkno);
 1240                 ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
 1241         }
 1242         return (0);
 1243 }
 1244 
 1245 /*
 1246  * Before expunging a snapshot inode, note all the
 1247  * blocks that it claims with BLK_SNAP so that fsck will
 1248  * be able to account for those blocks properly and so
 1249  * that this snapshot knows that it need not copy them
 1250  * if the other snapshot holding them is freed. This code
 1251  * is reproduced once each for UFS1 and UFS2.
 1252  */
 1253 static int
 1254 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
 1255         struct vnode *snapvp;
 1256         struct inode *cancelip;
 1257         struct fs *fs;
 1258         int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 1259             struct fs *, ufs_lbn_t, int);
 1260         int expungetype;
 1261 {
 1262         int i, error, indiroff;
 1263         ufs_lbn_t lbn, rlbn;
 1264         ufs2_daddr_t len, blkno, numblks, blksperindir;
 1265         struct ufs2_dinode *dip;
 1266         struct thread *td = curthread;
 1267         struct buf *bp;
 1268 
 1269         /*
 1270          * Prepare to expunge the inode. If its inode block has not
 1271          * yet been copied, then allocate and fill the copy.
 1272          */
 1273         lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 1274         blkno = 0;
 1275         if (lbn < NDADDR) {
 1276                 blkno = VTOI(snapvp)->i_din2->di_db[lbn];
 1277         } else {
 1278                 td->td_pflags |= TDP_COWINPROGRESS;
 1279                 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
 1280                    fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 1281                 td->td_pflags &= ~TDP_COWINPROGRESS;
 1282                 if (error)
 1283                         return (error);
 1284                 indiroff = (lbn - NDADDR) % NINDIR(fs);
 1285                 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
 1286                 bqrelse(bp);
 1287         }
 1288         if (blkno != 0) {
 1289                 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
 1290                         return (error);
 1291         } else {
 1292                 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
 1293                     fs->fs_bsize, KERNCRED, 0, &bp);
 1294                 if (error)
 1295                         return (error);
 1296                 if ((error = readblock(snapvp, bp, lbn)) != 0)
 1297                         return (error);
 1298         }
 1299         /*
 1300          * Set a snapshot inode to be a zero length file, regular files
 1301          * to be completely unallocated.
 1302          */
 1303         dip = (struct ufs2_dinode *)bp->b_data +
 1304             ino_to_fsbo(fs, cancelip->i_number);
 1305         if (expungetype == BLK_NOCOPY)
 1306                 dip->di_mode = 0;
 1307         dip->di_size = 0;
 1308         dip->di_blocks = 0;
 1309         dip->di_flags &= ~SF_SNAPSHOT;
 1310         bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
 1311         bdwrite(bp);
 1312         /*
 1313          * Now go through and expunge all the blocks in the file
 1314          * using the function requested.
 1315          */
 1316         numblks = howmany(cancelip->i_size, fs->fs_bsize);
 1317         if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
 1318             &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype)))
 1319                 return (error);
 1320         if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0],
 1321             &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype)))
 1322                 return (error);
 1323         blksperindir = 1;
 1324         lbn = -NDADDR;
 1325         len = numblks - NDADDR;
 1326         rlbn = NDADDR;
 1327         for (i = 0; len > 0 && i < NIADDR; i++) {
 1328                 error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
 1329                     cancelip->i_din2->di_ib[i], lbn, rlbn, len,
 1330                     blksperindir, fs, acctfunc, expungetype);
 1331                 if (error)
 1332                         return (error);
 1333                 blksperindir *= NINDIR(fs);
 1334                 lbn -= blksperindir + 1;
 1335                 len -= blksperindir;
 1336                 rlbn += blksperindir;
 1337         }
 1338         return (0);
 1339 }
 1340 
 1341 /*
 1342  * Descend an indirect block chain for vnode cancelvp accounting for all
 1343  * its indirect blocks in snapvp.
 1344  */ 
 1345 static int
 1346 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 1347             blksperindir, fs, acctfunc, expungetype)
 1348         struct vnode *snapvp;
 1349         struct vnode *cancelvp;
 1350         int level;
 1351         ufs2_daddr_t blkno;
 1352         ufs_lbn_t lbn;
 1353         ufs_lbn_t rlbn;
 1354         ufs_lbn_t remblks;
 1355         ufs_lbn_t blksperindir;
 1356         struct fs *fs;
 1357         int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 1358             struct fs *, ufs_lbn_t, int);
 1359         int expungetype;
 1360 {
 1361         int error, num, i;
 1362         ufs_lbn_t subblksperindir;
 1363         struct indir indirs[NIADDR + 2];
 1364         ufs2_daddr_t last, *bap;
 1365         struct buf *bp;
 1366 
 1367         if (blkno == 0) {
 1368                 if (expungetype == BLK_NOCOPY)
 1369                         return (0);
 1370                 panic("indiracct_ufs2: missing indir");
 1371         }
 1372         if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 1373                 return (error);
 1374         if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 1375                 panic("indiracct_ufs2: botched params");
 1376         /*
 1377          * We have to expand bread here since it will deadlock looking
 1378          * up the block number for any blocks that are not in the cache.
 1379          */
 1380         bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 1381         bp->b_blkno = fsbtodb(fs, blkno);
 1382         if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 1383             (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
 1384                 brelse(bp);
 1385                 return (error);
 1386         }
 1387         /*
 1388          * Account for the block pointers in this indirect block.
 1389          */
 1390         last = howmany(remblks, blksperindir);
 1391         if (last > NINDIR(fs))
 1392                 last = NINDIR(fs);
 1393         MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 1394         bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 1395         bqrelse(bp);
 1396         error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 1397             level == 0 ? rlbn : -1, expungetype);
 1398         if (error || level == 0)
 1399                 goto out;
 1400         /*
 1401          * Account for the block pointers in each of the indirect blocks
 1402          * in the levels below us.
 1403          */
 1404         subblksperindir = blksperindir / NINDIR(fs);
 1405         for (lbn++, level--, i = 0; i < last; i++) {
 1406                 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
 1407                     rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 1408                 if (error)
 1409                         goto out;
 1410                 rlbn += blksperindir;
 1411                 lbn -= blksperindir;
 1412                 remblks -= blksperindir;
 1413         }
 1414 out:
 1415         FREE(bap, M_DEVBUF);
 1416         return (error);
 1417 }
 1418 
 1419 /*
 1420  * Do both snap accounting and map accounting.
 1421  */
 1422 static int
 1423 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 1424         struct vnode *vp;
 1425         ufs2_daddr_t *oldblkp, *lastblkp;
 1426         struct fs *fs;
 1427         ufs_lbn_t lblkno;
 1428         int exptype;    /* BLK_SNAP or BLK_NOCOPY */
 1429 {
 1430         int error;
 1431 
 1432         if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 1433                 return (error);
 1434         return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 1435 }
 1436 
 1437 /*
 1438  * Identify a set of blocks allocated in a snapshot inode.
 1439  */
 1440 static int
 1441 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 1442         struct vnode *vp;
 1443         ufs2_daddr_t *oldblkp, *lastblkp;
 1444         struct fs *fs;
 1445         ufs_lbn_t lblkno;
 1446         int expungetype;        /* BLK_SNAP or BLK_NOCOPY */
 1447 {
 1448         struct inode *ip = VTOI(vp);
 1449         ufs2_daddr_t blkno, *blkp;
 1450         ufs_lbn_t lbn;
 1451         struct buf *ibp;
 1452         int error;
 1453 
 1454         for ( ; oldblkp < lastblkp; oldblkp++) {
 1455                 blkno = *oldblkp;
 1456                 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 1457                         continue;
 1458                 lbn = fragstoblks(fs, blkno);
 1459                 if (lbn < NDADDR) {
 1460                         blkp = &ip->i_din2->di_db[lbn];
 1461                         ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1462                 } else {
 1463                         error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn),
 1464                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 1465                         if (error)
 1466                                 return (error);
 1467                         blkp = &((ufs2_daddr_t *)(ibp->b_data))
 1468                             [(lbn - NDADDR) % NINDIR(fs)];
 1469                 }
 1470                 /*
 1471                  * If we are expunging a snapshot vnode and we
 1472                  * find a block marked BLK_NOCOPY, then it is
 1473                  * one that has been allocated to this snapshot after
 1474                  * we took our current snapshot and can be ignored.
 1475                  */
 1476                 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 1477                         if (lbn >= NDADDR)
 1478                                 brelse(ibp);
 1479                 } else {
 1480                         if (*blkp != 0)
 1481                                 panic("snapacct_ufs2: bad block");
 1482                         *blkp = expungetype;
 1483                         if (lbn >= NDADDR)
 1484                                 bdwrite(ibp);
 1485                 }
 1486         }
 1487         return (0);
 1488 }
 1489 
 1490 /*
 1491  * Account for a set of blocks allocated in a snapshot inode.
 1492  */
 1493 static int
 1494 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 1495         struct vnode *vp;
 1496         ufs2_daddr_t *oldblkp, *lastblkp;
 1497         struct fs *fs;
 1498         ufs_lbn_t lblkno;
 1499         int expungetype;
 1500 {
 1501         ufs2_daddr_t blkno;
 1502         struct inode *ip;
 1503         ino_t inum;
 1504         int acctit;
 1505 
 1506         ip = VTOI(vp);
 1507         inum = ip->i_number;
 1508         if (lblkno == -1)
 1509                 acctit = 0;
 1510         else
 1511                 acctit = 1;
 1512         for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 1513                 blkno = *oldblkp;
 1514                 if (blkno == 0 || blkno == BLK_NOCOPY)
 1515                         continue;
 1516                 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 1517                         *ip->i_snapblklist++ = lblkno;
 1518                 if (blkno == BLK_SNAP)
 1519                         blkno = blkstofrags(fs, lblkno);
 1520                 ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
 1521         }
 1522         return (0);
 1523 }
 1524 
 1525 /*
 1526  * Decrement extra reference on snapshot when last name is removed.
 1527  * It will not be freed until the last open reference goes away.
 1528  */
 1529 void
 1530 ffs_snapgone(ip)
 1531         struct inode *ip;
 1532 {
 1533         struct inode *xp;
 1534         struct fs *fs;
 1535         int snaploc;
 1536         struct snapdata *sn;
 1537         struct ufsmount *ump;
 1538 
 1539         /*
 1540          * Find snapshot in incore list.
 1541          */
 1542         xp = NULL;
 1543         sn = ip->i_devvp->v_rdev->si_snapdata;
 1544         if (sn != NULL)
 1545                 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap)
 1546                         if (xp == ip)
 1547                                 break;
 1548         if (xp != NULL)
 1549                 vrele(ITOV(ip));
 1550         else if (snapdebug)
 1551                 printf("ffs_snapgone: lost snapshot vnode %d\n",
 1552                     ip->i_number);
 1553         /*
 1554          * Delete snapshot inode from superblock. Keep list dense.
 1555          */
 1556         fs = ip->i_fs;
 1557         ump = ip->i_ump;
 1558         UFS_LOCK(ump);
 1559         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 1560                 if (fs->fs_snapinum[snaploc] == ip->i_number)
 1561                         break;
 1562         if (snaploc < FSMAXSNAP) {
 1563                 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
 1564                         if (fs->fs_snapinum[snaploc] == 0)
 1565                                 break;
 1566                         fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
 1567                 }
 1568                 fs->fs_snapinum[snaploc - 1] = 0;
 1569         }
 1570         UFS_UNLOCK(ump);
 1571 }
 1572 
 1573 /*
 1574  * Prepare a snapshot file for being removed.
 1575  */
 1576 void
 1577 ffs_snapremove(vp)
 1578         struct vnode *vp;
 1579 {
 1580         struct inode *ip;
 1581         struct vnode *devvp;
 1582         struct buf *ibp;
 1583         struct fs *fs;
 1584         struct thread *td = curthread;
 1585         ufs2_daddr_t numblks, blkno, dblk;
 1586         int error, loc, last;
 1587         struct snapdata *sn;
 1588 
 1589         ip = VTOI(vp);
 1590         fs = ip->i_fs;
 1591         devvp = ip->i_devvp;
 1592         /*
 1593          * If active, delete from incore list (this snapshot may
 1594          * already have been in the process of being deleted, so
 1595          * would not have been active).
 1596          *
 1597          * Clear copy-on-write flag if last snapshot.
 1598          */
 1599         VI_LOCK(devvp);
 1600         if (ip->i_nextsnap.tqe_prev != 0) {
 1601                 sn = devvp->v_rdev->si_snapdata;
 1602                 TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap);
 1603                 ip->i_nextsnap.tqe_prev = 0;
 1604                 VI_UNLOCK(devvp);
 1605                 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL, td);
 1606                 VI_LOCK(vp);
 1607                 KASSERT(vp->v_vnlock == &sn->sn_lock,
 1608                         ("ffs_snapremove: lost lock mutation")); 
 1609                 vp->v_vnlock = &vp->v_lock;
 1610                 VI_UNLOCK(vp);
 1611                 VI_LOCK(devvp);
 1612                 lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
 1613                 try_free_snapdata(devvp, td);
 1614         } else
 1615                 VI_UNLOCK(devvp);
 1616         /*
 1617          * Clear all BLK_NOCOPY fields. Pass any block claims to other
 1618          * snapshots that want them (see ffs_snapblkfree below).
 1619          */
 1620         for (blkno = 1; blkno < NDADDR; blkno++) {
 1621                 dblk = DIP(ip, i_db[blkno]);
 1622                 if (dblk == 0)
 1623                         continue;
 1624                 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 1625                         DIP_SET(ip, i_db[blkno], 0);
 1626                 else if ((dblk == blkstofrags(fs, blkno) &&
 1627                      ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
 1628                      ip->i_number))) {
 1629                         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) -
 1630                             btodb(fs->fs_bsize));
 1631                         DIP_SET(ip, i_db[blkno], 0);
 1632                 }
 1633         }
 1634         numblks = howmany(ip->i_size, fs->fs_bsize);
 1635         for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 1636                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 1637                     fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 1638                 if (error)
 1639                         continue;
 1640                 if (fs->fs_size - blkno > NINDIR(fs))
 1641                         last = NINDIR(fs);
 1642                 else
 1643                         last = fs->fs_size - blkno;
 1644                 for (loc = 0; loc < last; loc++) {
 1645                         if (ip->i_ump->um_fstype == UFS1) {
 1646                                 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
 1647                                 if (dblk == 0)
 1648                                         continue;
 1649                                 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 1650                                         ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 1651                                 else if ((dblk == blkstofrags(fs, blkno) &&
 1652                                      ffs_snapblkfree(fs, ip->i_devvp, dblk,
 1653                                      fs->fs_bsize, ip->i_number))) {
 1654                                         ip->i_din1->di_blocks -=
 1655                                             btodb(fs->fs_bsize);
 1656                                         ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 1657                                 }
 1658                                 continue;
 1659                         }
 1660                         dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
 1661                         if (dblk == 0)
 1662                                 continue;
 1663                         if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 1664                                 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 1665                         else if ((dblk == blkstofrags(fs, blkno) &&
 1666                              ffs_snapblkfree(fs, ip->i_devvp, dblk,
 1667                              fs->fs_bsize, ip->i_number))) {
 1668                                 ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
 1669                                 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 1670                         }
 1671                 }
 1672                 bawrite(ibp);
 1673         }
 1674         /*
 1675          * Clear snapshot flag and drop reference.
 1676          */
 1677         ip->i_flags &= ~SF_SNAPSHOT;
 1678         DIP_SET(ip, i_flags, ip->i_flags);
 1679         ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1680 #ifdef QUOTA
 1681         /*
 1682          * Reenable disk quotas for ex-snapshot file.
 1683          */
 1684         if (!getinoquota(ip))
 1685                 (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE);
 1686 #endif
 1687 }
 1688 
 1689 /*
 1690  * Notification that a block is being freed. Return zero if the free
 1691  * should be allowed to proceed. Return non-zero if the snapshot file
 1692  * wants to claim the block. The block will be claimed if it is an
 1693  * uncopied part of one of the snapshots. It will be freed if it is
 1694  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
 1695  * If a fragment is being freed, then all snapshots that care about
 1696  * it must make a copy since a snapshot file can only claim full sized
 1697  * blocks. Note that if more than one snapshot file maps the block,
 1698  * we can pick one at random to claim it. Since none of the snapshots
 1699  * can change, we are assurred that they will all see the same unmodified
 1700  * image. When deleting a snapshot file (see ffs_snapremove above), we
 1701  * must push any of these claimed blocks to one of the other snapshots
 1702  * that maps it. These claimed blocks are easily identified as they will
 1703  * have a block number equal to their logical block number within the
 1704  * snapshot. A copied block can never have this property because they
 1705  * must always have been allocated from a BLK_NOCOPY location.
 1706  */
 1707 int
 1708 ffs_snapblkfree(fs, devvp, bno, size, inum)
 1709         struct fs *fs;
 1710         struct vnode *devvp;
 1711         ufs2_daddr_t bno;
 1712         long size;
 1713         ino_t inum;
 1714 {
 1715         struct buf *ibp, *cbp, *savedcbp = 0;
 1716         struct thread *td = curthread;
 1717         struct inode *ip;
 1718         struct vnode *vp = NULL;
 1719         ufs_lbn_t lbn;
 1720         ufs2_daddr_t blkno;
 1721         int indiroff = 0, error = 0, claimedblk = 0;
 1722         struct snapdata *sn;
 1723 
 1724         lbn = fragstoblks(fs, bno);
 1725 retry:
 1726         VI_LOCK(devvp);
 1727         sn = devvp->v_rdev->si_snapdata;
 1728         if (sn == NULL) {
 1729                 VI_UNLOCK(devvp);
 1730                 return (0);
 1731         }
 1732         if (lockmgr(&sn->sn_lock,
 1733                     LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 1734                     VI_MTX(devvp), td) != 0)
 1735                 goto retry;
 1736         TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 1737                 vp = ITOV(ip);
 1738                 /*
 1739                  * Lookup block being written.
 1740                  */
 1741                 if (lbn < NDADDR) {
 1742                         blkno = DIP(ip, i_db[lbn]);
 1743                 } else {
 1744                         td->td_pflags |= TDP_COWINPROGRESS;
 1745                         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 1746                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 1747                         td->td_pflags &= ~TDP_COWINPROGRESS;
 1748                         if (error)
 1749                                 break;
 1750                         indiroff = (lbn - NDADDR) % NINDIR(fs);
 1751                         if (ip->i_ump->um_fstype == UFS1)
 1752                                 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 1753                         else
 1754                                 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 1755                 }
 1756                 /*
 1757                  * Check to see if block needs to be copied.
 1758                  */
 1759                 if (blkno == 0) {
 1760                         /*
 1761                          * A block that we map is being freed. If it has not
 1762                          * been claimed yet, we will claim or copy it (below).
 1763                          */
 1764                         claimedblk = 1;
 1765                 } else if (blkno == BLK_SNAP) {
 1766                         /*
 1767                          * No previous snapshot claimed the block,
 1768                          * so it will be freed and become a BLK_NOCOPY
 1769                          * (don't care) for us.
 1770                          */
 1771                         if (claimedblk)
 1772                                 panic("snapblkfree: inconsistent block type");
 1773                         if (lbn < NDADDR) {
 1774                                 DIP_SET(ip, i_db[lbn], BLK_NOCOPY);
 1775                                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1776                         } else if (ip->i_ump->um_fstype == UFS1) {
 1777                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 1778                                     BLK_NOCOPY;
 1779                                 bdwrite(ibp);
 1780                         } else {
 1781                                 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
 1782                                     BLK_NOCOPY;
 1783                                 bdwrite(ibp);
 1784                         }
 1785                         continue;
 1786                 } else /* BLK_NOCOPY or default */ {
 1787                         /*
 1788                          * If the snapshot has already copied the block
 1789                          * (default), or does not care about the block,
 1790                          * it is not needed.
 1791                          */
 1792                         if (lbn >= NDADDR)
 1793                                 bqrelse(ibp);
 1794                         continue;
 1795                 }
 1796                 /*
 1797                  * If this is a full size block, we will just grab it
 1798                  * and assign it to the snapshot inode. Otherwise we
 1799                  * will proceed to copy it. See explanation for this
 1800                  * routine as to why only a single snapshot needs to
 1801                  * claim this block.
 1802                  */
 1803                 if (size == fs->fs_bsize) {
 1804 #ifdef DEBUG
 1805                         if (snapdebug)
 1806                                 printf("%s %d lbn %jd from inum %d\n",
 1807                                     "Grabonremove: snapino", ip->i_number,
 1808                                     (intmax_t)lbn, inum);
 1809 #endif
 1810                         if (lbn < NDADDR) {
 1811                                 DIP_SET(ip, i_db[lbn], bno);
 1812                         } else if (ip->i_ump->um_fstype == UFS1) {
 1813                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
 1814                                 bdwrite(ibp);
 1815                         } else {
 1816                                 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
 1817                                 bdwrite(ibp);
 1818                         }
 1819                         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
 1820                         ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1821                         lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 1822                         return (1);
 1823                 }
 1824                 if (lbn >= NDADDR)
 1825                         bqrelse(ibp);
 1826                 /*
 1827                  * Allocate the block into which to do the copy. Note that this
 1828                  * allocation will never require any additional allocations for
 1829                  * the snapshot inode.
 1830                  */
 1831                 td->td_pflags |= TDP_COWINPROGRESS;
 1832                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 1833                     fs->fs_bsize, KERNCRED, 0, &cbp);
 1834                 td->td_pflags &= ~TDP_COWINPROGRESS;
 1835                 if (error)
 1836                         break;
 1837 #ifdef DEBUG
 1838                 if (snapdebug)
 1839                         printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n",
 1840                             "Copyonremove: snapino ", ip->i_number,
 1841                             (intmax_t)lbn, "for inum", inum, size,
 1842                             (intmax_t)cbp->b_blkno);
 1843 #endif
 1844                 /*
 1845                  * If we have already read the old block contents, then
 1846                  * simply copy them to the new block. Note that we need
 1847                  * to synchronously write snapshots that have not been
 1848                  * unlinked, and hence will be visible after a crash,
 1849                  * to ensure their integrity.
 1850                  */
 1851                 if (savedcbp != 0) {
 1852                         bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 1853                         bawrite(cbp);
 1854                         if (dopersistence && ip->i_effnlink > 0)
 1855                                 (void) ffs_syncvnode(vp, MNT_WAIT);
 1856                         continue;
 1857                 }
 1858                 /*
 1859                  * Otherwise, read the old block contents into the buffer.
 1860                  */
 1861                 if ((error = readblock(vp, cbp, lbn)) != 0) {
 1862                         bzero(cbp->b_data, fs->fs_bsize);
 1863                         bawrite(cbp);
 1864                         if (dopersistence && ip->i_effnlink > 0)
 1865                                 (void) ffs_syncvnode(vp, MNT_WAIT);
 1866                         break;
 1867                 }
 1868                 savedcbp = cbp;
 1869         }
 1870         /*
 1871          * Note that we need to synchronously write snapshots that
 1872          * have not been unlinked, and hence will be visible after
 1873          * a crash, to ensure their integrity.
 1874          */
 1875         if (savedcbp) {
 1876                 vp = savedcbp->b_vp;
 1877                 bawrite(savedcbp);
 1878                 if (dopersistence && VTOI(vp)->i_effnlink > 0)
 1879                         (void) ffs_syncvnode(vp, MNT_WAIT);
 1880         }
 1881         /*
 1882          * If we have been unable to allocate a block in which to do
 1883          * the copy, then return non-zero so that the fragment will
 1884          * not be freed. Although space will be lost, the snapshot
 1885          * will stay consistent.
 1886          */
 1887         lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 1888         return (error);
 1889 }
 1890 
 1891 /*
 1892  * Associate snapshot files when mounting.
 1893  */
 1894 void
 1895 ffs_snapshot_mount(mp)
 1896         struct mount *mp;
 1897 {
 1898         struct ufsmount *ump = VFSTOUFS(mp);
 1899         struct vnode *devvp = ump->um_devvp;
 1900         struct fs *fs = ump->um_fs;
 1901         struct thread *td = curthread;
 1902         struct snapdata *sn;
 1903         struct vnode *vp;
 1904         struct vnode *lastvp;
 1905         struct inode *ip;
 1906         struct uio auio;
 1907         struct iovec aiov;
 1908         void *snapblklist;
 1909         char *reason;
 1910         daddr_t snaplistsize;
 1911         int error, snaploc, loc;
 1912 
 1913         /*
 1914          * XXX The following needs to be set before ffs_truncate or
 1915          * VOP_READ can be called.
 1916          */
 1917         mp->mnt_stat.f_iosize = fs->fs_bsize;
 1918         /*
 1919          * Process each snapshot listed in the superblock.
 1920          */
 1921         vp = NULL;
 1922         lastvp = NULL;
 1923         sn = devvp->v_rdev->si_snapdata;
 1924         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
 1925                 if (fs->fs_snapinum[snaploc] == 0)
 1926                         break;
 1927                 if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc],
 1928                     LK_EXCLUSIVE, &vp)) != 0){
 1929                         printf("ffs_snapshot_mount: vget failed %d\n", error);
 1930                         continue;
 1931                 }
 1932                 ip = VTOI(vp);
 1933                 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size ==
 1934                     lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
 1935                         if ((ip->i_flags & SF_SNAPSHOT) == 0) {
 1936                                 reason = "non-snapshot";
 1937                         } else {
 1938                                 reason = "old format snapshot";
 1939                                 (void)ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
 1940                                 (void)ffs_syncvnode(vp, MNT_WAIT);
 1941                         }
 1942                         printf("ffs_snapshot_mount: %s inode %d\n",
 1943                             reason, fs->fs_snapinum[snaploc]);
 1944                         vput(vp);
 1945                         vp = NULL;
 1946                         for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
 1947                                 if (fs->fs_snapinum[loc] == 0)
 1948                                         break;
 1949                                 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
 1950                         }
 1951                         fs->fs_snapinum[loc - 1] = 0;
 1952                         snaploc--;
 1953                         continue;
 1954                 }
 1955                 /*
 1956                  * If there already exist snapshots on this filesystem, grab a
 1957                  * reference to their shared lock. If this is the first snapshot
 1958                  * on this filesystem, we need to allocate a lock for the
 1959                  * snapshots to share. In either case, acquire the snapshot
 1960                  * lock and give up our original private lock.
 1961                  */
 1962                 VI_LOCK(devvp);
 1963                 if (sn != NULL) {
 1964 
 1965                         VI_UNLOCK(devvp);
 1966                         VI_LOCK(vp);
 1967                         vp->v_vnlock = &sn->sn_lock;
 1968                 } else {
 1969                         VI_UNLOCK(devvp);
 1970                         sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
 1971                         TAILQ_INIT(&sn->sn_head);
 1972                         lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
 1973                             LK_CANRECURSE | LK_NOSHARE);
 1974                         VI_LOCK(vp);
 1975                         vp->v_vnlock = &sn->sn_lock;
 1976                         devvp->v_rdev->si_snapdata = sn;
 1977                 }
 1978                 lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
 1979                     VI_MTX(vp), td);
 1980                 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 1981                 /*
 1982                  * Link it onto the active snapshot list.
 1983                  */
 1984                 VI_LOCK(devvp);
 1985                 if (ip->i_nextsnap.tqe_prev != 0)
 1986                         panic("ffs_snapshot_mount: %d already on list",
 1987                             ip->i_number);
 1988                 else
 1989                         TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
 1990                 vp->v_vflag |= VV_SYSTEM;
 1991                 VI_UNLOCK(devvp);
 1992                 VOP_UNLOCK(vp, 0, td);
 1993                 lastvp = vp;
 1994         }
 1995         vp = lastvp;
 1996         /*
 1997          * No usable snapshots found.
 1998          */
 1999         if (vp == NULL)
 2000                 return;
 2001         /*
 2002          * Allocate the space for the block hints list. We always want to
 2003          * use the list from the newest snapshot.
 2004          */
 2005         auio.uio_iov = &aiov;
 2006         auio.uio_iovcnt = 1;
 2007         aiov.iov_base = (void *)&snaplistsize;
 2008         aiov.iov_len = sizeof(snaplistsize);
 2009         auio.uio_resid = aiov.iov_len;
 2010         auio.uio_offset =
 2011             lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
 2012         auio.uio_segflg = UIO_SYSSPACE;
 2013         auio.uio_rw = UIO_READ;
 2014         auio.uio_td = td;
 2015         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 2016         if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 2017                 printf("ffs_snapshot_mount: read_1 failed %d\n", error);
 2018                 VOP_UNLOCK(vp, 0, td);
 2019                 return;
 2020         }
 2021         MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t),
 2022             M_UFSMNT, M_WAITOK);
 2023         auio.uio_iovcnt = 1;
 2024         aiov.iov_base = snapblklist;
 2025         aiov.iov_len = snaplistsize * sizeof (daddr_t);
 2026         auio.uio_resid = aiov.iov_len;
 2027         auio.uio_offset -= sizeof(snaplistsize);
 2028         if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 2029                 printf("ffs_snapshot_mount: read_2 failed %d\n", error);
 2030                 VOP_UNLOCK(vp, 0, td);
 2031                 FREE(snapblklist, M_UFSMNT);
 2032                 return;
 2033         }
 2034         VOP_UNLOCK(vp, 0, td);
 2035         VI_LOCK(devvp);
 2036         ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount");
 2037         sn->sn_listsize = snaplistsize;
 2038         sn->sn_blklist = (daddr_t *)snapblklist;
 2039         devvp->v_vflag |= VV_COPYONWRITE;
 2040         VI_UNLOCK(devvp);
 2041 }
 2042 
 2043 /*
 2044  * Disassociate snapshot files when unmounting.
 2045  */
 2046 void
 2047 ffs_snapshot_unmount(mp)
 2048         struct mount *mp;
 2049 {
 2050         struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
 2051         struct snapdata *sn;
 2052         struct inode *xp;
 2053         struct vnode *vp;
 2054         struct thread *td = curthread;
 2055 
 2056         VI_LOCK(devvp);
 2057         sn = devvp->v_rdev->si_snapdata;
 2058         while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) {
 2059                 vp = ITOV(xp);
 2060                 TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap);
 2061                 xp->i_nextsnap.tqe_prev = 0;
 2062                 lockmgr(&sn->sn_lock, 
 2063                         LK_INTERLOCK | LK_EXCLUSIVE,
 2064                         VI_MTX(devvp),
 2065                         td);
 2066                 VI_LOCK(vp);
 2067                 lockmgr(&vp->v_lock,
 2068                         LK_INTERLOCK | LK_EXCLUSIVE,
 2069                         VI_MTX(vp), td);
 2070                 VI_LOCK(vp);
 2071                 KASSERT(vp->v_vnlock == &sn->sn_lock,
 2072                 ("ffs_snapshot_unmount: lost lock mutation")); 
 2073                 vp->v_vnlock = &vp->v_lock;
 2074                 VI_UNLOCK(vp);
 2075                 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 2076                 lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
 2077                 if (xp->i_effnlink > 0)
 2078                         vrele(vp);
 2079                 VI_LOCK(devvp);
 2080                 sn = devvp->v_rdev->si_snapdata;
 2081         }
 2082         try_free_snapdata(devvp, td);
 2083         ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
 2084 }
 2085 
 2086 /*
 2087  * Check the buffer block to be belong to device buffer that shall be
 2088  * locked after snaplk. devvp shall be locked on entry, and will be
 2089  * leaved locked upon exit.
 2090  */
 2091 static int
 2092 ffs_bp_snapblk(devvp, bp)
 2093         struct vnode *devvp;
 2094         struct buf *bp;
 2095 {
 2096         struct snapdata *sn;
 2097         struct fs *fs;
 2098         ufs2_daddr_t lbn, *snapblklist;
 2099         int lower, upper, mid;
 2100 
 2101         ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk");
 2102         KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp));
 2103         sn = devvp->v_rdev->si_snapdata;
 2104         if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL)
 2105                 return (0);
 2106         fs = TAILQ_FIRST(&sn->sn_head)->i_fs;
 2107         lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 2108         snapblklist = sn->sn_blklist;
 2109         upper = sn->sn_listsize - 1;
 2110         lower = 1;
 2111         while (lower <= upper) {
 2112                 mid = (lower + upper) / 2;
 2113                 if (snapblklist[mid] == lbn)
 2114                         break;
 2115                 if (snapblklist[mid] < lbn)
 2116                         lower = mid + 1;
 2117                 else
 2118                         upper = mid - 1;
 2119         }
 2120         if (lower <= upper)
 2121                 return (1);
 2122         return (0);
 2123 }
 2124 
 2125 void
 2126 ffs_bdflush(bo, bp)
 2127         struct bufobj *bo;
 2128         struct buf *bp;
 2129 {
 2130         struct thread *td;
 2131         struct vnode *vp, *devvp;
 2132         struct buf *nbp;
 2133         int bp_bdskip;
 2134 
 2135         if (bo->bo_dirty.bv_cnt <= dirtybufthresh)
 2136                 return;
 2137 
 2138         td = curthread;
 2139         vp = bp->b_vp;
 2140         devvp = bo->__bo_vnode;
 2141         KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp));
 2142 
 2143         VI_LOCK(devvp);
 2144         bp_bdskip = ffs_bp_snapblk(devvp, bp);
 2145         if (bp_bdskip)
 2146                 bdwriteskip++;
 2147         VI_UNLOCK(devvp);
 2148         if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) {
 2149                 (void) VOP_FSYNC(vp, MNT_NOWAIT, td);
 2150                 altbufferflushes++;
 2151         } else {
 2152                 BO_LOCK(bo);
 2153                 /*
 2154                  * Try to find a buffer to flush.
 2155                  */
 2156                 TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
 2157                         if ((nbp->b_vflags & BV_BKGRDINPROG) ||
 2158                             BUF_LOCK(nbp,
 2159                                      LK_EXCLUSIVE | LK_NOWAIT, NULL))
 2160                                 continue;
 2161                         if (bp == nbp)
 2162                                 panic("bdwrite: found ourselves");
 2163                         BO_UNLOCK(bo);
 2164                         /*
 2165                          * Don't countdeps with the bo lock
 2166                          * held.
 2167                          */
 2168                         if (buf_countdeps(nbp, 0)) {
 2169                                 BO_LOCK(bo);
 2170                                 BUF_UNLOCK(nbp);
 2171                                 continue;
 2172                         }
 2173                         if (bp_bdskip) {
 2174                                 VI_LOCK(devvp);
 2175                                 if (!ffs_bp_snapblk(vp, nbp)) {
 2176                                         if (BO_MTX(bo) != VI_MTX(vp)) {
 2177                                                 VI_UNLOCK(devvp);
 2178                                                 BO_LOCK(bo);
 2179                                         }
 2180                                         BUF_UNLOCK(nbp);
 2181                                         continue;
 2182                                 }
 2183                                 VI_UNLOCK(devvp);
 2184                         }
 2185                         if (nbp->b_flags & B_CLUSTEROK) {
 2186                                 vfs_bio_awrite(nbp);
 2187                         } else {
 2188                                 bremfree(nbp);
 2189                                 bawrite(nbp);
 2190                         }
 2191                         dirtybufferflushes++;
 2192                         break;
 2193                 }
 2194                 if (nbp == NULL)
 2195                         BO_UNLOCK(bo);
 2196         }
 2197 }
 2198 
 2199 /*
 2200  * Check for need to copy block that is about to be written,
 2201  * copying the block if necessary.
 2202  */
 2203 int
 2204 ffs_copyonwrite(devvp, bp)
 2205         struct vnode *devvp;
 2206         struct buf *bp;
 2207 {
 2208         struct snapdata *sn;
 2209         struct buf *ibp, *cbp, *savedcbp = 0;
 2210         struct thread *td = curthread;
 2211         struct fs *fs;
 2212         struct inode *ip;
 2213         struct vnode *vp = 0;
 2214         ufs2_daddr_t lbn, blkno, *snapblklist;
 2215         int lower, upper, mid, indiroff, error = 0;
 2216         int launched_async_io, prev_norunningbuf;
 2217         long saved_runningbufspace;
 2218 
 2219         if (devvp != bp->b_vp && (VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0)
 2220                 return (0);             /* Update on a snapshot file */
 2221         if (td->td_pflags & TDP_COWINPROGRESS)
 2222                 panic("ffs_copyonwrite: recursive call");
 2223         /*
 2224          * First check to see if it is in the preallocated list.
 2225          * By doing this check we avoid several potential deadlocks.
 2226          */
 2227         VI_LOCK(devvp);
 2228         sn = devvp->v_rdev->si_snapdata;
 2229         if (sn == NULL ||
 2230             TAILQ_EMPTY(&sn->sn_head)) {
 2231                 VI_UNLOCK(devvp);
 2232                 return (0);             /* No snapshot */
 2233         }
 2234         ip = TAILQ_FIRST(&sn->sn_head);
 2235         fs = ip->i_fs;
 2236         lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 2237         snapblklist = sn->sn_blklist;
 2238         upper = sn->sn_listsize - 1;
 2239         lower = 1;
 2240         while (lower <= upper) {
 2241                 mid = (lower + upper) / 2;
 2242                 if (snapblklist[mid] == lbn)
 2243                         break;
 2244                 if (snapblklist[mid] < lbn)
 2245                         lower = mid + 1;
 2246                 else
 2247                         upper = mid - 1;
 2248         }
 2249         if (lower <= upper) {
 2250                 VI_UNLOCK(devvp);
 2251                 return (0);
 2252         }
 2253         launched_async_io = 0;
 2254         prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF;
 2255         /*
 2256          * Since I/O on bp isn't yet in progress and it may be blocked
 2257          * for a long time waiting on snaplk, back it out of
 2258          * runningbufspace, possibly waking other threads waiting for space.
 2259          */
 2260         saved_runningbufspace = bp->b_runningbufspace;
 2261         if (saved_runningbufspace != 0)
 2262                 runningbufwakeup(bp);
 2263         /*
 2264          * Not in the precomputed list, so check the snapshots.
 2265          */
 2266         while (lockmgr(&sn->sn_lock,
 2267                        LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 2268                        VI_MTX(devvp), td) != 0) {
 2269                 VI_LOCK(devvp);
 2270                 sn = devvp->v_rdev->si_snapdata;
 2271                 if (sn == NULL ||
 2272                     TAILQ_EMPTY(&sn->sn_head)) {
 2273                         VI_UNLOCK(devvp);
 2274                         if (saved_runningbufspace != 0) {
 2275                                 bp->b_runningbufspace = saved_runningbufspace;
 2276                                 atomic_add_int(&runningbufspace,
 2277                                                bp->b_runningbufspace);
 2278                         }
 2279                         return (0);             /* Snapshot gone */
 2280                 }
 2281         }
 2282         TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 2283                 vp = ITOV(ip);
 2284                 /*
 2285                  * We ensure that everything of our own that needs to be
 2286                  * copied will be done at the time that ffs_snapshot is
 2287                  * called. Thus we can skip the check here which can
 2288                  * deadlock in doing the lookup in UFS_BALLOC.
 2289                  */
 2290                 if (bp->b_vp == vp)
 2291                         continue;
 2292                 /*
 2293                  * Check to see if block needs to be copied. We do not have
 2294                  * to hold the snapshot lock while doing this lookup as it
 2295                  * will never require any additional allocations for the
 2296                  * snapshot inode.
 2297                  */
 2298                 if (lbn < NDADDR) {
 2299                         blkno = DIP(ip, i_db[lbn]);
 2300                 } else {
 2301                         td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
 2302                         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 2303                            fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 2304                         td->td_pflags &= ~TDP_COWINPROGRESS;
 2305                         if (error)
 2306                                 break;
 2307                         indiroff = (lbn - NDADDR) % NINDIR(fs);
 2308                         if (ip->i_ump->um_fstype == UFS1)
 2309                                 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 2310                         else
 2311                                 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 2312                         bqrelse(ibp);
 2313                 }
 2314 #ifdef INVARIANTS
 2315                 if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
 2316                         panic("ffs_copyonwrite: bad copy block");
 2317 #endif
 2318                 if (blkno != 0)
 2319                         continue;
 2320                 /*
 2321                  * Allocate the block into which to do the copy. Since
 2322                  * multiple processes may all try to copy the same block,
 2323                  * we have to recheck our need to do a copy if we sleep
 2324                  * waiting for the lock.
 2325                  *
 2326                  * Because all snapshots on a filesystem share a single
 2327                  * lock, we ensure that we will never be in competition
 2328                  * with another process to allocate a block.
 2329                  */
 2330                 td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
 2331                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 2332                     fs->fs_bsize, KERNCRED, 0, &cbp);
 2333                 td->td_pflags &= ~TDP_COWINPROGRESS;
 2334                 if (error)
 2335                         break;
 2336 #ifdef DEBUG
 2337                 if (snapdebug) {
 2338                         printf("Copyonwrite: snapino %d lbn %jd for ",
 2339                             ip->i_number, (intmax_t)lbn);
 2340                         if (bp->b_vp == devvp)
 2341                                 printf("fs metadata");
 2342                         else
 2343                                 printf("inum %d", VTOI(bp->b_vp)->i_number);
 2344                         printf(" lblkno %jd to blkno %jd\n",
 2345                             (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
 2346                 }
 2347 #endif
 2348                 /*
 2349                  * If we have already read the old block contents, then
 2350                  * simply copy them to the new block. Note that we need
 2351                  * to synchronously write snapshots that have not been
 2352                  * unlinked, and hence will be visible after a crash,
 2353                  * to ensure their integrity.
 2354                  */
 2355                 if (savedcbp != 0) {
 2356                         bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 2357                         bawrite(cbp);
 2358                         if (dopersistence && ip->i_effnlink > 0)
 2359                                 (void) ffs_syncvnode(vp, MNT_WAIT);
 2360                         else
 2361                                 launched_async_io = 1;
 2362                         continue;
 2363                 }
 2364                 /*
 2365                  * Otherwise, read the old block contents into the buffer.
 2366                  */
 2367                 if ((error = readblock(vp, cbp, lbn)) != 0) {
 2368                         bzero(cbp->b_data, fs->fs_bsize);
 2369                         bawrite(cbp);
 2370                         if (dopersistence && ip->i_effnlink > 0)
 2371                                 (void) ffs_syncvnode(vp, MNT_WAIT);
 2372                         else
 2373                                 launched_async_io = 1;
 2374                         break;
 2375                 }
 2376                 savedcbp = cbp;
 2377         }
 2378         /*
 2379          * Note that we need to synchronously write snapshots that
 2380          * have not been unlinked, and hence will be visible after
 2381          * a crash, to ensure their integrity.
 2382          */
 2383         if (savedcbp) {
 2384                 vp = savedcbp->b_vp;
 2385                 bawrite(savedcbp);
 2386                 if (dopersistence && VTOI(vp)->i_effnlink > 0)
 2387                         (void) ffs_syncvnode(vp, MNT_WAIT);
 2388                 else
 2389                         launched_async_io = 1;
 2390         }
 2391         lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 2392         td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) |
 2393                 prev_norunningbuf;
 2394         if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0)
 2395                 waitrunningbufspace();
 2396         /*
 2397          * I/O on bp will now be started, so count it in runningbufspace.
 2398          */
 2399         if (saved_runningbufspace != 0) {
 2400                 bp->b_runningbufspace = saved_runningbufspace;
 2401                 atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 2402         }
 2403         return (error);
 2404 }
 2405 
 2406 /*
 2407  * Read the specified block into the given buffer.
 2408  * Much of this boiler-plate comes from bwrite().
 2409  */
 2410 static int
 2411 readblock(vp, bp, lbn)
 2412         struct vnode *vp;
 2413         struct buf *bp;
 2414         ufs2_daddr_t lbn;
 2415 {
 2416         struct inode *ip = VTOI(vp);
 2417         struct bio *bip;
 2418 
 2419         bip = g_alloc_bio();
 2420         bip->bio_cmd = BIO_READ;
 2421         bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
 2422         bip->bio_data = bp->b_data;
 2423         bip->bio_length = bp->b_bcount;
 2424         bip->bio_done = NULL;
 2425 
 2426         g_io_request(bip, ip->i_devvp->v_bufobj.bo_private);
 2427         bp->b_error = biowait(bip, "snaprdb");
 2428         g_destroy_bio(bip);
 2429         return (bp->b_error);
 2430 }
 2431 
 2432 /*
 2433  * Process file deletes that were deferred by ufs_inactive() due to
 2434  * the file system being suspended. Transfer IN_LAZYACCESS into
 2435  * IN_MODIFIED for vnodes that were accessed during suspension.
 2436  */
 2437 static void
 2438 process_deferred_inactive(struct mount *mp)
 2439 {
 2440         struct vnode *vp, *mvp;
 2441         struct inode *ip;
 2442         struct thread *td;
 2443         int error;
 2444 
 2445         td = curthread;
 2446         (void) vn_start_secondary_write(NULL, &mp, V_WAIT);
 2447         MNT_ILOCK(mp);
 2448  loop:
 2449         MNT_VNODE_FOREACH(vp, mp, mvp) {
 2450                 VI_LOCK(vp);
 2451                 /*
 2452                  * IN_LAZYACCESS is checked here without holding any
 2453                  * vnode lock, but this flag is set only while holding
 2454                  * vnode interlock.
 2455                  */
 2456                 if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED) != 0 ||
 2457                     ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 &&
 2458                         ((vp->v_iflag & VI_OWEINACT) == 0 ||
 2459                         vp->v_usecount > 0))) {
 2460                         VI_UNLOCK(vp);
 2461                         continue;
 2462                 }
 2463                 MNT_IUNLOCK(mp);
 2464                 vholdl(vp);
 2465                 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td);
 2466                 if (error != 0) {
 2467                         vdrop(vp);
 2468                         MNT_ILOCK(mp);
 2469                         if (error == ENOENT)
 2470                                 continue;       /* vnode recycled */
 2471                         MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 2472                         goto loop;
 2473                 }
 2474                 ip = VTOI(vp);
 2475                 if ((ip->i_flag & IN_LAZYACCESS) != 0) {
 2476                         ip->i_flag &= ~IN_LAZYACCESS;
 2477                         ip->i_flag |= IN_MODIFIED;
 2478                 }
 2479                 VI_LOCK(vp);
 2480                 if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) {
 2481                         VI_UNLOCK(vp);
 2482                         VOP_UNLOCK(vp, 0, td);
 2483                         vdrop(vp);
 2484                         MNT_ILOCK(mp);
 2485                         continue;
 2486                 }
 2487                 
 2488                 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 2489                          ("process_deferred_inactive: "
 2490                           "recursed on VI_DOINGINACT"));
 2491                 vp->v_iflag |= VI_DOINGINACT;
 2492                 vp->v_iflag &= ~VI_OWEINACT;
 2493                 VI_UNLOCK(vp);
 2494                 (void) VOP_INACTIVE(vp, td);
 2495                 VI_LOCK(vp);
 2496                 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 2497                          ("process_deferred_inactive: lost VI_DOINGINACT"));
 2498                 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
 2499                          ("process_deferred_inactive: got VI_OWEINACT"));
 2500                 vp->v_iflag &= ~VI_DOINGINACT;
 2501                 VI_UNLOCK(vp);
 2502                 VOP_UNLOCK(vp, 0, td);
 2503                 vdrop(vp);
 2504                 MNT_ILOCK(mp);
 2505         }
 2506         MNT_IUNLOCK(mp);
 2507         vn_finished_secondary_write(mp);
 2508 }
 2509 
 2510 /* Try to free snapdata associated with devvp */
 2511 static void
 2512 try_free_snapdata(struct vnode *devvp,
 2513                   struct thread *td)
 2514 {
 2515         struct snapdata *sn;
 2516         ufs2_daddr_t *snapblklist;
 2517 
 2518         sn = devvp->v_rdev->si_snapdata;
 2519 
 2520         if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL ||
 2521             (devvp->v_vflag & VV_COPYONWRITE) == 0) {
 2522                 VI_UNLOCK(devvp);
 2523                 return;
 2524         }
 2525 
 2526         devvp->v_rdev->si_snapdata = NULL;
 2527         devvp->v_vflag &= ~VV_COPYONWRITE;
 2528         snapblklist = sn->sn_blklist;
 2529         sn->sn_blklist = NULL;
 2530         sn->sn_listsize = 0;
 2531         lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td);
 2532         lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
 2533         lockdestroy(&sn->sn_lock);
 2534         free(sn, M_UFSMNT);
 2535         if (snapblklist != NULL)
 2536                 FREE(snapblklist, M_UFSMNT);
 2537 }
 2538 #endif
Cache object: bcec7d981b9f88ccb41107c262563e27
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/ufs/ffs/ffs_snapshot.c

FreeBSD/Linux Kernel Cross Reference
sys/ufs/ffs/ffs_snapshot.c