The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/ufs/ffs/ffs_snapshot.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
    3  *
    4  * Further information about snapshots can be obtained from:
    5  *
    6  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
    7  *      1614 Oxford Street              mckusick@mckusick.com
    8  *      Berkeley, CA 94709-1608         +1-510-843-9542
    9  *      USA
   10  *
   11  * Redistribution and use in source and binary forms, with or without
   12  * modification, are permitted provided that the following conditions
   13  * are met:
   14  *
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  *
   21  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
   22  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   24  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
   25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   31  * SUCH DAMAGE.
   32  *
   33  *      @(#)ffs_snapshot.c      8.11 (McKusick) 7/23/00
   34  */
   35 
   36 #include <sys/cdefs.h>
   37 __FBSDID("$FreeBSD$");
   38 
   39 #include "opt_quota.h"
   40 
   41 #include <sys/param.h>
   42 #include <sys/kernel.h>
   43 #include <sys/systm.h>
   44 #include <sys/conf.h>
   45 #include <sys/bio.h>
   46 #include <sys/buf.h>
   47 #include <sys/proc.h>
   48 #include <sys/namei.h>
   49 #include <sys/sched.h>
   50 #include <sys/stat.h>
   51 #include <sys/malloc.h>
   52 #include <sys/mount.h>
   53 #include <sys/resource.h>
   54 #include <sys/resourcevar.h>
   55 #include <sys/vnode.h>
   56 
   57 #include <geom/geom.h>
   58 
   59 #include <ufs/ufs/extattr.h>
   60 #include <ufs/ufs/quota.h>
   61 #include <ufs/ufs/ufsmount.h>
   62 #include <ufs/ufs/inode.h>
   63 #include <ufs/ufs/ufs_extern.h>
   64 
   65 #include <ufs/ffs/fs.h>
   66 #include <ufs/ffs/ffs_extern.h>
   67 
   68 #define KERNCRED thread0.td_ucred
   69 #define DEBUG 1
   70 
   71 #include "opt_ffs.h"
   72 
   73 #ifdef NO_FFS_SNAPSHOT
   74 int
   75 ffs_snapshot(mp, snapfile)
   76         struct mount *mp;
   77         char *snapfile;
   78 {
   79         return (EINVAL);
   80 }
   81 
   82 int
   83 ffs_snapblkfree(fs, devvp, bno, size, inum)
   84         struct fs *fs;
   85         struct vnode *devvp;
   86         ufs2_daddr_t bno;
   87         long size;
   88         ino_t inum;
   89 {
   90         return (EINVAL);
   91 }
   92 
   93 void
   94 ffs_snapremove(vp)
   95         struct vnode *vp;
   96 {
   97 }
   98 
   99 void
  100 ffs_snapshot_mount(mp)
  101         struct mount *mp;
  102 {
  103 }
  104 
  105 void
  106 ffs_snapshot_unmount(mp)
  107         struct mount *mp;
  108 {
  109 }
  110 
  111 void
  112 ffs_snapgone(ip)
  113         struct inode *ip;
  114 {
  115 }
  116 
  117 int
  118 ffs_copyonwrite(devvp, bp)
  119         struct vnode *devvp;
  120         struct buf *bp;
  121 {
  122         return (EINVAL);
  123 }
  124 
  125 #else
  126 
  127 TAILQ_HEAD(snaphead, inode);
  128 
  129 struct snapdata {
  130         struct snaphead sn_head;
  131         daddr_t sn_listsize;
  132         daddr_t *sn_blklist;
  133         struct lock sn_lock;
  134 };
  135 
  136 static int cgaccount(int, struct vnode *, struct buf *, int);
  137 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
  138     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
  139     ufs_lbn_t, int), int);
  140 static int indiracct_ufs1(struct vnode *, struct vnode *, int,
  141     ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
  142     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
  143     ufs_lbn_t, int), int);
  144 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
  145     struct fs *, ufs_lbn_t, int);
  146 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
  147     struct fs *, ufs_lbn_t, int);
  148 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
  149     struct fs *, ufs_lbn_t, int);
  150 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
  151     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
  152     ufs_lbn_t, int), int);
  153 static int indiracct_ufs2(struct vnode *, struct vnode *, int,
  154     ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
  155     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
  156     ufs_lbn_t, int), int);
  157 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
  158     struct fs *, ufs_lbn_t, int);
  159 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
  160     struct fs *, ufs_lbn_t, int);
  161 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
  162     struct fs *, ufs_lbn_t, int);
  163 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
  164 static void process_deferred_inactive(struct mount *);
  165 static void try_free_snapdata(struct vnode *devvp, struct thread *td);
  166 static int ffs_bp_snapblk(struct vnode *, struct buf *);
  167 
  168 /*
  169  * To ensure the consistency of snapshots across crashes, we must
  170  * synchronously write out copied blocks before allowing the
  171  * originals to be modified. Because of the rather severe speed
  172  * penalty that this imposes, the following flag allows this
  173  * crash persistence to be disabled.
  174  */
  175 int dopersistence = 0;
  176 
  177 #ifdef DEBUG
  178 #include <sys/sysctl.h>
  179 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
  180 static int snapdebug = 0;
  181 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
  182 int collectsnapstats = 0;
  183 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
  184         0, "");
  185 #endif /* DEBUG */
  186 
  187 /*
  188  * Create a snapshot file and initialize it for the filesystem.
  189  */
  190 int
  191 ffs_snapshot(mp, snapfile)
  192         struct mount *mp;
  193         char *snapfile;
  194 {
  195         ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
  196         int error, cg, snaploc;
  197         int i, size, len, loc;
  198         int flag;
  199         struct timespec starttime = {0, 0}, endtime;
  200         char saved_nice = 0;
  201         long redo = 0, snaplistsize = 0;
  202         int32_t *lp;
  203         void *space;
  204         struct fs *copy_fs = NULL, *fs;
  205         struct thread *td = curthread;
  206         struct inode *ip, *xp;
  207         struct buf *bp, *nbp, *ibp, *sbp = NULL;
  208         struct nameidata nd;
  209         struct mount *wrtmp;
  210         struct vattr vat;
  211         struct vnode *vp, *xvp, *mvp, *devvp;
  212         struct uio auio;
  213         struct iovec aiov;
  214         struct snapdata *sn;
  215         struct ufsmount *ump;
  216 
  217         ump = VFSTOUFS(mp);
  218         fs = ump->um_fs;
  219         sn = NULL;
  220         MNT_ILOCK(mp);
  221         flag = mp->mnt_flag;
  222         MNT_IUNLOCK(mp);
  223 
  224         /*
  225          * Need to serialize access to snapshot code per filesystem.
  226          */
  227         /*
  228          * Assign a snapshot slot in the superblock.
  229          */
  230         UFS_LOCK(ump);
  231         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
  232                 if (fs->fs_snapinum[snaploc] == 0)
  233                         break;
  234         UFS_UNLOCK(ump);
  235         if (snaploc == FSMAXSNAP)
  236                 return (ENOSPC);
  237         /*
  238          * Create the snapshot file.
  239          */
  240 restart:
  241         NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile, td);
  242         if ((error = namei(&nd)) != 0)
  243                 return (error);
  244         if (nd.ni_vp != NULL) {
  245                 vput(nd.ni_vp);
  246                 error = EEXIST;
  247         }
  248         if (nd.ni_dvp->v_mount != mp)
  249                 error = EXDEV;
  250         if (error) {
  251                 NDFREE(&nd, NDF_ONLY_PNBUF);
  252                 if (nd.ni_dvp == nd.ni_vp)
  253                         vrele(nd.ni_dvp);
  254                 else
  255                         vput(nd.ni_dvp);
  256                 return (error);
  257         }
  258         VATTR_NULL(&vat);
  259         vat.va_type = VREG;
  260         vat.va_mode = S_IRUSR;
  261         vat.va_vaflags |= VA_EXCLUSIVE;
  262         if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
  263                 wrtmp = NULL;
  264         if (wrtmp != mp)
  265                 panic("ffs_snapshot: mount mismatch");
  266         vfs_rel(wrtmp);
  267         if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
  268                 NDFREE(&nd, NDF_ONLY_PNBUF);
  269                 vput(nd.ni_dvp);
  270                 if ((error = vn_start_write(NULL, &wrtmp,
  271                     V_XSLEEP | PCATCH)) != 0)
  272                         return (error);
  273                 goto restart;
  274         }
  275         VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE);
  276         error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
  277         VOP_UNLOCK(nd.ni_dvp, 0, td);
  278         if (error) {
  279                 NDFREE(&nd, NDF_ONLY_PNBUF);
  280                 vn_finished_write(wrtmp);
  281                 vrele(nd.ni_dvp);
  282                 return (error);
  283         }
  284         vp = nd.ni_vp;
  285         vp->v_vflag |= VV_SYSTEM;
  286         ip = VTOI(vp);
  287         devvp = ip->i_devvp;
  288         /*
  289          * Allocate and copy the last block contents so as to be able
  290          * to set size to that of the filesystem.
  291          */
  292         numblks = howmany(fs->fs_size, fs->fs_frag);
  293         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
  294             fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
  295         if (error)
  296                 goto out;
  297         ip->i_size = lblktosize(fs, (off_t)numblks);
  298         DIP_SET(ip, i_size, ip->i_size);
  299         ip->i_flag |= IN_CHANGE | IN_UPDATE;
  300         error = readblock(vp, bp, numblks - 1);
  301         bawrite(bp);
  302         if (error != 0)
  303                 goto out;
  304         /*
  305          * Preallocate critical data structures so that we can copy
  306          * them in without further allocation after we suspend all
  307          * operations on the filesystem. We would like to just release
  308          * the allocated buffers without writing them since they will
  309          * be filled in below once we are ready to go, but this upsets
  310          * the soft update code, so we go ahead and write the new buffers.
  311          *
  312          * Allocate all indirect blocks and mark all of them as not
  313          * needing to be copied.
  314          */
  315         for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
  316                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
  317                     fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
  318                 if (error)
  319                         goto out;
  320                 bawrite(ibp);
  321         }
  322         /*
  323          * Allocate copies for the superblock and its summary information.
  324          */
  325         error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
  326             0, &nbp);
  327         if (error)
  328                 goto out;
  329         bawrite(nbp);
  330         blkno = fragstoblks(fs, fs->fs_csaddr);
  331         len = howmany(fs->fs_cssize, fs->fs_bsize);
  332         for (loc = 0; loc < len; loc++) {
  333                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
  334                     fs->fs_bsize, KERNCRED, 0, &nbp);
  335                 if (error)
  336                         goto out;
  337                 bawrite(nbp);
  338         }
  339         /*
  340          * Allocate all cylinder group blocks.
  341          */
  342         for (cg = 0; cg < fs->fs_ncg; cg++) {
  343                 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
  344                     fs->fs_bsize, KERNCRED, 0, &nbp);
  345                 if (error)
  346                         goto out;
  347                 bawrite(nbp);
  348         }
  349         /*
  350          * Copy all the cylinder group maps. Although the
  351          * filesystem is still active, we hope that only a few
  352          * cylinder groups will change between now and when we
  353          * suspend operations. Thus, we will be able to quickly
  354          * touch up the few cylinder groups that changed during
  355          * the suspension period.
  356          */
  357         len = howmany(fs->fs_ncg, NBBY);
  358         MALLOC(space, void *, len, M_DEVBUF, M_WAITOK|M_ZERO);
  359         UFS_LOCK(ump);
  360         fs->fs_active = space;
  361         UFS_UNLOCK(ump);
  362         for (cg = 0; cg < fs->fs_ncg; cg++) {
  363                 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
  364                     fs->fs_bsize, KERNCRED, 0, &nbp);
  365                 if (error)
  366                         goto out;
  367                 error = cgaccount(cg, vp, nbp, 1);
  368                 bawrite(nbp);
  369                 if (error)
  370                         goto out;
  371         }
  372         /*
  373          * Change inode to snapshot type file.
  374          */
  375         ip->i_flags |= SF_SNAPSHOT;
  376         DIP_SET(ip, i_flags, ip->i_flags);
  377         ip->i_flag |= IN_CHANGE | IN_UPDATE;
  378         /*
  379          * Ensure that the snapshot is completely on disk.
  380          * Since we have marked it as a snapshot it is safe to
  381          * unlock it as no process will be allowed to write to it.
  382          */
  383         if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
  384                 goto out;
  385         VOP_UNLOCK(vp, 0, td);
  386         /*
  387          * All allocations are done, so we can now snapshot the system.
  388          *
  389          * Recind nice scheduling while running with the filesystem suspended.
  390          */
  391         if (td->td_proc->p_nice > 0) {
  392                 PROC_LOCK(td->td_proc);
  393                 mtx_lock_spin(&sched_lock);
  394                 saved_nice = td->td_proc->p_nice;
  395                 sched_nice(td->td_proc, 0);
  396                 mtx_unlock_spin(&sched_lock);
  397                 PROC_UNLOCK(td->td_proc);
  398         }
  399         /*
  400          * Suspend operation on filesystem.
  401          */
  402         for (;;) {
  403                 vn_finished_write(wrtmp);
  404                 if ((error = vfs_write_suspend(vp->v_mount)) != 0) {
  405                         vn_start_write(NULL, &wrtmp, V_WAIT);
  406                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
  407                         goto out;
  408                 }
  409                 if (mp->mnt_kern_flag & MNTK_SUSPENDED)
  410                         break;
  411                 vn_start_write(NULL, &wrtmp, V_WAIT);
  412         }
  413         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
  414         if (ip->i_effnlink == 0) {
  415                 error = ENOENT;         /* Snapshot file unlinked */
  416                 goto out1;
  417         }
  418         if (collectsnapstats)
  419                 nanotime(&starttime);
  420 
  421         /* The last block might have changed.  Copy it again to be sure. */
  422         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
  423             fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
  424         if (error != 0)
  425                 goto out1;
  426         error = readblock(vp, bp, numblks - 1);
  427         bp->b_flags |= B_VALIDSUSPWRT;
  428         bawrite(bp);
  429         if (error != 0)
  430                 goto out1;
  431         /*
  432          * First, copy all the cylinder group maps that have changed.
  433          */
  434         for (cg = 0; cg < fs->fs_ncg; cg++) {
  435                 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
  436                         continue;
  437                 redo++;
  438                 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
  439                     fs->fs_bsize, KERNCRED, 0, &nbp);
  440                 if (error)
  441                         goto out1;
  442                 error = cgaccount(cg, vp, nbp, 2);
  443                 bawrite(nbp);
  444                 if (error)
  445                         goto out1;
  446         }
  447         /*
  448          * Grab a copy of the superblock and its summary information.
  449          * We delay writing it until the suspension is released below.
  450          */
  451         error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
  452             KERNCRED, &sbp);
  453         if (error) {
  454                 brelse(sbp);
  455                 sbp = NULL;
  456                 goto out1;
  457         }
  458         loc = blkoff(fs, fs->fs_sblockloc);
  459         copy_fs = (struct fs *)(sbp->b_data + loc);
  460         bcopy(fs, copy_fs, fs->fs_sbsize);
  461         if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
  462                 copy_fs->fs_clean = 1;
  463         size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
  464         if (fs->fs_sbsize < size)
  465                 bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize);
  466         size = blkroundup(fs, fs->fs_cssize);
  467         if (fs->fs_contigsumsize > 0)
  468                 size += fs->fs_ncg * sizeof(int32_t);
  469         space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
  470         copy_fs->fs_csp = space;
  471         bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
  472         space = (char *)space + fs->fs_cssize;
  473         loc = howmany(fs->fs_cssize, fs->fs_fsize);
  474         i = fs->fs_frag - loc % fs->fs_frag;
  475         len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
  476         if (len > 0) {
  477                 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
  478                     len, KERNCRED, &bp)) != 0) {
  479                         brelse(bp);
  480                         free(copy_fs->fs_csp, M_UFSMNT);
  481                         bawrite(sbp);
  482                         sbp = NULL;
  483                         goto out1;
  484                 }
  485                 bcopy(bp->b_data, space, (u_int)len);
  486                 space = (char *)space + len;
  487                 bp->b_flags |= B_INVAL | B_NOCACHE;
  488                 brelse(bp);
  489         }
  490         if (fs->fs_contigsumsize > 0) {
  491                 copy_fs->fs_maxcluster = lp = space;
  492                 for (i = 0; i < fs->fs_ncg; i++)
  493                         *lp++ = fs->fs_contigsumsize;
  494         }
  495         /*
  496          * We must check for active files that have been unlinked
  497          * (e.g., with a zero link count). We have to expunge all
  498          * trace of these files from the snapshot so that they are
  499          * not reclaimed prematurely by fsck or unnecessarily dumped.
  500          * We turn off the MNTK_SUSPENDED flag to avoid a panic from
  501          * spec_strategy about writing on a suspended filesystem.
  502          * Note that we skip unlinked snapshot files as they will
  503          * be handled separately below.
  504          *
  505          * We also calculate the needed size for the snapshot list.
  506          */
  507         snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
  508             FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
  509         MNT_ILOCK(mp);
  510         mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
  511 loop:
  512         MNT_VNODE_FOREACH(xvp, mp, mvp) {
  513                 VI_LOCK(xvp);
  514                 MNT_IUNLOCK(mp);
  515                 if ((xvp->v_iflag & VI_DOOMED) ||
  516                     (xvp->v_usecount == 0 &&
  517                      (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) ||
  518                     xvp->v_type == VNON ||
  519                     (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
  520                         VI_UNLOCK(xvp);
  521                         MNT_ILOCK(mp);
  522                         continue;
  523                 }
  524                 /*
  525                  * We can skip parent directory vnode because it must have
  526                  * this snapshot file in it.
  527                  */
  528                 if (xvp == nd.ni_dvp) {
  529                         VI_UNLOCK(xvp);
  530                         MNT_ILOCK(mp);
  531                         continue;
  532                 }
  533                 vholdl(xvp);
  534                 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) {
  535                         MNT_ILOCK(mp);
  536                         MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
  537                         vdrop(xvp);
  538                         goto loop;
  539                 }
  540                 VI_LOCK(xvp);
  541                 if (xvp->v_usecount == 0 &&
  542                     (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) {
  543                         VI_UNLOCK(xvp);
  544                         VOP_UNLOCK(xvp, 0, td);
  545                         vdrop(xvp);
  546                         MNT_ILOCK(mp);
  547                         continue;
  548                 }
  549                 VI_UNLOCK(xvp);
  550                 if (snapdebug)
  551                         vprint("ffs_snapshot: busy vnode", xvp);
  552                 if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 &&
  553                     vat.va_nlink > 0) {
  554                         VOP_UNLOCK(xvp, 0, td);
  555                         vdrop(xvp);
  556                         MNT_ILOCK(mp);
  557                         continue;
  558                 }
  559                 xp = VTOI(xvp);
  560                 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
  561                         VOP_UNLOCK(xvp, 0, td);
  562                         vdrop(xvp);
  563                         MNT_ILOCK(mp);
  564                         continue;
  565                 }
  566                 /*
  567                  * If there is a fragment, clear it here.
  568                  */
  569                 blkno = 0;
  570                 loc = howmany(xp->i_size, fs->fs_bsize) - 1;
  571                 if (loc < NDADDR) {
  572                         len = fragroundup(fs, blkoff(fs, xp->i_size));
  573                         if (len != 0 && len < fs->fs_bsize) {
  574                                 ffs_blkfree(ump, copy_fs, vp,
  575                                     DIP(xp, i_db[loc]), len, xp->i_number);
  576                                 blkno = DIP(xp, i_db[loc]);
  577                                 DIP_SET(xp, i_db[loc], 0);
  578                         }
  579                 }
  580                 snaplistsize += 1;
  581                 if (xp->i_ump->um_fstype == UFS1)
  582                         error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
  583                             BLK_NOCOPY);
  584                 else
  585                         error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
  586                             BLK_NOCOPY);
  587                 if (blkno)
  588                         DIP_SET(xp, i_db[loc], blkno);
  589                 if (!error)
  590                         error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
  591                             xp->i_mode);
  592                 VOP_UNLOCK(xvp, 0, td);
  593                 vdrop(xvp);
  594                 if (error) {
  595                         free(copy_fs->fs_csp, M_UFSMNT);
  596                         bawrite(sbp);
  597                         sbp = NULL;
  598                         MNT_VNODE_FOREACH_ABORT(mp, mvp);
  599                         goto out1;
  600                 }
  601                 MNT_ILOCK(mp);
  602         }
  603         MNT_IUNLOCK(mp);
  604         /*
  605          * If there already exist snapshots on this filesystem, grab a
  606          * reference to their shared lock. If this is the first snapshot
  607          * on this filesystem, we need to allocate a lock for the snapshots
  608          * to share. In either case, acquire the snapshot lock and give
  609          * up our original private lock.
  610          */
  611         VI_LOCK(devvp);
  612         sn = devvp->v_rdev->si_snapdata;
  613         if (sn != NULL) {
  614                 xp = TAILQ_FIRST(&sn->sn_head);
  615                 VI_UNLOCK(devvp);
  616                 VI_LOCK(vp);
  617                 vp->v_vnlock = &sn->sn_lock;
  618         } else {
  619                 VI_UNLOCK(devvp);
  620                 sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
  621                 TAILQ_INIT(&sn->sn_head);
  622                 lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
  623                     LK_CANRECURSE | LK_NOSHARE);
  624                 VI_LOCK(vp);
  625                 vp->v_vnlock = &sn->sn_lock;
  626                 mp_fixme("si_snapdata setting is racey.");
  627                 devvp->v_rdev->si_snapdata = sn;
  628                 xp = NULL;
  629         }
  630         lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
  631             VI_MTX(vp), td);
  632         lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
  633         /*
  634          * If this is the first snapshot on this filesystem, then we need
  635          * to allocate the space for the list of preallocated snapshot blocks.
  636          * This list will be refined below, but this preliminary one will
  637          * keep us out of deadlock until the full one is ready.
  638          */
  639         if (xp == NULL) {
  640                 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
  641                     M_UFSMNT, M_WAITOK);
  642                 blkp = &snapblklist[1];
  643                 *blkp++ = lblkno(fs, fs->fs_sblockloc);
  644                 blkno = fragstoblks(fs, fs->fs_csaddr);
  645                 for (cg = 0; cg < fs->fs_ncg; cg++) {
  646                         if (fragstoblks(fs, cgtod(fs, cg) > blkno))
  647                                 break;
  648                         *blkp++ = fragstoblks(fs, cgtod(fs, cg));
  649                 }
  650                 len = howmany(fs->fs_cssize, fs->fs_bsize);
  651                 for (loc = 0; loc < len; loc++)
  652                         *blkp++ = blkno + loc;
  653                 for (; cg < fs->fs_ncg; cg++)
  654                         *blkp++ = fragstoblks(fs, cgtod(fs, cg));
  655                 snapblklist[0] = blkp - snapblklist;
  656                 VI_LOCK(devvp);
  657                 if (sn->sn_blklist != NULL)
  658                         panic("ffs_snapshot: non-empty list");
  659                 sn->sn_blklist = snapblklist;
  660                 sn->sn_listsize = blkp - snapblklist;
  661                 VI_UNLOCK(devvp);
  662         }
  663         /*
  664          * Record snapshot inode. Since this is the newest snapshot,
  665          * it must be placed at the end of the list.
  666          */
  667         VI_LOCK(devvp);
  668         fs->fs_snapinum[snaploc] = ip->i_number;
  669         if (ip->i_nextsnap.tqe_prev != 0)
  670                 panic("ffs_snapshot: %d already on list", ip->i_number);
  671         TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
  672         devvp->v_vflag |= VV_COPYONWRITE;
  673         VI_UNLOCK(devvp);
  674         ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
  675 out1:
  676         KASSERT((sn != NULL && sbp != NULL && error == 0) ||
  677                 (sn == NULL && sbp == NULL && error != 0),
  678                 ("email phk@ and mckusick@"));
  679         /*
  680          * Resume operation on filesystem.
  681          */
  682         vfs_write_resume(vp->v_mount);
  683         vn_start_write(NULL, &wrtmp, V_WAIT);
  684         if (collectsnapstats && starttime.tv_sec > 0) {
  685                 nanotime(&endtime);
  686                 timespecsub(&endtime, &starttime);
  687                 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
  688                     vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
  689                     endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
  690         }
  691         if (sbp == NULL)
  692                 goto out;
  693         /*
  694          * Copy allocation information from all the snapshots in
  695          * this snapshot and then expunge them from its view.
  696          */
  697         TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) {
  698                 if (xp == ip)
  699                         break;
  700                 if (xp->i_ump->um_fstype == UFS1)
  701                         error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
  702                             BLK_SNAP);
  703                 else
  704                         error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
  705                             BLK_SNAP);
  706                 if (error == 0 && xp->i_effnlink == 0) {
  707                         error = ffs_freefile(ump,
  708                                              copy_fs,
  709                                              vp,
  710                                              xp->i_number,
  711                                              xp->i_mode);
  712                 }
  713                 if (error) {
  714                         fs->fs_snapinum[snaploc] = 0;
  715                         goto done;
  716                 }
  717         }
  718         /*
  719          * Allocate space for the full list of preallocated snapshot blocks.
  720          */
  721         MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
  722             M_UFSMNT, M_WAITOK);
  723         ip->i_snapblklist = &snapblklist[1];
  724         /*
  725          * Expunge the blocks used by the snapshots from the set of
  726          * blocks marked as used in the snapshot bitmaps. Also, collect
  727          * the list of allocated blocks in i_snapblklist.
  728          */
  729         if (ip->i_ump->um_fstype == UFS1)
  730                 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
  731         else
  732                 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
  733         if (error) {
  734                 fs->fs_snapinum[snaploc] = 0;
  735                 FREE(snapblklist, M_UFSMNT);
  736                 goto done;
  737         }
  738         if (snaplistsize < ip->i_snapblklist - snapblklist)
  739                 panic("ffs_snapshot: list too small");
  740         snaplistsize = ip->i_snapblklist - snapblklist;
  741         snapblklist[0] = snaplistsize;
  742         ip->i_snapblklist = 0;
  743         /*
  744          * Write out the list of allocated blocks to the end of the snapshot.
  745          */
  746         auio.uio_iov = &aiov;
  747         auio.uio_iovcnt = 1;
  748         aiov.iov_base = (void *)snapblklist;
  749         aiov.iov_len = snaplistsize * sizeof(daddr_t);
  750         auio.uio_resid = aiov.iov_len;;
  751         auio.uio_offset = ip->i_size;
  752         auio.uio_segflg = UIO_SYSSPACE;
  753         auio.uio_rw = UIO_WRITE;
  754         auio.uio_td = td;
  755         if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
  756                 fs->fs_snapinum[snaploc] = 0;
  757                 FREE(snapblklist, M_UFSMNT);
  758                 goto done;
  759         }
  760         /*
  761          * Write the superblock and its summary information
  762          * to the snapshot.
  763          */
  764         blkno = fragstoblks(fs, fs->fs_csaddr);
  765         len = howmany(fs->fs_cssize, fs->fs_bsize);
  766         space = copy_fs->fs_csp;
  767         for (loc = 0; loc < len; loc++) {
  768                 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
  769                 if (error) {
  770                         brelse(nbp);
  771                         fs->fs_snapinum[snaploc] = 0;
  772                         FREE(snapblklist, M_UFSMNT);
  773                         goto done;
  774                 }
  775                 bcopy(space, nbp->b_data, fs->fs_bsize);
  776                 space = (char *)space + fs->fs_bsize;
  777                 bawrite(nbp);
  778         }
  779         /*
  780          * As this is the newest list, it is the most inclusive, so
  781          * should replace the previous list.
  782          */
  783         VI_LOCK(devvp);
  784         space = sn->sn_blklist;
  785         sn->sn_blklist = snapblklist;
  786         sn->sn_listsize = snaplistsize;
  787         VI_UNLOCK(devvp);
  788         if (space != NULL)
  789                 FREE(space, M_UFSMNT);
  790         /*
  791          * If another process is currently writing the buffer containing
  792          * the inode for this snapshot then a deadlock can occur. Drop
  793          * the snapshot lock until the buffer has been written.
  794          */
  795         VREF(vp);       /* Protect against ffs_snapgone() */
  796         VOP_UNLOCK(vp, 0, td);
  797         (void) bread(ip->i_devvp,
  798                      fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
  799                      (int) fs->fs_bsize, NOCRED, &nbp);
  800         brelse(nbp);
  801         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
  802         if (ip->i_effnlink == 0)
  803                 error = ENOENT;         /* Snapshot file unlinked */
  804         else
  805                 vrele(vp);              /* Drop extra reference */
  806 done:
  807         FREE(copy_fs->fs_csp, M_UFSMNT);
  808         bawrite(sbp);
  809 out:
  810         NDFREE(&nd, NDF_ONLY_PNBUF);
  811         if (saved_nice > 0) {
  812                 PROC_LOCK(td->td_proc);
  813                 mtx_lock_spin(&sched_lock);
  814                 sched_nice(td->td_proc, saved_nice);
  815                 mtx_unlock_spin(&sched_lock);
  816                 PROC_UNLOCK(td->td_proc);
  817         }
  818         UFS_LOCK(ump);
  819         if (fs->fs_active != 0) {
  820                 FREE(fs->fs_active, M_DEVBUF);
  821                 fs->fs_active = 0;
  822         }
  823         UFS_UNLOCK(ump);
  824         MNT_ILOCK(mp);
  825         mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
  826         MNT_IUNLOCK(mp);
  827         if (error)
  828                 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
  829         (void) ffs_syncvnode(vp, MNT_WAIT);
  830         if (error)
  831                 vput(vp);
  832         else
  833                 VOP_UNLOCK(vp, 0, td);
  834         vrele(nd.ni_dvp);
  835         vn_finished_write(wrtmp);
  836         process_deferred_inactive(mp);
  837         return (error);
  838 }
  839 
  840 /*
  841  * Copy a cylinder group map. All the unallocated blocks are marked
  842  * BLK_NOCOPY so that the snapshot knows that it need not copy them
  843  * if they are later written. If passno is one, then this is a first
  844  * pass, so only setting needs to be done. If passno is 2, then this
  845  * is a revision to a previous pass which must be undone as the
  846  * replacement pass is done.
  847  */
  848 static int
  849 cgaccount(cg, vp, nbp, passno)
  850         int cg;
  851         struct vnode *vp;
  852         struct buf *nbp;
  853         int passno;
  854 {
  855         struct buf *bp, *ibp;
  856         struct inode *ip;
  857         struct cg *cgp;
  858         struct fs *fs;
  859         ufs2_daddr_t base, numblks;
  860         int error, len, loc, indiroff;
  861 
  862         ip = VTOI(vp);
  863         fs = ip->i_fs;
  864         error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
  865                 (int)fs->fs_cgsize, KERNCRED, &bp);
  866         if (error) {
  867                 brelse(bp);
  868                 return (error);
  869         }
  870         cgp = (struct cg *)bp->b_data;
  871         if (!cg_chkmagic(cgp)) {
  872                 brelse(bp);
  873                 return (EIO);
  874         }
  875         UFS_LOCK(ip->i_ump);
  876         ACTIVESET(fs, cg);
  877         UFS_UNLOCK(ip->i_ump);
  878         bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
  879         if (fs->fs_cgsize < fs->fs_bsize)
  880                 bzero(&nbp->b_data[fs->fs_cgsize],
  881                     fs->fs_bsize - fs->fs_cgsize);
  882         cgp = (struct cg *)nbp->b_data;
  883         bqrelse(bp);
  884         if (passno == 2)
  885                 nbp->b_flags |= B_VALIDSUSPWRT;
  886         numblks = howmany(fs->fs_size, fs->fs_frag);
  887         len = howmany(fs->fs_fpg, fs->fs_frag);
  888         base = cgbase(fs, cg) / fs->fs_frag;
  889         if (base + len >= numblks)
  890                 len = numblks - base - 1;
  891         loc = 0;
  892         if (base < NDADDR) {
  893                 for ( ; loc < NDADDR; loc++) {
  894                         if (ffs_isblock(fs, cg_blksfree(cgp), loc))
  895                                 DIP_SET(ip, i_db[loc], BLK_NOCOPY);
  896                         else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
  897                                 DIP_SET(ip, i_db[loc], 0);
  898                         else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
  899                                 panic("ffs_snapshot: lost direct block");
  900                 }
  901         }
  902         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
  903             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
  904         if (error) {
  905                 return (error);
  906         }
  907         indiroff = (base + loc - NDADDR) % NINDIR(fs);
  908         for ( ; loc < len; loc++, indiroff++) {
  909                 if (indiroff >= NINDIR(fs)) {
  910                         if (passno == 2)
  911                                 ibp->b_flags |= B_VALIDSUSPWRT;
  912                         bawrite(ibp);
  913                         error = UFS_BALLOC(vp,
  914                             lblktosize(fs, (off_t)(base + loc)),
  915                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
  916                         if (error) {
  917                                 return (error);
  918                         }
  919                         indiroff = 0;
  920                 }
  921                 if (ip->i_ump->um_fstype == UFS1) {
  922                         if (ffs_isblock(fs, cg_blksfree(cgp), loc))
  923                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
  924                                     BLK_NOCOPY;
  925                         else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
  926                             [indiroff] == BLK_NOCOPY)
  927                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
  928                         else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
  929                             [indiroff] == BLK_NOCOPY)
  930                                 panic("ffs_snapshot: lost indirect block");
  931                         continue;
  932                 }
  933                 if (ffs_isblock(fs, cg_blksfree(cgp), loc))
  934                         ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
  935                 else if (passno == 2 &&
  936                     ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
  937                         ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
  938                 else if (passno == 1 &&
  939                     ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
  940                         panic("ffs_snapshot: lost indirect block");
  941         }
  942         if (passno == 2)
  943                 ibp->b_flags |= B_VALIDSUSPWRT;
  944         bdwrite(ibp);
  945         return (0);
  946 }
  947 
  948 /*
  949  * Before expunging a snapshot inode, note all the
  950  * blocks that it claims with BLK_SNAP so that fsck will
  951  * be able to account for those blocks properly and so
  952  * that this snapshot knows that it need not copy them
  953  * if the other snapshot holding them is freed. This code
  954  * is reproduced once each for UFS1 and UFS2.
  955  */
  956 static int
  957 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
  958         struct vnode *snapvp;
  959         struct inode *cancelip;
  960         struct fs *fs;
  961         int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
  962             struct fs *, ufs_lbn_t, int);
  963         int expungetype;
  964 {
  965         int i, error, indiroff;
  966         ufs_lbn_t lbn, rlbn;
  967         ufs2_daddr_t len, blkno, numblks, blksperindir;
  968         struct ufs1_dinode *dip;
  969         struct thread *td = curthread;
  970         struct buf *bp;
  971 
  972         /*
  973          * Prepare to expunge the inode. If its inode block has not
  974          * yet been copied, then allocate and fill the copy.
  975          */
  976         lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
  977         blkno = 0;
  978         if (lbn < NDADDR) {
  979                 blkno = VTOI(snapvp)->i_din1->di_db[lbn];
  980         } else {
  981                 td->td_pflags |= TDP_COWINPROGRESS;
  982                 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
  983                    fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
  984                 td->td_pflags &= ~TDP_COWINPROGRESS;
  985                 if (error)
  986                         return (error);
  987                 indiroff = (lbn - NDADDR) % NINDIR(fs);
  988                 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
  989                 bqrelse(bp);
  990         }
  991         if (blkno != 0) {
  992                 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
  993                         return (error);
  994         } else {
  995                 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
  996                     fs->fs_bsize, KERNCRED, 0, &bp);
  997                 if (error)
  998                         return (error);
  999                 if ((error = readblock(snapvp, bp, lbn)) != 0)
 1000                         return (error);
 1001         }
 1002         /*
 1003          * Set a snapshot inode to be a zero length file, regular files
 1004          * or unlinked snapshots to be completely unallocated.
 1005          */
 1006         dip = (struct ufs1_dinode *)bp->b_data +
 1007             ino_to_fsbo(fs, cancelip->i_number);
 1008         if (expungetype == BLK_NOCOPY || cancelip->i_effnlink == 0)
 1009                 dip->di_mode = 0;
 1010         dip->di_size = 0;
 1011         dip->di_blocks = 0;
 1012         dip->di_flags &= ~SF_SNAPSHOT;
 1013         bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
 1014         bdwrite(bp);
 1015         /*
 1016          * Now go through and expunge all the blocks in the file
 1017          * using the function requested.
 1018          */
 1019         numblks = howmany(cancelip->i_size, fs->fs_bsize);
 1020         if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
 1021             &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype)))
 1022                 return (error);
 1023         if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0],
 1024             &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype)))
 1025                 return (error);
 1026         blksperindir = 1;
 1027         lbn = -NDADDR;
 1028         len = numblks - NDADDR;
 1029         rlbn = NDADDR;
 1030         for (i = 0; len > 0 && i < NIADDR; i++) {
 1031                 error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
 1032                     cancelip->i_din1->di_ib[i], lbn, rlbn, len,
 1033                     blksperindir, fs, acctfunc, expungetype);
 1034                 if (error)
 1035                         return (error);
 1036                 blksperindir *= NINDIR(fs);
 1037                 lbn -= blksperindir + 1;
 1038                 len -= blksperindir;
 1039                 rlbn += blksperindir;
 1040         }
 1041         return (0);
 1042 }
 1043 
 1044 /*
 1045  * Descend an indirect block chain for vnode cancelvp accounting for all
 1046  * its indirect blocks in snapvp.
 1047  */ 
 1048 static int
 1049 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 1050             blksperindir, fs, acctfunc, expungetype)
 1051         struct vnode *snapvp;
 1052         struct vnode *cancelvp;
 1053         int level;
 1054         ufs1_daddr_t blkno;
 1055         ufs_lbn_t lbn;
 1056         ufs_lbn_t rlbn;
 1057         ufs_lbn_t remblks;
 1058         ufs_lbn_t blksperindir;
 1059         struct fs *fs;
 1060         int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 1061             struct fs *, ufs_lbn_t, int);
 1062         int expungetype;
 1063 {
 1064         int error, num, i;
 1065         ufs_lbn_t subblksperindir;
 1066         struct indir indirs[NIADDR + 2];
 1067         ufs1_daddr_t last, *bap;
 1068         struct buf *bp;
 1069 
 1070         if (blkno == 0) {
 1071                 if (expungetype == BLK_NOCOPY)
 1072                         return (0);
 1073                 panic("indiracct_ufs1: missing indir");
 1074         }
 1075         if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 1076                 return (error);
 1077         if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 1078                 panic("indiracct_ufs1: botched params");
 1079         /*
 1080          * We have to expand bread here since it will deadlock looking
 1081          * up the block number for any blocks that are not in the cache.
 1082          */
 1083         bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 1084         bp->b_blkno = fsbtodb(fs, blkno);
 1085         if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 1086             (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
 1087                 brelse(bp);
 1088                 return (error);
 1089         }
 1090         /*
 1091          * Account for the block pointers in this indirect block.
 1092          */
 1093         last = howmany(remblks, blksperindir);
 1094         if (last > NINDIR(fs))
 1095                 last = NINDIR(fs);
 1096         MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 1097         bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 1098         bqrelse(bp);
 1099         error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 1100             level == 0 ? rlbn : -1, expungetype);
 1101         if (error || level == 0)
 1102                 goto out;
 1103         /*
 1104          * Account for the block pointers in each of the indirect blocks
 1105          * in the levels below us.
 1106          */
 1107         subblksperindir = blksperindir / NINDIR(fs);
 1108         for (lbn++, level--, i = 0; i < last; i++) {
 1109                 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
 1110                     rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 1111                 if (error)
 1112                         goto out;
 1113                 rlbn += blksperindir;
 1114                 lbn -= blksperindir;
 1115                 remblks -= blksperindir;
 1116         }
 1117 out:
 1118         FREE(bap, M_DEVBUF);
 1119         return (error);
 1120 }
 1121 
 1122 /*
 1123  * Do both snap accounting and map accounting.
 1124  */
 1125 static int
 1126 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 1127         struct vnode *vp;
 1128         ufs1_daddr_t *oldblkp, *lastblkp;
 1129         struct fs *fs;
 1130         ufs_lbn_t lblkno;
 1131         int exptype;    /* BLK_SNAP or BLK_NOCOPY */
 1132 {
 1133         int error;
 1134 
 1135         if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 1136                 return (error);
 1137         return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 1138 }
 1139 
 1140 /*
 1141  * Identify a set of blocks allocated in a snapshot inode.
 1142  */
 1143 static int
 1144 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 1145         struct vnode *vp;
 1146         ufs1_daddr_t *oldblkp, *lastblkp;
 1147         struct fs *fs;
 1148         ufs_lbn_t lblkno;
 1149         int expungetype;        /* BLK_SNAP or BLK_NOCOPY */
 1150 {
 1151         struct inode *ip = VTOI(vp);
 1152         ufs1_daddr_t blkno, *blkp;
 1153         ufs_lbn_t lbn;
 1154         struct buf *ibp;
 1155         int error;
 1156 
 1157         for ( ; oldblkp < lastblkp; oldblkp++) {
 1158                 blkno = *oldblkp;
 1159                 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 1160                         continue;
 1161                 lbn = fragstoblks(fs, blkno);
 1162                 if (lbn < NDADDR) {
 1163                         blkp = &ip->i_din1->di_db[lbn];
 1164                         ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1165                 } else {
 1166                         error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn),
 1167                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 1168                         if (error)
 1169                                 return (error);
 1170                         blkp = &((ufs1_daddr_t *)(ibp->b_data))
 1171                             [(lbn - NDADDR) % NINDIR(fs)];
 1172                 }
 1173                 /*
 1174                  * If we are expunging a snapshot vnode and we
 1175                  * find a block marked BLK_NOCOPY, then it is
 1176                  * one that has been allocated to this snapshot after
 1177                  * we took our current snapshot and can be ignored.
 1178                  */
 1179                 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 1180                         if (lbn >= NDADDR)
 1181                                 brelse(ibp);
 1182                 } else {
 1183                         if (*blkp != 0)
 1184                                 panic("snapacct_ufs1: bad block");
 1185                         *blkp = expungetype;
 1186                         if (lbn >= NDADDR)
 1187                                 bdwrite(ibp);
 1188                 }
 1189         }
 1190         return (0);
 1191 }
 1192 
 1193 /*
 1194  * Account for a set of blocks allocated in a snapshot inode.
 1195  */
 1196 static int
 1197 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 1198         struct vnode *vp;
 1199         ufs1_daddr_t *oldblkp, *lastblkp;
 1200         struct fs *fs;
 1201         ufs_lbn_t lblkno;
 1202         int expungetype;
 1203 {
 1204         ufs1_daddr_t blkno;
 1205         struct inode *ip;
 1206         ino_t inum;
 1207         int acctit;
 1208 
 1209         ip = VTOI(vp);
 1210         inum = ip->i_number;
 1211         if (lblkno == -1)
 1212                 acctit = 0;
 1213         else
 1214                 acctit = 1;
 1215         for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 1216                 blkno = *oldblkp;
 1217                 if (blkno == 0 || blkno == BLK_NOCOPY)
 1218                         continue;
 1219                 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 1220                         *ip->i_snapblklist++ = lblkno;
 1221                 if (blkno == BLK_SNAP)
 1222                         blkno = blkstofrags(fs, lblkno);
 1223                 ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
 1224         }
 1225         return (0);
 1226 }
 1227 
 1228 /*
 1229  * Before expunging a snapshot inode, note all the
 1230  * blocks that it claims with BLK_SNAP so that fsck will
 1231  * be able to account for those blocks properly and so
 1232  * that this snapshot knows that it need not copy them
 1233  * if the other snapshot holding them is freed. This code
 1234  * is reproduced once each for UFS1 and UFS2.
 1235  */
 1236 static int
 1237 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
 1238         struct vnode *snapvp;
 1239         struct inode *cancelip;
 1240         struct fs *fs;
 1241         int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 1242             struct fs *, ufs_lbn_t, int);
 1243         int expungetype;
 1244 {
 1245         int i, error, indiroff;
 1246         ufs_lbn_t lbn, rlbn;
 1247         ufs2_daddr_t len, blkno, numblks, blksperindir;
 1248         struct ufs2_dinode *dip;
 1249         struct thread *td = curthread;
 1250         struct buf *bp;
 1251 
 1252         /*
 1253          * Prepare to expunge the inode. If its inode block has not
 1254          * yet been copied, then allocate and fill the copy.
 1255          */
 1256         lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 1257         blkno = 0;
 1258         if (lbn < NDADDR) {
 1259                 blkno = VTOI(snapvp)->i_din2->di_db[lbn];
 1260         } else {
 1261                 td->td_pflags |= TDP_COWINPROGRESS;
 1262                 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
 1263                    fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 1264                 td->td_pflags &= ~TDP_COWINPROGRESS;
 1265                 if (error)
 1266                         return (error);
 1267                 indiroff = (lbn - NDADDR) % NINDIR(fs);
 1268                 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
 1269                 bqrelse(bp);
 1270         }
 1271         if (blkno != 0) {
 1272                 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
 1273                         return (error);
 1274         } else {
 1275                 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
 1276                     fs->fs_bsize, KERNCRED, 0, &bp);
 1277                 if (error)
 1278                         return (error);
 1279                 if ((error = readblock(snapvp, bp, lbn)) != 0)
 1280                         return (error);
 1281         }
 1282         /*
 1283          * Set a snapshot inode to be a zero length file, regular files
 1284          * to be completely unallocated.
 1285          */
 1286         dip = (struct ufs2_dinode *)bp->b_data +
 1287             ino_to_fsbo(fs, cancelip->i_number);
 1288         if (expungetype == BLK_NOCOPY)
 1289                 dip->di_mode = 0;
 1290         dip->di_size = 0;
 1291         dip->di_blocks = 0;
 1292         dip->di_flags &= ~SF_SNAPSHOT;
 1293         bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
 1294         bdwrite(bp);
 1295         /*
 1296          * Now go through and expunge all the blocks in the file
 1297          * using the function requested.
 1298          */
 1299         numblks = howmany(cancelip->i_size, fs->fs_bsize);
 1300         if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
 1301             &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype)))
 1302                 return (error);
 1303         if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0],
 1304             &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype)))
 1305                 return (error);
 1306         blksperindir = 1;
 1307         lbn = -NDADDR;
 1308         len = numblks - NDADDR;
 1309         rlbn = NDADDR;
 1310         for (i = 0; len > 0 && i < NIADDR; i++) {
 1311                 error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
 1312                     cancelip->i_din2->di_ib[i], lbn, rlbn, len,
 1313                     blksperindir, fs, acctfunc, expungetype);
 1314                 if (error)
 1315                         return (error);
 1316                 blksperindir *= NINDIR(fs);
 1317                 lbn -= blksperindir + 1;
 1318                 len -= blksperindir;
 1319                 rlbn += blksperindir;
 1320         }
 1321         return (0);
 1322 }
 1323 
 1324 /*
 1325  * Descend an indirect block chain for vnode cancelvp accounting for all
 1326  * its indirect blocks in snapvp.
 1327  */ 
 1328 static int
 1329 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 1330             blksperindir, fs, acctfunc, expungetype)
 1331         struct vnode *snapvp;
 1332         struct vnode *cancelvp;
 1333         int level;
 1334         ufs2_daddr_t blkno;
 1335         ufs_lbn_t lbn;
 1336         ufs_lbn_t rlbn;
 1337         ufs_lbn_t remblks;
 1338         ufs_lbn_t blksperindir;
 1339         struct fs *fs;
 1340         int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 1341             struct fs *, ufs_lbn_t, int);
 1342         int expungetype;
 1343 {
 1344         int error, num, i;
 1345         ufs_lbn_t subblksperindir;
 1346         struct indir indirs[NIADDR + 2];
 1347         ufs2_daddr_t last, *bap;
 1348         struct buf *bp;
 1349 
 1350         if (blkno == 0) {
 1351                 if (expungetype == BLK_NOCOPY)
 1352                         return (0);
 1353                 panic("indiracct_ufs2: missing indir");
 1354         }
 1355         if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 1356                 return (error);
 1357         if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 1358                 panic("indiracct_ufs2: botched params");
 1359         /*
 1360          * We have to expand bread here since it will deadlock looking
 1361          * up the block number for any blocks that are not in the cache.
 1362          */
 1363         bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 1364         bp->b_blkno = fsbtodb(fs, blkno);
 1365         if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 1366             (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
 1367                 brelse(bp);
 1368                 return (error);
 1369         }
 1370         /*
 1371          * Account for the block pointers in this indirect block.
 1372          */
 1373         last = howmany(remblks, blksperindir);
 1374         if (last > NINDIR(fs))
 1375                 last = NINDIR(fs);
 1376         MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 1377         bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 1378         bqrelse(bp);
 1379         error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 1380             level == 0 ? rlbn : -1, expungetype);
 1381         if (error || level == 0)
 1382                 goto out;
 1383         /*
 1384          * Account for the block pointers in each of the indirect blocks
 1385          * in the levels below us.
 1386          */
 1387         subblksperindir = blksperindir / NINDIR(fs);
 1388         for (lbn++, level--, i = 0; i < last; i++) {
 1389                 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
 1390                     rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 1391                 if (error)
 1392                         goto out;
 1393                 rlbn += blksperindir;
 1394                 lbn -= blksperindir;
 1395                 remblks -= blksperindir;
 1396         }
 1397 out:
 1398         FREE(bap, M_DEVBUF);
 1399         return (error);
 1400 }
 1401 
 1402 /*
 1403  * Do both snap accounting and map accounting.
 1404  */
 1405 static int
 1406 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 1407         struct vnode *vp;
 1408         ufs2_daddr_t *oldblkp, *lastblkp;
 1409         struct fs *fs;
 1410         ufs_lbn_t lblkno;
 1411         int exptype;    /* BLK_SNAP or BLK_NOCOPY */
 1412 {
 1413         int error;
 1414 
 1415         if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 1416                 return (error);
 1417         return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 1418 }
 1419 
 1420 /*
 1421  * Identify a set of blocks allocated in a snapshot inode.
 1422  */
 1423 static int
 1424 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 1425         struct vnode *vp;
 1426         ufs2_daddr_t *oldblkp, *lastblkp;
 1427         struct fs *fs;
 1428         ufs_lbn_t lblkno;
 1429         int expungetype;        /* BLK_SNAP or BLK_NOCOPY */
 1430 {
 1431         struct inode *ip = VTOI(vp);
 1432         ufs2_daddr_t blkno, *blkp;
 1433         ufs_lbn_t lbn;
 1434         struct buf *ibp;
 1435         int error;
 1436 
 1437         for ( ; oldblkp < lastblkp; oldblkp++) {
 1438                 blkno = *oldblkp;
 1439                 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 1440                         continue;
 1441                 lbn = fragstoblks(fs, blkno);
 1442                 if (lbn < NDADDR) {
 1443                         blkp = &ip->i_din2->di_db[lbn];
 1444                         ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1445                 } else {
 1446                         error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn),
 1447                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 1448                         if (error)
 1449                                 return (error);
 1450                         blkp = &((ufs2_daddr_t *)(ibp->b_data))
 1451                             [(lbn - NDADDR) % NINDIR(fs)];
 1452                 }
 1453                 /*
 1454                  * If we are expunging a snapshot vnode and we
 1455                  * find a block marked BLK_NOCOPY, then it is
 1456                  * one that has been allocated to this snapshot after
 1457                  * we took our current snapshot and can be ignored.
 1458                  */
 1459                 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 1460                         if (lbn >= NDADDR)
 1461                                 brelse(ibp);
 1462                 } else {
 1463                         if (*blkp != 0)
 1464                                 panic("snapacct_ufs2: bad block");
 1465                         *blkp = expungetype;
 1466                         if (lbn >= NDADDR)
 1467                                 bdwrite(ibp);
 1468                 }
 1469         }
 1470         return (0);
 1471 }
 1472 
 1473 /*
 1474  * Account for a set of blocks allocated in a snapshot inode.
 1475  */
 1476 static int
 1477 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 1478         struct vnode *vp;
 1479         ufs2_daddr_t *oldblkp, *lastblkp;
 1480         struct fs *fs;
 1481         ufs_lbn_t lblkno;
 1482         int expungetype;
 1483 {
 1484         ufs2_daddr_t blkno;
 1485         struct inode *ip;
 1486         ino_t inum;
 1487         int acctit;
 1488 
 1489         ip = VTOI(vp);
 1490         inum = ip->i_number;
 1491         if (lblkno == -1)
 1492                 acctit = 0;
 1493         else
 1494                 acctit = 1;
 1495         for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 1496                 blkno = *oldblkp;
 1497                 if (blkno == 0 || blkno == BLK_NOCOPY)
 1498                         continue;
 1499                 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 1500                         *ip->i_snapblklist++ = lblkno;
 1501                 if (blkno == BLK_SNAP)
 1502                         blkno = blkstofrags(fs, lblkno);
 1503                 ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
 1504         }
 1505         return (0);
 1506 }
 1507 
 1508 /*
 1509  * Decrement extra reference on snapshot when last name is removed.
 1510  * It will not be freed until the last open reference goes away.
 1511  */
 1512 void
 1513 ffs_snapgone(ip)
 1514         struct inode *ip;
 1515 {
 1516         struct inode *xp;
 1517         struct fs *fs;
 1518         int snaploc;
 1519         struct snapdata *sn;
 1520         struct ufsmount *ump;
 1521 
 1522         /*
 1523          * Find snapshot in incore list.
 1524          */
 1525         xp = NULL;
 1526         sn = ip->i_devvp->v_rdev->si_snapdata;
 1527         if (sn != NULL)
 1528                 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap)
 1529                         if (xp == ip)
 1530                                 break;
 1531         if (xp != NULL)
 1532                 vrele(ITOV(ip));
 1533         else if (snapdebug)
 1534                 printf("ffs_snapgone: lost snapshot vnode %d\n",
 1535                     ip->i_number);
 1536         /*
 1537          * Delete snapshot inode from superblock. Keep list dense.
 1538          */
 1539         fs = ip->i_fs;
 1540         ump = ip->i_ump;
 1541         UFS_LOCK(ump);
 1542         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 1543                 if (fs->fs_snapinum[snaploc] == ip->i_number)
 1544                         break;
 1545         if (snaploc < FSMAXSNAP) {
 1546                 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
 1547                         if (fs->fs_snapinum[snaploc] == 0)
 1548                                 break;
 1549                         fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
 1550                 }
 1551                 fs->fs_snapinum[snaploc - 1] = 0;
 1552         }
 1553         UFS_UNLOCK(ump);
 1554 }
 1555 
 1556 /*
 1557  * Prepare a snapshot file for being removed.
 1558  */
 1559 void
 1560 ffs_snapremove(vp)
 1561         struct vnode *vp;
 1562 {
 1563         struct inode *ip;
 1564         struct vnode *devvp;
 1565         struct buf *ibp;
 1566         struct fs *fs;
 1567         struct thread *td = curthread;
 1568         ufs2_daddr_t numblks, blkno, dblk;
 1569         int error, loc, last;
 1570         struct snapdata *sn;
 1571 
 1572         ip = VTOI(vp);
 1573         fs = ip->i_fs;
 1574         devvp = ip->i_devvp;
 1575         /*
 1576          * If active, delete from incore list (this snapshot may
 1577          * already have been in the process of being deleted, so
 1578          * would not have been active).
 1579          *
 1580          * Clear copy-on-write flag if last snapshot.
 1581          */
 1582         VI_LOCK(devvp);
 1583         if (ip->i_nextsnap.tqe_prev != 0) {
 1584                 sn = devvp->v_rdev->si_snapdata;
 1585                 TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap);
 1586                 ip->i_nextsnap.tqe_prev = 0;
 1587                 VI_UNLOCK(devvp);
 1588                 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL, td);
 1589                 VI_LOCK(vp);
 1590                 KASSERT(vp->v_vnlock == &sn->sn_lock,
 1591                         ("ffs_snapremove: lost lock mutation")); 
 1592                 vp->v_vnlock = &vp->v_lock;
 1593                 VI_UNLOCK(vp);
 1594                 VI_LOCK(devvp);
 1595                 lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
 1596                 try_free_snapdata(devvp, td);
 1597         } else
 1598                 VI_UNLOCK(devvp);
 1599         /*
 1600          * Clear all BLK_NOCOPY fields. Pass any block claims to other
 1601          * snapshots that want them (see ffs_snapblkfree below).
 1602          */
 1603         for (blkno = 1; blkno < NDADDR; blkno++) {
 1604                 dblk = DIP(ip, i_db[blkno]);
 1605                 if (dblk == 0)
 1606                         continue;
 1607                 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 1608                         DIP_SET(ip, i_db[blkno], 0);
 1609                 else if ((dblk == blkstofrags(fs, blkno) &&
 1610                      ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
 1611                      ip->i_number))) {
 1612                         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) -
 1613                             btodb(fs->fs_bsize));
 1614                         DIP_SET(ip, i_db[blkno], 0);
 1615                 }
 1616         }
 1617         numblks = howmany(ip->i_size, fs->fs_bsize);
 1618         for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 1619                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 1620                     fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 1621                 if (error)
 1622                         continue;
 1623                 if (fs->fs_size - blkno > NINDIR(fs))
 1624                         last = NINDIR(fs);
 1625                 else
 1626                         last = fs->fs_size - blkno;
 1627                 for (loc = 0; loc < last; loc++) {
 1628                         if (ip->i_ump->um_fstype == UFS1) {
 1629                                 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
 1630                                 if (dblk == 0)
 1631                                         continue;
 1632                                 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 1633                                         ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 1634                                 else if ((dblk == blkstofrags(fs, blkno) &&
 1635                                      ffs_snapblkfree(fs, ip->i_devvp, dblk,
 1636                                      fs->fs_bsize, ip->i_number))) {
 1637                                         ip->i_din1->di_blocks -=
 1638                                             btodb(fs->fs_bsize);
 1639                                         ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 1640                                 }
 1641                                 continue;
 1642                         }
 1643                         dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
 1644                         if (dblk == 0)
 1645                                 continue;
 1646                         if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 1647                                 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 1648                         else if ((dblk == blkstofrags(fs, blkno) &&
 1649                              ffs_snapblkfree(fs, ip->i_devvp, dblk,
 1650                              fs->fs_bsize, ip->i_number))) {
 1651                                 ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
 1652                                 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 1653                         }
 1654                 }
 1655                 bawrite(ibp);
 1656         }
 1657         /*
 1658          * Clear snapshot flag and drop reference.
 1659          */
 1660         ip->i_flags &= ~SF_SNAPSHOT;
 1661         DIP_SET(ip, i_flags, ip->i_flags);
 1662         ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1663 #ifdef QUOTA
 1664         /*
 1665          * Reenable disk quotas for ex-snapshot file.
 1666          */
 1667         if (!getinoquota(ip))
 1668                 (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE);
 1669 #endif
 1670 }
 1671 
 1672 /*
 1673  * Notification that a block is being freed. Return zero if the free
 1674  * should be allowed to proceed. Return non-zero if the snapshot file
 1675  * wants to claim the block. The block will be claimed if it is an
 1676  * uncopied part of one of the snapshots. It will be freed if it is
 1677  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
 1678  * If a fragment is being freed, then all snapshots that care about
 1679  * it must make a copy since a snapshot file can only claim full sized
 1680  * blocks. Note that if more than one snapshot file maps the block,
 1681  * we can pick one at random to claim it. Since none of the snapshots
 1682  * can change, we are assurred that they will all see the same unmodified
 1683  * image. When deleting a snapshot file (see ffs_snapremove above), we
 1684  * must push any of these claimed blocks to one of the other snapshots
 1685  * that maps it. These claimed blocks are easily identified as they will
 1686  * have a block number equal to their logical block number within the
 1687  * snapshot. A copied block can never have this property because they
 1688  * must always have been allocated from a BLK_NOCOPY location.
 1689  */
 1690 int
 1691 ffs_snapblkfree(fs, devvp, bno, size, inum)
 1692         struct fs *fs;
 1693         struct vnode *devvp;
 1694         ufs2_daddr_t bno;
 1695         long size;
 1696         ino_t inum;
 1697 {
 1698         struct buf *ibp, *cbp, *savedcbp = 0;
 1699         struct thread *td = curthread;
 1700         struct inode *ip;
 1701         struct vnode *vp = NULL;
 1702         ufs_lbn_t lbn;
 1703         ufs2_daddr_t blkno;
 1704         int indiroff = 0, error = 0, claimedblk = 0;
 1705         struct snapdata *sn;
 1706 
 1707         lbn = fragstoblks(fs, bno);
 1708 retry:
 1709         VI_LOCK(devvp);
 1710         sn = devvp->v_rdev->si_snapdata;
 1711         if (sn == NULL) {
 1712                 VI_UNLOCK(devvp);
 1713                 return (0);
 1714         }
 1715         if (lockmgr(&sn->sn_lock,
 1716                     LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 1717                     VI_MTX(devvp), td) != 0)
 1718                 goto retry;
 1719         TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 1720                 vp = ITOV(ip);
 1721                 /*
 1722                  * Lookup block being written.
 1723                  */
 1724                 if (lbn < NDADDR) {
 1725                         blkno = DIP(ip, i_db[lbn]);
 1726                 } else {
 1727                         td->td_pflags |= TDP_COWINPROGRESS;
 1728                         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 1729                             fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 1730                         td->td_pflags &= ~TDP_COWINPROGRESS;
 1731                         if (error)
 1732                                 break;
 1733                         indiroff = (lbn - NDADDR) % NINDIR(fs);
 1734                         if (ip->i_ump->um_fstype == UFS1)
 1735                                 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 1736                         else
 1737                                 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 1738                 }
 1739                 /*
 1740                  * Check to see if block needs to be copied.
 1741                  */
 1742                 if (blkno == 0) {
 1743                         /*
 1744                          * A block that we map is being freed. If it has not
 1745                          * been claimed yet, we will claim or copy it (below).
 1746                          */
 1747                         claimedblk = 1;
 1748                 } else if (blkno == BLK_SNAP) {
 1749                         /*
 1750                          * No previous snapshot claimed the block,
 1751                          * so it will be freed and become a BLK_NOCOPY
 1752                          * (don't care) for us.
 1753                          */
 1754                         if (claimedblk)
 1755                                 panic("snapblkfree: inconsistent block type");
 1756                         if (lbn < NDADDR) {
 1757                                 DIP_SET(ip, i_db[lbn], BLK_NOCOPY);
 1758                                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1759                         } else if (ip->i_ump->um_fstype == UFS1) {
 1760                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 1761                                     BLK_NOCOPY;
 1762                                 bdwrite(ibp);
 1763                         } else {
 1764                                 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
 1765                                     BLK_NOCOPY;
 1766                                 bdwrite(ibp);
 1767                         }
 1768                         continue;
 1769                 } else /* BLK_NOCOPY or default */ {
 1770                         /*
 1771                          * If the snapshot has already copied the block
 1772                          * (default), or does not care about the block,
 1773                          * it is not needed.
 1774                          */
 1775                         if (lbn >= NDADDR)
 1776                                 bqrelse(ibp);
 1777                         continue;
 1778                 }
 1779                 /*
 1780                  * If this is a full size block, we will just grab it
 1781                  * and assign it to the snapshot inode. Otherwise we
 1782                  * will proceed to copy it. See explanation for this
 1783                  * routine as to why only a single snapshot needs to
 1784                  * claim this block.
 1785                  */
 1786                 if (size == fs->fs_bsize) {
 1787 #ifdef DEBUG
 1788                         if (snapdebug)
 1789                                 printf("%s %d lbn %jd from inum %d\n",
 1790                                     "Grabonremove: snapino", ip->i_number,
 1791                                     (intmax_t)lbn, inum);
 1792 #endif
 1793                         if (lbn < NDADDR) {
 1794                                 DIP_SET(ip, i_db[lbn], bno);
 1795                         } else if (ip->i_ump->um_fstype == UFS1) {
 1796                                 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
 1797                                 bdwrite(ibp);
 1798                         } else {
 1799                                 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
 1800                                 bdwrite(ibp);
 1801                         }
 1802                         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
 1803                         ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1804                         lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 1805                         return (1);
 1806                 }
 1807                 if (lbn >= NDADDR)
 1808                         bqrelse(ibp);
 1809                 /*
 1810                  * Allocate the block into which to do the copy. Note that this
 1811                  * allocation will never require any additional allocations for
 1812                  * the snapshot inode.
 1813                  */
 1814                 td->td_pflags |= TDP_COWINPROGRESS;
 1815                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 1816                     fs->fs_bsize, KERNCRED, 0, &cbp);
 1817                 td->td_pflags &= ~TDP_COWINPROGRESS;
 1818                 if (error)
 1819                         break;
 1820 #ifdef DEBUG
 1821                 if (snapdebug)
 1822                         printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n",
 1823                             "Copyonremove: snapino ", ip->i_number,
 1824                             (intmax_t)lbn, "for inum", inum, size,
 1825                             (intmax_t)cbp->b_blkno);
 1826 #endif
 1827                 /*
 1828                  * If we have already read the old block contents, then
 1829                  * simply copy them to the new block. Note that we need
 1830                  * to synchronously write snapshots that have not been
 1831                  * unlinked, and hence will be visible after a crash,
 1832                  * to ensure their integrity.
 1833                  */
 1834                 if (savedcbp != 0) {
 1835                         bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 1836                         bawrite(cbp);
 1837                         if (dopersistence && ip->i_effnlink > 0)
 1838                                 (void) ffs_syncvnode(vp, MNT_WAIT);
 1839                         continue;
 1840                 }
 1841                 /*
 1842                  * Otherwise, read the old block contents into the buffer.
 1843                  */
 1844                 if ((error = readblock(vp, cbp, lbn)) != 0) {
 1845                         bzero(cbp->b_data, fs->fs_bsize);
 1846                         bawrite(cbp);
 1847                         if (dopersistence && ip->i_effnlink > 0)
 1848                                 (void) ffs_syncvnode(vp, MNT_WAIT);
 1849                         break;
 1850                 }
 1851                 savedcbp = cbp;
 1852         }
 1853         /*
 1854          * Note that we need to synchronously write snapshots that
 1855          * have not been unlinked, and hence will be visible after
 1856          * a crash, to ensure their integrity.
 1857          */
 1858         if (savedcbp) {
 1859                 vp = savedcbp->b_vp;
 1860                 bawrite(savedcbp);
 1861                 if (dopersistence && VTOI(vp)->i_effnlink > 0)
 1862                         (void) ffs_syncvnode(vp, MNT_WAIT);
 1863         }
 1864         /*
 1865          * If we have been unable to allocate a block in which to do
 1866          * the copy, then return non-zero so that the fragment will
 1867          * not be freed. Although space will be lost, the snapshot
 1868          * will stay consistent.
 1869          */
 1870         lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 1871         return (error);
 1872 }
 1873 
 1874 /*
 1875  * Associate snapshot files when mounting.
 1876  */
 1877 void
 1878 ffs_snapshot_mount(mp)
 1879         struct mount *mp;
 1880 {
 1881         struct ufsmount *ump = VFSTOUFS(mp);
 1882         struct vnode *devvp = ump->um_devvp;
 1883         struct fs *fs = ump->um_fs;
 1884         struct thread *td = curthread;
 1885         struct snapdata *sn;
 1886         struct vnode *vp;
 1887         struct vnode *lastvp;
 1888         struct inode *ip;
 1889         struct uio auio;
 1890         struct iovec aiov;
 1891         void *snapblklist;
 1892         char *reason;
 1893         daddr_t snaplistsize;
 1894         int error, snaploc, loc;
 1895 
 1896         /*
 1897          * XXX The following needs to be set before ffs_truncate or
 1898          * VOP_READ can be called.
 1899          */
 1900         mp->mnt_stat.f_iosize = fs->fs_bsize;
 1901         /*
 1902          * Process each snapshot listed in the superblock.
 1903          */
 1904         vp = NULL;
 1905         lastvp = NULL;
 1906         sn = devvp->v_rdev->si_snapdata;
 1907         for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
 1908                 if (fs->fs_snapinum[snaploc] == 0)
 1909                         break;
 1910                 if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc],
 1911                     LK_EXCLUSIVE, &vp)) != 0){
 1912                         printf("ffs_snapshot_mount: vget failed %d\n", error);
 1913                         continue;
 1914                 }
 1915                 ip = VTOI(vp);
 1916                 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size ==
 1917                     lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
 1918                         if ((ip->i_flags & SF_SNAPSHOT) == 0) {
 1919                                 reason = "non-snapshot";
 1920                         } else {
 1921                                 reason = "old format snapshot";
 1922                                 (void)ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
 1923                                 (void)ffs_syncvnode(vp, MNT_WAIT);
 1924                         }
 1925                         printf("ffs_snapshot_mount: %s inode %d\n",
 1926                             reason, fs->fs_snapinum[snaploc]);
 1927                         vput(vp);
 1928                         vp = NULL;
 1929                         for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
 1930                                 if (fs->fs_snapinum[loc] == 0)
 1931                                         break;
 1932                                 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
 1933                         }
 1934                         fs->fs_snapinum[loc - 1] = 0;
 1935                         snaploc--;
 1936                         continue;
 1937                 }
 1938                 /*
 1939                  * If there already exist snapshots on this filesystem, grab a
 1940                  * reference to their shared lock. If this is the first snapshot
 1941                  * on this filesystem, we need to allocate a lock for the
 1942                  * snapshots to share. In either case, acquire the snapshot
 1943                  * lock and give up our original private lock.
 1944                  */
 1945                 VI_LOCK(devvp);
 1946                 if (sn != NULL) {
 1947 
 1948                         VI_UNLOCK(devvp);
 1949                         VI_LOCK(vp);
 1950                         vp->v_vnlock = &sn->sn_lock;
 1951                 } else {
 1952                         VI_UNLOCK(devvp);
 1953                         sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
 1954                         TAILQ_INIT(&sn->sn_head);
 1955                         lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
 1956                             LK_CANRECURSE | LK_NOSHARE);
 1957                         VI_LOCK(vp);
 1958                         vp->v_vnlock = &sn->sn_lock;
 1959                         devvp->v_rdev->si_snapdata = sn;
 1960                 }
 1961                 lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
 1962                     VI_MTX(vp), td);
 1963                 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 1964                 /*
 1965                  * Link it onto the active snapshot list.
 1966                  */
 1967                 VI_LOCK(devvp);
 1968                 if (ip->i_nextsnap.tqe_prev != 0)
 1969                         panic("ffs_snapshot_mount: %d already on list",
 1970                             ip->i_number);
 1971                 else
 1972                         TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
 1973                 vp->v_vflag |= VV_SYSTEM;
 1974                 VI_UNLOCK(devvp);
 1975                 VOP_UNLOCK(vp, 0, td);
 1976                 lastvp = vp;
 1977         }
 1978         vp = lastvp;
 1979         /*
 1980          * No usable snapshots found.
 1981          */
 1982         if (vp == NULL)
 1983                 return;
 1984         /*
 1985          * Allocate the space for the block hints list. We always want to
 1986          * use the list from the newest snapshot.
 1987          */
 1988         auio.uio_iov = &aiov;
 1989         auio.uio_iovcnt = 1;
 1990         aiov.iov_base = (void *)&snaplistsize;
 1991         aiov.iov_len = sizeof(snaplistsize);
 1992         auio.uio_resid = aiov.iov_len;
 1993         auio.uio_offset =
 1994             lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
 1995         auio.uio_segflg = UIO_SYSSPACE;
 1996         auio.uio_rw = UIO_READ;
 1997         auio.uio_td = td;
 1998         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 1999         if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 2000                 printf("ffs_snapshot_mount: read_1 failed %d\n", error);
 2001                 VOP_UNLOCK(vp, 0, td);
 2002                 return;
 2003         }
 2004         MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t),
 2005             M_UFSMNT, M_WAITOK);
 2006         auio.uio_iovcnt = 1;
 2007         aiov.iov_base = snapblklist;
 2008         aiov.iov_len = snaplistsize * sizeof (daddr_t);
 2009         auio.uio_resid = aiov.iov_len;
 2010         auio.uio_offset -= sizeof(snaplistsize);
 2011         if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 2012                 printf("ffs_snapshot_mount: read_2 failed %d\n", error);
 2013                 VOP_UNLOCK(vp, 0, td);
 2014                 FREE(snapblklist, M_UFSMNT);
 2015                 return;
 2016         }
 2017         VOP_UNLOCK(vp, 0, td);
 2018         VI_LOCK(devvp);
 2019         ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount");
 2020         sn->sn_listsize = snaplistsize;
 2021         sn->sn_blklist = (daddr_t *)snapblklist;
 2022         devvp->v_vflag |= VV_COPYONWRITE;
 2023         VI_UNLOCK(devvp);
 2024 }
 2025 
 2026 /*
 2027  * Disassociate snapshot files when unmounting.
 2028  */
 2029 void
 2030 ffs_snapshot_unmount(mp)
 2031         struct mount *mp;
 2032 {
 2033         struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
 2034         struct snapdata *sn;
 2035         struct inode *xp;
 2036         struct vnode *vp;
 2037         struct thread *td = curthread;
 2038 
 2039         VI_LOCK(devvp);
 2040         sn = devvp->v_rdev->si_snapdata;
 2041         while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) {
 2042                 vp = ITOV(xp);
 2043                 TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap);
 2044                 xp->i_nextsnap.tqe_prev = 0;
 2045                 lockmgr(&sn->sn_lock, 
 2046                         LK_INTERLOCK | LK_EXCLUSIVE,
 2047                         VI_MTX(devvp),
 2048                         td);
 2049                 VI_LOCK(vp);
 2050                 lockmgr(&vp->v_lock,
 2051                         LK_INTERLOCK | LK_EXCLUSIVE,
 2052                         VI_MTX(vp), td);
 2053                 VI_LOCK(vp);
 2054                 KASSERT(vp->v_vnlock == &sn->sn_lock,
 2055                 ("ffs_snapshot_unmount: lost lock mutation")); 
 2056                 vp->v_vnlock = &vp->v_lock;
 2057                 VI_UNLOCK(vp);
 2058                 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 2059                 lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
 2060                 if (xp->i_effnlink > 0)
 2061                         vrele(vp);
 2062                 VI_LOCK(devvp);
 2063                 sn = devvp->v_rdev->si_snapdata;
 2064         }
 2065         try_free_snapdata(devvp, td);
 2066         ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
 2067 }
 2068 
 2069 /*
 2070  * Check the buffer block to be belong to device buffer that shall be
 2071  * locked after snaplk. devvp shall be locked on entry, and will be
 2072  * leaved locked upon exit.
 2073  */
 2074 static int
 2075 ffs_bp_snapblk(devvp, bp)
 2076         struct vnode *devvp;
 2077         struct buf *bp;
 2078 {
 2079         struct snapdata *sn;
 2080         struct fs *fs;
 2081         ufs2_daddr_t lbn, *snapblklist;
 2082         int lower, upper, mid;
 2083 
 2084         ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk");
 2085         KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp));
 2086         sn = devvp->v_rdev->si_snapdata;
 2087         if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL)
 2088                 return (0);
 2089         fs = TAILQ_FIRST(&sn->sn_head)->i_fs;
 2090         lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 2091         snapblklist = sn->sn_blklist;
 2092         upper = sn->sn_listsize - 1;
 2093         lower = 1;
 2094         while (lower <= upper) {
 2095                 mid = (lower + upper) / 2;
 2096                 if (snapblklist[mid] == lbn)
 2097                         break;
 2098                 if (snapblklist[mid] < lbn)
 2099                         lower = mid + 1;
 2100                 else
 2101                         upper = mid - 1;
 2102         }
 2103         if (lower <= upper)
 2104                 return (1);
 2105         return (0);
 2106 }
 2107 
 2108 void
 2109 ffs_bdflush(bo, bp)
 2110         struct bufobj *bo;
 2111         struct buf *bp;
 2112 {
 2113         struct thread *td;
 2114         struct vnode *vp, *devvp;
 2115         struct buf *nbp;
 2116         int bp_bdskip;
 2117 
 2118         if (bo->bo_dirty.bv_cnt <= dirtybufthresh)
 2119                 return;
 2120 
 2121         td = curthread;
 2122         vp = bp->b_vp;
 2123         devvp = bo->__bo_vnode;
 2124         KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp));
 2125 
 2126         VI_LOCK(devvp);
 2127         bp_bdskip = ffs_bp_snapblk(devvp, bp);
 2128         if (bp_bdskip)
 2129                 bdwriteskip++;
 2130         VI_UNLOCK(devvp);
 2131         if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) {
 2132                 (void) VOP_FSYNC(vp, MNT_NOWAIT, td);
 2133                 altbufferflushes++;
 2134         } else {
 2135                 BO_LOCK(bo);
 2136                 /*
 2137                  * Try to find a buffer to flush.
 2138                  */
 2139                 TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
 2140                         if ((nbp->b_vflags & BV_BKGRDINPROG) ||
 2141                             BUF_LOCK(nbp,
 2142                                      LK_EXCLUSIVE | LK_NOWAIT, NULL))
 2143                                 continue;
 2144                         if (bp == nbp)
 2145                                 panic("bdwrite: found ourselves");
 2146                         BO_UNLOCK(bo);
 2147                         /*
 2148                          * Don't countdeps with the bo lock
 2149                          * held.
 2150                          */
 2151                         if (buf_countdeps(nbp, 0)) {
 2152                                 BO_LOCK(bo);
 2153                                 BUF_UNLOCK(nbp);
 2154                                 continue;
 2155                         }
 2156                         if (bp_bdskip) {
 2157                                 VI_LOCK(devvp);
 2158                                 if (!ffs_bp_snapblk(vp, nbp)) {
 2159                                         if (BO_MTX(bo) != VI_MTX(vp)) {
 2160                                                 VI_UNLOCK(devvp);
 2161                                                 BO_LOCK(bo);
 2162                                         }
 2163                                         BUF_UNLOCK(nbp);
 2164                                         continue;
 2165                                 }
 2166                                 VI_UNLOCK(devvp);
 2167                         }
 2168                         if (nbp->b_flags & B_CLUSTEROK) {
 2169                                 vfs_bio_awrite(nbp);
 2170                         } else {
 2171                                 bremfree(nbp);
 2172                                 bawrite(nbp);
 2173                         }
 2174                         dirtybufferflushes++;
 2175                         break;
 2176                 }
 2177                 if (nbp == NULL)
 2178                         BO_UNLOCK(bo);
 2179         }
 2180 }
 2181 
 2182 /*
 2183  * Check for need to copy block that is about to be written,
 2184  * copying the block if necessary.
 2185  */
 2186 int
 2187 ffs_copyonwrite(devvp, bp)
 2188         struct vnode *devvp;
 2189         struct buf *bp;
 2190 {
 2191         struct snapdata *sn;
 2192         struct buf *ibp, *cbp, *savedcbp = 0;
 2193         struct thread *td = curthread;
 2194         struct fs *fs;
 2195         struct inode *ip;
 2196         struct vnode *vp = 0;
 2197         ufs2_daddr_t lbn, blkno, *snapblklist;
 2198         int lower, upper, mid, indiroff, error = 0;
 2199         int launched_async_io, prev_norunningbuf;
 2200         long saved_runningbufspace;
 2201 
 2202         if (devvp != bp->b_vp && (VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0)
 2203                 return (0);             /* Update on a snapshot file */
 2204         if (td->td_pflags & TDP_COWINPROGRESS)
 2205                 panic("ffs_copyonwrite: recursive call");
 2206         /*
 2207          * First check to see if it is in the preallocated list.
 2208          * By doing this check we avoid several potential deadlocks.
 2209          */
 2210         VI_LOCK(devvp);
 2211         sn = devvp->v_rdev->si_snapdata;
 2212         if (sn == NULL ||
 2213             TAILQ_EMPTY(&sn->sn_head)) {
 2214                 VI_UNLOCK(devvp);
 2215                 return (0);             /* No snapshot */
 2216         }
 2217         ip = TAILQ_FIRST(&sn->sn_head);
 2218         fs = ip->i_fs;
 2219         lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 2220         snapblklist = sn->sn_blklist;
 2221         upper = sn->sn_listsize - 1;
 2222         lower = 1;
 2223         while (lower <= upper) {
 2224                 mid = (lower + upper) / 2;
 2225                 if (snapblklist[mid] == lbn)
 2226                         break;
 2227                 if (snapblklist[mid] < lbn)
 2228                         lower = mid + 1;
 2229                 else
 2230                         upper = mid - 1;
 2231         }
 2232         if (lower <= upper) {
 2233                 VI_UNLOCK(devvp);
 2234                 return (0);
 2235         }
 2236         launched_async_io = 0;
 2237         prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF;
 2238         /*
 2239          * Since I/O on bp isn't yet in progress and it may be blocked
 2240          * for a long time waiting on snaplk, back it out of
 2241          * runningbufspace, possibly waking other threads waiting for space.
 2242          */
 2243         saved_runningbufspace = bp->b_runningbufspace;
 2244         if (saved_runningbufspace != 0)
 2245                 runningbufwakeup(bp);
 2246         /*
 2247          * Not in the precomputed list, so check the snapshots.
 2248          */
 2249         while (lockmgr(&sn->sn_lock,
 2250                        LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 2251                        VI_MTX(devvp), td) != 0) {
 2252                 VI_LOCK(devvp);
 2253                 sn = devvp->v_rdev->si_snapdata;
 2254                 if (sn == NULL ||
 2255                     TAILQ_EMPTY(&sn->sn_head)) {
 2256                         VI_UNLOCK(devvp);
 2257                         if (saved_runningbufspace != 0) {
 2258                                 bp->b_runningbufspace = saved_runningbufspace;
 2259                                 atomic_add_int(&runningbufspace,
 2260                                                bp->b_runningbufspace);
 2261                         }
 2262                         return (0);             /* Snapshot gone */
 2263                 }
 2264         }
 2265         TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 2266                 vp = ITOV(ip);
 2267                 /*
 2268                  * We ensure that everything of our own that needs to be
 2269                  * copied will be done at the time that ffs_snapshot is
 2270                  * called. Thus we can skip the check here which can
 2271                  * deadlock in doing the lookup in UFS_BALLOC.
 2272                  */
 2273                 if (bp->b_vp == vp)
 2274                         continue;
 2275                 /*
 2276                  * Check to see if block needs to be copied. We do not have
 2277                  * to hold the snapshot lock while doing this lookup as it
 2278                  * will never require any additional allocations for the
 2279                  * snapshot inode.
 2280                  */
 2281                 if (lbn < NDADDR) {
 2282                         blkno = DIP(ip, i_db[lbn]);
 2283                 } else {
 2284                         td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
 2285                         error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 2286                            fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 2287                         td->td_pflags &= ~TDP_COWINPROGRESS;
 2288                         if (error)
 2289                                 break;
 2290                         indiroff = (lbn - NDADDR) % NINDIR(fs);
 2291                         if (ip->i_ump->um_fstype == UFS1)
 2292                                 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 2293                         else
 2294                                 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 2295                         bqrelse(ibp);
 2296                 }
 2297 #ifdef INVARIANTS
 2298                 if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
 2299                         panic("ffs_copyonwrite: bad copy block");
 2300 #endif
 2301                 if (blkno != 0)
 2302                         continue;
 2303                 /*
 2304                  * Allocate the block into which to do the copy. Since
 2305                  * multiple processes may all try to copy the same block,
 2306                  * we have to recheck our need to do a copy if we sleep
 2307                  * waiting for the lock.
 2308                  *
 2309                  * Because all snapshots on a filesystem share a single
 2310                  * lock, we ensure that we will never be in competition
 2311                  * with another process to allocate a block.
 2312                  */
 2313                 td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
 2314                 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 2315                     fs->fs_bsize, KERNCRED, 0, &cbp);
 2316                 td->td_pflags &= ~TDP_COWINPROGRESS;
 2317                 if (error)
 2318                         break;
 2319 #ifdef DEBUG
 2320                 if (snapdebug) {
 2321                         printf("Copyonwrite: snapino %d lbn %jd for ",
 2322                             ip->i_number, (intmax_t)lbn);
 2323                         if (bp->b_vp == devvp)
 2324                                 printf("fs metadata");
 2325                         else
 2326                                 printf("inum %d", VTOI(bp->b_vp)->i_number);
 2327                         printf(" lblkno %jd to blkno %jd\n",
 2328                             (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
 2329                 }
 2330 #endif
 2331                 /*
 2332                  * If we have already read the old block contents, then
 2333                  * simply copy them to the new block. Note that we need
 2334                  * to synchronously write snapshots that have not been
 2335                  * unlinked, and hence will be visible after a crash,
 2336                  * to ensure their integrity.
 2337                  */
 2338                 if (savedcbp != 0) {
 2339                         bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 2340                         bawrite(cbp);
 2341                         if (dopersistence && ip->i_effnlink > 0)
 2342                                 (void) ffs_syncvnode(vp, MNT_WAIT);
 2343                         else
 2344                                 launched_async_io = 1;
 2345                         continue;
 2346                 }
 2347                 /*
 2348                  * Otherwise, read the old block contents into the buffer.
 2349                  */
 2350                 if ((error = readblock(vp, cbp, lbn)) != 0) {
 2351                         bzero(cbp->b_data, fs->fs_bsize);
 2352                         bawrite(cbp);
 2353                         if (dopersistence && ip->i_effnlink > 0)
 2354                                 (void) ffs_syncvnode(vp, MNT_WAIT);
 2355                         else
 2356                                 launched_async_io = 1;
 2357                         break;
 2358                 }
 2359                 savedcbp = cbp;
 2360         }
 2361         /*
 2362          * Note that we need to synchronously write snapshots that
 2363          * have not been unlinked, and hence will be visible after
 2364          * a crash, to ensure their integrity.
 2365          */
 2366         if (savedcbp) {
 2367                 vp = savedcbp->b_vp;
 2368                 bawrite(savedcbp);
 2369                 if (dopersistence && VTOI(vp)->i_effnlink > 0)
 2370                         (void) ffs_syncvnode(vp, MNT_WAIT);
 2371                 else
 2372                         launched_async_io = 1;
 2373         }
 2374         lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 2375         td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) |
 2376                 prev_norunningbuf;
 2377         if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0)
 2378                 waitrunningbufspace();
 2379         /*
 2380          * I/O on bp will now be started, so count it in runningbufspace.
 2381          */
 2382         if (saved_runningbufspace != 0) {
 2383                 bp->b_runningbufspace = saved_runningbufspace;
 2384                 atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 2385         }
 2386         return (error);
 2387 }
 2388 
 2389 /*
 2390  * Read the specified block into the given buffer.
 2391  * Much of this boiler-plate comes from bwrite().
 2392  */
 2393 static int
 2394 readblock(vp, bp, lbn)
 2395         struct vnode *vp;
 2396         struct buf *bp;
 2397         ufs2_daddr_t lbn;
 2398 {
 2399         struct inode *ip = VTOI(vp);
 2400         struct bio *bip;
 2401 
 2402         bip = g_alloc_bio();
 2403         bip->bio_cmd = BIO_READ;
 2404         bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
 2405         bip->bio_data = bp->b_data;
 2406         bip->bio_length = bp->b_bcount;
 2407         bip->bio_done = NULL;
 2408 
 2409         g_io_request(bip, ip->i_devvp->v_bufobj.bo_private);
 2410         bp->b_error = biowait(bip, "snaprdb");
 2411         g_destroy_bio(bip);
 2412         return (bp->b_error);
 2413 }
 2414 
 2415 /*
 2416  * Process file deletes that were deferred by ufs_inactive() due to
 2417  * the file system being suspended. Transfer IN_LAZYACCESS into
 2418  * IN_MODIFIED for vnodes that were accessed during suspension.
 2419  */
 2420 static void
 2421 process_deferred_inactive(struct mount *mp)
 2422 {
 2423         struct vnode *vp, *mvp;
 2424         struct inode *ip;
 2425         struct thread *td;
 2426         int error;
 2427 
 2428         td = curthread;
 2429         (void) vn_start_secondary_write(NULL, &mp, V_WAIT);
 2430         MNT_ILOCK(mp);
 2431  loop:
 2432         MNT_VNODE_FOREACH(vp, mp, mvp) {
 2433                 VI_LOCK(vp);
 2434                 /*
 2435                  * IN_LAZYACCESS is checked here without holding any
 2436                  * vnode lock, but this flag is set only while holding
 2437                  * vnode interlock.
 2438                  */
 2439                 if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED) != 0 ||
 2440                     ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 &&
 2441                         ((vp->v_iflag & VI_OWEINACT) == 0 ||
 2442                         vp->v_usecount > 0))) {
 2443                         VI_UNLOCK(vp);
 2444                         continue;
 2445                 }
 2446                 MNT_IUNLOCK(mp);
 2447                 vholdl(vp);
 2448                 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td);
 2449                 if (error != 0) {
 2450                         vdrop(vp);
 2451                         MNT_ILOCK(mp);
 2452                         if (error == ENOENT)
 2453                                 continue;       /* vnode recycled */
 2454                         MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 2455                         goto loop;
 2456                 }
 2457                 ip = VTOI(vp);
 2458                 if ((ip->i_flag & IN_LAZYACCESS) != 0) {
 2459                         ip->i_flag &= ~IN_LAZYACCESS;
 2460                         ip->i_flag |= IN_MODIFIED;
 2461                 }
 2462                 VI_LOCK(vp);
 2463                 if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) {
 2464                         VI_UNLOCK(vp);
 2465                         VOP_UNLOCK(vp, 0, td);
 2466                         vdrop(vp);
 2467                         MNT_ILOCK(mp);
 2468                         continue;
 2469                 }
 2470                 
 2471                 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 2472                          ("process_deferred_inactive: "
 2473                           "recursed on VI_DOINGINACT"));
 2474                 vp->v_iflag |= VI_DOINGINACT;
 2475                 vp->v_iflag &= ~VI_OWEINACT;
 2476                 VI_UNLOCK(vp);
 2477                 (void) VOP_INACTIVE(vp, td);
 2478                 VI_LOCK(vp);
 2479                 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 2480                          ("process_deferred_inactive: lost VI_DOINGINACT"));
 2481                 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
 2482                          ("process_deferred_inactive: got VI_OWEINACT"));
 2483                 vp->v_iflag &= ~VI_DOINGINACT;
 2484                 VI_UNLOCK(vp);
 2485                 VOP_UNLOCK(vp, 0, td);
 2486                 vdrop(vp);
 2487                 MNT_ILOCK(mp);
 2488         }
 2489         MNT_IUNLOCK(mp);
 2490         vn_finished_secondary_write(mp);
 2491 }
 2492 
 2493 /* Try to free snapdata associated with devvp */
 2494 static void
 2495 try_free_snapdata(struct vnode *devvp,
 2496                   struct thread *td)
 2497 {
 2498         struct snapdata *sn;
 2499         ufs2_daddr_t *snapblklist;
 2500 
 2501         sn = devvp->v_rdev->si_snapdata;
 2502 
 2503         if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL ||
 2504             (devvp->v_vflag & VV_COPYONWRITE) == 0) {
 2505                 VI_UNLOCK(devvp);
 2506                 return;
 2507         }
 2508 
 2509         devvp->v_rdev->si_snapdata = NULL;
 2510         devvp->v_vflag &= ~VV_COPYONWRITE;
 2511         snapblklist = sn->sn_blklist;
 2512         sn->sn_blklist = NULL;
 2513         sn->sn_listsize = 0;
 2514         lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td);
 2515         lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
 2516         lockdestroy(&sn->sn_lock);
 2517         free(sn, M_UFSMNT);
 2518         if (snapblklist != NULL)
 2519                 FREE(snapblklist, M_UFSMNT);
 2520 }
 2521 #endif

Cache object: 2c5b2efbbae64735475377b60b605686


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.