The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/ufs/ffs/ffs_softdep.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright 1998, 2000 Marshall Kirk McKusick.
    3  * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
    4  * All rights reserved.
    5  *
    6  * The soft updates code is derived from the appendix of a University
    7  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
    8  * "Soft Updates: A Solution to the Metadata Update Problem in File
    9  * Systems", CSE-TR-254-95, August 1995).
   10  *
   11  * Further information about soft updates can be obtained from:
   12  *
   13  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
   14  *      1614 Oxford Street              mckusick@mckusick.com
   15  *      Berkeley, CA 94709-1608         +1-510-843-9542
   16  *      USA
   17  *
   18  * Redistribution and use in source and binary forms, with or without
   19  * modification, are permitted provided that the following conditions
   20  * are met:
   21  *
   22  * 1. Redistributions of source code must retain the above copyright
   23  *    notice, this list of conditions and the following disclaimer.
   24  * 2. Redistributions in binary form must reproduce the above copyright
   25  *    notice, this list of conditions and the following disclaimer in the
   26  *    documentation and/or other materials provided with the distribution.
   27  *
   28  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
   29  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   30  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   31  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
   34  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   35  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
   36  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
   37  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   38  *
   39  *      from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
   40  */
   41 
   42 #include <sys/cdefs.h>
   43 __FBSDID("$FreeBSD$");
   44 
   45 #include "opt_ffs.h"
   46 #include "opt_quota.h"
   47 #include "opt_ddb.h"
   48 
   49 /*
   50  * For now we want the safety net that the DEBUG flag provides.
   51  */
   52 #ifndef DEBUG
   53 #define DEBUG
   54 #endif
   55 
   56 #include <sys/param.h>
   57 #include <sys/kernel.h>
   58 #include <sys/systm.h>
   59 #include <sys/bio.h>
   60 #include <sys/buf.h>
   61 #include <sys/kdb.h>
   62 #include <sys/kthread.h>
   63 #include <sys/ktr.h>
   64 #include <sys/limits.h>
   65 #include <sys/lock.h>
   66 #include <sys/malloc.h>
   67 #include <sys/mount.h>
   68 #include <sys/mutex.h>
   69 #include <sys/namei.h>
   70 #include <sys/priv.h>
   71 #include <sys/proc.h>
   72 #include <sys/racct.h>
   73 #include <sys/rwlock.h>
   74 #include <sys/stat.h>
   75 #include <sys/sysctl.h>
   76 #include <sys/syslog.h>
   77 #include <sys/vnode.h>
   78 #include <sys/conf.h>
   79 
   80 #include <ufs/ufs/dir.h>
   81 #include <ufs/ufs/extattr.h>
   82 #include <ufs/ufs/quota.h>
   83 #include <ufs/ufs/inode.h>
   84 #include <ufs/ufs/ufsmount.h>
   85 #include <ufs/ffs/fs.h>
   86 #include <ufs/ffs/softdep.h>
   87 #include <ufs/ffs/ffs_extern.h>
   88 #include <ufs/ufs/ufs_extern.h>
   89 
   90 #include <vm/vm.h>
   91 #include <vm/vm_extern.h>
   92 #include <vm/vm_object.h>
   93 
   94 #include <geom/geom.h>
   95 
   96 #include <ddb/ddb.h>
   97 
   98 #define KTR_SUJ 0       /* Define to KTR_SPARE. */
   99 
  100 #ifndef SOFTUPDATES
  101 
  102 int
  103 softdep_flushfiles(oldmnt, flags, td)
  104         struct mount *oldmnt;
  105         int flags;
  106         struct thread *td;
  107 {
  108 
  109         panic("softdep_flushfiles called");
  110 }
  111 
  112 int
  113 softdep_mount(devvp, mp, fs, cred)
  114         struct vnode *devvp;
  115         struct mount *mp;
  116         struct fs *fs;
  117         struct ucred *cred;
  118 {
  119 
  120         return (0);
  121 }
  122 
  123 void
  124 softdep_initialize()
  125 {
  126 
  127         return;
  128 }
  129 
  130 void
  131 softdep_uninitialize()
  132 {
  133 
  134         return;
  135 }
  136 
  137 void
  138 softdep_unmount(mp)
  139         struct mount *mp;
  140 {
  141 
  142         panic("softdep_unmount called");
  143 }
  144 
  145 void
  146 softdep_setup_sbupdate(ump, fs, bp)
  147         struct ufsmount *ump;
  148         struct fs *fs;
  149         struct buf *bp;
  150 {
  151 
  152         panic("softdep_setup_sbupdate called");
  153 }
  154 
  155 void
  156 softdep_setup_inomapdep(bp, ip, newinum, mode)
  157         struct buf *bp;
  158         struct inode *ip;
  159         ino_t newinum;
  160         int mode;
  161 {
  162 
  163         panic("softdep_setup_inomapdep called");
  164 }
  165 
  166 void
  167 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
  168         struct buf *bp;
  169         struct mount *mp;
  170         ufs2_daddr_t newblkno;
  171         int frags;
  172         int oldfrags;
  173 {
  174 
  175         panic("softdep_setup_blkmapdep called");
  176 }
  177 
  178 void
  179 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
  180         struct inode *ip;
  181         ufs_lbn_t lbn;
  182         ufs2_daddr_t newblkno;
  183         ufs2_daddr_t oldblkno;
  184         long newsize;
  185         long oldsize;
  186         struct buf *bp;
  187 {
  188         
  189         panic("softdep_setup_allocdirect called");
  190 }
  191 
  192 void
  193 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
  194         struct inode *ip;
  195         ufs_lbn_t lbn;
  196         ufs2_daddr_t newblkno;
  197         ufs2_daddr_t oldblkno;
  198         long newsize;
  199         long oldsize;
  200         struct buf *bp;
  201 {
  202         
  203         panic("softdep_setup_allocext called");
  204 }
  205 
  206 void
  207 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
  208         struct inode *ip;
  209         ufs_lbn_t lbn;
  210         struct buf *bp;
  211         int ptrno;
  212         ufs2_daddr_t newblkno;
  213         ufs2_daddr_t oldblkno;
  214         struct buf *nbp;
  215 {
  216 
  217         panic("softdep_setup_allocindir_page called");
  218 }
  219 
  220 void
  221 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
  222         struct buf *nbp;
  223         struct inode *ip;
  224         struct buf *bp;
  225         int ptrno;
  226         ufs2_daddr_t newblkno;
  227 {
  228 
  229         panic("softdep_setup_allocindir_meta called");
  230 }
  231 
  232 void
  233 softdep_journal_freeblocks(ip, cred, length, flags)
  234         struct inode *ip;
  235         struct ucred *cred;
  236         off_t length;
  237         int flags;
  238 {
  239         
  240         panic("softdep_journal_freeblocks called");
  241 }
  242 
  243 void
  244 softdep_journal_fsync(ip)
  245         struct inode *ip;
  246 {
  247 
  248         panic("softdep_journal_fsync called");
  249 }
  250 
  251 void
  252 softdep_setup_freeblocks(ip, length, flags)
  253         struct inode *ip;
  254         off_t length;
  255         int flags;
  256 {
  257         
  258         panic("softdep_setup_freeblocks called");
  259 }
  260 
  261 void
  262 softdep_freefile(pvp, ino, mode)
  263                 struct vnode *pvp;
  264                 ino_t ino;
  265                 int mode;
  266 {
  267 
  268         panic("softdep_freefile called");
  269 }
  270 
  271 int
  272 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
  273         struct buf *bp;
  274         struct inode *dp;
  275         off_t diroffset;
  276         ino_t newinum;
  277         struct buf *newdirbp;
  278         int isnewblk;
  279 {
  280 
  281         panic("softdep_setup_directory_add called");
  282 }
  283 
  284 void
  285 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
  286         struct buf *bp;
  287         struct inode *dp;
  288         caddr_t base;
  289         caddr_t oldloc;
  290         caddr_t newloc;
  291         int entrysize;
  292 {
  293 
  294         panic("softdep_change_directoryentry_offset called");
  295 }
  296 
  297 void
  298 softdep_setup_remove(bp, dp, ip, isrmdir)
  299         struct buf *bp;
  300         struct inode *dp;
  301         struct inode *ip;
  302         int isrmdir;
  303 {
  304         
  305         panic("softdep_setup_remove called");
  306 }
  307 
  308 void
  309 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
  310         struct buf *bp;
  311         struct inode *dp;
  312         struct inode *ip;
  313         ino_t newinum;
  314         int isrmdir;
  315 {
  316 
  317         panic("softdep_setup_directory_change called");
  318 }
  319 
  320 void
  321 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
  322         struct mount *mp;
  323         struct buf *bp;
  324         ufs2_daddr_t blkno;
  325         int frags;
  326         struct workhead *wkhd;
  327 {
  328 
  329         panic("%s called", __FUNCTION__);
  330 }
  331 
  332 void
  333 softdep_setup_inofree(mp, bp, ino, wkhd)
  334         struct mount *mp;
  335         struct buf *bp;
  336         ino_t ino;
  337         struct workhead *wkhd;
  338 {
  339 
  340         panic("%s called", __FUNCTION__);
  341 }
  342 
  343 void
  344 softdep_setup_unlink(dp, ip)
  345         struct inode *dp;
  346         struct inode *ip;
  347 {
  348 
  349         panic("%s called", __FUNCTION__);
  350 }
  351 
  352 void
  353 softdep_setup_link(dp, ip)
  354         struct inode *dp;
  355         struct inode *ip;
  356 {
  357 
  358         panic("%s called", __FUNCTION__);
  359 }
  360 
  361 void
  362 softdep_revert_link(dp, ip)
  363         struct inode *dp;
  364         struct inode *ip;
  365 {
  366 
  367         panic("%s called", __FUNCTION__);
  368 }
  369 
  370 void
  371 softdep_setup_rmdir(dp, ip)
  372         struct inode *dp;
  373         struct inode *ip;
  374 {
  375 
  376         panic("%s called", __FUNCTION__);
  377 }
  378 
  379 void
  380 softdep_revert_rmdir(dp, ip)
  381         struct inode *dp;
  382         struct inode *ip;
  383 {
  384 
  385         panic("%s called", __FUNCTION__);
  386 }
  387 
  388 void
  389 softdep_setup_create(dp, ip)
  390         struct inode *dp;
  391         struct inode *ip;
  392 {
  393 
  394         panic("%s called", __FUNCTION__);
  395 }
  396 
  397 void
  398 softdep_revert_create(dp, ip)
  399         struct inode *dp;
  400         struct inode *ip;
  401 {
  402 
  403         panic("%s called", __FUNCTION__);
  404 }
  405 
  406 void
  407 softdep_setup_mkdir(dp, ip)
  408         struct inode *dp;
  409         struct inode *ip;
  410 {
  411 
  412         panic("%s called", __FUNCTION__);
  413 }
  414 
  415 void
  416 softdep_revert_mkdir(dp, ip)
  417         struct inode *dp;
  418         struct inode *ip;
  419 {
  420 
  421         panic("%s called", __FUNCTION__);
  422 }
  423 
  424 void
  425 softdep_setup_dotdot_link(dp, ip)
  426         struct inode *dp;
  427         struct inode *ip;
  428 {
  429 
  430         panic("%s called", __FUNCTION__);
  431 }
  432 
  433 int
  434 softdep_prealloc(vp, waitok)
  435         struct vnode *vp;
  436         int waitok;
  437 {
  438 
  439         panic("%s called", __FUNCTION__);
  440 }
  441 
  442 int
  443 softdep_journal_lookup(mp, vpp)
  444         struct mount *mp;
  445         struct vnode **vpp;
  446 {
  447 
  448         return (ENOENT);
  449 }
  450 
  451 void
  452 softdep_change_linkcnt(ip)
  453         struct inode *ip;
  454 {
  455 
  456         panic("softdep_change_linkcnt called");
  457 }
  458 
  459 void 
  460 softdep_load_inodeblock(ip)
  461         struct inode *ip;
  462 {
  463 
  464         panic("softdep_load_inodeblock called");
  465 }
  466 
  467 void
  468 softdep_update_inodeblock(ip, bp, waitfor)
  469         struct inode *ip;
  470         struct buf *bp;
  471         int waitfor;
  472 {
  473 
  474         panic("softdep_update_inodeblock called");
  475 }
  476 
  477 int
  478 softdep_fsync(vp)
  479         struct vnode *vp;       /* the "in_core" copy of the inode */
  480 {
  481 
  482         return (0);
  483 }
  484 
  485 void
  486 softdep_fsync_mountdev(vp)
  487         struct vnode *vp;
  488 {
  489 
  490         return;
  491 }
  492 
  493 int
  494 softdep_flushworklist(oldmnt, countp, td)
  495         struct mount *oldmnt;
  496         int *countp;
  497         struct thread *td;
  498 {
  499 
  500         *countp = 0;
  501         return (0);
  502 }
  503 
  504 int
  505 softdep_sync_metadata(struct vnode *vp)
  506 {
  507 
  508         panic("softdep_sync_metadata called");
  509 }
  510 
  511 int
  512 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
  513 {
  514 
  515         panic("softdep_sync_buf called");
  516 }
  517 
  518 int
  519 softdep_slowdown(vp)
  520         struct vnode *vp;
  521 {
  522 
  523         panic("softdep_slowdown called");
  524 }
  525 
  526 int
  527 softdep_request_cleanup(fs, vp, cred, resource)
  528         struct fs *fs;
  529         struct vnode *vp;
  530         struct ucred *cred;
  531         int resource;
  532 {
  533 
  534         return (0);
  535 }
  536 
  537 int
  538 softdep_check_suspend(struct mount *mp,
  539                       struct vnode *devvp,
  540                       int softdep_depcnt,
  541                       int softdep_accdepcnt,
  542                       int secondary_writes,
  543                       int secondary_accwrites)
  544 {
  545         struct bufobj *bo;
  546         int error;
  547         
  548         (void) softdep_depcnt,
  549         (void) softdep_accdepcnt;
  550 
  551         bo = &devvp->v_bufobj;
  552         ASSERT_BO_WLOCKED(bo);
  553 
  554         MNT_ILOCK(mp);
  555         while (mp->mnt_secondary_writes != 0) {
  556                 BO_UNLOCK(bo);
  557                 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
  558                     (PUSER - 1) | PDROP, "secwr", 0);
  559                 BO_LOCK(bo);
  560                 MNT_ILOCK(mp);
  561         }
  562 
  563         /*
  564          * Reasons for needing more work before suspend:
  565          * - Dirty buffers on devvp.
  566          * - Secondary writes occurred after start of vnode sync loop
  567          */
  568         error = 0;
  569         if (bo->bo_numoutput > 0 ||
  570             bo->bo_dirty.bv_cnt > 0 ||
  571             secondary_writes != 0 ||
  572             mp->mnt_secondary_writes != 0 ||
  573             secondary_accwrites != mp->mnt_secondary_accwrites)
  574                 error = EAGAIN;
  575         BO_UNLOCK(bo);
  576         return (error);
  577 }
  578 
  579 void
  580 softdep_get_depcounts(struct mount *mp,
  581                       int *softdepactivep,
  582                       int *softdepactiveaccp)
  583 {
  584         (void) mp;
  585         *softdepactivep = 0;
  586         *softdepactiveaccp = 0;
  587 }
  588 
  589 void
  590 softdep_buf_append(bp, wkhd)
  591         struct buf *bp;
  592         struct workhead *wkhd;
  593 {
  594 
  595         panic("softdep_buf_appendwork called");
  596 }
  597 
  598 void
  599 softdep_inode_append(ip, cred, wkhd)
  600         struct inode *ip;
  601         struct ucred *cred;
  602         struct workhead *wkhd;
  603 {
  604 
  605         panic("softdep_inode_appendwork called");
  606 }
  607 
  608 void
  609 softdep_freework(wkhd)
  610         struct workhead *wkhd;
  611 {
  612 
  613         panic("softdep_freework called");
  614 }
  615 
  616 #else
  617 
  618 FEATURE(softupdates, "FFS soft-updates support");
  619 
  620 static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
  621     "soft updates stats");
  622 static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
  623     "total dependencies allocated");
  624 static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
  625     "high use dependencies allocated");
  626 static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
  627     "current dependencies allocated");
  628 static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
  629     "current dependencies written");
  630 
  631 unsigned long dep_current[D_LAST + 1];
  632 unsigned long dep_highuse[D_LAST + 1];
  633 unsigned long dep_total[D_LAST + 1];
  634 unsigned long dep_write[D_LAST + 1];
  635 
  636 #define SOFTDEP_TYPE(type, str, long)                                   \
  637     static MALLOC_DEFINE(M_ ## type, #str, long);                       \
  638     SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,       \
  639         &dep_total[D_ ## type], 0, "");                                 \
  640     SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD,     \
  641         &dep_current[D_ ## type], 0, "");                               \
  642     SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD,     \
  643         &dep_highuse[D_ ## type], 0, "");                               \
  644     SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD,       \
  645         &dep_write[D_ ## type], 0, "");
  646 
  647 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 
  648 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
  649 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
  650     "Block or frag allocated from cyl group map");
  651 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
  652 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
  653 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
  654 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
  655 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
  656 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
  657 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
  658 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
  659 SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
  660 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
  661 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
  662 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
  663 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
  664 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
  665 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
  666 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
  667 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
  668 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
  669 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
  670 SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
  671 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
  672 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
  673 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
  674 SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
  675 
  676 static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
  677 
  678 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
  679 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
  680 static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
  681 
  682 #define M_SOFTDEP_FLAGS (M_WAITOK)
  683 
  684 /* 
  685  * translate from workitem type to memory type
  686  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
  687  */
  688 static struct malloc_type *memtype[] = {
  689         M_PAGEDEP,
  690         M_INODEDEP,
  691         M_BMSAFEMAP,
  692         M_NEWBLK,
  693         M_ALLOCDIRECT,
  694         M_INDIRDEP,
  695         M_ALLOCINDIR,
  696         M_FREEFRAG,
  697         M_FREEBLKS,
  698         M_FREEFILE,
  699         M_DIRADD,
  700         M_MKDIR,
  701         M_DIRREM,
  702         M_NEWDIRBLK,
  703         M_FREEWORK,
  704         M_FREEDEP,
  705         M_JADDREF,
  706         M_JREMREF,
  707         M_JMVREF,
  708         M_JNEWBLK,
  709         M_JFREEBLK,
  710         M_JFREEFRAG,
  711         M_JSEG,
  712         M_JSEGDEP,
  713         M_SBDEP,
  714         M_JTRUNC,
  715         M_JFSYNC,
  716         M_SENTINEL
  717 };
  718 
  719 #define DtoM(type) (memtype[type])
  720 
  721 /*
  722  * Names of malloc types.
  723  */
  724 #define TYPENAME(type)  \
  725         ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
  726 /*
  727  * End system adaptation definitions.
  728  */
  729 
  730 #define DOTDOT_OFFSET   offsetof(struct dirtemplate, dotdot_ino)
  731 #define DOT_OFFSET      offsetof(struct dirtemplate, dot_ino)
  732 
  733 /*
  734  * Internal function prototypes.
  735  */
  736 static  void check_clear_deps(struct mount *);
  737 static  void softdep_error(char *, int);
  738 static  int softdep_process_worklist(struct mount *, int);
  739 static  int softdep_waitidle(struct mount *, int);
  740 static  void drain_output(struct vnode *);
  741 static  struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
  742 static  int check_inodedep_free(struct inodedep *);
  743 static  void clear_remove(struct mount *);
  744 static  void clear_inodedeps(struct mount *);
  745 static  void unlinked_inodedep(struct mount *, struct inodedep *);
  746 static  void clear_unlinked_inodedep(struct inodedep *);
  747 static  struct inodedep *first_unlinked_inodedep(struct ufsmount *);
  748 static  int flush_pagedep_deps(struct vnode *, struct mount *,
  749             struct diraddhd *);
  750 static  int free_pagedep(struct pagedep *);
  751 static  int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
  752 static  int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
  753 static  int flush_deplist(struct allocdirectlst *, int, int *);
  754 static  int sync_cgs(struct mount *, int);
  755 static  int handle_written_filepage(struct pagedep *, struct buf *, int);
  756 static  int handle_written_sbdep(struct sbdep *, struct buf *);
  757 static  void initiate_write_sbdep(struct sbdep *);
  758 static  void diradd_inode_written(struct diradd *, struct inodedep *);
  759 static  int handle_written_indirdep(struct indirdep *, struct buf *,
  760             struct buf**, int);
  761 static  int handle_written_inodeblock(struct inodedep *, struct buf *, int);
  762 static  int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
  763             uint8_t *);
  764 static  int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
  765 static  void handle_written_jaddref(struct jaddref *);
  766 static  void handle_written_jremref(struct jremref *);
  767 static  void handle_written_jseg(struct jseg *, struct buf *);
  768 static  void handle_written_jnewblk(struct jnewblk *);
  769 static  void handle_written_jblkdep(struct jblkdep *);
  770 static  void handle_written_jfreefrag(struct jfreefrag *);
  771 static  void complete_jseg(struct jseg *);
  772 static  void complete_jsegs(struct jseg *);
  773 static  void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
  774 static  void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
  775 static  void jremref_write(struct jremref *, struct jseg *, uint8_t *);
  776 static  void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
  777 static  void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
  778 static  void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
  779 static  void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
  780 static  void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
  781 static  void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
  782 static  inline void inoref_write(struct inoref *, struct jseg *,
  783             struct jrefrec *);
  784 static  void handle_allocdirect_partdone(struct allocdirect *,
  785             struct workhead *);
  786 static  struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
  787             struct workhead *);
  788 static  void indirdep_complete(struct indirdep *);
  789 static  int indirblk_lookup(struct mount *, ufs2_daddr_t);
  790 static  void indirblk_insert(struct freework *);
  791 static  void indirblk_remove(struct freework *);
  792 static  void handle_allocindir_partdone(struct allocindir *);
  793 static  void initiate_write_filepage(struct pagedep *, struct buf *);
  794 static  void initiate_write_indirdep(struct indirdep*, struct buf *);
  795 static  void handle_written_mkdir(struct mkdir *, int);
  796 static  int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
  797             uint8_t *);
  798 static  void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
  799 static  void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
  800 static  void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
  801 static  void handle_workitem_freefile(struct freefile *);
  802 static  int handle_workitem_remove(struct dirrem *, int);
  803 static  struct dirrem *newdirrem(struct buf *, struct inode *,
  804             struct inode *, int, struct dirrem **);
  805 static  struct indirdep *indirdep_lookup(struct mount *, struct inode *,
  806             struct buf *);
  807 static  void cancel_indirdep(struct indirdep *, struct buf *,
  808             struct freeblks *);
  809 static  void free_indirdep(struct indirdep *);
  810 static  void free_diradd(struct diradd *, struct workhead *);
  811 static  void merge_diradd(struct inodedep *, struct diradd *);
  812 static  void complete_diradd(struct diradd *);
  813 static  struct diradd *diradd_lookup(struct pagedep *, int);
  814 static  struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
  815             struct jremref *);
  816 static  struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
  817             struct jremref *);
  818 static  void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
  819             struct jremref *, struct jremref *);
  820 static  void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
  821             struct jremref *);
  822 static  void cancel_allocindir(struct allocindir *, struct buf *bp,
  823             struct freeblks *, int);
  824 static  int setup_trunc_indir(struct freeblks *, struct inode *,
  825             ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
  826 static  void complete_trunc_indir(struct freework *);
  827 static  void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
  828             int);
  829 static  void complete_mkdir(struct mkdir *);
  830 static  void free_newdirblk(struct newdirblk *);
  831 static  void free_jremref(struct jremref *);
  832 static  void free_jaddref(struct jaddref *);
  833 static  void free_jsegdep(struct jsegdep *);
  834 static  void free_jsegs(struct jblocks *);
  835 static  void rele_jseg(struct jseg *);
  836 static  void free_jseg(struct jseg *, struct jblocks *);
  837 static  void free_jnewblk(struct jnewblk *);
  838 static  void free_jblkdep(struct jblkdep *);
  839 static  void free_jfreefrag(struct jfreefrag *);
  840 static  void free_freedep(struct freedep *);
  841 static  void journal_jremref(struct dirrem *, struct jremref *,
  842             struct inodedep *);
  843 static  void cancel_jnewblk(struct jnewblk *, struct workhead *);
  844 static  int cancel_jaddref(struct jaddref *, struct inodedep *,
  845             struct workhead *);
  846 static  void cancel_jfreefrag(struct jfreefrag *);
  847 static  inline void setup_freedirect(struct freeblks *, struct inode *,
  848             int, int);
  849 static  inline void setup_freeext(struct freeblks *, struct inode *, int, int);
  850 static  inline void setup_freeindir(struct freeblks *, struct inode *, int,
  851             ufs_lbn_t, int);
  852 static  inline struct freeblks *newfreeblks(struct mount *, struct inode *);
  853 static  void freeblks_free(struct ufsmount *, struct freeblks *, int);
  854 static  void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
  855 static  ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
  856 static  int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
  857 static  void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
  858             int, int);
  859 static  void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
  860 static  int cancel_pagedep(struct pagedep *, struct freeblks *, int);
  861 static  int deallocate_dependencies(struct buf *, struct freeblks *, int);
  862 static  void newblk_freefrag(struct newblk*);
  863 static  void free_newblk(struct newblk *);
  864 static  void cancel_allocdirect(struct allocdirectlst *,
  865             struct allocdirect *, struct freeblks *);
  866 static  int check_inode_unwritten(struct inodedep *);
  867 static  int free_inodedep(struct inodedep *);
  868 static  void freework_freeblock(struct freework *);
  869 static  void freework_enqueue(struct freework *);
  870 static  int handle_workitem_freeblocks(struct freeblks *, int);
  871 static  int handle_complete_freeblocks(struct freeblks *, int);
  872 static  void handle_workitem_indirblk(struct freework *);
  873 static  void handle_written_freework(struct freework *);
  874 static  void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
  875 static  struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
  876             struct workhead *);
  877 static  struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
  878             struct inodedep *, struct allocindir *, ufs_lbn_t);
  879 static  struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
  880             ufs2_daddr_t, ufs_lbn_t);
  881 static  void handle_workitem_freefrag(struct freefrag *);
  882 static  struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
  883             ufs_lbn_t);
  884 static  void allocdirect_merge(struct allocdirectlst *,
  885             struct allocdirect *, struct allocdirect *);
  886 static  struct freefrag *allocindir_merge(struct allocindir *,
  887             struct allocindir *);
  888 static  int bmsafemap_find(struct bmsafemap_hashhead *, int,
  889             struct bmsafemap **);
  890 static  struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
  891             int cg, struct bmsafemap *);
  892 static  int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
  893             struct newblk **);
  894 static  int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
  895 static  int inodedep_find(struct inodedep_hashhead *, ino_t,
  896             struct inodedep **);
  897 static  int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
  898 static  int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
  899             int, struct pagedep **);
  900 static  int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
  901             struct pagedep **);
  902 static  void pause_timer(void *);
  903 static  int request_cleanup(struct mount *, int);
  904 static  int softdep_request_cleanup_flush(struct mount *, struct ufsmount *);
  905 static  void schedule_cleanup(struct mount *);
  906 static void softdep_ast_cleanup_proc(struct thread *);
  907 static struct ufsmount *softdep_bp_to_mp(struct buf *bp);
  908 static  int process_worklist_item(struct mount *, int, int);
  909 static  void process_removes(struct vnode *);
  910 static  void process_truncates(struct vnode *);
  911 static  void jwork_move(struct workhead *, struct workhead *);
  912 static  void jwork_insert(struct workhead *, struct jsegdep *);
  913 static  void add_to_worklist(struct worklist *, int);
  914 static  void wake_worklist(struct worklist *);
  915 static  void wait_worklist(struct worklist *, char *);
  916 static  void remove_from_worklist(struct worklist *);
  917 static  void softdep_flush(void *);
  918 static  void softdep_flushjournal(struct mount *);
  919 static  int softdep_speedup(struct ufsmount *);
  920 static  void worklist_speedup(struct mount *);
  921 static  int journal_mount(struct mount *, struct fs *, struct ucred *);
  922 static  void journal_unmount(struct ufsmount *);
  923 static  int journal_space(struct ufsmount *, int);
  924 static  void journal_suspend(struct ufsmount *);
  925 static  int journal_unsuspend(struct ufsmount *ump);
  926 static  void softdep_prelink(struct vnode *, struct vnode *);
  927 static  void add_to_journal(struct worklist *);
  928 static  void remove_from_journal(struct worklist *);
  929 static  bool softdep_excess_items(struct ufsmount *, int);
  930 static  void softdep_process_journal(struct mount *, struct worklist *, int);
  931 static  struct jremref *newjremref(struct dirrem *, struct inode *,
  932             struct inode *ip, off_t, nlink_t);
  933 static  struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
  934             uint16_t);
  935 static  inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
  936             uint16_t);
  937 static  inline struct jsegdep *inoref_jseg(struct inoref *);
  938 static  struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
  939 static  struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
  940             ufs2_daddr_t, int);
  941 static  void adjust_newfreework(struct freeblks *, int);
  942 static  struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
  943 static  void move_newblock_dep(struct jaddref *, struct inodedep *);
  944 static  void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
  945 static  struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
  946             ufs2_daddr_t, long, ufs_lbn_t);
  947 static  struct freework *newfreework(struct ufsmount *, struct freeblks *,
  948             struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
  949 static  int jwait(struct worklist *, int);
  950 static  struct inodedep *inodedep_lookup_ip(struct inode *);
  951 static  int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
  952 static  struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
  953 static  void handle_jwork(struct workhead *);
  954 static  struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
  955             struct mkdir **);
  956 static  struct jblocks *jblocks_create(void);
  957 static  ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
  958 static  void jblocks_free(struct jblocks *, struct mount *, int);
  959 static  void jblocks_destroy(struct jblocks *);
  960 static  void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
  961 
  962 /*
  963  * Exported softdep operations.
  964  */
  965 static  void softdep_disk_io_initiation(struct buf *);
  966 static  void softdep_disk_write_complete(struct buf *);
  967 static  void softdep_deallocate_dependencies(struct buf *);
  968 static  int softdep_count_dependencies(struct buf *bp, int);
  969 
  970 /*
  971  * Global lock over all of soft updates.
  972  */
  973 static struct mtx lk;
  974 MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF);
  975 
  976 #define ACQUIRE_GBLLOCK(lk)     mtx_lock(lk)
  977 #define FREE_GBLLOCK(lk)        mtx_unlock(lk)
  978 #define GBLLOCK_OWNED(lk)       mtx_assert((lk), MA_OWNED)
  979 
  980 /*
  981  * Per-filesystem soft-updates locking.
  982  */
  983 #define LOCK_PTR(ump)           (&(ump)->um_softdep->sd_fslock)
  984 #define TRY_ACQUIRE_LOCK(ump)   rw_try_wlock(&(ump)->um_softdep->sd_fslock)
  985 #define ACQUIRE_LOCK(ump)       rw_wlock(&(ump)->um_softdep->sd_fslock)
  986 #define FREE_LOCK(ump)          rw_wunlock(&(ump)->um_softdep->sd_fslock)
  987 #define LOCK_OWNED(ump)         rw_assert(&(ump)->um_softdep->sd_fslock, \
  988                                     RA_WLOCKED)
  989 
  990 #define BUF_AREC(bp)            lockallowrecurse(&(bp)->b_lock)
  991 #define BUF_NOREC(bp)           lockdisablerecurse(&(bp)->b_lock)
  992 
  993 /*
  994  * Worklist queue management.
  995  * These routines require that the lock be held.
  996  */
  997 #ifndef /* NOT */ DEBUG
  998 #define WORKLIST_INSERT(head, item) do {        \
  999         (item)->wk_state |= ONWORKLIST;         \
 1000         LIST_INSERT_HEAD(head, item, wk_list);  \
 1001 } while (0)
 1002 #define WORKLIST_REMOVE(item) do {              \
 1003         (item)->wk_state &= ~ONWORKLIST;        \
 1004         LIST_REMOVE(item, wk_list);             \
 1005 } while (0)
 1006 #define WORKLIST_INSERT_UNLOCKED        WORKLIST_INSERT
 1007 #define WORKLIST_REMOVE_UNLOCKED        WORKLIST_REMOVE
 1008 
 1009 #else /* DEBUG */
 1010 static  void worklist_insert(struct workhead *, struct worklist *, int);
 1011 static  void worklist_remove(struct worklist *, int);
 1012 
 1013 #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
 1014 #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
 1015 #define WORKLIST_REMOVE(item) worklist_remove(item, 1)
 1016 #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
 1017 
 1018 static void
 1019 worklist_insert(head, item, locked)
 1020         struct workhead *head;
 1021         struct worklist *item;
 1022         int locked;
 1023 {
 1024 
 1025         if (locked)
 1026                 LOCK_OWNED(VFSTOUFS(item->wk_mp));
 1027         if (item->wk_state & ONWORKLIST)
 1028                 panic("worklist_insert: %p %s(0x%X) already on list",
 1029                     item, TYPENAME(item->wk_type), item->wk_state);
 1030         item->wk_state |= ONWORKLIST;
 1031         LIST_INSERT_HEAD(head, item, wk_list);
 1032 }
 1033 
 1034 static void
 1035 worklist_remove(item, locked)
 1036         struct worklist *item;
 1037         int locked;
 1038 {
 1039 
 1040         if (locked)
 1041                 LOCK_OWNED(VFSTOUFS(item->wk_mp));
 1042         if ((item->wk_state & ONWORKLIST) == 0)
 1043                 panic("worklist_remove: %p %s(0x%X) not on list",
 1044                     item, TYPENAME(item->wk_type), item->wk_state);
 1045         item->wk_state &= ~ONWORKLIST;
 1046         LIST_REMOVE(item, wk_list);
 1047 }
 1048 #endif /* DEBUG */
 1049 
 1050 /*
 1051  * Merge two jsegdeps keeping only the oldest one as newer references
 1052  * can't be discarded until after older references.
 1053  */
 1054 static inline struct jsegdep *
 1055 jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
 1056 {
 1057         struct jsegdep *swp;
 1058 
 1059         if (two == NULL)
 1060                 return (one);
 1061 
 1062         if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
 1063                 swp = one;
 1064                 one = two;
 1065                 two = swp;
 1066         }
 1067         WORKLIST_REMOVE(&two->jd_list);
 1068         free_jsegdep(two);
 1069 
 1070         return (one);
 1071 }
 1072 
 1073 /*
 1074  * If two freedeps are compatible free one to reduce list size.
 1075  */
 1076 static inline struct freedep *
 1077 freedep_merge(struct freedep *one, struct freedep *two)
 1078 {
 1079         if (two == NULL)
 1080                 return (one);
 1081 
 1082         if (one->fd_freework == two->fd_freework) {
 1083                 WORKLIST_REMOVE(&two->fd_list);
 1084                 free_freedep(two);
 1085         }
 1086         return (one);
 1087 }
 1088 
 1089 /*
 1090  * Move journal work from one list to another.  Duplicate freedeps and
 1091  * jsegdeps are coalesced to keep the lists as small as possible.
 1092  */
 1093 static void
 1094 jwork_move(dst, src)
 1095         struct workhead *dst;
 1096         struct workhead *src;
 1097 {
 1098         struct freedep *freedep;
 1099         struct jsegdep *jsegdep;
 1100         struct worklist *wkn;
 1101         struct worklist *wk;
 1102 
 1103         KASSERT(dst != src,
 1104             ("jwork_move: dst == src"));
 1105         freedep = NULL;
 1106         jsegdep = NULL;
 1107         LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
 1108                 if (wk->wk_type == D_JSEGDEP)
 1109                         jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
 1110                 else if (wk->wk_type == D_FREEDEP)
 1111                         freedep = freedep_merge(WK_FREEDEP(wk), freedep);
 1112         }
 1113 
 1114         while ((wk = LIST_FIRST(src)) != NULL) {
 1115                 WORKLIST_REMOVE(wk);
 1116                 WORKLIST_INSERT(dst, wk);
 1117                 if (wk->wk_type == D_JSEGDEP) {
 1118                         jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
 1119                         continue;
 1120                 }
 1121                 if (wk->wk_type == D_FREEDEP)
 1122                         freedep = freedep_merge(WK_FREEDEP(wk), freedep);
 1123         }
 1124 }
 1125 
 1126 static void
 1127 jwork_insert(dst, jsegdep)
 1128         struct workhead *dst;
 1129         struct jsegdep *jsegdep;
 1130 {
 1131         struct jsegdep *jsegdepn;
 1132         struct worklist *wk;
 1133 
 1134         LIST_FOREACH(wk, dst, wk_list)
 1135                 if (wk->wk_type == D_JSEGDEP)
 1136                         break;
 1137         if (wk == NULL) {
 1138                 WORKLIST_INSERT(dst, &jsegdep->jd_list);
 1139                 return;
 1140         }
 1141         jsegdepn = WK_JSEGDEP(wk);
 1142         if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
 1143                 WORKLIST_REMOVE(wk);
 1144                 free_jsegdep(jsegdepn);
 1145                 WORKLIST_INSERT(dst, &jsegdep->jd_list);
 1146         } else
 1147                 free_jsegdep(jsegdep);
 1148 }
 1149 
 1150 /*
 1151  * Routines for tracking and managing workitems.
 1152  */
 1153 static  void workitem_free(struct worklist *, int);
 1154 static  void workitem_alloc(struct worklist *, int, struct mount *);
 1155 static  void workitem_reassign(struct worklist *, int);
 1156 
 1157 #define WORKITEM_FREE(item, type) \
 1158         workitem_free((struct worklist *)(item), (type))
 1159 #define WORKITEM_REASSIGN(item, type) \
 1160         workitem_reassign((struct worklist *)(item), (type))
 1161 
 1162 static void
 1163 workitem_free(item, type)
 1164         struct worklist *item;
 1165         int type;
 1166 {
 1167         struct ufsmount *ump;
 1168 
 1169 #ifdef DEBUG
 1170         if (item->wk_state & ONWORKLIST)
 1171                 panic("workitem_free: %s(0x%X) still on list",
 1172                     TYPENAME(item->wk_type), item->wk_state);
 1173         if (item->wk_type != type && type != D_NEWBLK)
 1174                 panic("workitem_free: type mismatch %s != %s",
 1175                     TYPENAME(item->wk_type), TYPENAME(type));
 1176 #endif
 1177         if (item->wk_state & IOWAITING)
 1178                 wakeup(item);
 1179         ump = VFSTOUFS(item->wk_mp);
 1180         LOCK_OWNED(ump);
 1181         KASSERT(ump->softdep_deps > 0,
 1182             ("workitem_free: %s: softdep_deps going negative",
 1183             ump->um_fs->fs_fsmnt));
 1184         if (--ump->softdep_deps == 0 && ump->softdep_req)
 1185                 wakeup(&ump->softdep_deps);
 1186         KASSERT(dep_current[item->wk_type] > 0,
 1187             ("workitem_free: %s: dep_current[%s] going negative",
 1188             ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
 1189         KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
 1190             ("workitem_free: %s: softdep_curdeps[%s] going negative",
 1191             ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
 1192         atomic_subtract_long(&dep_current[item->wk_type], 1);
 1193         ump->softdep_curdeps[item->wk_type] -= 1;
 1194         free(item, DtoM(type));
 1195 }
 1196 
 1197 static void
 1198 workitem_alloc(item, type, mp)
 1199         struct worklist *item;
 1200         int type;
 1201         struct mount *mp;
 1202 {
 1203         struct ufsmount *ump;
 1204 
 1205         item->wk_type = type;
 1206         item->wk_mp = mp;
 1207         item->wk_state = 0;
 1208 
 1209         ump = VFSTOUFS(mp);
 1210         ACQUIRE_GBLLOCK(&lk);
 1211         dep_current[type]++;
 1212         if (dep_current[type] > dep_highuse[type])
 1213                 dep_highuse[type] = dep_current[type];
 1214         dep_total[type]++;
 1215         FREE_GBLLOCK(&lk);
 1216         ACQUIRE_LOCK(ump);
 1217         ump->softdep_curdeps[type] += 1;
 1218         ump->softdep_deps++;
 1219         ump->softdep_accdeps++;
 1220         FREE_LOCK(ump);
 1221 }
 1222 
 1223 static void
 1224 workitem_reassign(item, newtype)
 1225         struct worklist *item;
 1226         int newtype;
 1227 {
 1228         struct ufsmount *ump;
 1229 
 1230         ump = VFSTOUFS(item->wk_mp);
 1231         LOCK_OWNED(ump);
 1232         KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
 1233             ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
 1234             VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
 1235         ump->softdep_curdeps[item->wk_type] -= 1;
 1236         ump->softdep_curdeps[newtype] += 1;
 1237         KASSERT(dep_current[item->wk_type] > 0,
 1238             ("workitem_reassign: %s: dep_current[%s] going negative",
 1239             VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
 1240         ACQUIRE_GBLLOCK(&lk);
 1241         dep_current[newtype]++;
 1242         dep_current[item->wk_type]--;
 1243         if (dep_current[newtype] > dep_highuse[newtype])
 1244                 dep_highuse[newtype] = dep_current[newtype];
 1245         dep_total[newtype]++;
 1246         FREE_GBLLOCK(&lk);
 1247         item->wk_type = newtype;
 1248 }
 1249 
 1250 /*
 1251  * Workitem queue management
 1252  */
 1253 static int max_softdeps;        /* maximum number of structs before slowdown */
 1254 static int tickdelay = 2;       /* number of ticks to pause during slowdown */
 1255 static int proc_waiting;        /* tracks whether we have a timeout posted */
 1256 static int *stat_countp;        /* statistic to count in proc_waiting timeout */
 1257 static struct callout softdep_callout;
 1258 static int req_clear_inodedeps; /* syncer process flush some inodedeps */
 1259 static int req_clear_remove;    /* syncer process flush some freeblks */
 1260 static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
 1261 
 1262 /*
 1263  * runtime statistics
 1264  */
 1265 static int stat_flush_threads;  /* number of softdep flushing threads */
 1266 static int stat_worklist_push;  /* number of worklist cleanups */
 1267 static int stat_blk_limit_push; /* number of times block limit neared */
 1268 static int stat_ino_limit_push; /* number of times inode limit neared */
 1269 static int stat_blk_limit_hit;  /* number of times block slowdown imposed */
 1270 static int stat_ino_limit_hit;  /* number of times inode slowdown imposed */
 1271 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
 1272 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
 1273 static int stat_inode_bitmap;   /* bufs redirtied as inode bitmap not written */
 1274 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
 1275 static int stat_dir_entry;      /* bufs redirtied as dir entry cannot write */
 1276 static int stat_jaddref;        /* bufs redirtied as ino bitmap can not write */
 1277 static int stat_jnewblk;        /* bufs redirtied as blk bitmap can not write */
 1278 static int stat_journal_min;    /* Times hit journal min threshold */
 1279 static int stat_journal_low;    /* Times hit journal low threshold */
 1280 static int stat_journal_wait;   /* Times blocked in jwait(). */
 1281 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */
 1282 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */
 1283 static int stat_jwait_inode;    /* Times blocked in jwait() for inodes. */
 1284 static int stat_jwait_newblk;   /* Times blocked in jwait() for newblks. */
 1285 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
 1286 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
 1287 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
 1288 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
 1289 static int stat_cleanup_failures; /* Number of cleanup requests that failed */
 1290 static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
 1291 
 1292 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
 1293     &max_softdeps, 0, "");
 1294 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
 1295     &tickdelay, 0, "");
 1296 SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
 1297     &stat_flush_threads, 0, "");
 1298 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
 1299     &stat_worklist_push, 0,"");
 1300 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
 1301     &stat_blk_limit_push, 0,"");
 1302 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
 1303     &stat_ino_limit_push, 0,"");
 1304 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
 1305     &stat_blk_limit_hit, 0, "");
 1306 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
 1307     &stat_ino_limit_hit, 0, "");
 1308 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
 1309     &stat_sync_limit_hit, 0, "");
 1310 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
 1311     &stat_indir_blk_ptrs, 0, "");
 1312 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
 1313     &stat_inode_bitmap, 0, "");
 1314 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
 1315     &stat_direct_blk_ptrs, 0, "");
 1316 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
 1317     &stat_dir_entry, 0, "");
 1318 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
 1319     &stat_jaddref, 0, "");
 1320 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
 1321     &stat_jnewblk, 0, "");
 1322 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
 1323     &stat_journal_low, 0, "");
 1324 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
 1325     &stat_journal_min, 0, "");
 1326 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
 1327     &stat_journal_wait, 0, "");
 1328 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
 1329     &stat_jwait_filepage, 0, "");
 1330 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
 1331     &stat_jwait_freeblks, 0, "");
 1332 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
 1333     &stat_jwait_inode, 0, "");
 1334 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
 1335     &stat_jwait_newblk, 0, "");
 1336 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
 1337     &stat_cleanup_blkrequests, 0, "");
 1338 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
 1339     &stat_cleanup_inorequests, 0, "");
 1340 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
 1341     &stat_cleanup_high_delay, 0, "");
 1342 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
 1343     &stat_cleanup_retries, 0, "");
 1344 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
 1345     &stat_cleanup_failures, 0, "");
 1346 SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
 1347     &softdep_flushcache, 0, "");
 1348 SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
 1349     &stat_emptyjblocks, 0, "");
 1350 
 1351 SYSCTL_DECL(_vfs_ffs);
 1352 
 1353 /* Whether to recompute the summary at mount time */
 1354 static int compute_summary_at_mount = 0;
 1355 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
 1356            &compute_summary_at_mount, 0, "Recompute summary at mount");
 1357 static int print_threads = 0;
 1358 SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
 1359     &print_threads, 0, "Notify flusher thread start/stop");
 1360 
 1361 /* List of all filesystems mounted with soft updates */
 1362 static TAILQ_HEAD(, mount_softdeps) softdepmounts;
 1363 
 1364 /*
 1365  * This function cleans the worklist for a filesystem.
 1366  * Each filesystem running with soft dependencies gets its own
 1367  * thread to run in this function. The thread is started up in
 1368  * softdep_mount and shutdown in softdep_unmount. They show up
 1369  * as part of the kernel "bufdaemon" process whose process
 1370  * entry is available in bufdaemonproc.
 1371  */
 1372 static int searchfailed;
 1373 extern struct proc *bufdaemonproc;
 1374 static void
 1375 softdep_flush(addr)
 1376         void *addr;
 1377 {
 1378         struct mount *mp;
 1379         struct thread *td;
 1380         struct ufsmount *ump;
 1381 
 1382         td = curthread;
 1383         td->td_pflags |= TDP_NORUNNINGBUF;
 1384         mp = (struct mount *)addr;
 1385         ump = VFSTOUFS(mp);
 1386         atomic_add_int(&stat_flush_threads, 1);
 1387         ACQUIRE_LOCK(ump);
 1388         ump->softdep_flags &= ~FLUSH_STARTING;
 1389         wakeup(&ump->softdep_flushtd);
 1390         FREE_LOCK(ump);
 1391         if (print_threads) {
 1392                 if (stat_flush_threads == 1)
 1393                         printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
 1394                             bufdaemonproc->p_pid);
 1395                 printf("Start thread %s\n", td->td_name);
 1396         }
 1397         for (;;) {      
 1398                 while (softdep_process_worklist(mp, 0) > 0 ||
 1399                     (MOUNTEDSUJ(mp) &&
 1400                     VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
 1401                         kthread_suspend_check();
 1402                 ACQUIRE_LOCK(ump);
 1403                 if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
 1404                         msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
 1405                             "sdflush", hz / 2);
 1406                 ump->softdep_flags &= ~FLUSH_CLEANUP;
 1407                 /*
 1408                  * Check to see if we are done and need to exit.
 1409                  */
 1410                 if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
 1411                         FREE_LOCK(ump);
 1412                         continue;
 1413                 }
 1414                 ump->softdep_flags &= ~FLUSH_EXIT;
 1415                 FREE_LOCK(ump);
 1416                 wakeup(&ump->softdep_flags);
 1417                 if (print_threads)
 1418                         printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
 1419                 atomic_subtract_int(&stat_flush_threads, 1);
 1420                 kthread_exit();
 1421                 panic("kthread_exit failed\n");
 1422         }
 1423 }
 1424 
 1425 static void
 1426 worklist_speedup(mp)
 1427         struct mount *mp;
 1428 {
 1429         struct ufsmount *ump;
 1430 
 1431         ump = VFSTOUFS(mp);
 1432         LOCK_OWNED(ump);
 1433         if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
 1434                 ump->softdep_flags |= FLUSH_CLEANUP;
 1435         wakeup(&ump->softdep_flushtd);
 1436 }
 1437 
 1438 static int
 1439 softdep_speedup(ump)
 1440         struct ufsmount *ump;
 1441 {
 1442         struct ufsmount *altump;
 1443         struct mount_softdeps *sdp;
 1444 
 1445         LOCK_OWNED(ump);
 1446         worklist_speedup(ump->um_mountp);
 1447         bd_speedup();
 1448         /*
 1449          * If we have global shortages, then we need other
 1450          * filesystems to help with the cleanup. Here we wakeup a
 1451          * flusher thread for a filesystem that is over its fair
 1452          * share of resources.
 1453          */
 1454         if (req_clear_inodedeps || req_clear_remove) {
 1455                 ACQUIRE_GBLLOCK(&lk);
 1456                 TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
 1457                         if ((altump = sdp->sd_ump) == ump)
 1458                                 continue;
 1459                         if (((req_clear_inodedeps &&
 1460                             altump->softdep_curdeps[D_INODEDEP] >
 1461                             max_softdeps / stat_flush_threads) ||
 1462                             (req_clear_remove &&
 1463                             altump->softdep_curdeps[D_DIRREM] >
 1464                             (max_softdeps / 2) / stat_flush_threads)) &&
 1465                             TRY_ACQUIRE_LOCK(altump))
 1466                                 break;
 1467                 }
 1468                 if (sdp == NULL) {
 1469                         searchfailed++;
 1470                         FREE_GBLLOCK(&lk);
 1471                 } else {
 1472                         /*
 1473                          * Move to the end of the list so we pick a
 1474                          * different one on out next try.
 1475                          */
 1476                         TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
 1477                         TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
 1478                         FREE_GBLLOCK(&lk);
 1479                         if ((altump->softdep_flags &
 1480                             (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
 1481                                 altump->softdep_flags |= FLUSH_CLEANUP;
 1482                         altump->um_softdep->sd_cleanups++;
 1483                         wakeup(&altump->softdep_flushtd);
 1484                         FREE_LOCK(altump);
 1485                 }
 1486         }
 1487         return (speedup_syncer());
 1488 }
 1489 
 1490 /*
 1491  * Add an item to the end of the work queue.
 1492  * This routine requires that the lock be held.
 1493  * This is the only routine that adds items to the list.
 1494  * The following routine is the only one that removes items
 1495  * and does so in order from first to last.
 1496  */
 1497 
 1498 #define WK_HEAD         0x0001  /* Add to HEAD. */
 1499 #define WK_NODELAY      0x0002  /* Process immediately. */
 1500 
 1501 static void
 1502 add_to_worklist(wk, flags)
 1503         struct worklist *wk;
 1504         int flags;
 1505 {
 1506         struct ufsmount *ump;
 1507 
 1508         ump = VFSTOUFS(wk->wk_mp);
 1509         LOCK_OWNED(ump);
 1510         if (wk->wk_state & ONWORKLIST)
 1511                 panic("add_to_worklist: %s(0x%X) already on list",
 1512                     TYPENAME(wk->wk_type), wk->wk_state);
 1513         wk->wk_state |= ONWORKLIST;
 1514         if (ump->softdep_on_worklist == 0) {
 1515                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
 1516                 ump->softdep_worklist_tail = wk;
 1517         } else if (flags & WK_HEAD) {
 1518                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
 1519         } else {
 1520                 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
 1521                 ump->softdep_worklist_tail = wk;
 1522         }
 1523         ump->softdep_on_worklist += 1;
 1524         if (flags & WK_NODELAY)
 1525                 worklist_speedup(wk->wk_mp);
 1526 }
 1527 
 1528 /*
 1529  * Remove the item to be processed. If we are removing the last
 1530  * item on the list, we need to recalculate the tail pointer.
 1531  */
 1532 static void
 1533 remove_from_worklist(wk)
 1534         struct worklist *wk;
 1535 {
 1536         struct ufsmount *ump;
 1537 
 1538         ump = VFSTOUFS(wk->wk_mp);
 1539         if (ump->softdep_worklist_tail == wk)
 1540                 ump->softdep_worklist_tail =
 1541                     (struct worklist *)wk->wk_list.le_prev;
 1542         WORKLIST_REMOVE(wk);
 1543         ump->softdep_on_worklist -= 1;
 1544 }
 1545 
 1546 static void
 1547 wake_worklist(wk)
 1548         struct worklist *wk;
 1549 {
 1550         if (wk->wk_state & IOWAITING) {
 1551                 wk->wk_state &= ~IOWAITING;
 1552                 wakeup(wk);
 1553         }
 1554 }
 1555 
 1556 static void
 1557 wait_worklist(wk, wmesg)
 1558         struct worklist *wk;
 1559         char *wmesg;
 1560 {
 1561         struct ufsmount *ump;
 1562 
 1563         ump = VFSTOUFS(wk->wk_mp);
 1564         wk->wk_state |= IOWAITING;
 1565         msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
 1566 }
 1567 
 1568 /*
 1569  * Process that runs once per second to handle items in the background queue.
 1570  *
 1571  * Note that we ensure that everything is done in the order in which they
 1572  * appear in the queue. The code below depends on this property to ensure
 1573  * that blocks of a file are freed before the inode itself is freed. This
 1574  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
 1575  * until all the old ones have been purged from the dependency lists.
 1576  */
 1577 static int 
 1578 softdep_process_worklist(mp, full)
 1579         struct mount *mp;
 1580         int full;
 1581 {
 1582         int cnt, matchcnt;
 1583         struct ufsmount *ump;
 1584         long starttime;
 1585 
 1586         KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
 1587         if (MOUNTEDSOFTDEP(mp) == 0)
 1588                 return (0);
 1589         matchcnt = 0;
 1590         ump = VFSTOUFS(mp);
 1591         ACQUIRE_LOCK(ump);
 1592         starttime = time_second;
 1593         softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
 1594         check_clear_deps(mp);
 1595         while (ump->softdep_on_worklist > 0) {
 1596                 if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
 1597                         break;
 1598                 else
 1599                         matchcnt += cnt;
 1600                 check_clear_deps(mp);
 1601                 /*
 1602                  * We do not generally want to stop for buffer space, but if
 1603                  * we are really being a buffer hog, we will stop and wait.
 1604                  */
 1605                 if (should_yield()) {
 1606                         FREE_LOCK(ump);
 1607                         kern_yield(PRI_USER);
 1608                         bwillwrite();
 1609                         ACQUIRE_LOCK(ump);
 1610                 }
 1611                 /*
 1612                  * Never allow processing to run for more than one
 1613                  * second. This gives the syncer thread the opportunity
 1614                  * to pause if appropriate.
 1615                  */
 1616                 if (!full && starttime != time_second)
 1617                         break;
 1618         }
 1619         if (full == 0)
 1620                 journal_unsuspend(ump);
 1621         FREE_LOCK(ump);
 1622         return (matchcnt);
 1623 }
 1624 
 1625 /*
 1626  * Process all removes associated with a vnode if we are running out of
 1627  * journal space.  Any other process which attempts to flush these will
 1628  * be unable as we have the vnodes locked.
 1629  */
 1630 static void
 1631 process_removes(vp)
 1632         struct vnode *vp;
 1633 {
 1634         struct inodedep *inodedep;
 1635         struct dirrem *dirrem;
 1636         struct ufsmount *ump;
 1637         struct mount *mp;
 1638         ino_t inum;
 1639 
 1640         mp = vp->v_mount;
 1641         ump = VFSTOUFS(mp);
 1642         LOCK_OWNED(ump);
 1643         inum = VTOI(vp)->i_number;
 1644         for (;;) {
 1645 top:
 1646                 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
 1647                         return;
 1648                 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
 1649                         /*
 1650                          * If another thread is trying to lock this vnode
 1651                          * it will fail but we must wait for it to do so
 1652                          * before we can proceed.
 1653                          */
 1654                         if (dirrem->dm_state & INPROGRESS) {
 1655                                 wait_worklist(&dirrem->dm_list, "pwrwait");
 1656                                 goto top;
 1657                         }
 1658                         if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == 
 1659                             (COMPLETE | ONWORKLIST))
 1660                                 break;
 1661                 }
 1662                 if (dirrem == NULL)
 1663                         return;
 1664                 remove_from_worklist(&dirrem->dm_list);
 1665                 FREE_LOCK(ump);
 1666                 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
 1667                         panic("process_removes: suspended filesystem");
 1668                 handle_workitem_remove(dirrem, 0);
 1669                 vn_finished_secondary_write(mp);
 1670                 ACQUIRE_LOCK(ump);
 1671         }
 1672 }
 1673 
 1674 /*
 1675  * Process all truncations associated with a vnode if we are running out
 1676  * of journal space.  This is called when the vnode lock is already held
 1677  * and no other process can clear the truncation.  This function returns
 1678  * a value greater than zero if it did any work.
 1679  */
 1680 static void
 1681 process_truncates(vp)
 1682         struct vnode *vp;
 1683 {
 1684         struct inodedep *inodedep;
 1685         struct freeblks *freeblks;
 1686         struct ufsmount *ump;
 1687         struct mount *mp;
 1688         ino_t inum;
 1689         int cgwait;
 1690 
 1691         mp = vp->v_mount;
 1692         ump = VFSTOUFS(mp);
 1693         LOCK_OWNED(ump);
 1694         inum = VTOI(vp)->i_number;
 1695         for (;;) {
 1696                 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
 1697                         return;
 1698                 cgwait = 0;
 1699                 TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
 1700                         /* Journal entries not yet written.  */
 1701                         if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
 1702                                 jwait(&LIST_FIRST(
 1703                                     &freeblks->fb_jblkdephd)->jb_list,
 1704                                     MNT_WAIT);
 1705                                 break;
 1706                         }
 1707                         /* Another thread is executing this item. */
 1708                         if (freeblks->fb_state & INPROGRESS) {
 1709                                 wait_worklist(&freeblks->fb_list, "ptrwait");
 1710                                 break;
 1711                         }
 1712                         /* Freeblks is waiting on a inode write. */
 1713                         if ((freeblks->fb_state & COMPLETE) == 0) {
 1714                                 FREE_LOCK(ump);
 1715                                 ffs_update(vp, 1);
 1716                                 ACQUIRE_LOCK(ump);
 1717                                 break;
 1718                         }
 1719                         if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
 1720                             (ALLCOMPLETE | ONWORKLIST)) {
 1721                                 remove_from_worklist(&freeblks->fb_list);
 1722                                 freeblks->fb_state |= INPROGRESS;
 1723                                 FREE_LOCK(ump);
 1724                                 if (vn_start_secondary_write(NULL, &mp,
 1725                                     V_NOWAIT))
 1726                                         panic("process_truncates: "
 1727                                             "suspended filesystem");
 1728                                 handle_workitem_freeblocks(freeblks, 0);
 1729                                 vn_finished_secondary_write(mp);
 1730                                 ACQUIRE_LOCK(ump);
 1731                                 break;
 1732                         }
 1733                         if (freeblks->fb_cgwait)
 1734                                 cgwait++;
 1735                 }
 1736                 if (cgwait) {
 1737                         FREE_LOCK(ump);
 1738                         sync_cgs(mp, MNT_WAIT);
 1739                         ffs_sync_snap(mp, MNT_WAIT);
 1740                         ACQUIRE_LOCK(ump);
 1741                         continue;
 1742                 }
 1743                 if (freeblks == NULL)
 1744                         break;
 1745         }
 1746         return;
 1747 }
 1748 
 1749 /*
 1750  * Process one item on the worklist.
 1751  */
 1752 static int
 1753 process_worklist_item(mp, target, flags)
 1754         struct mount *mp;
 1755         int target;
 1756         int flags;
 1757 {
 1758         struct worklist sentinel;
 1759         struct worklist *wk;
 1760         struct ufsmount *ump;
 1761         int matchcnt;
 1762         int error;
 1763 
 1764         KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
 1765         /*
 1766          * If we are being called because of a process doing a
 1767          * copy-on-write, then it is not safe to write as we may
 1768          * recurse into the copy-on-write routine.
 1769          */
 1770         if (curthread->td_pflags & TDP_COWINPROGRESS)
 1771                 return (-1);
 1772         PHOLD(curproc); /* Don't let the stack go away. */
 1773         ump = VFSTOUFS(mp);
 1774         LOCK_OWNED(ump);
 1775         matchcnt = 0;
 1776         sentinel.wk_mp = NULL;
 1777         sentinel.wk_type = D_SENTINEL;
 1778         LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
 1779         for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
 1780             wk = LIST_NEXT(&sentinel, wk_list)) {
 1781                 if (wk->wk_type == D_SENTINEL) {
 1782                         LIST_REMOVE(&sentinel, wk_list);
 1783                         LIST_INSERT_AFTER(wk, &sentinel, wk_list);
 1784                         continue;
 1785                 }
 1786                 if (wk->wk_state & INPROGRESS)
 1787                         panic("process_worklist_item: %p already in progress.",
 1788                             wk);
 1789                 wk->wk_state |= INPROGRESS;
 1790                 remove_from_worklist(wk);
 1791                 FREE_LOCK(ump);
 1792                 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
 1793                         panic("process_worklist_item: suspended filesystem");
 1794                 switch (wk->wk_type) {
 1795                 case D_DIRREM:
 1796                         /* removal of a directory entry */
 1797                         error = handle_workitem_remove(WK_DIRREM(wk), flags);
 1798                         break;
 1799 
 1800                 case D_FREEBLKS:
 1801                         /* releasing blocks and/or fragments from a file */
 1802                         error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
 1803                             flags);
 1804                         break;
 1805 
 1806                 case D_FREEFRAG:
 1807                         /* releasing a fragment when replaced as a file grows */
 1808                         handle_workitem_freefrag(WK_FREEFRAG(wk));
 1809                         error = 0;
 1810                         break;
 1811 
 1812                 case D_FREEFILE:
 1813                         /* releasing an inode when its link count drops to 0 */
 1814                         handle_workitem_freefile(WK_FREEFILE(wk));
 1815                         error = 0;
 1816                         break;
 1817 
 1818                 default:
 1819                         panic("%s_process_worklist: Unknown type %s",
 1820                             "softdep", TYPENAME(wk->wk_type));
 1821                         /* NOTREACHED */
 1822                 }
 1823                 vn_finished_secondary_write(mp);
 1824                 ACQUIRE_LOCK(ump);
 1825                 if (error == 0) {
 1826                         if (++matchcnt == target)
 1827                                 break;
 1828                         continue;
 1829                 }
 1830                 /*
 1831                  * We have to retry the worklist item later.  Wake up any
 1832                  * waiters who may be able to complete it immediately and
 1833                  * add the item back to the head so we don't try to execute
 1834                  * it again.
 1835                  */
 1836                 wk->wk_state &= ~INPROGRESS;
 1837                 wake_worklist(wk);
 1838                 add_to_worklist(wk, WK_HEAD);
 1839         }
 1840         /* Sentinal could've become the tail from remove_from_worklist. */
 1841         if (ump->softdep_worklist_tail == &sentinel)
 1842                 ump->softdep_worklist_tail =
 1843                     (struct worklist *)sentinel.wk_list.le_prev;
 1844         LIST_REMOVE(&sentinel, wk_list);
 1845         PRELE(curproc);
 1846         return (matchcnt);
 1847 }
 1848 
 1849 /*
 1850  * Move dependencies from one buffer to another.
 1851  */
 1852 int
 1853 softdep_move_dependencies(oldbp, newbp)
 1854         struct buf *oldbp;
 1855         struct buf *newbp;
 1856 {
 1857         struct worklist *wk, *wktail;
 1858         struct ufsmount *ump;
 1859         int dirty;
 1860 
 1861         if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
 1862                 return (0);
 1863         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
 1864             ("softdep_move_dependencies called on non-softdep filesystem"));
 1865         dirty = 0;
 1866         wktail = NULL;
 1867         ump = VFSTOUFS(wk->wk_mp);
 1868         ACQUIRE_LOCK(ump);
 1869         while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
 1870                 LIST_REMOVE(wk, wk_list);
 1871                 if (wk->wk_type == D_BMSAFEMAP &&
 1872                     bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
 1873                         dirty = 1;
 1874                 if (wktail == NULL)
 1875                         LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
 1876                 else
 1877                         LIST_INSERT_AFTER(wktail, wk, wk_list);
 1878                 wktail = wk;
 1879         }
 1880         FREE_LOCK(ump);
 1881 
 1882         return (dirty);
 1883 }
 1884 
 1885 /*
 1886  * Purge the work list of all items associated with a particular mount point.
 1887  */
 1888 int
 1889 softdep_flushworklist(oldmnt, countp, td)
 1890         struct mount *oldmnt;
 1891         int *countp;
 1892         struct thread *td;
 1893 {
 1894         struct vnode *devvp;
 1895         struct ufsmount *ump;
 1896         int count, error;
 1897 
 1898         /*
 1899          * Alternately flush the block device associated with the mount
 1900          * point and process any dependencies that the flushing
 1901          * creates. We continue until no more worklist dependencies
 1902          * are found.
 1903          */
 1904         *countp = 0;
 1905         error = 0;
 1906         ump = VFSTOUFS(oldmnt);
 1907         devvp = ump->um_devvp;
 1908         while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
 1909                 *countp += count;
 1910                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 1911                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
 1912                 VOP_UNLOCK(devvp, 0);
 1913                 if (error != 0)
 1914                         break;
 1915         }
 1916         return (error);
 1917 }
 1918 
 1919 #define SU_WAITIDLE_RETRIES     20
 1920 static int
 1921 softdep_waitidle(struct mount *mp, int flags __unused)
 1922 {
 1923         struct ufsmount *ump;
 1924         struct vnode *devvp;
 1925         struct thread *td;
 1926         int error, i;
 1927 
 1928         ump = VFSTOUFS(mp);
 1929         devvp = ump->um_devvp;
 1930         td = curthread;
 1931         error = 0;
 1932         ACQUIRE_LOCK(ump);
 1933         for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
 1934                 ump->softdep_req = 1;
 1935                 KASSERT((flags & FORCECLOSE) == 0 ||
 1936                     ump->softdep_on_worklist == 0,
 1937                     ("softdep_waitidle: work added after flush"));
 1938                 msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
 1939                     "softdeps", 10 * hz);
 1940                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 1941                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
 1942                 VOP_UNLOCK(devvp, 0);
 1943                 ACQUIRE_LOCK(ump);
 1944                 if (error != 0)
 1945                         break;
 1946         }
 1947         ump->softdep_req = 0;
 1948         if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
 1949                 error = EBUSY;
 1950                 printf("softdep_waitidle: Failed to flush worklist for %p\n",
 1951                     mp);
 1952         }
 1953         FREE_LOCK(ump);
 1954         return (error);
 1955 }
 1956 
 1957 /*
 1958  * Flush all vnodes and worklist items associated with a specified mount point.
 1959  */
 1960 int
 1961 softdep_flushfiles(oldmnt, flags, td)
 1962         struct mount *oldmnt;
 1963         int flags;
 1964         struct thread *td;
 1965 {
 1966 #ifdef QUOTA
 1967         struct ufsmount *ump;
 1968         int i;
 1969 #endif
 1970         int error, early, depcount, loopcnt, retry_flush_count, retry;
 1971         int morework;
 1972 
 1973         KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
 1974             ("softdep_flushfiles called on non-softdep filesystem"));
 1975         loopcnt = 10;
 1976         retry_flush_count = 3;
 1977 retry_flush:
 1978         error = 0;
 1979 
 1980         /*
 1981          * Alternately flush the vnodes associated with the mount
 1982          * point and process any dependencies that the flushing
 1983          * creates. In theory, this loop can happen at most twice,
 1984          * but we give it a few extra just to be sure.
 1985          */
 1986         for (; loopcnt > 0; loopcnt--) {
 1987                 /*
 1988                  * Do another flush in case any vnodes were brought in
 1989                  * as part of the cleanup operations.
 1990                  */
 1991                 early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
 1992                     MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
 1993                 if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
 1994                         break;
 1995                 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
 1996                     depcount == 0)
 1997                         break;
 1998         }
 1999         /*
 2000          * If we are unmounting then it is an error to fail. If we
 2001          * are simply trying to downgrade to read-only, then filesystem
 2002          * activity can keep us busy forever, so we just fail with EBUSY.
 2003          */
 2004         if (loopcnt == 0) {
 2005                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
 2006                         panic("softdep_flushfiles: looping");
 2007                 error = EBUSY;
 2008         }
 2009         if (!error)
 2010                 error = softdep_waitidle(oldmnt, flags);
 2011         if (!error) {
 2012                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
 2013                         retry = 0;
 2014                         MNT_ILOCK(oldmnt);
 2015                         KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
 2016                             ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
 2017                         morework = oldmnt->mnt_nvnodelistsize > 0;
 2018 #ifdef QUOTA
 2019                         ump = VFSTOUFS(oldmnt);
 2020                         UFS_LOCK(ump);
 2021                         for (i = 0; i < MAXQUOTAS; i++) {
 2022                                 if (ump->um_quotas[i] != NULLVP)
 2023                                         morework = 1;
 2024                         }
 2025                         UFS_UNLOCK(ump);
 2026 #endif
 2027                         if (morework) {
 2028                                 if (--retry_flush_count > 0) {
 2029                                         retry = 1;
 2030                                         loopcnt = 3;
 2031                                 } else
 2032                                         error = EBUSY;
 2033                         }
 2034                         MNT_IUNLOCK(oldmnt);
 2035                         if (retry)
 2036                                 goto retry_flush;
 2037                 }
 2038         }
 2039         return (error);
 2040 }
 2041 
 2042 /*
 2043  * Structure hashing.
 2044  * 
 2045  * There are four types of structures that can be looked up:
 2046  *      1) pagedep structures identified by mount point, inode number,
 2047  *         and logical block.
 2048  *      2) inodedep structures identified by mount point and inode number.
 2049  *      3) newblk structures identified by mount point and
 2050  *         physical block number.
 2051  *      4) bmsafemap structures identified by mount point and
 2052  *         cylinder group number.
 2053  *
 2054  * The "pagedep" and "inodedep" dependency structures are hashed
 2055  * separately from the file blocks and inodes to which they correspond.
 2056  * This separation helps when the in-memory copy of an inode or
 2057  * file block must be replaced. It also obviates the need to access
 2058  * an inode or file page when simply updating (or de-allocating)
 2059  * dependency structures. Lookup of newblk structures is needed to
 2060  * find newly allocated blocks when trying to associate them with
 2061  * their allocdirect or allocindir structure.
 2062  *
 2063  * The lookup routines optionally create and hash a new instance when
 2064  * an existing entry is not found. The bmsafemap lookup routine always
 2065  * allocates a new structure if an existing one is not found.
 2066  */
 2067 #define DEPALLOC        0x0001  /* allocate structure if lookup fails */
 2068 
 2069 /*
 2070  * Structures and routines associated with pagedep caching.
 2071  */
 2072 #define PAGEDEP_HASH(ump, inum, lbn) \
 2073         (&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
 2074 
 2075 static int
 2076 pagedep_find(pagedephd, ino, lbn, pagedeppp)
 2077         struct pagedep_hashhead *pagedephd;
 2078         ino_t ino;
 2079         ufs_lbn_t lbn;
 2080         struct pagedep **pagedeppp;
 2081 {
 2082         struct pagedep *pagedep;
 2083 
 2084         LIST_FOREACH(pagedep, pagedephd, pd_hash) {
 2085                 if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
 2086                         *pagedeppp = pagedep;
 2087                         return (1);
 2088                 }
 2089         }
 2090         *pagedeppp = NULL;
 2091         return (0);
 2092 }
 2093 /*
 2094  * Look up a pagedep. Return 1 if found, 0 otherwise.
 2095  * If not found, allocate if DEPALLOC flag is passed.
 2096  * Found or allocated entry is returned in pagedeppp.
 2097  * This routine must be called with splbio interrupts blocked.
 2098  */
 2099 static int
 2100 pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
 2101         struct mount *mp;
 2102         struct buf *bp;
 2103         ino_t ino;
 2104         ufs_lbn_t lbn;
 2105         int flags;
 2106         struct pagedep **pagedeppp;
 2107 {
 2108         struct pagedep *pagedep;
 2109         struct pagedep_hashhead *pagedephd;
 2110         struct worklist *wk;
 2111         struct ufsmount *ump;
 2112         int ret;
 2113         int i;
 2114 
 2115         ump = VFSTOUFS(mp);
 2116         LOCK_OWNED(ump);
 2117         if (bp) {
 2118                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 2119                         if (wk->wk_type == D_PAGEDEP) {
 2120                                 *pagedeppp = WK_PAGEDEP(wk);
 2121                                 return (1);
 2122                         }
 2123                 }
 2124         }
 2125         pagedephd = PAGEDEP_HASH(ump, ino, lbn);
 2126         ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
 2127         if (ret) {
 2128                 if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
 2129                         WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
 2130                 return (1);
 2131         }
 2132         if ((flags & DEPALLOC) == 0)
 2133                 return (0);
 2134         FREE_LOCK(ump);
 2135         pagedep = malloc(sizeof(struct pagedep),
 2136             M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
 2137         workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
 2138         ACQUIRE_LOCK(ump);
 2139         ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
 2140         if (*pagedeppp) {
 2141                 /*
 2142                  * This should never happen since we only create pagedeps
 2143                  * with the vnode lock held.  Could be an assert.
 2144                  */
 2145                 WORKITEM_FREE(pagedep, D_PAGEDEP);
 2146                 return (ret);
 2147         }
 2148         pagedep->pd_ino = ino;
 2149         pagedep->pd_lbn = lbn;
 2150         LIST_INIT(&pagedep->pd_dirremhd);
 2151         LIST_INIT(&pagedep->pd_pendinghd);
 2152         for (i = 0; i < DAHASHSZ; i++)
 2153                 LIST_INIT(&pagedep->pd_diraddhd[i]);
 2154         LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
 2155         WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 2156         *pagedeppp = pagedep;
 2157         return (0);
 2158 }
 2159 
 2160 /*
 2161  * Structures and routines associated with inodedep caching.
 2162  */
 2163 #define INODEDEP_HASH(ump, inum) \
 2164       (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
 2165 
 2166 static int
 2167 inodedep_find(inodedephd, inum, inodedeppp)
 2168         struct inodedep_hashhead *inodedephd;
 2169         ino_t inum;
 2170         struct inodedep **inodedeppp;
 2171 {
 2172         struct inodedep *inodedep;
 2173 
 2174         LIST_FOREACH(inodedep, inodedephd, id_hash)
 2175                 if (inum == inodedep->id_ino)
 2176                         break;
 2177         if (inodedep) {
 2178                 *inodedeppp = inodedep;
 2179                 return (1);
 2180         }
 2181         *inodedeppp = NULL;
 2182 
 2183         return (0);
 2184 }
 2185 /*
 2186  * Look up an inodedep. Return 1 if found, 0 if not found.
 2187  * If not found, allocate if DEPALLOC flag is passed.
 2188  * Found or allocated entry is returned in inodedeppp.
 2189  * This routine must be called with splbio interrupts blocked.
 2190  */
 2191 static int
 2192 inodedep_lookup(mp, inum, flags, inodedeppp)
 2193         struct mount *mp;
 2194         ino_t inum;
 2195         int flags;
 2196         struct inodedep **inodedeppp;
 2197 {
 2198         struct inodedep *inodedep;
 2199         struct inodedep_hashhead *inodedephd;
 2200         struct ufsmount *ump;
 2201         struct fs *fs;
 2202 
 2203         ump = VFSTOUFS(mp);
 2204         LOCK_OWNED(ump);
 2205         fs = ump->um_fs;
 2206         inodedephd = INODEDEP_HASH(ump, inum);
 2207 
 2208         if (inodedep_find(inodedephd, inum, inodedeppp))
 2209                 return (1);
 2210         if ((flags & DEPALLOC) == 0)
 2211                 return (0);
 2212         /*
 2213          * If the system is over its limit and our filesystem is
 2214          * responsible for more than our share of that usage and
 2215          * we are not in a rush, request some inodedep cleanup.
 2216          */
 2217         if (softdep_excess_items(ump, D_INODEDEP))
 2218                 schedule_cleanup(mp);
 2219         else
 2220                 FREE_LOCK(ump);
 2221         inodedep = malloc(sizeof(struct inodedep),
 2222                 M_INODEDEP, M_SOFTDEP_FLAGS);
 2223         workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
 2224         ACQUIRE_LOCK(ump);
 2225         if (inodedep_find(inodedephd, inum, inodedeppp)) {
 2226                 WORKITEM_FREE(inodedep, D_INODEDEP);
 2227                 return (1);
 2228         }
 2229         inodedep->id_fs = fs;
 2230         inodedep->id_ino = inum;
 2231         inodedep->id_state = ALLCOMPLETE;
 2232         inodedep->id_nlinkdelta = 0;
 2233         inodedep->id_savedino1 = NULL;
 2234         inodedep->id_savedsize = -1;
 2235         inodedep->id_savedextsize = -1;
 2236         inodedep->id_savednlink = -1;
 2237         inodedep->id_bmsafemap = NULL;
 2238         inodedep->id_mkdiradd = NULL;
 2239         LIST_INIT(&inodedep->id_dirremhd);
 2240         LIST_INIT(&inodedep->id_pendinghd);
 2241         LIST_INIT(&inodedep->id_inowait);
 2242         LIST_INIT(&inodedep->id_bufwait);
 2243         TAILQ_INIT(&inodedep->id_inoreflst);
 2244         TAILQ_INIT(&inodedep->id_inoupdt);
 2245         TAILQ_INIT(&inodedep->id_newinoupdt);
 2246         TAILQ_INIT(&inodedep->id_extupdt);
 2247         TAILQ_INIT(&inodedep->id_newextupdt);
 2248         TAILQ_INIT(&inodedep->id_freeblklst);
 2249         LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
 2250         *inodedeppp = inodedep;
 2251         return (0);
 2252 }
 2253 
 2254 /*
 2255  * Structures and routines associated with newblk caching.
 2256  */
 2257 #define NEWBLK_HASH(ump, inum) \
 2258         (&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
 2259 
 2260 static int
 2261 newblk_find(newblkhd, newblkno, flags, newblkpp)
 2262         struct newblk_hashhead *newblkhd;
 2263         ufs2_daddr_t newblkno;
 2264         int flags;
 2265         struct newblk **newblkpp;
 2266 {
 2267         struct newblk *newblk;
 2268 
 2269         LIST_FOREACH(newblk, newblkhd, nb_hash) {
 2270                 if (newblkno != newblk->nb_newblkno)
 2271                         continue;
 2272                 /*
 2273                  * If we're creating a new dependency don't match those that
 2274                  * have already been converted to allocdirects.  This is for
 2275                  * a frag extend.
 2276                  */
 2277                 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
 2278                         continue;
 2279                 break;
 2280         }
 2281         if (newblk) {
 2282                 *newblkpp = newblk;
 2283                 return (1);
 2284         }
 2285         *newblkpp = NULL;
 2286         return (0);
 2287 }
 2288 
 2289 /*
 2290  * Look up a newblk. Return 1 if found, 0 if not found.
 2291  * If not found, allocate if DEPALLOC flag is passed.
 2292  * Found or allocated entry is returned in newblkpp.
 2293  */
 2294 static int
 2295 newblk_lookup(mp, newblkno, flags, newblkpp)
 2296         struct mount *mp;
 2297         ufs2_daddr_t newblkno;
 2298         int flags;
 2299         struct newblk **newblkpp;
 2300 {
 2301         struct newblk *newblk;
 2302         struct newblk_hashhead *newblkhd;
 2303         struct ufsmount *ump;
 2304 
 2305         ump = VFSTOUFS(mp);
 2306         LOCK_OWNED(ump);
 2307         newblkhd = NEWBLK_HASH(ump, newblkno);
 2308         if (newblk_find(newblkhd, newblkno, flags, newblkpp))
 2309                 return (1);
 2310         if ((flags & DEPALLOC) == 0)
 2311                 return (0);
 2312         if (softdep_excess_items(ump, D_NEWBLK) ||
 2313             softdep_excess_items(ump, D_ALLOCDIRECT) ||
 2314             softdep_excess_items(ump, D_ALLOCINDIR))
 2315                 schedule_cleanup(mp);
 2316         else
 2317                 FREE_LOCK(ump);
 2318         newblk = malloc(sizeof(union allblk), M_NEWBLK,
 2319             M_SOFTDEP_FLAGS | M_ZERO);
 2320         workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
 2321         ACQUIRE_LOCK(ump);
 2322         if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
 2323                 WORKITEM_FREE(newblk, D_NEWBLK);
 2324                 return (1);
 2325         }
 2326         newblk->nb_freefrag = NULL;
 2327         LIST_INIT(&newblk->nb_indirdeps);
 2328         LIST_INIT(&newblk->nb_newdirblk);
 2329         LIST_INIT(&newblk->nb_jwork);
 2330         newblk->nb_state = ATTACHED;
 2331         newblk->nb_newblkno = newblkno;
 2332         LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
 2333         *newblkpp = newblk;
 2334         return (0);
 2335 }
 2336 
 2337 /*
 2338  * Structures and routines associated with freed indirect block caching.
 2339  */
 2340 #define INDIR_HASH(ump, blkno) \
 2341         (&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
 2342 
 2343 /*
 2344  * Lookup an indirect block in the indir hash table.  The freework is
 2345  * removed and potentially freed.  The caller must do a blocking journal
 2346  * write before writing to the blkno.
 2347  */
 2348 static int
 2349 indirblk_lookup(mp, blkno)
 2350         struct mount *mp;
 2351         ufs2_daddr_t blkno;
 2352 {
 2353         struct freework *freework;
 2354         struct indir_hashhead *wkhd;
 2355         struct ufsmount *ump;
 2356 
 2357         ump = VFSTOUFS(mp);
 2358         wkhd = INDIR_HASH(ump, blkno);
 2359         TAILQ_FOREACH(freework, wkhd, fw_next) {
 2360                 if (freework->fw_blkno != blkno)
 2361                         continue;
 2362                 indirblk_remove(freework);
 2363                 return (1);
 2364         }
 2365         return (0);
 2366 }
 2367 
 2368 /*
 2369  * Insert an indirect block represented by freework into the indirblk
 2370  * hash table so that it may prevent the block from being re-used prior
 2371  * to the journal being written.
 2372  */
 2373 static void
 2374 indirblk_insert(freework)
 2375         struct freework *freework;
 2376 {
 2377         struct jblocks *jblocks;
 2378         struct jseg *jseg;
 2379         struct ufsmount *ump;
 2380 
 2381         ump = VFSTOUFS(freework->fw_list.wk_mp);
 2382         jblocks = ump->softdep_jblocks;
 2383         jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
 2384         if (jseg == NULL)
 2385                 return;
 2386         
 2387         LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
 2388         TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
 2389             fw_next);
 2390         freework->fw_state &= ~DEPCOMPLETE;
 2391 }
 2392 
 2393 static void
 2394 indirblk_remove(freework)
 2395         struct freework *freework;
 2396 {
 2397         struct ufsmount *ump;
 2398 
 2399         ump = VFSTOUFS(freework->fw_list.wk_mp);
 2400         LIST_REMOVE(freework, fw_segs);
 2401         TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
 2402         freework->fw_state |= DEPCOMPLETE;
 2403         if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
 2404                 WORKITEM_FREE(freework, D_FREEWORK);
 2405 }
 2406 
 2407 /*
 2408  * Executed during filesystem system initialization before
 2409  * mounting any filesystems.
 2410  */
 2411 void 
 2412 softdep_initialize()
 2413 {
 2414 
 2415         TAILQ_INIT(&softdepmounts);
 2416 #ifdef __LP64__
 2417         max_softdeps = desiredvnodes * 4;
 2418 #else
 2419         max_softdeps = desiredvnodes * 2;
 2420 #endif
 2421 
 2422         /* initialise bioops hack */
 2423         bioops.io_start = softdep_disk_io_initiation;
 2424         bioops.io_complete = softdep_disk_write_complete;
 2425         bioops.io_deallocate = softdep_deallocate_dependencies;
 2426         bioops.io_countdeps = softdep_count_dependencies;
 2427         softdep_ast_cleanup = softdep_ast_cleanup_proc;
 2428 
 2429         /* Initialize the callout with an mtx. */
 2430         callout_init_mtx(&softdep_callout, &lk, 0);
 2431 }
 2432 
 2433 /*
 2434  * Executed after all filesystems have been unmounted during
 2435  * filesystem module unload.
 2436  */
 2437 void
 2438 softdep_uninitialize()
 2439 {
 2440 
 2441         /* clear bioops hack */
 2442         bioops.io_start = NULL;
 2443         bioops.io_complete = NULL;
 2444         bioops.io_deallocate = NULL;
 2445         bioops.io_countdeps = NULL;
 2446         softdep_ast_cleanup = NULL;
 2447 
 2448         callout_drain(&softdep_callout);
 2449 }
 2450 
 2451 /*
 2452  * Called at mount time to notify the dependency code that a
 2453  * filesystem wishes to use it.
 2454  */
 2455 int
 2456 softdep_mount(devvp, mp, fs, cred)
 2457         struct vnode *devvp;
 2458         struct mount *mp;
 2459         struct fs *fs;
 2460         struct ucred *cred;
 2461 {
 2462         struct csum_total cstotal;
 2463         struct mount_softdeps *sdp;
 2464         struct ufsmount *ump;
 2465         struct cg *cgp;
 2466         struct buf *bp;
 2467         int i, error, cyl;
 2468 
 2469         sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
 2470             M_WAITOK | M_ZERO);
 2471         MNT_ILOCK(mp);
 2472         mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
 2473         if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
 2474                 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 
 2475                         MNTK_SOFTDEP | MNTK_NOASYNC;
 2476         }
 2477         ump = VFSTOUFS(mp);
 2478         ump->um_softdep = sdp;
 2479         MNT_IUNLOCK(mp);
 2480         rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock");
 2481         sdp->sd_ump = ump;
 2482         LIST_INIT(&ump->softdep_workitem_pending);
 2483         LIST_INIT(&ump->softdep_journal_pending);
 2484         TAILQ_INIT(&ump->softdep_unlinked);
 2485         LIST_INIT(&ump->softdep_dirtycg);
 2486         ump->softdep_worklist_tail = NULL;
 2487         ump->softdep_on_worklist = 0;
 2488         ump->softdep_deps = 0;
 2489         LIST_INIT(&ump->softdep_mkdirlisthd);
 2490         ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
 2491             &ump->pagedep_hash_size);
 2492         ump->pagedep_nextclean = 0;
 2493         ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
 2494             &ump->inodedep_hash_size);
 2495         ump->inodedep_nextclean = 0;
 2496         ump->newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK,
 2497             &ump->newblk_hash_size);
 2498         ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
 2499             &ump->bmsafemap_hash_size);
 2500         i = 1 << (ffs(desiredvnodes / 10) - 1);
 2501         ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
 2502             M_FREEWORK, M_WAITOK);
 2503         ump->indir_hash_size = i - 1;
 2504         for (i = 0; i <= ump->indir_hash_size; i++)
 2505                 TAILQ_INIT(&ump->indir_hashtbl[i]);
 2506         ACQUIRE_GBLLOCK(&lk);
 2507         TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
 2508         FREE_GBLLOCK(&lk);
 2509         if ((fs->fs_flags & FS_SUJ) &&
 2510             (error = journal_mount(mp, fs, cred)) != 0) {
 2511                 printf("Failed to start journal: %d\n", error);
 2512                 softdep_unmount(mp);
 2513                 return (error);
 2514         }
 2515         /*
 2516          * Start our flushing thread in the bufdaemon process.
 2517          */
 2518         ACQUIRE_LOCK(ump);
 2519         ump->softdep_flags |= FLUSH_STARTING;
 2520         FREE_LOCK(ump);
 2521         kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
 2522             &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
 2523             mp->mnt_stat.f_mntonname);
 2524         ACQUIRE_LOCK(ump);
 2525         while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
 2526                 msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
 2527                     hz / 2);
 2528         }
 2529         FREE_LOCK(ump);
 2530         /*
 2531          * When doing soft updates, the counters in the
 2532          * superblock may have gotten out of sync. Recomputation
 2533          * can take a long time and can be deferred for background
 2534          * fsck.  However, the old behavior of scanning the cylinder
 2535          * groups and recalculating them at mount time is available
 2536          * by setting vfs.ffs.compute_summary_at_mount to one.
 2537          */
 2538         if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
 2539                 return (0);
 2540         bzero(&cstotal, sizeof cstotal);
 2541         for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
 2542                 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
 2543                     fs->fs_cgsize, cred, &bp)) != 0) {
 2544                         brelse(bp);
 2545                         softdep_unmount(mp);
 2546                         return (error);
 2547                 }
 2548                 cgp = (struct cg *)bp->b_data;
 2549                 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
 2550                 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
 2551                 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
 2552                 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
 2553                 fs->fs_cs(fs, cyl) = cgp->cg_cs;
 2554                 brelse(bp);
 2555         }
 2556 #ifdef DEBUG
 2557         if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
 2558                 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
 2559 #endif
 2560         bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
 2561         return (0);
 2562 }
 2563 
 2564 void
 2565 softdep_unmount(mp)
 2566         struct mount *mp;
 2567 {
 2568         struct ufsmount *ump;
 2569 #ifdef INVARIANTS
 2570         int i;
 2571 #endif
 2572 
 2573         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 2574             ("softdep_unmount called on non-softdep filesystem"));
 2575         ump = VFSTOUFS(mp);
 2576         MNT_ILOCK(mp);
 2577         mp->mnt_flag &= ~MNT_SOFTDEP;
 2578         if (MOUNTEDSUJ(mp) == 0) {
 2579                 MNT_IUNLOCK(mp);
 2580         } else {
 2581                 mp->mnt_flag &= ~MNT_SUJ;
 2582                 MNT_IUNLOCK(mp);
 2583                 journal_unmount(ump);
 2584         }
 2585         /*
 2586          * Shut down our flushing thread. Check for NULL is if
 2587          * softdep_mount errors out before the thread has been created.
 2588          */
 2589         if (ump->softdep_flushtd != NULL) {
 2590                 ACQUIRE_LOCK(ump);
 2591                 ump->softdep_flags |= FLUSH_EXIT;
 2592                 wakeup(&ump->softdep_flushtd);
 2593                 msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
 2594                     "sdwait", 0);
 2595                 KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
 2596                     ("Thread shutdown failed"));
 2597         }
 2598         /*
 2599          * Free up our resources.
 2600          */
 2601         ACQUIRE_GBLLOCK(&lk);
 2602         TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
 2603         FREE_GBLLOCK(&lk);
 2604         rw_destroy(LOCK_PTR(ump));
 2605         hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
 2606         hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
 2607         hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
 2608         hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
 2609             ump->bmsafemap_hash_size);
 2610         free(ump->indir_hashtbl, M_FREEWORK);
 2611 #ifdef INVARIANTS
 2612         for (i = 0; i <= D_LAST; i++)
 2613                 KASSERT(ump->softdep_curdeps[i] == 0,
 2614                     ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
 2615                     TYPENAME(i), ump->softdep_curdeps[i]));
 2616 #endif
 2617         free(ump->um_softdep, M_MOUNTDATA);
 2618 }
 2619 
 2620 static struct jblocks *
 2621 jblocks_create(void)
 2622 {
 2623         struct jblocks *jblocks;
 2624 
 2625         jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
 2626         TAILQ_INIT(&jblocks->jb_segs);
 2627         jblocks->jb_avail = 10;
 2628         jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
 2629             M_JBLOCKS, M_WAITOK | M_ZERO);
 2630 
 2631         return (jblocks);
 2632 }
 2633 
 2634 static ufs2_daddr_t
 2635 jblocks_alloc(jblocks, bytes, actual)
 2636         struct jblocks *jblocks;
 2637         int bytes;
 2638         int *actual;
 2639 {
 2640         ufs2_daddr_t daddr;
 2641         struct jextent *jext;
 2642         int freecnt;
 2643         int blocks;
 2644 
 2645         blocks = bytes / DEV_BSIZE;
 2646         jext = &jblocks->jb_extent[jblocks->jb_head];
 2647         freecnt = jext->je_blocks - jblocks->jb_off;
 2648         if (freecnt == 0) {
 2649                 jblocks->jb_off = 0;
 2650                 if (++jblocks->jb_head > jblocks->jb_used)
 2651                         jblocks->jb_head = 0;
 2652                 jext = &jblocks->jb_extent[jblocks->jb_head];
 2653                 freecnt = jext->je_blocks;
 2654         }
 2655         if (freecnt > blocks)
 2656                 freecnt = blocks;
 2657         *actual = freecnt * DEV_BSIZE;
 2658         daddr = jext->je_daddr + jblocks->jb_off;
 2659         jblocks->jb_off += freecnt;
 2660         jblocks->jb_free -= freecnt;
 2661 
 2662         return (daddr);
 2663 }
 2664 
 2665 static void
 2666 jblocks_free(jblocks, mp, bytes)
 2667         struct jblocks *jblocks;
 2668         struct mount *mp;
 2669         int bytes;
 2670 {
 2671 
 2672         LOCK_OWNED(VFSTOUFS(mp));
 2673         jblocks->jb_free += bytes / DEV_BSIZE;
 2674         if (jblocks->jb_suspended)
 2675                 worklist_speedup(mp);
 2676         wakeup(jblocks);
 2677 }
 2678 
 2679 static void
 2680 jblocks_destroy(jblocks)
 2681         struct jblocks *jblocks;
 2682 {
 2683 
 2684         if (jblocks->jb_extent)
 2685                 free(jblocks->jb_extent, M_JBLOCKS);
 2686         free(jblocks, M_JBLOCKS);
 2687 }
 2688 
 2689 static void
 2690 jblocks_add(jblocks, daddr, blocks)
 2691         struct jblocks *jblocks;
 2692         ufs2_daddr_t daddr;
 2693         int blocks;
 2694 {
 2695         struct jextent *jext;
 2696 
 2697         jblocks->jb_blocks += blocks;
 2698         jblocks->jb_free += blocks;
 2699         jext = &jblocks->jb_extent[jblocks->jb_used];
 2700         /* Adding the first block. */
 2701         if (jext->je_daddr == 0) {
 2702                 jext->je_daddr = daddr;
 2703                 jext->je_blocks = blocks;
 2704                 return;
 2705         }
 2706         /* Extending the last extent. */
 2707         if (jext->je_daddr + jext->je_blocks == daddr) {
 2708                 jext->je_blocks += blocks;
 2709                 return;
 2710         }
 2711         /* Adding a new extent. */
 2712         if (++jblocks->jb_used == jblocks->jb_avail) {
 2713                 jblocks->jb_avail *= 2;
 2714                 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
 2715                     M_JBLOCKS, M_WAITOK | M_ZERO);
 2716                 memcpy(jext, jblocks->jb_extent,
 2717                     sizeof(struct jextent) * jblocks->jb_used);
 2718                 free(jblocks->jb_extent, M_JBLOCKS);
 2719                 jblocks->jb_extent = jext;
 2720         }
 2721         jext = &jblocks->jb_extent[jblocks->jb_used];
 2722         jext->je_daddr = daddr;
 2723         jext->je_blocks = blocks;
 2724         return;
 2725 }
 2726 
 2727 int
 2728 softdep_journal_lookup(mp, vpp)
 2729         struct mount *mp;
 2730         struct vnode **vpp;
 2731 {
 2732         struct componentname cnp;
 2733         struct vnode *dvp;
 2734         ino_t sujournal;
 2735         int error;
 2736 
 2737         error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
 2738         if (error)
 2739                 return (error);
 2740         bzero(&cnp, sizeof(cnp));
 2741         cnp.cn_nameiop = LOOKUP;
 2742         cnp.cn_flags = ISLASTCN;
 2743         cnp.cn_thread = curthread;
 2744         cnp.cn_cred = curthread->td_ucred;
 2745         cnp.cn_pnbuf = SUJ_FILE;
 2746         cnp.cn_nameptr = SUJ_FILE;
 2747         cnp.cn_namelen = strlen(SUJ_FILE);
 2748         error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
 2749         vput(dvp);
 2750         if (error != 0)
 2751                 return (error);
 2752         error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
 2753         return (error);
 2754 }
 2755 
 2756 /*
 2757  * Open and verify the journal file.
 2758  */
 2759 static int
 2760 journal_mount(mp, fs, cred)
 2761         struct mount *mp;
 2762         struct fs *fs;
 2763         struct ucred *cred;
 2764 {
 2765         struct jblocks *jblocks;
 2766         struct ufsmount *ump;
 2767         struct vnode *vp;
 2768         struct inode *ip;
 2769         ufs2_daddr_t blkno;
 2770         int bcount;
 2771         int error;
 2772         int i;
 2773 
 2774         ump = VFSTOUFS(mp);
 2775         ump->softdep_journal_tail = NULL;
 2776         ump->softdep_on_journal = 0;
 2777         ump->softdep_accdeps = 0;
 2778         ump->softdep_req = 0;
 2779         ump->softdep_jblocks = NULL;
 2780         error = softdep_journal_lookup(mp, &vp);
 2781         if (error != 0) {
 2782                 printf("Failed to find journal.  Use tunefs to create one\n");
 2783                 return (error);
 2784         }
 2785         ip = VTOI(vp);
 2786         if (ip->i_size < SUJ_MIN) {
 2787                 error = ENOSPC;
 2788                 goto out;
 2789         }
 2790         bcount = lblkno(fs, ip->i_size);        /* Only use whole blocks. */
 2791         jblocks = jblocks_create();
 2792         for (i = 0; i < bcount; i++) {
 2793                 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
 2794                 if (error)
 2795                         break;
 2796                 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
 2797         }
 2798         if (error) {
 2799                 jblocks_destroy(jblocks);
 2800                 goto out;
 2801         }
 2802         jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */
 2803         jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
 2804         ump->softdep_jblocks = jblocks;
 2805 out:
 2806         if (error == 0) {
 2807                 MNT_ILOCK(mp);
 2808                 mp->mnt_flag |= MNT_SUJ;
 2809                 mp->mnt_flag &= ~MNT_SOFTDEP;
 2810                 MNT_IUNLOCK(mp);
 2811                 /*
 2812                  * Only validate the journal contents if the
 2813                  * filesystem is clean, otherwise we write the logs
 2814                  * but they'll never be used.  If the filesystem was
 2815                  * still dirty when we mounted it the journal is
 2816                  * invalid and a new journal can only be valid if it
 2817                  * starts from a clean mount.
 2818                  */
 2819                 if (fs->fs_clean) {
 2820                         DIP_SET(ip, i_modrev, fs->fs_mtime);
 2821                         ip->i_flags |= IN_MODIFIED;
 2822                         ffs_update(vp, 1);
 2823                 }
 2824         }
 2825         vput(vp);
 2826         return (error);
 2827 }
 2828 
 2829 static void
 2830 journal_unmount(ump)
 2831         struct ufsmount *ump;
 2832 {
 2833 
 2834         if (ump->softdep_jblocks)
 2835                 jblocks_destroy(ump->softdep_jblocks);
 2836         ump->softdep_jblocks = NULL;
 2837 }
 2838 
 2839 /*
 2840  * Called when a journal record is ready to be written.  Space is allocated
 2841  * and the journal entry is created when the journal is flushed to stable
 2842  * store.
 2843  */
 2844 static void
 2845 add_to_journal(wk)
 2846         struct worklist *wk;
 2847 {
 2848         struct ufsmount *ump;
 2849 
 2850         ump = VFSTOUFS(wk->wk_mp);
 2851         LOCK_OWNED(ump);
 2852         if (wk->wk_state & ONWORKLIST)
 2853                 panic("add_to_journal: %s(0x%X) already on list",
 2854                     TYPENAME(wk->wk_type), wk->wk_state);
 2855         wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
 2856         if (LIST_EMPTY(&ump->softdep_journal_pending)) {
 2857                 ump->softdep_jblocks->jb_age = ticks;
 2858                 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
 2859         } else
 2860                 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
 2861         ump->softdep_journal_tail = wk;
 2862         ump->softdep_on_journal += 1;
 2863 }
 2864 
 2865 /*
 2866  * Remove an arbitrary item for the journal worklist maintain the tail
 2867  * pointer.  This happens when a new operation obviates the need to
 2868  * journal an old operation.
 2869  */
 2870 static void
 2871 remove_from_journal(wk)
 2872         struct worklist *wk;
 2873 {
 2874         struct ufsmount *ump;
 2875 
 2876         ump = VFSTOUFS(wk->wk_mp);
 2877         LOCK_OWNED(ump);
 2878 #ifdef SUJ_DEBUG
 2879         {
 2880                 struct worklist *wkn;
 2881 
 2882                 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
 2883                         if (wkn == wk)
 2884                                 break;
 2885                 if (wkn == NULL)
 2886                         panic("remove_from_journal: %p is not in journal", wk);
 2887         }
 2888 #endif
 2889         /*
 2890          * We emulate a TAILQ to save space in most structures which do not
 2891          * require TAILQ semantics.  Here we must update the tail position
 2892          * when removing the tail which is not the final entry. This works
 2893          * only if the worklist linkage are at the beginning of the structure.
 2894          */
 2895         if (ump->softdep_journal_tail == wk)
 2896                 ump->softdep_journal_tail =
 2897                     (struct worklist *)wk->wk_list.le_prev;
 2898         WORKLIST_REMOVE(wk);
 2899         ump->softdep_on_journal -= 1;
 2900 }
 2901 
 2902 /*
 2903  * Check for journal space as well as dependency limits so the prelink
 2904  * code can throttle both journaled and non-journaled filesystems.
 2905  * Threshold is 0 for low and 1 for min.
 2906  */
 2907 static int
 2908 journal_space(ump, thresh)
 2909         struct ufsmount *ump;
 2910         int thresh;
 2911 {
 2912         struct jblocks *jblocks;
 2913         int limit, avail;
 2914 
 2915         jblocks = ump->softdep_jblocks;
 2916         if (jblocks == NULL)
 2917                 return (1);
 2918         /*
 2919          * We use a tighter restriction here to prevent request_cleanup()
 2920          * running in threads from running into locks we currently hold.
 2921          * We have to be over the limit and our filesystem has to be
 2922          * responsible for more than our share of that usage.
 2923          */
 2924         limit = (max_softdeps / 10) * 9;
 2925         if (dep_current[D_INODEDEP] > limit &&
 2926             ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
 2927                 return (0);
 2928         if (thresh)
 2929                 thresh = jblocks->jb_min;
 2930         else
 2931                 thresh = jblocks->jb_low;
 2932         avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
 2933         avail = jblocks->jb_free - avail;
 2934 
 2935         return (avail > thresh);
 2936 }
 2937 
 2938 static void
 2939 journal_suspend(ump)
 2940         struct ufsmount *ump;
 2941 {
 2942         struct jblocks *jblocks;
 2943         struct mount *mp;
 2944 
 2945         mp = UFSTOVFS(ump);
 2946         jblocks = ump->softdep_jblocks;
 2947         MNT_ILOCK(mp);
 2948         if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
 2949                 stat_journal_min++;
 2950                 mp->mnt_kern_flag |= MNTK_SUSPEND;
 2951                 mp->mnt_susp_owner = ump->softdep_flushtd;
 2952         }
 2953         jblocks->jb_suspended = 1;
 2954         MNT_IUNLOCK(mp);
 2955 }
 2956 
 2957 static int
 2958 journal_unsuspend(struct ufsmount *ump)
 2959 {
 2960         struct jblocks *jblocks;
 2961         struct mount *mp;
 2962 
 2963         mp = UFSTOVFS(ump);
 2964         jblocks = ump->softdep_jblocks;
 2965 
 2966         if (jblocks != NULL && jblocks->jb_suspended &&
 2967             journal_space(ump, jblocks->jb_min)) {
 2968                 jblocks->jb_suspended = 0;
 2969                 FREE_LOCK(ump);
 2970                 mp->mnt_susp_owner = curthread;
 2971                 vfs_write_resume(mp, 0);
 2972                 ACQUIRE_LOCK(ump);
 2973                 return (1);
 2974         }
 2975         return (0);
 2976 }
 2977 
 2978 /*
 2979  * Called before any allocation function to be certain that there is
 2980  * sufficient space in the journal prior to creating any new records.
 2981  * Since in the case of block allocation we may have multiple locked
 2982  * buffers at the time of the actual allocation we can not block
 2983  * when the journal records are created.  Doing so would create a deadlock
 2984  * if any of these buffers needed to be flushed to reclaim space.  Instead
 2985  * we require a sufficiently large amount of available space such that
 2986  * each thread in the system could have passed this allocation check and
 2987  * still have sufficient free space.  With 20% of a minimum journal size
 2988  * of 1MB we have 6553 records available.
 2989  */
 2990 int
 2991 softdep_prealloc(vp, waitok)
 2992         struct vnode *vp;
 2993         int waitok;
 2994 {
 2995         struct ufsmount *ump;
 2996 
 2997         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
 2998             ("softdep_prealloc called on non-softdep filesystem"));
 2999         /*
 3000          * Nothing to do if we are not running journaled soft updates.
 3001          * If we currently hold the snapshot lock, we must avoid
 3002          * handling other resources that could cause deadlock.  Do not
 3003          * touch quotas vnode since it is typically recursed with
 3004          * other vnode locks held.
 3005          */
 3006         if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) ||
 3007             (vp->v_vflag & VV_SYSTEM) != 0)
 3008                 return (0);
 3009         ump = VFSTOUFS(vp->v_mount);
 3010         ACQUIRE_LOCK(ump);
 3011         if (journal_space(ump, 0)) {
 3012                 FREE_LOCK(ump);
 3013                 return (0);
 3014         }
 3015         stat_journal_low++;
 3016         FREE_LOCK(ump);
 3017         if (waitok == MNT_NOWAIT)
 3018                 return (ENOSPC);
 3019         /*
 3020          * Attempt to sync this vnode once to flush any journal
 3021          * work attached to it.
 3022          */
 3023         if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
 3024                 ffs_syncvnode(vp, waitok, 0);
 3025         ACQUIRE_LOCK(ump);
 3026         process_removes(vp);
 3027         process_truncates(vp);
 3028         if (journal_space(ump, 0) == 0) {
 3029                 softdep_speedup(ump);
 3030                 if (journal_space(ump, 1) == 0)
 3031                         journal_suspend(ump);
 3032         }
 3033         FREE_LOCK(ump);
 3034 
 3035         return (0);
 3036 }
 3037 
 3038 /*
 3039  * Before adjusting a link count on a vnode verify that we have sufficient
 3040  * journal space.  If not, process operations that depend on the currently
 3041  * locked pair of vnodes to try to flush space as the syncer, buf daemon,
 3042  * and softdep flush threads can not acquire these locks to reclaim space.
 3043  */
 3044 static void
 3045 softdep_prelink(dvp, vp)
 3046         struct vnode *dvp;
 3047         struct vnode *vp;
 3048 {
 3049         struct ufsmount *ump;
 3050 
 3051         ump = VFSTOUFS(dvp->v_mount);
 3052         LOCK_OWNED(ump);
 3053         /*
 3054          * Nothing to do if we have sufficient journal space.
 3055          * If we currently hold the snapshot lock, we must avoid
 3056          * handling other resources that could cause deadlock.
 3057          */
 3058         if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
 3059                 return;
 3060         stat_journal_low++;
 3061         FREE_LOCK(ump);
 3062         if (vp)
 3063                 ffs_syncvnode(vp, MNT_NOWAIT, 0);
 3064         ffs_syncvnode(dvp, MNT_WAIT, 0);
 3065         ACQUIRE_LOCK(ump);
 3066         /* Process vp before dvp as it may create .. removes. */
 3067         if (vp) {
 3068                 process_removes(vp);
 3069                 process_truncates(vp);
 3070         }
 3071         process_removes(dvp);
 3072         process_truncates(dvp);
 3073         softdep_speedup(ump);
 3074         process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
 3075         if (journal_space(ump, 0) == 0) {
 3076                 softdep_speedup(ump);
 3077                 if (journal_space(ump, 1) == 0)
 3078                         journal_suspend(ump);
 3079         }
 3080 }
 3081 
 3082 static void
 3083 jseg_write(ump, jseg, data)
 3084         struct ufsmount *ump;
 3085         struct jseg *jseg;
 3086         uint8_t *data;
 3087 {
 3088         struct jsegrec *rec;
 3089 
 3090         rec = (struct jsegrec *)data;
 3091         rec->jsr_seq = jseg->js_seq;
 3092         rec->jsr_oldest = jseg->js_oldseq;
 3093         rec->jsr_cnt = jseg->js_cnt;
 3094         rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
 3095         rec->jsr_crc = 0;
 3096         rec->jsr_time = ump->um_fs->fs_mtime;
 3097 }
 3098 
 3099 static inline void
 3100 inoref_write(inoref, jseg, rec)
 3101         struct inoref *inoref;
 3102         struct jseg *jseg;
 3103         struct jrefrec *rec;
 3104 {
 3105 
 3106         inoref->if_jsegdep->jd_seg = jseg;
 3107         rec->jr_ino = inoref->if_ino;
 3108         rec->jr_parent = inoref->if_parent;
 3109         rec->jr_nlink = inoref->if_nlink;
 3110         rec->jr_mode = inoref->if_mode;
 3111         rec->jr_diroff = inoref->if_diroff;
 3112 }
 3113 
 3114 static void
 3115 jaddref_write(jaddref, jseg, data)
 3116         struct jaddref *jaddref;
 3117         struct jseg *jseg;
 3118         uint8_t *data;
 3119 {
 3120         struct jrefrec *rec;
 3121 
 3122         rec = (struct jrefrec *)data;
 3123         rec->jr_op = JOP_ADDREF;
 3124         inoref_write(&jaddref->ja_ref, jseg, rec);
 3125 }
 3126 
 3127 static void
 3128 jremref_write(jremref, jseg, data)
 3129         struct jremref *jremref;
 3130         struct jseg *jseg;
 3131         uint8_t *data;
 3132 {
 3133         struct jrefrec *rec;
 3134 
 3135         rec = (struct jrefrec *)data;
 3136         rec->jr_op = JOP_REMREF;
 3137         inoref_write(&jremref->jr_ref, jseg, rec);
 3138 }
 3139 
 3140 static void
 3141 jmvref_write(jmvref, jseg, data)
 3142         struct jmvref *jmvref;
 3143         struct jseg *jseg;
 3144         uint8_t *data;
 3145 {
 3146         struct jmvrec *rec;
 3147 
 3148         rec = (struct jmvrec *)data;
 3149         rec->jm_op = JOP_MVREF;
 3150         rec->jm_ino = jmvref->jm_ino;
 3151         rec->jm_parent = jmvref->jm_parent;
 3152         rec->jm_oldoff = jmvref->jm_oldoff;
 3153         rec->jm_newoff = jmvref->jm_newoff;
 3154 }
 3155 
 3156 static void
 3157 jnewblk_write(jnewblk, jseg, data)
 3158         struct jnewblk *jnewblk;
 3159         struct jseg *jseg;
 3160         uint8_t *data;
 3161 {
 3162         struct jblkrec *rec;
 3163 
 3164         jnewblk->jn_jsegdep->jd_seg = jseg;
 3165         rec = (struct jblkrec *)data;
 3166         rec->jb_op = JOP_NEWBLK;
 3167         rec->jb_ino = jnewblk->jn_ino;
 3168         rec->jb_blkno = jnewblk->jn_blkno;
 3169         rec->jb_lbn = jnewblk->jn_lbn;
 3170         rec->jb_frags = jnewblk->jn_frags;
 3171         rec->jb_oldfrags = jnewblk->jn_oldfrags;
 3172 }
 3173 
 3174 static void
 3175 jfreeblk_write(jfreeblk, jseg, data)
 3176         struct jfreeblk *jfreeblk;
 3177         struct jseg *jseg;
 3178         uint8_t *data;
 3179 {
 3180         struct jblkrec *rec;
 3181 
 3182         jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
 3183         rec = (struct jblkrec *)data;
 3184         rec->jb_op = JOP_FREEBLK;
 3185         rec->jb_ino = jfreeblk->jf_ino;
 3186         rec->jb_blkno = jfreeblk->jf_blkno;
 3187         rec->jb_lbn = jfreeblk->jf_lbn;
 3188         rec->jb_frags = jfreeblk->jf_frags;
 3189         rec->jb_oldfrags = 0;
 3190 }
 3191 
 3192 static void
 3193 jfreefrag_write(jfreefrag, jseg, data)
 3194         struct jfreefrag *jfreefrag;
 3195         struct jseg *jseg;
 3196         uint8_t *data;
 3197 {
 3198         struct jblkrec *rec;
 3199 
 3200         jfreefrag->fr_jsegdep->jd_seg = jseg;
 3201         rec = (struct jblkrec *)data;
 3202         rec->jb_op = JOP_FREEBLK;
 3203         rec->jb_ino = jfreefrag->fr_ino;
 3204         rec->jb_blkno = jfreefrag->fr_blkno;
 3205         rec->jb_lbn = jfreefrag->fr_lbn;
 3206         rec->jb_frags = jfreefrag->fr_frags;
 3207         rec->jb_oldfrags = 0;
 3208 }
 3209 
 3210 static void
 3211 jtrunc_write(jtrunc, jseg, data)
 3212         struct jtrunc *jtrunc;
 3213         struct jseg *jseg;
 3214         uint8_t *data;
 3215 {
 3216         struct jtrncrec *rec;
 3217 
 3218         jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
 3219         rec = (struct jtrncrec *)data;
 3220         rec->jt_op = JOP_TRUNC;
 3221         rec->jt_ino = jtrunc->jt_ino;
 3222         rec->jt_size = jtrunc->jt_size;
 3223         rec->jt_extsize = jtrunc->jt_extsize;
 3224 }
 3225 
 3226 static void
 3227 jfsync_write(jfsync, jseg, data)
 3228         struct jfsync *jfsync;
 3229         struct jseg *jseg;
 3230         uint8_t *data;
 3231 {
 3232         struct jtrncrec *rec;
 3233 
 3234         rec = (struct jtrncrec *)data;
 3235         rec->jt_op = JOP_SYNC;
 3236         rec->jt_ino = jfsync->jfs_ino;
 3237         rec->jt_size = jfsync->jfs_size;
 3238         rec->jt_extsize = jfsync->jfs_extsize;
 3239 }
 3240 
 3241 static void
 3242 softdep_flushjournal(mp)
 3243         struct mount *mp;
 3244 {
 3245         struct jblocks *jblocks;
 3246         struct ufsmount *ump;
 3247 
 3248         if (MOUNTEDSUJ(mp) == 0)
 3249                 return;
 3250         ump = VFSTOUFS(mp);
 3251         jblocks = ump->softdep_jblocks;
 3252         ACQUIRE_LOCK(ump);
 3253         while (ump->softdep_on_journal) {
 3254                 jblocks->jb_needseg = 1;
 3255                 softdep_process_journal(mp, NULL, MNT_WAIT);
 3256         }
 3257         FREE_LOCK(ump);
 3258 }
 3259 
 3260 static void softdep_synchronize_completed(struct bio *);
 3261 static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
 3262 
 3263 static void
 3264 softdep_synchronize_completed(bp)
 3265         struct bio *bp;
 3266 {
 3267         struct jseg *oldest;
 3268         struct jseg *jseg;
 3269         struct ufsmount *ump;
 3270 
 3271         /*
 3272          * caller1 marks the last segment written before we issued the
 3273          * synchronize cache.
 3274          */
 3275         jseg = bp->bio_caller1;
 3276         if (jseg == NULL) {
 3277                 g_destroy_bio(bp);
 3278                 return;
 3279         }
 3280         ump = VFSTOUFS(jseg->js_list.wk_mp);
 3281         ACQUIRE_LOCK(ump);
 3282         oldest = NULL;
 3283         /*
 3284          * Mark all the journal entries waiting on the synchronize cache
 3285          * as completed so they may continue on.
 3286          */
 3287         while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
 3288                 jseg->js_state |= COMPLETE;
 3289                 oldest = jseg;
 3290                 jseg = TAILQ_PREV(jseg, jseglst, js_next);
 3291         }
 3292         /*
 3293          * Restart deferred journal entry processing from the oldest
 3294          * completed jseg.
 3295          */
 3296         if (oldest)
 3297                 complete_jsegs(oldest);
 3298 
 3299         FREE_LOCK(ump);
 3300         g_destroy_bio(bp);
 3301 }
 3302 
 3303 /*
 3304  * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
 3305  * barriers.  The journal must be written prior to any blocks that depend
 3306  * on it and the journal can not be released until the blocks have be
 3307  * written.  This code handles both barriers simultaneously.
 3308  */
 3309 static void
 3310 softdep_synchronize(bp, ump, caller1)
 3311         struct bio *bp;
 3312         struct ufsmount *ump;
 3313         void *caller1;
 3314 {
 3315 
 3316         bp->bio_cmd = BIO_FLUSH;
 3317         bp->bio_flags |= BIO_ORDERED;
 3318         bp->bio_data = NULL;
 3319         bp->bio_offset = ump->um_cp->provider->mediasize;
 3320         bp->bio_length = 0;
 3321         bp->bio_done = softdep_synchronize_completed;
 3322         bp->bio_caller1 = caller1;
 3323         g_io_request(bp,
 3324             (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
 3325 }
 3326 
 3327 /*
 3328  * Flush some journal records to disk.
 3329  */
 3330 static void
 3331 softdep_process_journal(mp, needwk, flags)
 3332         struct mount *mp;
 3333         struct worklist *needwk;
 3334         int flags;
 3335 {
 3336         struct jblocks *jblocks;
 3337         struct ufsmount *ump;
 3338         struct worklist *wk;
 3339         struct jseg *jseg;
 3340         struct buf *bp;
 3341         struct bio *bio;
 3342         uint8_t *data;
 3343         struct fs *fs;
 3344         int shouldflush;
 3345         int segwritten;
 3346         int jrecmin;    /* Minimum records per block. */
 3347         int jrecmax;    /* Maximum records per block. */
 3348         int size;
 3349         int cnt;
 3350         int off;
 3351         int devbsize;
 3352 
 3353         if (MOUNTEDSUJ(mp) == 0)
 3354                 return;
 3355         shouldflush = softdep_flushcache;
 3356         bio = NULL;
 3357         jseg = NULL;
 3358         ump = VFSTOUFS(mp);
 3359         LOCK_OWNED(ump);
 3360         fs = ump->um_fs;
 3361         jblocks = ump->softdep_jblocks;
 3362         devbsize = ump->um_devvp->v_bufobj.bo_bsize;
 3363         /*
 3364          * We write anywhere between a disk block and fs block.  The upper
 3365          * bound is picked to prevent buffer cache fragmentation and limit
 3366          * processing time per I/O.
 3367          */
 3368         jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
 3369         jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
 3370         segwritten = 0;
 3371         for (;;) {
 3372                 cnt = ump->softdep_on_journal;
 3373                 /*
 3374                  * Criteria for writing a segment:
 3375                  * 1) We have a full block.
 3376                  * 2) We're called from jwait() and haven't found the
 3377                  *    journal item yet.
 3378                  * 3) Always write if needseg is set.
 3379                  * 4) If we are called from process_worklist and have
 3380                  *    not yet written anything we write a partial block
 3381                  *    to enforce a 1 second maximum latency on journal
 3382                  *    entries.
 3383                  */
 3384                 if (cnt < (jrecmax - 1) && needwk == NULL &&
 3385                     jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
 3386                         break;
 3387                 cnt++;
 3388                 /*
 3389                  * Verify some free journal space.  softdep_prealloc() should
 3390                  * guarantee that we don't run out so this is indicative of
 3391                  * a problem with the flow control.  Try to recover
 3392                  * gracefully in any event.
 3393                  */
 3394                 while (jblocks->jb_free == 0) {
 3395                         if (flags != MNT_WAIT)
 3396                                 break;
 3397                         printf("softdep: Out of journal space!\n");
 3398                         softdep_speedup(ump);
 3399                         msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
 3400                 }
 3401                 FREE_LOCK(ump);
 3402                 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
 3403                 workitem_alloc(&jseg->js_list, D_JSEG, mp);
 3404                 LIST_INIT(&jseg->js_entries);
 3405                 LIST_INIT(&jseg->js_indirs);
 3406                 jseg->js_state = ATTACHED;
 3407                 if (shouldflush == 0)
 3408                         jseg->js_state |= COMPLETE;
 3409                 else if (bio == NULL)
 3410                         bio = g_alloc_bio();
 3411                 jseg->js_jblocks = jblocks;
 3412                 bp = geteblk(fs->fs_bsize, 0);
 3413                 ACQUIRE_LOCK(ump);
 3414                 /*
 3415                  * If there was a race while we were allocating the block
 3416                  * and jseg the entry we care about was likely written.
 3417                  * We bail out in both the WAIT and NOWAIT case and assume
 3418                  * the caller will loop if the entry it cares about is
 3419                  * not written.
 3420                  */
 3421                 cnt = ump->softdep_on_journal;
 3422                 if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
 3423                         bp->b_flags |= B_INVAL | B_NOCACHE;
 3424                         WORKITEM_FREE(jseg, D_JSEG);
 3425                         FREE_LOCK(ump);
 3426                         brelse(bp);
 3427                         ACQUIRE_LOCK(ump);
 3428                         break;
 3429                 }
 3430                 /*
 3431                  * Calculate the disk block size required for the available
 3432                  * records rounded to the min size.
 3433                  */
 3434                 if (cnt == 0)
 3435                         size = devbsize;
 3436                 else if (cnt < jrecmax)
 3437                         size = howmany(cnt, jrecmin) * devbsize;
 3438                 else
 3439                         size = fs->fs_bsize;
 3440                 /*
 3441                  * Allocate a disk block for this journal data and account
 3442                  * for truncation of the requested size if enough contiguous
 3443                  * space was not available.
 3444                  */
 3445                 bp->b_blkno = jblocks_alloc(jblocks, size, &size);
 3446                 bp->b_lblkno = bp->b_blkno;
 3447                 bp->b_offset = bp->b_blkno * DEV_BSIZE;
 3448                 bp->b_bcount = size;
 3449                 bp->b_flags &= ~B_INVAL;
 3450                 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
 3451                 /*
 3452                  * Initialize our jseg with cnt records.  Assign the next
 3453                  * sequence number to it and link it in-order.
 3454                  */
 3455                 cnt = MIN(cnt, (size / devbsize) * jrecmin);
 3456                 jseg->js_buf = bp;
 3457                 jseg->js_cnt = cnt;
 3458                 jseg->js_refs = cnt + 1;        /* Self ref. */
 3459                 jseg->js_size = size;
 3460                 jseg->js_seq = jblocks->jb_nextseq++;
 3461                 if (jblocks->jb_oldestseg == NULL)
 3462                         jblocks->jb_oldestseg = jseg;
 3463                 jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
 3464                 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
 3465                 if (jblocks->jb_writeseg == NULL)
 3466                         jblocks->jb_writeseg = jseg;
 3467                 /*
 3468                  * Start filling in records from the pending list.
 3469                  */
 3470                 data = bp->b_data;
 3471                 off = 0;
 3472 
 3473                 /*
 3474                  * Always put a header on the first block.
 3475                  * XXX As with below, there might not be a chance to get
 3476                  * into the loop.  Ensure that something valid is written.
 3477                  */
 3478                 jseg_write(ump, jseg, data);
 3479                 off += JREC_SIZE;
 3480                 data = bp->b_data + off;
 3481 
 3482                 /*
 3483                  * XXX Something is wrong here.  There's no work to do,
 3484                  * but we need to perform and I/O and allow it to complete
 3485                  * anyways.
 3486                  */
 3487                 if (LIST_EMPTY(&ump->softdep_journal_pending))
 3488                         stat_emptyjblocks++;
 3489 
 3490                 while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
 3491                     != NULL) {
 3492                         if (cnt == 0)
 3493                                 break;
 3494                         /* Place a segment header on every device block. */
 3495                         if ((off % devbsize) == 0) {
 3496                                 jseg_write(ump, jseg, data);
 3497                                 off += JREC_SIZE;
 3498                                 data = bp->b_data + off;
 3499                         }
 3500                         if (wk == needwk)
 3501                                 needwk = NULL;
 3502                         remove_from_journal(wk);
 3503                         wk->wk_state |= INPROGRESS;
 3504                         WORKLIST_INSERT(&jseg->js_entries, wk);
 3505                         switch (wk->wk_type) {
 3506                         case D_JADDREF:
 3507                                 jaddref_write(WK_JADDREF(wk), jseg, data);
 3508                                 break;
 3509                         case D_JREMREF:
 3510                                 jremref_write(WK_JREMREF(wk), jseg, data);
 3511                                 break;
 3512                         case D_JMVREF:
 3513                                 jmvref_write(WK_JMVREF(wk), jseg, data);
 3514                                 break;
 3515                         case D_JNEWBLK:
 3516                                 jnewblk_write(WK_JNEWBLK(wk), jseg, data);
 3517                                 break;
 3518                         case D_JFREEBLK:
 3519                                 jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
 3520                                 break;
 3521                         case D_JFREEFRAG:
 3522                                 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
 3523                                 break;
 3524                         case D_JTRUNC:
 3525                                 jtrunc_write(WK_JTRUNC(wk), jseg, data);
 3526                                 break;
 3527                         case D_JFSYNC:
 3528                                 jfsync_write(WK_JFSYNC(wk), jseg, data);
 3529                                 break;
 3530                         default:
 3531                                 panic("process_journal: Unknown type %s",
 3532                                     TYPENAME(wk->wk_type));
 3533                                 /* NOTREACHED */
 3534                         }
 3535                         off += JREC_SIZE;
 3536                         data = bp->b_data + off;
 3537                         cnt--;
 3538                 }
 3539 
 3540                 /* Clear any remaining space so we don't leak kernel data */
 3541                 if (size > off)
 3542                         bzero(data, size - off);
 3543 
 3544                 /*
 3545                  * Write this one buffer and continue.
 3546                  */
 3547                 segwritten = 1;
 3548                 jblocks->jb_needseg = 0;
 3549                 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
 3550                 FREE_LOCK(ump);
 3551                 pbgetvp(ump->um_devvp, bp);
 3552                 /*
 3553                  * We only do the blocking wait once we find the journal
 3554                  * entry we're looking for.
 3555                  */
 3556                 if (needwk == NULL && flags == MNT_WAIT)
 3557                         bwrite(bp);
 3558                 else
 3559                         bawrite(bp);
 3560                 ACQUIRE_LOCK(ump);
 3561         }
 3562         /*
 3563          * If we wrote a segment issue a synchronize cache so the journal
 3564          * is reflected on disk before the data is written.  Since reclaiming
 3565          * journal space also requires writing a journal record this
 3566          * process also enforces a barrier before reclamation.
 3567          */
 3568         if (segwritten && shouldflush) {
 3569                 softdep_synchronize(bio, ump, 
 3570                     TAILQ_LAST(&jblocks->jb_segs, jseglst));
 3571         } else if (bio)
 3572                 g_destroy_bio(bio);
 3573         /*
 3574          * If we've suspended the filesystem because we ran out of journal
 3575          * space either try to sync it here to make some progress or
 3576          * unsuspend it if we already have.
 3577          */
 3578         if (flags == 0 && jblocks->jb_suspended) {
 3579                 if (journal_unsuspend(ump))
 3580                         return;
 3581                 FREE_LOCK(ump);
 3582                 VFS_SYNC(mp, MNT_NOWAIT);
 3583                 ffs_sbupdate(ump, MNT_WAIT, 0);
 3584                 ACQUIRE_LOCK(ump);
 3585         }
 3586 }
 3587 
 3588 /*
 3589  * Complete a jseg, allowing all dependencies awaiting journal writes
 3590  * to proceed.  Each journal dependency also attaches a jsegdep to dependent
 3591  * structures so that the journal segment can be freed to reclaim space.
 3592  */
 3593 static void
 3594 complete_jseg(jseg)
 3595         struct jseg *jseg;
 3596 {
 3597         struct worklist *wk;
 3598         struct jmvref *jmvref;
 3599 #ifdef INVARIANTS
 3600         int i = 0;
 3601 #endif
 3602 
 3603         while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
 3604                 WORKLIST_REMOVE(wk);
 3605                 wk->wk_state &= ~INPROGRESS;
 3606                 wk->wk_state |= COMPLETE;
 3607                 KASSERT(i++ < jseg->js_cnt,
 3608                     ("handle_written_jseg: overflow %d >= %d",
 3609                     i - 1, jseg->js_cnt));
 3610                 switch (wk->wk_type) {
 3611                 case D_JADDREF:
 3612                         handle_written_jaddref(WK_JADDREF(wk));
 3613                         break;
 3614                 case D_JREMREF:
 3615                         handle_written_jremref(WK_JREMREF(wk));
 3616                         break;
 3617                 case D_JMVREF:
 3618                         rele_jseg(jseg);        /* No jsegdep. */
 3619                         jmvref = WK_JMVREF(wk);
 3620                         LIST_REMOVE(jmvref, jm_deps);
 3621                         if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
 3622                                 free_pagedep(jmvref->jm_pagedep);
 3623                         WORKITEM_FREE(jmvref, D_JMVREF);
 3624                         break;
 3625                 case D_JNEWBLK:
 3626                         handle_written_jnewblk(WK_JNEWBLK(wk));
 3627                         break;
 3628                 case D_JFREEBLK:
 3629                         handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
 3630                         break;
 3631                 case D_JTRUNC:
 3632                         handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
 3633                         break;
 3634                 case D_JFSYNC:
 3635                         rele_jseg(jseg);        /* No jsegdep. */
 3636                         WORKITEM_FREE(wk, D_JFSYNC);
 3637                         break;
 3638                 case D_JFREEFRAG:
 3639                         handle_written_jfreefrag(WK_JFREEFRAG(wk));
 3640                         break;
 3641                 default:
 3642                         panic("handle_written_jseg: Unknown type %s",
 3643                             TYPENAME(wk->wk_type));
 3644                         /* NOTREACHED */
 3645                 }
 3646         }
 3647         /* Release the self reference so the structure may be freed. */
 3648         rele_jseg(jseg);
 3649 }
 3650 
 3651 /*
 3652  * Determine which jsegs are ready for completion processing.  Waits for
 3653  * synchronize cache to complete as well as forcing in-order completion
 3654  * of journal entries.
 3655  */
 3656 static void
 3657 complete_jsegs(jseg)
 3658         struct jseg *jseg;
 3659 {
 3660         struct jblocks *jblocks;
 3661         struct jseg *jsegn;
 3662 
 3663         jblocks = jseg->js_jblocks;
 3664         /*
 3665          * Don't allow out of order completions.  If this isn't the first
 3666          * block wait for it to write before we're done.
 3667          */
 3668         if (jseg != jblocks->jb_writeseg)
 3669                 return;
 3670         /* Iterate through available jsegs processing their entries. */
 3671         while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
 3672                 jblocks->jb_oldestwrseq = jseg->js_oldseq;
 3673                 jsegn = TAILQ_NEXT(jseg, js_next);
 3674                 complete_jseg(jseg);
 3675                 jseg = jsegn;
 3676         }
 3677         jblocks->jb_writeseg = jseg;
 3678         /*
 3679          * Attempt to free jsegs now that oldestwrseq may have advanced. 
 3680          */
 3681         free_jsegs(jblocks);
 3682 }
 3683 
 3684 /*
 3685  * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
 3686  * the final completions.
 3687  */
 3688 static void
 3689 handle_written_jseg(jseg, bp)
 3690         struct jseg *jseg;
 3691         struct buf *bp;
 3692 {
 3693 
 3694         if (jseg->js_refs == 0)
 3695                 panic("handle_written_jseg: No self-reference on %p", jseg);
 3696         jseg->js_state |= DEPCOMPLETE;
 3697         /*
 3698          * We'll never need this buffer again, set flags so it will be
 3699          * discarded.
 3700          */
 3701         bp->b_flags |= B_INVAL | B_NOCACHE;
 3702         pbrelvp(bp);
 3703         complete_jsegs(jseg);
 3704 }
 3705 
 3706 static inline struct jsegdep *
 3707 inoref_jseg(inoref)
 3708         struct inoref *inoref;
 3709 {
 3710         struct jsegdep *jsegdep;
 3711 
 3712         jsegdep = inoref->if_jsegdep;
 3713         inoref->if_jsegdep = NULL;
 3714 
 3715         return (jsegdep);
 3716 }
 3717 
 3718 /*
 3719  * Called once a jremref has made it to stable store.  The jremref is marked
 3720  * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
 3721  * for the jremref to complete will be awoken by free_jremref.
 3722  */
 3723 static void
 3724 handle_written_jremref(jremref)
 3725         struct jremref *jremref;
 3726 {
 3727         struct inodedep *inodedep;
 3728         struct jsegdep *jsegdep;
 3729         struct dirrem *dirrem;
 3730 
 3731         /* Grab the jsegdep. */
 3732         jsegdep = inoref_jseg(&jremref->jr_ref);
 3733         /*
 3734          * Remove us from the inoref list.
 3735          */
 3736         if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
 3737             0, &inodedep) == 0)
 3738                 panic("handle_written_jremref: Lost inodedep");
 3739         TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
 3740         /*
 3741          * Complete the dirrem.
 3742          */
 3743         dirrem = jremref->jr_dirrem;
 3744         jremref->jr_dirrem = NULL;
 3745         LIST_REMOVE(jremref, jr_deps);
 3746         jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
 3747         jwork_insert(&dirrem->dm_jwork, jsegdep);
 3748         if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
 3749             (dirrem->dm_state & COMPLETE) != 0)
 3750                 add_to_worklist(&dirrem->dm_list, 0);
 3751         free_jremref(jremref);
 3752 }
 3753 
 3754 /*
 3755  * Called once a jaddref has made it to stable store.  The dependency is
 3756  * marked complete and any dependent structures are added to the inode
 3757  * bufwait list to be completed as soon as it is written.  If a bitmap write
 3758  * depends on this entry we move the inode into the inodedephd of the
 3759  * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
 3760  */
 3761 static void
 3762 handle_written_jaddref(jaddref)
 3763         struct jaddref *jaddref;
 3764 {
 3765         struct jsegdep *jsegdep;
 3766         struct inodedep *inodedep;
 3767         struct diradd *diradd;
 3768         struct mkdir *mkdir;
 3769 
 3770         /* Grab the jsegdep. */
 3771         jsegdep = inoref_jseg(&jaddref->ja_ref);
 3772         mkdir = NULL;
 3773         diradd = NULL;
 3774         if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
 3775             0, &inodedep) == 0)
 3776                 panic("handle_written_jaddref: Lost inodedep.");
 3777         if (jaddref->ja_diradd == NULL)
 3778                 panic("handle_written_jaddref: No dependency");
 3779         if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
 3780                 diradd = jaddref->ja_diradd;
 3781                 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
 3782         } else if (jaddref->ja_state & MKDIR_PARENT) {
 3783                 mkdir = jaddref->ja_mkdir;
 3784                 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
 3785         } else if (jaddref->ja_state & MKDIR_BODY)
 3786                 mkdir = jaddref->ja_mkdir;
 3787         else
 3788                 panic("handle_written_jaddref: Unknown dependency %p",
 3789                     jaddref->ja_diradd);
 3790         jaddref->ja_diradd = NULL;      /* also clears ja_mkdir */
 3791         /*
 3792          * Remove us from the inode list.
 3793          */
 3794         TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
 3795         /*
 3796          * The mkdir may be waiting on the jaddref to clear before freeing.
 3797          */
 3798         if (mkdir) {
 3799                 KASSERT(mkdir->md_list.wk_type == D_MKDIR,
 3800                     ("handle_written_jaddref: Incorrect type for mkdir %s",
 3801                     TYPENAME(mkdir->md_list.wk_type)));
 3802                 mkdir->md_jaddref = NULL;
 3803                 diradd = mkdir->md_diradd;
 3804                 mkdir->md_state |= DEPCOMPLETE;
 3805                 complete_mkdir(mkdir);
 3806         }
 3807         jwork_insert(&diradd->da_jwork, jsegdep);
 3808         if (jaddref->ja_state & NEWBLOCK) {
 3809                 inodedep->id_state |= ONDEPLIST;
 3810                 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
 3811                     inodedep, id_deps);
 3812         }
 3813         free_jaddref(jaddref);
 3814 }
 3815 
 3816 /*
 3817  * Called once a jnewblk journal is written.  The allocdirect or allocindir
 3818  * is placed in the bmsafemap to await notification of a written bitmap.  If
 3819  * the operation was canceled we add the segdep to the appropriate
 3820  * dependency to free the journal space once the canceling operation
 3821  * completes.
 3822  */
 3823 static void
 3824 handle_written_jnewblk(jnewblk)
 3825         struct jnewblk *jnewblk;
 3826 {
 3827         struct bmsafemap *bmsafemap;
 3828         struct freefrag *freefrag;
 3829         struct freework *freework;
 3830         struct jsegdep *jsegdep;
 3831         struct newblk *newblk;
 3832 
 3833         /* Grab the jsegdep. */
 3834         jsegdep = jnewblk->jn_jsegdep;
 3835         jnewblk->jn_jsegdep = NULL;
 3836         if (jnewblk->jn_dep == NULL) 
 3837                 panic("handle_written_jnewblk: No dependency for the segdep.");
 3838         switch (jnewblk->jn_dep->wk_type) {
 3839         case D_NEWBLK:
 3840         case D_ALLOCDIRECT:
 3841         case D_ALLOCINDIR:
 3842                 /*
 3843                  * Add the written block to the bmsafemap so it can
 3844                  * be notified when the bitmap is on disk.
 3845                  */
 3846                 newblk = WK_NEWBLK(jnewblk->jn_dep);
 3847                 newblk->nb_jnewblk = NULL;
 3848                 if ((newblk->nb_state & GOINGAWAY) == 0) {
 3849                         bmsafemap = newblk->nb_bmsafemap;
 3850                         newblk->nb_state |= ONDEPLIST;
 3851                         LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
 3852                             nb_deps);
 3853                 }
 3854                 jwork_insert(&newblk->nb_jwork, jsegdep);
 3855                 break;
 3856         case D_FREEFRAG:
 3857                 /*
 3858                  * A newblock being removed by a freefrag when replaced by
 3859                  * frag extension.
 3860                  */
 3861                 freefrag = WK_FREEFRAG(jnewblk->jn_dep);
 3862                 freefrag->ff_jdep = NULL;
 3863                 jwork_insert(&freefrag->ff_jwork, jsegdep);
 3864                 break;
 3865         case D_FREEWORK:
 3866                 /*
 3867                  * A direct block was removed by truncate.
 3868                  */
 3869                 freework = WK_FREEWORK(jnewblk->jn_dep);
 3870                 freework->fw_jnewblk = NULL;
 3871                 jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
 3872                 break;
 3873         default:
 3874                 panic("handle_written_jnewblk: Unknown type %d.",
 3875                     jnewblk->jn_dep->wk_type);
 3876         }
 3877         jnewblk->jn_dep = NULL;
 3878         free_jnewblk(jnewblk);
 3879 }
 3880 
 3881 /*
 3882  * Cancel a jfreefrag that won't be needed, probably due to colliding with
 3883  * an in-flight allocation that has not yet been committed.  Divorce us
 3884  * from the freefrag and mark it DEPCOMPLETE so that it may be added
 3885  * to the worklist.
 3886  */
 3887 static void
 3888 cancel_jfreefrag(jfreefrag)
 3889         struct jfreefrag *jfreefrag;
 3890 {
 3891         struct freefrag *freefrag;
 3892 
 3893         if (jfreefrag->fr_jsegdep) {
 3894                 free_jsegdep(jfreefrag->fr_jsegdep);
 3895                 jfreefrag->fr_jsegdep = NULL;
 3896         }
 3897         freefrag = jfreefrag->fr_freefrag;
 3898         jfreefrag->fr_freefrag = NULL;
 3899         free_jfreefrag(jfreefrag);
 3900         freefrag->ff_state |= DEPCOMPLETE;
 3901         CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
 3902 }
 3903 
 3904 /*
 3905  * Free a jfreefrag when the parent freefrag is rendered obsolete.
 3906  */
 3907 static void
 3908 free_jfreefrag(jfreefrag)
 3909         struct jfreefrag *jfreefrag;
 3910 {
 3911 
 3912         if (jfreefrag->fr_state & INPROGRESS)
 3913                 WORKLIST_REMOVE(&jfreefrag->fr_list);
 3914         else if (jfreefrag->fr_state & ONWORKLIST)
 3915                 remove_from_journal(&jfreefrag->fr_list);
 3916         if (jfreefrag->fr_freefrag != NULL)
 3917                 panic("free_jfreefrag:  Still attached to a freefrag.");
 3918         WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
 3919 }
 3920 
 3921 /*
 3922  * Called when the journal write for a jfreefrag completes.  The parent
 3923  * freefrag is added to the worklist if this completes its dependencies.
 3924  */
 3925 static void
 3926 handle_written_jfreefrag(jfreefrag)
 3927         struct jfreefrag *jfreefrag;
 3928 {
 3929         struct jsegdep *jsegdep;
 3930         struct freefrag *freefrag;
 3931 
 3932         /* Grab the jsegdep. */
 3933         jsegdep = jfreefrag->fr_jsegdep;
 3934         jfreefrag->fr_jsegdep = NULL;
 3935         freefrag = jfreefrag->fr_freefrag;
 3936         if (freefrag == NULL)
 3937                 panic("handle_written_jfreefrag: No freefrag.");
 3938         freefrag->ff_state |= DEPCOMPLETE;
 3939         freefrag->ff_jdep = NULL;
 3940         jwork_insert(&freefrag->ff_jwork, jsegdep);
 3941         if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
 3942                 add_to_worklist(&freefrag->ff_list, 0);
 3943         jfreefrag->fr_freefrag = NULL;
 3944         free_jfreefrag(jfreefrag);
 3945 }
 3946 
 3947 /*
 3948  * Called when the journal write for a jfreeblk completes.  The jfreeblk
 3949  * is removed from the freeblks list of pending journal writes and the
 3950  * jsegdep is moved to the freeblks jwork to be completed when all blocks
 3951  * have been reclaimed.
 3952  */
 3953 static void
 3954 handle_written_jblkdep(jblkdep)
 3955         struct jblkdep *jblkdep;
 3956 {
 3957         struct freeblks *freeblks;
 3958         struct jsegdep *jsegdep;
 3959 
 3960         /* Grab the jsegdep. */
 3961         jsegdep = jblkdep->jb_jsegdep;
 3962         jblkdep->jb_jsegdep = NULL;
 3963         freeblks = jblkdep->jb_freeblks;
 3964         LIST_REMOVE(jblkdep, jb_deps);
 3965         jwork_insert(&freeblks->fb_jwork, jsegdep);
 3966         /*
 3967          * If the freeblks is all journaled, we can add it to the worklist.
 3968          */
 3969         if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
 3970             (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
 3971                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
 3972 
 3973         free_jblkdep(jblkdep);
 3974 }
 3975 
 3976 static struct jsegdep *
 3977 newjsegdep(struct worklist *wk)
 3978 {
 3979         struct jsegdep *jsegdep;
 3980 
 3981         jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
 3982         workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
 3983         jsegdep->jd_seg = NULL;
 3984 
 3985         return (jsegdep);
 3986 }
 3987 
 3988 static struct jmvref *
 3989 newjmvref(dp, ino, oldoff, newoff)
 3990         struct inode *dp;
 3991         ino_t ino;
 3992         off_t oldoff;
 3993         off_t newoff;
 3994 {
 3995         struct jmvref *jmvref;
 3996 
 3997         jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
 3998         workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp));
 3999         jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
 4000         jmvref->jm_parent = dp->i_number;
 4001         jmvref->jm_ino = ino;
 4002         jmvref->jm_oldoff = oldoff;
 4003         jmvref->jm_newoff = newoff;
 4004 
 4005         return (jmvref);
 4006 }
 4007 
 4008 /*
 4009  * Allocate a new jremref that tracks the removal of ip from dp with the
 4010  * directory entry offset of diroff.  Mark the entry as ATTACHED and
 4011  * DEPCOMPLETE as we have all the information required for the journal write
 4012  * and the directory has already been removed from the buffer.  The caller
 4013  * is responsible for linking the jremref into the pagedep and adding it
 4014  * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
 4015  * a DOTDOT addition so handle_workitem_remove() can properly assign
 4016  * the jsegdep when we're done.
 4017  */
 4018 static struct jremref *
 4019 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
 4020     off_t diroff, nlink_t nlink)
 4021 {
 4022         struct jremref *jremref;
 4023 
 4024         jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
 4025         workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp));
 4026         jremref->jr_state = ATTACHED;
 4027         newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
 4028            nlink, ip->i_mode);
 4029         jremref->jr_dirrem = dirrem;
 4030 
 4031         return (jremref);
 4032 }
 4033 
 4034 static inline void
 4035 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
 4036     nlink_t nlink, uint16_t mode)
 4037 {
 4038 
 4039         inoref->if_jsegdep = newjsegdep(&inoref->if_list);
 4040         inoref->if_diroff = diroff;
 4041         inoref->if_ino = ino;
 4042         inoref->if_parent = parent;
 4043         inoref->if_nlink = nlink;
 4044         inoref->if_mode = mode;
 4045 }
 4046 
 4047 /*
 4048  * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
 4049  * directory offset may not be known until later.  The caller is responsible
 4050  * adding the entry to the journal when this information is available.  nlink
 4051  * should be the link count prior to the addition and mode is only required
 4052  * to have the correct FMT.
 4053  */
 4054 static struct jaddref *
 4055 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
 4056     uint16_t mode)
 4057 {
 4058         struct jaddref *jaddref;
 4059 
 4060         jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
 4061         workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp));
 4062         jaddref->ja_state = ATTACHED;
 4063         jaddref->ja_mkdir = NULL;
 4064         newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
 4065 
 4066         return (jaddref);
 4067 }
 4068 
 4069 /*
 4070  * Create a new free dependency for a freework.  The caller is responsible
 4071  * for adjusting the reference count when it has the lock held.  The freedep
 4072  * will track an outstanding bitmap write that will ultimately clear the
 4073  * freework to continue.
 4074  */
 4075 static struct freedep *
 4076 newfreedep(struct freework *freework)
 4077 {
 4078         struct freedep *freedep;
 4079 
 4080         freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
 4081         workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
 4082         freedep->fd_freework = freework;
 4083 
 4084         return (freedep);
 4085 }
 4086 
 4087 /*
 4088  * Free a freedep structure once the buffer it is linked to is written.  If
 4089  * this is the last reference to the freework schedule it for completion.
 4090  */
 4091 static void
 4092 free_freedep(freedep)
 4093         struct freedep *freedep;
 4094 {
 4095         struct freework *freework;
 4096 
 4097         freework = freedep->fd_freework;
 4098         freework->fw_freeblks->fb_cgwait--;
 4099         if (--freework->fw_ref == 0)
 4100                 freework_enqueue(freework);
 4101         WORKITEM_FREE(freedep, D_FREEDEP);
 4102 }
 4103 
 4104 /*
 4105  * Allocate a new freework structure that may be a level in an indirect
 4106  * when parent is not NULL or a top level block when it is.  The top level
 4107  * freework structures are allocated without the per-filesystem lock held
 4108  * and before the freeblks is visible outside of softdep_setup_freeblocks().
 4109  */
 4110 static struct freework *
 4111 newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
 4112         struct ufsmount *ump;
 4113         struct freeblks *freeblks;
 4114         struct freework *parent;
 4115         ufs_lbn_t lbn;
 4116         ufs2_daddr_t nb;
 4117         int frags;
 4118         int off;
 4119         int journal;
 4120 {
 4121         struct freework *freework;
 4122 
 4123         freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
 4124         workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
 4125         freework->fw_state = ATTACHED;
 4126         freework->fw_jnewblk = NULL;
 4127         freework->fw_freeblks = freeblks;
 4128         freework->fw_parent = parent;
 4129         freework->fw_lbn = lbn;
 4130         freework->fw_blkno = nb;
 4131         freework->fw_frags = frags;
 4132         freework->fw_indir = NULL;
 4133         freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR)
 4134                 ? 0 : NINDIR(ump->um_fs) + 1;
 4135         freework->fw_start = freework->fw_off = off;
 4136         if (journal)
 4137                 newjfreeblk(freeblks, lbn, nb, frags);
 4138         if (parent == NULL) {
 4139                 ACQUIRE_LOCK(ump);
 4140                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
 4141                 freeblks->fb_ref++;
 4142                 FREE_LOCK(ump);
 4143         }
 4144 
 4145         return (freework);
 4146 }
 4147 
 4148 /*
 4149  * Eliminate a jfreeblk for a block that does not need journaling.
 4150  */
 4151 static void
 4152 cancel_jfreeblk(freeblks, blkno)
 4153         struct freeblks *freeblks;
 4154         ufs2_daddr_t blkno;
 4155 {
 4156         struct jfreeblk *jfreeblk;
 4157         struct jblkdep *jblkdep;
 4158 
 4159         LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
 4160                 if (jblkdep->jb_list.wk_type != D_JFREEBLK)
 4161                         continue;
 4162                 jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
 4163                 if (jfreeblk->jf_blkno == blkno)
 4164                         break;
 4165         }
 4166         if (jblkdep == NULL)
 4167                 return;
 4168         CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
 4169         free_jsegdep(jblkdep->jb_jsegdep);
 4170         LIST_REMOVE(jblkdep, jb_deps);
 4171         WORKITEM_FREE(jfreeblk, D_JFREEBLK);
 4172 }
 4173 
 4174 /*
 4175  * Allocate a new jfreeblk to journal top level block pointer when truncating
 4176  * a file.  The caller must add this to the worklist when the per-filesystem
 4177  * lock is held.
 4178  */
 4179 static struct jfreeblk *
 4180 newjfreeblk(freeblks, lbn, blkno, frags)
 4181         struct freeblks *freeblks;
 4182         ufs_lbn_t lbn;
 4183         ufs2_daddr_t blkno;
 4184         int frags;
 4185 {
 4186         struct jfreeblk *jfreeblk;
 4187 
 4188         jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
 4189         workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
 4190             freeblks->fb_list.wk_mp);
 4191         jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
 4192         jfreeblk->jf_dep.jb_freeblks = freeblks;
 4193         jfreeblk->jf_ino = freeblks->fb_inum;
 4194         jfreeblk->jf_lbn = lbn;
 4195         jfreeblk->jf_blkno = blkno;
 4196         jfreeblk->jf_frags = frags;
 4197         LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
 4198 
 4199         return (jfreeblk);
 4200 }
 4201 
 4202 /*
 4203  * The journal is only prepared to handle full-size block numbers, so we
 4204  * have to adjust the record to reflect the change to a full-size block.
 4205  * For example, suppose we have a block made up of fragments 8-15 and
 4206  * want to free its last two fragments. We are given a request that says:
 4207  *     FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
 4208  * where frags are the number of fragments to free and oldfrags are the
 4209  * number of fragments to keep. To block align it, we have to change it to
 4210  * have a valid full-size blkno, so it becomes:
 4211  *     FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
 4212  */
 4213 static void
 4214 adjust_newfreework(freeblks, frag_offset)
 4215         struct freeblks *freeblks;
 4216         int frag_offset;
 4217 {
 4218         struct jfreeblk *jfreeblk;
 4219 
 4220         KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
 4221             LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
 4222             ("adjust_newfreework: Missing freeblks dependency"));
 4223 
 4224         jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
 4225         jfreeblk->jf_blkno -= frag_offset;
 4226         jfreeblk->jf_frags += frag_offset;
 4227 }
 4228 
 4229 /*
 4230  * Allocate a new jtrunc to track a partial truncation.
 4231  */
 4232 static struct jtrunc *
 4233 newjtrunc(freeblks, size, extsize)
 4234         struct freeblks *freeblks;
 4235         off_t size;
 4236         int extsize;
 4237 {
 4238         struct jtrunc *jtrunc;
 4239 
 4240         jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
 4241         workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
 4242             freeblks->fb_list.wk_mp);
 4243         jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
 4244         jtrunc->jt_dep.jb_freeblks = freeblks;
 4245         jtrunc->jt_ino = freeblks->fb_inum;
 4246         jtrunc->jt_size = size;
 4247         jtrunc->jt_extsize = extsize;
 4248         LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
 4249 
 4250         return (jtrunc);
 4251 }
 4252 
 4253 /*
 4254  * If we're canceling a new bitmap we have to search for another ref
 4255  * to move into the bmsafemap dep.  This might be better expressed
 4256  * with another structure.
 4257  */
 4258 static void
 4259 move_newblock_dep(jaddref, inodedep)
 4260         struct jaddref *jaddref;
 4261         struct inodedep *inodedep;
 4262 {
 4263         struct inoref *inoref;
 4264         struct jaddref *jaddrefn;
 4265 
 4266         jaddrefn = NULL;
 4267         for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
 4268             inoref = TAILQ_NEXT(inoref, if_deps)) {
 4269                 if ((jaddref->ja_state & NEWBLOCK) &&
 4270                     inoref->if_list.wk_type == D_JADDREF) {
 4271                         jaddrefn = (struct jaddref *)inoref;
 4272                         break;
 4273                 }
 4274         }
 4275         if (jaddrefn == NULL)
 4276                 return;
 4277         jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
 4278         jaddrefn->ja_state |= jaddref->ja_state &
 4279             (ATTACHED | UNDONE | NEWBLOCK);
 4280         jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
 4281         jaddref->ja_state |= ATTACHED;
 4282         LIST_REMOVE(jaddref, ja_bmdeps);
 4283         LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
 4284             ja_bmdeps);
 4285 }
 4286 
 4287 /*
 4288  * Cancel a jaddref either before it has been written or while it is being
 4289  * written.  This happens when a link is removed before the add reaches
 4290  * the disk.  The jaddref dependency is kept linked into the bmsafemap
 4291  * and inode to prevent the link count or bitmap from reaching the disk
 4292  * until handle_workitem_remove() re-adjusts the counts and bitmaps as
 4293  * required.
 4294  *
 4295  * Returns 1 if the canceled addref requires journaling of the remove and
 4296  * 0 otherwise.
 4297  */
 4298 static int
 4299 cancel_jaddref(jaddref, inodedep, wkhd)
 4300         struct jaddref *jaddref;
 4301         struct inodedep *inodedep;
 4302         struct workhead *wkhd;
 4303 {
 4304         struct inoref *inoref;
 4305         struct jsegdep *jsegdep;
 4306         int needsj;
 4307 
 4308         KASSERT((jaddref->ja_state & COMPLETE) == 0,
 4309             ("cancel_jaddref: Canceling complete jaddref"));
 4310         if (jaddref->ja_state & (INPROGRESS | COMPLETE))
 4311                 needsj = 1;
 4312         else
 4313                 needsj = 0;
 4314         if (inodedep == NULL)
 4315                 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
 4316                     0, &inodedep) == 0)
 4317                         panic("cancel_jaddref: Lost inodedep");
 4318         /*
 4319          * We must adjust the nlink of any reference operation that follows
 4320          * us so that it is consistent with the in-memory reference.  This
 4321          * ensures that inode nlink rollbacks always have the correct link.
 4322          */
 4323         if (needsj == 0) {
 4324                 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
 4325                     inoref = TAILQ_NEXT(inoref, if_deps)) {
 4326                         if (inoref->if_state & GOINGAWAY)
 4327                                 break;
 4328                         inoref->if_nlink--;
 4329                 }
 4330         }
 4331         jsegdep = inoref_jseg(&jaddref->ja_ref);
 4332         if (jaddref->ja_state & NEWBLOCK)
 4333                 move_newblock_dep(jaddref, inodedep);
 4334         wake_worklist(&jaddref->ja_list);
 4335         jaddref->ja_mkdir = NULL;
 4336         if (jaddref->ja_state & INPROGRESS) {
 4337                 jaddref->ja_state &= ~INPROGRESS;
 4338                 WORKLIST_REMOVE(&jaddref->ja_list);
 4339                 jwork_insert(wkhd, jsegdep);
 4340         } else {
 4341                 free_jsegdep(jsegdep);
 4342                 if (jaddref->ja_state & DEPCOMPLETE)
 4343                         remove_from_journal(&jaddref->ja_list);
 4344         }
 4345         jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
 4346         /*
 4347          * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
 4348          * can arrange for them to be freed with the bitmap.  Otherwise we
 4349          * no longer need this addref attached to the inoreflst and it
 4350          * will incorrectly adjust nlink if we leave it.
 4351          */
 4352         if ((jaddref->ja_state & NEWBLOCK) == 0) {
 4353                 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
 4354                     if_deps);
 4355                 jaddref->ja_state |= COMPLETE;
 4356                 free_jaddref(jaddref);
 4357                 return (needsj);
 4358         }
 4359         /*
 4360          * Leave the head of the list for jsegdeps for fast merging.
 4361          */
 4362         if (LIST_FIRST(wkhd) != NULL) {
 4363                 jaddref->ja_state |= ONWORKLIST;
 4364                 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
 4365         } else
 4366                 WORKLIST_INSERT(wkhd, &jaddref->ja_list);
 4367 
 4368         return (needsj);
 4369 }
 4370 
 4371 /* 
 4372  * Attempt to free a jaddref structure when some work completes.  This
 4373  * should only succeed once the entry is written and all dependencies have
 4374  * been notified.
 4375  */
 4376 static void
 4377 free_jaddref(jaddref)
 4378         struct jaddref *jaddref;
 4379 {
 4380 
 4381         if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
 4382                 return;
 4383         if (jaddref->ja_ref.if_jsegdep)
 4384                 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
 4385                     jaddref, jaddref->ja_state);
 4386         if (jaddref->ja_state & NEWBLOCK)
 4387                 LIST_REMOVE(jaddref, ja_bmdeps);
 4388         if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
 4389                 panic("free_jaddref: Bad state %p(0x%X)",
 4390                     jaddref, jaddref->ja_state);
 4391         if (jaddref->ja_mkdir != NULL)
 4392                 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
 4393         WORKITEM_FREE(jaddref, D_JADDREF);
 4394 }
 4395 
 4396 /*
 4397  * Free a jremref structure once it has been written or discarded.
 4398  */
 4399 static void
 4400 free_jremref(jremref)
 4401         struct jremref *jremref;
 4402 {
 4403 
 4404         if (jremref->jr_ref.if_jsegdep)
 4405                 free_jsegdep(jremref->jr_ref.if_jsegdep);
 4406         if (jremref->jr_state & INPROGRESS)
 4407                 panic("free_jremref: IO still pending");
 4408         WORKITEM_FREE(jremref, D_JREMREF);
 4409 }
 4410 
 4411 /*
 4412  * Free a jnewblk structure.
 4413  */
 4414 static void
 4415 free_jnewblk(jnewblk)
 4416         struct jnewblk *jnewblk;
 4417 {
 4418 
 4419         if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
 4420                 return;
 4421         LIST_REMOVE(jnewblk, jn_deps);
 4422         if (jnewblk->jn_dep != NULL)
 4423                 panic("free_jnewblk: Dependency still attached.");
 4424         WORKITEM_FREE(jnewblk, D_JNEWBLK);
 4425 }
 4426 
 4427 /*
 4428  * Cancel a jnewblk which has been been made redundant by frag extension.
 4429  */
 4430 static void
 4431 cancel_jnewblk(jnewblk, wkhd)
 4432         struct jnewblk *jnewblk;
 4433         struct workhead *wkhd;
 4434 {
 4435         struct jsegdep *jsegdep;
 4436 
 4437         CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
 4438         jsegdep = jnewblk->jn_jsegdep;
 4439         if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
 4440                 panic("cancel_jnewblk: Invalid state");
 4441         jnewblk->jn_jsegdep  = NULL;
 4442         jnewblk->jn_dep = NULL;
 4443         jnewblk->jn_state |= GOINGAWAY;
 4444         if (jnewblk->jn_state & INPROGRESS) {
 4445                 jnewblk->jn_state &= ~INPROGRESS;
 4446                 WORKLIST_REMOVE(&jnewblk->jn_list);
 4447                 jwork_insert(wkhd, jsegdep);
 4448         } else {
 4449                 free_jsegdep(jsegdep);
 4450                 remove_from_journal(&jnewblk->jn_list);
 4451         }
 4452         wake_worklist(&jnewblk->jn_list);
 4453         WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
 4454 }
 4455 
 4456 static void
 4457 free_jblkdep(jblkdep)
 4458         struct jblkdep *jblkdep;
 4459 {
 4460 
 4461         if (jblkdep->jb_list.wk_type == D_JFREEBLK)
 4462                 WORKITEM_FREE(jblkdep, D_JFREEBLK);
 4463         else if (jblkdep->jb_list.wk_type == D_JTRUNC)
 4464                 WORKITEM_FREE(jblkdep, D_JTRUNC);
 4465         else
 4466                 panic("free_jblkdep: Unexpected type %s",
 4467                     TYPENAME(jblkdep->jb_list.wk_type));
 4468 }
 4469 
 4470 /*
 4471  * Free a single jseg once it is no longer referenced in memory or on
 4472  * disk.  Reclaim journal blocks and dependencies waiting for the segment
 4473  * to disappear.
 4474  */
 4475 static void
 4476 free_jseg(jseg, jblocks)
 4477         struct jseg *jseg;
 4478         struct jblocks *jblocks;
 4479 {
 4480         struct freework *freework;
 4481 
 4482         /*
 4483          * Free freework structures that were lingering to indicate freed
 4484          * indirect blocks that forced journal write ordering on reallocate.
 4485          */
 4486         while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
 4487                 indirblk_remove(freework);
 4488         if (jblocks->jb_oldestseg == jseg)
 4489                 jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
 4490         TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
 4491         jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
 4492         KASSERT(LIST_EMPTY(&jseg->js_entries),
 4493             ("free_jseg: Freed jseg has valid entries."));
 4494         WORKITEM_FREE(jseg, D_JSEG);
 4495 }
 4496 
 4497 /*
 4498  * Free all jsegs that meet the criteria for being reclaimed and update
 4499  * oldestseg.
 4500  */
 4501 static void
 4502 free_jsegs(jblocks)
 4503         struct jblocks *jblocks;
 4504 {
 4505         struct jseg *jseg;
 4506 
 4507         /*
 4508          * Free only those jsegs which have none allocated before them to
 4509          * preserve the journal space ordering.
 4510          */
 4511         while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
 4512                 /*
 4513                  * Only reclaim space when nothing depends on this journal
 4514                  * set and another set has written that it is no longer
 4515                  * valid.
 4516                  */
 4517                 if (jseg->js_refs != 0) {
 4518                         jblocks->jb_oldestseg = jseg;
 4519                         return;
 4520                 }
 4521                 if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
 4522                         break;
 4523                 if (jseg->js_seq > jblocks->jb_oldestwrseq)
 4524                         break;
 4525                 /*
 4526                  * We can free jsegs that didn't write entries when
 4527                  * oldestwrseq == js_seq.
 4528                  */
 4529                 if (jseg->js_seq == jblocks->jb_oldestwrseq &&
 4530                     jseg->js_cnt != 0)
 4531                         break;
 4532                 free_jseg(jseg, jblocks);
 4533         }
 4534         /*
 4535          * If we exited the loop above we still must discover the
 4536          * oldest valid segment.
 4537          */
 4538         if (jseg)
 4539                 for (jseg = jblocks->jb_oldestseg; jseg != NULL;
 4540                      jseg = TAILQ_NEXT(jseg, js_next))
 4541                         if (jseg->js_refs != 0)
 4542                                 break;
 4543         jblocks->jb_oldestseg = jseg;
 4544         /*
 4545          * The journal has no valid records but some jsegs may still be
 4546          * waiting on oldestwrseq to advance.  We force a small record
 4547          * out to permit these lingering records to be reclaimed.
 4548          */
 4549         if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
 4550                 jblocks->jb_needseg = 1;
 4551 }
 4552 
 4553 /*
 4554  * Release one reference to a jseg and free it if the count reaches 0.  This
 4555  * should eventually reclaim journal space as well.
 4556  */
 4557 static void
 4558 rele_jseg(jseg)
 4559         struct jseg *jseg;
 4560 {
 4561 
 4562         KASSERT(jseg->js_refs > 0,
 4563             ("free_jseg: Invalid refcnt %d", jseg->js_refs));
 4564         if (--jseg->js_refs != 0)
 4565                 return;
 4566         free_jsegs(jseg->js_jblocks);
 4567 }
 4568 
 4569 /*
 4570  * Release a jsegdep and decrement the jseg count.
 4571  */
 4572 static void
 4573 free_jsegdep(jsegdep)
 4574         struct jsegdep *jsegdep;
 4575 {
 4576 
 4577         if (jsegdep->jd_seg)
 4578                 rele_jseg(jsegdep->jd_seg);
 4579         WORKITEM_FREE(jsegdep, D_JSEGDEP);
 4580 }
 4581 
 4582 /*
 4583  * Wait for a journal item to make it to disk.  Initiate journal processing
 4584  * if required.
 4585  */
 4586 static int
 4587 jwait(wk, waitfor)
 4588         struct worklist *wk;
 4589         int waitfor;
 4590 {
 4591 
 4592         LOCK_OWNED(VFSTOUFS(wk->wk_mp));
 4593         /*
 4594          * Blocking journal waits cause slow synchronous behavior.  Record
 4595          * stats on the frequency of these blocking operations.
 4596          */
 4597         if (waitfor == MNT_WAIT) {
 4598                 stat_journal_wait++;
 4599                 switch (wk->wk_type) {
 4600                 case D_JREMREF:
 4601                 case D_JMVREF:
 4602                         stat_jwait_filepage++;
 4603                         break;
 4604                 case D_JTRUNC:
 4605                 case D_JFREEBLK:
 4606                         stat_jwait_freeblks++;
 4607                         break;
 4608                 case D_JNEWBLK:
 4609                         stat_jwait_newblk++;
 4610                         break;
 4611                 case D_JADDREF:
 4612                         stat_jwait_inode++;
 4613                         break;
 4614                 default:
 4615                         break;
 4616                 }
 4617         }
 4618         /*
 4619          * If IO has not started we process the journal.  We can't mark the
 4620          * worklist item as IOWAITING because we drop the lock while
 4621          * processing the journal and the worklist entry may be freed after
 4622          * this point.  The caller may call back in and re-issue the request.
 4623          */
 4624         if ((wk->wk_state & INPROGRESS) == 0) {
 4625                 softdep_process_journal(wk->wk_mp, wk, waitfor);
 4626                 if (waitfor != MNT_WAIT)
 4627                         return (EBUSY);
 4628                 return (0);
 4629         }
 4630         if (waitfor != MNT_WAIT)
 4631                 return (EBUSY);
 4632         wait_worklist(wk, "jwait");
 4633         return (0);
 4634 }
 4635 
 4636 /*
 4637  * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
 4638  * appropriate.  This is a convenience function to reduce duplicate code
 4639  * for the setup and revert functions below.
 4640  */
 4641 static struct inodedep *
 4642 inodedep_lookup_ip(ip)
 4643         struct inode *ip;
 4644 {
 4645         struct inodedep *inodedep;
 4646 
 4647         KASSERT(ip->i_nlink >= ip->i_effnlink,
 4648             ("inodedep_lookup_ip: bad delta"));
 4649         (void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC,
 4650             &inodedep);
 4651         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 4652         KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
 4653 
 4654         return (inodedep);
 4655 }
 4656 
 4657 /*
 4658  * Called prior to creating a new inode and linking it to a directory.  The
 4659  * jaddref structure must already be allocated by softdep_setup_inomapdep
 4660  * and it is discovered here so we can initialize the mode and update
 4661  * nlinkdelta.
 4662  */
 4663 void
 4664 softdep_setup_create(dp, ip)
 4665         struct inode *dp;
 4666         struct inode *ip;
 4667 {
 4668         struct inodedep *inodedep;
 4669         struct jaddref *jaddref;
 4670         struct vnode *dvp;
 4671 
 4672         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4673             ("softdep_setup_create called on non-softdep filesystem"));
 4674         KASSERT(ip->i_nlink == 1,
 4675             ("softdep_setup_create: Invalid link count."));
 4676         dvp = ITOV(dp);
 4677         ACQUIRE_LOCK(ITOUMP(dp));
 4678         inodedep = inodedep_lookup_ip(ip);
 4679         if (DOINGSUJ(dvp)) {
 4680                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 4681                     inoreflst);
 4682                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
 4683                     ("softdep_setup_create: No addref structure present."));
 4684         }
 4685         softdep_prelink(dvp, NULL);
 4686         FREE_LOCK(ITOUMP(dp));
 4687 }
 4688 
 4689 /*
 4690  * Create a jaddref structure to track the addition of a DOTDOT link when
 4691  * we are reparenting an inode as part of a rename.  This jaddref will be
 4692  * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
 4693  * non-journaling softdep.
 4694  */
 4695 void
 4696 softdep_setup_dotdot_link(dp, ip)
 4697         struct inode *dp;
 4698         struct inode *ip;
 4699 {
 4700         struct inodedep *inodedep;
 4701         struct jaddref *jaddref;
 4702         struct vnode *dvp;
 4703 
 4704         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4705             ("softdep_setup_dotdot_link called on non-softdep filesystem"));
 4706         dvp = ITOV(dp);
 4707         jaddref = NULL;
 4708         /*
 4709          * We don't set MKDIR_PARENT as this is not tied to a mkdir and
 4710          * is used as a normal link would be.
 4711          */
 4712         if (DOINGSUJ(dvp))
 4713                 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
 4714                     dp->i_effnlink - 1, dp->i_mode);
 4715         ACQUIRE_LOCK(ITOUMP(dp));
 4716         inodedep = inodedep_lookup_ip(dp);
 4717         if (jaddref)
 4718                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 4719                     if_deps);
 4720         softdep_prelink(dvp, ITOV(ip));
 4721         FREE_LOCK(ITOUMP(dp));
 4722 }
 4723 
 4724 /*
 4725  * Create a jaddref structure to track a new link to an inode.  The directory
 4726  * offset is not known until softdep_setup_directory_add or
 4727  * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
 4728  * softdep.
 4729  */
 4730 void
 4731 softdep_setup_link(dp, ip)
 4732         struct inode *dp;
 4733         struct inode *ip;
 4734 {
 4735         struct inodedep *inodedep;
 4736         struct jaddref *jaddref;
 4737         struct vnode *dvp;
 4738 
 4739         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4740             ("softdep_setup_link called on non-softdep filesystem"));
 4741         dvp = ITOV(dp);
 4742         jaddref = NULL;
 4743         if (DOINGSUJ(dvp))
 4744                 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
 4745                     ip->i_mode);
 4746         ACQUIRE_LOCK(ITOUMP(dp));
 4747         inodedep = inodedep_lookup_ip(ip);
 4748         if (jaddref)
 4749                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 4750                     if_deps);
 4751         softdep_prelink(dvp, ITOV(ip));
 4752         FREE_LOCK(ITOUMP(dp));
 4753 }
 4754 
 4755 /*
 4756  * Called to create the jaddref structures to track . and .. references as
 4757  * well as lookup and further initialize the incomplete jaddref created
 4758  * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
 4759  * nlinkdelta for non-journaling softdep.
 4760  */
 4761 void
 4762 softdep_setup_mkdir(dp, ip)
 4763         struct inode *dp;
 4764         struct inode *ip;
 4765 {
 4766         struct inodedep *inodedep;
 4767         struct jaddref *dotdotaddref;
 4768         struct jaddref *dotaddref;
 4769         struct jaddref *jaddref;
 4770         struct vnode *dvp;
 4771 
 4772         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4773             ("softdep_setup_mkdir called on non-softdep filesystem"));
 4774         dvp = ITOV(dp);
 4775         dotaddref = dotdotaddref = NULL;
 4776         if (DOINGSUJ(dvp)) {
 4777                 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
 4778                     ip->i_mode);
 4779                 dotaddref->ja_state |= MKDIR_BODY;
 4780                 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
 4781                     dp->i_effnlink - 1, dp->i_mode);
 4782                 dotdotaddref->ja_state |= MKDIR_PARENT;
 4783         }
 4784         ACQUIRE_LOCK(ITOUMP(dp));
 4785         inodedep = inodedep_lookup_ip(ip);
 4786         if (DOINGSUJ(dvp)) {
 4787                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 4788                     inoreflst);
 4789                 KASSERT(jaddref != NULL,
 4790                     ("softdep_setup_mkdir: No addref structure present."));
 4791                 KASSERT(jaddref->ja_parent == dp->i_number, 
 4792                     ("softdep_setup_mkdir: bad parent %ju",
 4793                     (uintmax_t)jaddref->ja_parent));
 4794                 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
 4795                     if_deps);
 4796         }
 4797         inodedep = inodedep_lookup_ip(dp);
 4798         if (DOINGSUJ(dvp))
 4799                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
 4800                     &dotdotaddref->ja_ref, if_deps);
 4801         softdep_prelink(ITOV(dp), NULL);
 4802         FREE_LOCK(ITOUMP(dp));
 4803 }
 4804 
 4805 /*
 4806  * Called to track nlinkdelta of the inode and parent directories prior to
 4807  * unlinking a directory.
 4808  */
 4809 void
 4810 softdep_setup_rmdir(dp, ip)
 4811         struct inode *dp;
 4812         struct inode *ip;
 4813 {
 4814         struct vnode *dvp;
 4815 
 4816         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4817             ("softdep_setup_rmdir called on non-softdep filesystem"));
 4818         dvp = ITOV(dp);
 4819         ACQUIRE_LOCK(ITOUMP(dp));
 4820         (void) inodedep_lookup_ip(ip);
 4821         (void) inodedep_lookup_ip(dp);
 4822         softdep_prelink(dvp, ITOV(ip));
 4823         FREE_LOCK(ITOUMP(dp));
 4824 }
 4825 
 4826 /*
 4827  * Called to track nlinkdelta of the inode and parent directories prior to
 4828  * unlink.
 4829  */
 4830 void
 4831 softdep_setup_unlink(dp, ip)
 4832         struct inode *dp;
 4833         struct inode *ip;
 4834 {
 4835         struct vnode *dvp;
 4836 
 4837         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4838             ("softdep_setup_unlink called on non-softdep filesystem"));
 4839         dvp = ITOV(dp);
 4840         ACQUIRE_LOCK(ITOUMP(dp));
 4841         (void) inodedep_lookup_ip(ip);
 4842         (void) inodedep_lookup_ip(dp);
 4843         softdep_prelink(dvp, ITOV(ip));
 4844         FREE_LOCK(ITOUMP(dp));
 4845 }
 4846 
 4847 /*
 4848  * Called to release the journal structures created by a failed non-directory
 4849  * creation.  Adjusts nlinkdelta for non-journaling softdep.
 4850  */
 4851 void
 4852 softdep_revert_create(dp, ip)
 4853         struct inode *dp;
 4854         struct inode *ip;
 4855 {
 4856         struct inodedep *inodedep;
 4857         struct jaddref *jaddref;
 4858         struct vnode *dvp;
 4859 
 4860         KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0,
 4861             ("softdep_revert_create called on non-softdep filesystem"));
 4862         dvp = ITOV(dp);
 4863         ACQUIRE_LOCK(ITOUMP(dp));
 4864         inodedep = inodedep_lookup_ip(ip);
 4865         if (DOINGSUJ(dvp)) {
 4866                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 4867                     inoreflst);
 4868                 KASSERT(jaddref->ja_parent == dp->i_number,
 4869                     ("softdep_revert_create: addref parent mismatch"));
 4870                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
 4871         }
 4872         FREE_LOCK(ITOUMP(dp));
 4873 }
 4874 
 4875 /*
 4876  * Called to release the journal structures created by a failed link
 4877  * addition.  Adjusts nlinkdelta for non-journaling softdep.
 4878  */
 4879 void
 4880 softdep_revert_link(dp, ip)
 4881         struct inode *dp;
 4882         struct inode *ip;
 4883 {
 4884         struct inodedep *inodedep;
 4885         struct jaddref *jaddref;
 4886         struct vnode *dvp;
 4887 
 4888         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4889             ("softdep_revert_link called on non-softdep filesystem"));
 4890         dvp = ITOV(dp);
 4891         ACQUIRE_LOCK(ITOUMP(dp));
 4892         inodedep = inodedep_lookup_ip(ip);
 4893         if (DOINGSUJ(dvp)) {
 4894                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 4895                     inoreflst);
 4896                 KASSERT(jaddref->ja_parent == dp->i_number,
 4897                     ("softdep_revert_link: addref parent mismatch"));
 4898                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
 4899         }
 4900         FREE_LOCK(ITOUMP(dp));
 4901 }
 4902 
 4903 /*
 4904  * Called to release the journal structures created by a failed mkdir
 4905  * attempt.  Adjusts nlinkdelta for non-journaling softdep.
 4906  */
 4907 void
 4908 softdep_revert_mkdir(dp, ip)
 4909         struct inode *dp;
 4910         struct inode *ip;
 4911 {
 4912         struct inodedep *inodedep;
 4913         struct jaddref *jaddref;
 4914         struct jaddref *dotaddref;
 4915         struct vnode *dvp;
 4916 
 4917         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4918             ("softdep_revert_mkdir called on non-softdep filesystem"));
 4919         dvp = ITOV(dp);
 4920 
 4921         ACQUIRE_LOCK(ITOUMP(dp));
 4922         inodedep = inodedep_lookup_ip(dp);
 4923         if (DOINGSUJ(dvp)) {
 4924                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 4925                     inoreflst);
 4926                 KASSERT(jaddref->ja_parent == ip->i_number,
 4927                     ("softdep_revert_mkdir: dotdot addref parent mismatch"));
 4928                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
 4929         }
 4930         inodedep = inodedep_lookup_ip(ip);
 4931         if (DOINGSUJ(dvp)) {
 4932                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 4933                     inoreflst);
 4934                 KASSERT(jaddref->ja_parent == dp->i_number,
 4935                     ("softdep_revert_mkdir: addref parent mismatch"));
 4936                 dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
 4937                     inoreflst, if_deps);
 4938                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
 4939                 KASSERT(dotaddref->ja_parent == ip->i_number,
 4940                     ("softdep_revert_mkdir: dot addref parent mismatch"));
 4941                 cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
 4942         }
 4943         FREE_LOCK(ITOUMP(dp));
 4944 }
 4945 
 4946 /* 
 4947  * Called to correct nlinkdelta after a failed rmdir.
 4948  */
 4949 void
 4950 softdep_revert_rmdir(dp, ip)
 4951         struct inode *dp;
 4952         struct inode *ip;
 4953 {
 4954 
 4955         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4956             ("softdep_revert_rmdir called on non-softdep filesystem"));
 4957         ACQUIRE_LOCK(ITOUMP(dp));
 4958         (void) inodedep_lookup_ip(ip);
 4959         (void) inodedep_lookup_ip(dp);
 4960         FREE_LOCK(ITOUMP(dp));
 4961 }
 4962 
 4963 /*
 4964  * Protecting the freemaps (or bitmaps).
 4965  * 
 4966  * To eliminate the need to execute fsck before mounting a filesystem
 4967  * after a power failure, one must (conservatively) guarantee that the
 4968  * on-disk copy of the bitmaps never indicate that a live inode or block is
 4969  * free.  So, when a block or inode is allocated, the bitmap should be
 4970  * updated (on disk) before any new pointers.  When a block or inode is
 4971  * freed, the bitmap should not be updated until all pointers have been
 4972  * reset.  The latter dependency is handled by the delayed de-allocation
 4973  * approach described below for block and inode de-allocation.  The former
 4974  * dependency is handled by calling the following procedure when a block or
 4975  * inode is allocated. When an inode is allocated an "inodedep" is created
 4976  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
 4977  * Each "inodedep" is also inserted into the hash indexing structure so
 4978  * that any additional link additions can be made dependent on the inode
 4979  * allocation.
 4980  * 
 4981  * The ufs filesystem maintains a number of free block counts (e.g., per
 4982  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
 4983  * in addition to the bitmaps.  These counts are used to improve efficiency
 4984  * during allocation and therefore must be consistent with the bitmaps.
 4985  * There is no convenient way to guarantee post-crash consistency of these
 4986  * counts with simple update ordering, for two main reasons: (1) The counts
 4987  * and bitmaps for a single cylinder group block are not in the same disk
 4988  * sector.  If a disk write is interrupted (e.g., by power failure), one may
 4989  * be written and the other not.  (2) Some of the counts are located in the
 4990  * superblock rather than the cylinder group block. So, we focus our soft
 4991  * updates implementation on protecting the bitmaps. When mounting a
 4992  * filesystem, we recompute the auxiliary counts from the bitmaps.
 4993  */
 4994 
 4995 /*
 4996  * Called just after updating the cylinder group block to allocate an inode.
 4997  */
 4998 void
 4999 softdep_setup_inomapdep(bp, ip, newinum, mode)
 5000         struct buf *bp;         /* buffer for cylgroup block with inode map */
 5001         struct inode *ip;       /* inode related to allocation */
 5002         ino_t newinum;          /* new inode number being allocated */
 5003         int mode;
 5004 {
 5005         struct inodedep *inodedep;
 5006         struct bmsafemap *bmsafemap;
 5007         struct jaddref *jaddref;
 5008         struct mount *mp;
 5009         struct fs *fs;
 5010 
 5011         mp = ITOVFS(ip);
 5012         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 5013             ("softdep_setup_inomapdep called on non-softdep filesystem"));
 5014         fs = VFSTOUFS(mp)->um_fs;
 5015         jaddref = NULL;
 5016 
 5017         /*
 5018          * Allocate the journal reference add structure so that the bitmap
 5019          * can be dependent on it.
 5020          */
 5021         if (MOUNTEDSUJ(mp)) {
 5022                 jaddref = newjaddref(ip, newinum, 0, 0, mode);
 5023                 jaddref->ja_state |= NEWBLOCK;
 5024         }
 5025 
 5026         /*
 5027          * Create a dependency for the newly allocated inode.
 5028          * Panic if it already exists as something is seriously wrong.
 5029          * Otherwise add it to the dependency list for the buffer holding
 5030          * the cylinder group map from which it was allocated.
 5031          *
 5032          * We have to preallocate a bmsafemap entry in case it is needed
 5033          * in bmsafemap_lookup since once we allocate the inodedep, we
 5034          * have to finish initializing it before we can FREE_LOCK().
 5035          * By preallocating, we avoid FREE_LOCK() while doing a malloc
 5036          * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
 5037          * creating the inodedep as it can be freed during the time
 5038          * that we FREE_LOCK() while allocating the inodedep. We must
 5039          * call workitem_alloc() before entering the locked section as
 5040          * it also acquires the lock and we must avoid trying doing so
 5041          * recursively.
 5042          */
 5043         bmsafemap = malloc(sizeof(struct bmsafemap),
 5044             M_BMSAFEMAP, M_SOFTDEP_FLAGS);
 5045         workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
 5046         ACQUIRE_LOCK(ITOUMP(ip));
 5047         if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep)))
 5048                 panic("softdep_setup_inomapdep: dependency %p for new"
 5049                     "inode already exists", inodedep);
 5050         bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
 5051         if (jaddref) {
 5052                 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
 5053                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 5054                     if_deps);
 5055         } else {
 5056                 inodedep->id_state |= ONDEPLIST;
 5057                 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
 5058         }
 5059         inodedep->id_bmsafemap = bmsafemap;
 5060         inodedep->id_state &= ~DEPCOMPLETE;
 5061         FREE_LOCK(ITOUMP(ip));
 5062 }
 5063 
 5064 /*
 5065  * Called just after updating the cylinder group block to
 5066  * allocate block or fragment.
 5067  */
 5068 void
 5069 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
 5070         struct buf *bp;         /* buffer for cylgroup block with block map */
 5071         struct mount *mp;       /* filesystem doing allocation */
 5072         ufs2_daddr_t newblkno;  /* number of newly allocated block */
 5073         int frags;              /* Number of fragments. */
 5074         int oldfrags;           /* Previous number of fragments for extend. */
 5075 {
 5076         struct newblk *newblk;
 5077         struct bmsafemap *bmsafemap;
 5078         struct jnewblk *jnewblk;
 5079         struct ufsmount *ump;
 5080         struct fs *fs;
 5081 
 5082         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 5083             ("softdep_setup_blkmapdep called on non-softdep filesystem"));
 5084         ump = VFSTOUFS(mp);
 5085         fs = ump->um_fs;
 5086         jnewblk = NULL;
 5087         /*
 5088          * Create a dependency for the newly allocated block.
 5089          * Add it to the dependency list for the buffer holding
 5090          * the cylinder group map from which it was allocated.
 5091          */
 5092         if (MOUNTEDSUJ(mp)) {
 5093                 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
 5094                 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
 5095                 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
 5096                 jnewblk->jn_state = ATTACHED;
 5097                 jnewblk->jn_blkno = newblkno;
 5098                 jnewblk->jn_frags = frags;
 5099                 jnewblk->jn_oldfrags = oldfrags;
 5100 #ifdef SUJ_DEBUG
 5101                 {
 5102                         struct cg *cgp;
 5103                         uint8_t *blksfree;
 5104                         long bno;
 5105                         int i;
 5106         
 5107                         cgp = (struct cg *)bp->b_data;
 5108                         blksfree = cg_blksfree(cgp);
 5109                         bno = dtogd(fs, jnewblk->jn_blkno);
 5110                         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
 5111                             i++) {
 5112                                 if (isset(blksfree, bno + i))
 5113                                         panic("softdep_setup_blkmapdep: "
 5114                                             "free fragment %d from %d-%d "
 5115                                             "state 0x%X dep %p", i,
 5116                                             jnewblk->jn_oldfrags,
 5117                                             jnewblk->jn_frags,
 5118                                             jnewblk->jn_state,
 5119                                             jnewblk->jn_dep);
 5120                         }
 5121                 }
 5122 #endif
 5123         }
 5124 
 5125         CTR3(KTR_SUJ,
 5126             "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
 5127             newblkno, frags, oldfrags);
 5128         ACQUIRE_LOCK(ump);
 5129         if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
 5130                 panic("softdep_setup_blkmapdep: found block");
 5131         newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
 5132             dtog(fs, newblkno), NULL);
 5133         if (jnewblk) {
 5134                 jnewblk->jn_dep = (struct worklist *)newblk;
 5135                 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
 5136         } else {
 5137                 newblk->nb_state |= ONDEPLIST;
 5138                 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
 5139         }
 5140         newblk->nb_bmsafemap = bmsafemap;
 5141         newblk->nb_jnewblk = jnewblk;
 5142         FREE_LOCK(ump);
 5143 }
 5144 
 5145 #define BMSAFEMAP_HASH(ump, cg) \
 5146       (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
 5147 
 5148 static int
 5149 bmsafemap_find(bmsafemaphd, cg, bmsafemapp)
 5150         struct bmsafemap_hashhead *bmsafemaphd;
 5151         int cg;
 5152         struct bmsafemap **bmsafemapp;
 5153 {
 5154         struct bmsafemap *bmsafemap;
 5155 
 5156         LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
 5157                 if (bmsafemap->sm_cg == cg)
 5158                         break;
 5159         if (bmsafemap) {
 5160                 *bmsafemapp = bmsafemap;
 5161                 return (1);
 5162         }
 5163         *bmsafemapp = NULL;
 5164 
 5165         return (0);
 5166 }
 5167 
 5168 /*
 5169  * Find the bmsafemap associated with a cylinder group buffer.
 5170  * If none exists, create one. The buffer must be locked when
 5171  * this routine is called and this routine must be called with
 5172  * the softdep lock held. To avoid giving up the lock while
 5173  * allocating a new bmsafemap, a preallocated bmsafemap may be
 5174  * provided. If it is provided but not needed, it is freed.
 5175  */
 5176 static struct bmsafemap *
 5177 bmsafemap_lookup(mp, bp, cg, newbmsafemap)
 5178         struct mount *mp;
 5179         struct buf *bp;
 5180         int cg;
 5181         struct bmsafemap *newbmsafemap;
 5182 {
 5183         struct bmsafemap_hashhead *bmsafemaphd;
 5184         struct bmsafemap *bmsafemap, *collision;
 5185         struct worklist *wk;
 5186         struct ufsmount *ump;
 5187 
 5188         ump = VFSTOUFS(mp);
 5189         LOCK_OWNED(ump);
 5190         KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
 5191         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 5192                 if (wk->wk_type == D_BMSAFEMAP) {
 5193                         if (newbmsafemap)
 5194                                 WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
 5195                         return (WK_BMSAFEMAP(wk));
 5196                 }
 5197         }
 5198         bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
 5199         if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
 5200                 if (newbmsafemap)
 5201                         WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
 5202                 return (bmsafemap);
 5203         }
 5204         if (newbmsafemap) {
 5205                 bmsafemap = newbmsafemap;
 5206         } else {
 5207                 FREE_LOCK(ump);
 5208                 bmsafemap = malloc(sizeof(struct bmsafemap),
 5209                         M_BMSAFEMAP, M_SOFTDEP_FLAGS);
 5210                 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
 5211                 ACQUIRE_LOCK(ump);
 5212         }
 5213         bmsafemap->sm_buf = bp;
 5214         LIST_INIT(&bmsafemap->sm_inodedephd);
 5215         LIST_INIT(&bmsafemap->sm_inodedepwr);
 5216         LIST_INIT(&bmsafemap->sm_newblkhd);
 5217         LIST_INIT(&bmsafemap->sm_newblkwr);
 5218         LIST_INIT(&bmsafemap->sm_jaddrefhd);
 5219         LIST_INIT(&bmsafemap->sm_jnewblkhd);
 5220         LIST_INIT(&bmsafemap->sm_freehd);
 5221         LIST_INIT(&bmsafemap->sm_freewr);
 5222         if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
 5223                 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
 5224                 return (collision);
 5225         }
 5226         bmsafemap->sm_cg = cg;
 5227         LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
 5228         LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
 5229         WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
 5230         return (bmsafemap);
 5231 }
 5232 
 5233 /*
 5234  * Direct block allocation dependencies.
 5235  * 
 5236  * When a new block is allocated, the corresponding disk locations must be
 5237  * initialized (with zeros or new data) before the on-disk inode points to
 5238  * them.  Also, the freemap from which the block was allocated must be
 5239  * updated (on disk) before the inode's pointer. These two dependencies are
 5240  * independent of each other and are needed for all file blocks and indirect
 5241  * blocks that are pointed to directly by the inode.  Just before the
 5242  * "in-core" version of the inode is updated with a newly allocated block
 5243  * number, a procedure (below) is called to setup allocation dependency
 5244  * structures.  These structures are removed when the corresponding
 5245  * dependencies are satisfied or when the block allocation becomes obsolete
 5246  * (i.e., the file is deleted, the block is de-allocated, or the block is a
 5247  * fragment that gets upgraded).  All of these cases are handled in
 5248  * procedures described later.
 5249  * 
 5250  * When a file extension causes a fragment to be upgraded, either to a larger
 5251  * fragment or to a full block, the on-disk location may change (if the
 5252  * previous fragment could not simply be extended). In this case, the old
 5253  * fragment must be de-allocated, but not until after the inode's pointer has
 5254  * been updated. In most cases, this is handled by later procedures, which
 5255  * will construct a "freefrag" structure to be added to the workitem queue
 5256  * when the inode update is complete (or obsolete).  The main exception to
 5257  * this is when an allocation occurs while a pending allocation dependency
 5258  * (for the same block pointer) remains.  This case is handled in the main
 5259  * allocation dependency setup procedure by immediately freeing the
 5260  * unreferenced fragments.
 5261  */ 
 5262 void 
 5263 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
 5264         struct inode *ip;       /* inode to which block is being added */
 5265         ufs_lbn_t off;          /* block pointer within inode */
 5266         ufs2_daddr_t newblkno;  /* disk block number being added */
 5267         ufs2_daddr_t oldblkno;  /* previous block number, 0 unless frag */
 5268         long newsize;           /* size of new block */
 5269         long oldsize;           /* size of new block */
 5270         struct buf *bp;         /* bp for allocated block */
 5271 {
 5272         struct allocdirect *adp, *oldadp;
 5273         struct allocdirectlst *adphead;
 5274         struct freefrag *freefrag;
 5275         struct inodedep *inodedep;
 5276         struct pagedep *pagedep;
 5277         struct jnewblk *jnewblk;
 5278         struct newblk *newblk;
 5279         struct mount *mp;
 5280         ufs_lbn_t lbn;
 5281 
 5282         lbn = bp->b_lblkno;
 5283         mp = ITOVFS(ip);
 5284         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 5285             ("softdep_setup_allocdirect called on non-softdep filesystem"));
 5286         if (oldblkno && oldblkno != newblkno)
 5287                 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
 5288         else
 5289                 freefrag = NULL;
 5290 
 5291         CTR6(KTR_SUJ,
 5292             "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
 5293             "off %jd newsize %ld oldsize %d",
 5294             ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
 5295         ACQUIRE_LOCK(ITOUMP(ip));
 5296         if (off >= NDADDR) {
 5297                 if (lbn > 0)
 5298                         panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
 5299                             lbn, off);
 5300                 /* allocating an indirect block */
 5301                 if (oldblkno != 0)
 5302                         panic("softdep_setup_allocdirect: non-zero indir");
 5303         } else {
 5304                 if (off != lbn)
 5305                         panic("softdep_setup_allocdirect: lbn %jd != off %jd",
 5306                             lbn, off);
 5307                 /*
 5308                  * Allocating a direct block.
 5309                  *
 5310                  * If we are allocating a directory block, then we must
 5311                  * allocate an associated pagedep to track additions and
 5312                  * deletions.
 5313                  */
 5314                 if ((ip->i_mode & IFMT) == IFDIR)
 5315                         pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
 5316                             &pagedep);
 5317         }
 5318         if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
 5319                 panic("softdep_setup_allocdirect: lost block");
 5320         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
 5321             ("softdep_setup_allocdirect: newblk already initialized"));
 5322         /*
 5323          * Convert the newblk to an allocdirect.
 5324          */
 5325         WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
 5326         adp = (struct allocdirect *)newblk;
 5327         newblk->nb_freefrag = freefrag;
 5328         adp->ad_offset = off;
 5329         adp->ad_oldblkno = oldblkno;
 5330         adp->ad_newsize = newsize;
 5331         adp->ad_oldsize = oldsize;
 5332 
 5333         /*
 5334          * Finish initializing the journal.
 5335          */
 5336         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
 5337                 jnewblk->jn_ino = ip->i_number;
 5338                 jnewblk->jn_lbn = lbn;
 5339                 add_to_journal(&jnewblk->jn_list);
 5340         }
 5341         if (freefrag && freefrag->ff_jdep != NULL &&
 5342             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
 5343                 add_to_journal(freefrag->ff_jdep);
 5344         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 5345         adp->ad_inodedep = inodedep;
 5346 
 5347         WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
 5348         /*
 5349          * The list of allocdirects must be kept in sorted and ascending
 5350          * order so that the rollback routines can quickly determine the
 5351          * first uncommitted block (the size of the file stored on disk
 5352          * ends at the end of the lowest committed fragment, or if there
 5353          * are no fragments, at the end of the highest committed block).
 5354          * Since files generally grow, the typical case is that the new
 5355          * block is to be added at the end of the list. We speed this
 5356          * special case by checking against the last allocdirect in the
 5357          * list before laboriously traversing the list looking for the
 5358          * insertion point.
 5359          */
 5360         adphead = &inodedep->id_newinoupdt;
 5361         oldadp = TAILQ_LAST(adphead, allocdirectlst);
 5362         if (oldadp == NULL || oldadp->ad_offset <= off) {
 5363                 /* insert at end of list */
 5364                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 5365                 if (oldadp != NULL && oldadp->ad_offset == off)
 5366                         allocdirect_merge(adphead, adp, oldadp);
 5367                 FREE_LOCK(ITOUMP(ip));
 5368                 return;
 5369         }
 5370         TAILQ_FOREACH(oldadp, adphead, ad_next) {
 5371                 if (oldadp->ad_offset >= off)
 5372                         break;
 5373         }
 5374         if (oldadp == NULL)
 5375                 panic("softdep_setup_allocdirect: lost entry");
 5376         /* insert in middle of list */
 5377         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
 5378         if (oldadp->ad_offset == off)
 5379                 allocdirect_merge(adphead, adp, oldadp);
 5380 
 5381         FREE_LOCK(ITOUMP(ip));
 5382 }
 5383 
 5384 /*
 5385  * Merge a newer and older journal record to be stored either in a
 5386  * newblock or freefrag.  This handles aggregating journal records for
 5387  * fragment allocation into a second record as well as replacing a
 5388  * journal free with an aborted journal allocation.  A segment for the
 5389  * oldest record will be placed on wkhd if it has been written.  If not
 5390  * the segment for the newer record will suffice.
 5391  */
 5392 static struct worklist *
 5393 jnewblk_merge(new, old, wkhd)
 5394         struct worklist *new;
 5395         struct worklist *old;
 5396         struct workhead *wkhd;
 5397 {
 5398         struct jnewblk *njnewblk;
 5399         struct jnewblk *jnewblk;
 5400 
 5401         /* Handle NULLs to simplify callers. */
 5402         if (new == NULL)
 5403                 return (old);
 5404         if (old == NULL)
 5405                 return (new);
 5406         /* Replace a jfreefrag with a jnewblk. */
 5407         if (new->wk_type == D_JFREEFRAG) {
 5408                 if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
 5409                         panic("jnewblk_merge: blkno mismatch: %p, %p",
 5410                             old, new);
 5411                 cancel_jfreefrag(WK_JFREEFRAG(new));
 5412                 return (old);
 5413         }
 5414         if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
 5415                 panic("jnewblk_merge: Bad type: old %d new %d\n",
 5416                     old->wk_type, new->wk_type);
 5417         /*
 5418          * Handle merging of two jnewblk records that describe
 5419          * different sets of fragments in the same block.
 5420          */
 5421         jnewblk = WK_JNEWBLK(old);
 5422         njnewblk = WK_JNEWBLK(new);
 5423         if (jnewblk->jn_blkno != njnewblk->jn_blkno)
 5424                 panic("jnewblk_merge: Merging disparate blocks.");
 5425         /*
 5426          * The record may be rolled back in the cg.
 5427          */
 5428         if (jnewblk->jn_state & UNDONE) {
 5429                 jnewblk->jn_state &= ~UNDONE;
 5430                 njnewblk->jn_state |= UNDONE;
 5431                 njnewblk->jn_state &= ~ATTACHED;
 5432         }
 5433         /*
 5434          * We modify the newer addref and free the older so that if neither
 5435          * has been written the most up-to-date copy will be on disk.  If
 5436          * both have been written but rolled back we only temporarily need
 5437          * one of them to fix the bits when the cg write completes.
 5438          */
 5439         jnewblk->jn_state |= ATTACHED | COMPLETE;
 5440         njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
 5441         cancel_jnewblk(jnewblk, wkhd);
 5442         WORKLIST_REMOVE(&jnewblk->jn_list);
 5443         free_jnewblk(jnewblk);
 5444         return (new);
 5445 }
 5446 
 5447 /*
 5448  * Replace an old allocdirect dependency with a newer one.
 5449  * This routine must be called with splbio interrupts blocked.
 5450  */
 5451 static void
 5452 allocdirect_merge(adphead, newadp, oldadp)
 5453         struct allocdirectlst *adphead; /* head of list holding allocdirects */
 5454         struct allocdirect *newadp;     /* allocdirect being added */
 5455         struct allocdirect *oldadp;     /* existing allocdirect being checked */
 5456 {
 5457         struct worklist *wk;
 5458         struct freefrag *freefrag;
 5459 
 5460         freefrag = NULL;
 5461         LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
 5462         if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
 5463             newadp->ad_oldsize != oldadp->ad_newsize ||
 5464             newadp->ad_offset >= NDADDR)
 5465                 panic("%s %jd != new %jd || old size %ld != new %ld",
 5466                     "allocdirect_merge: old blkno",
 5467                     (intmax_t)newadp->ad_oldblkno,
 5468                     (intmax_t)oldadp->ad_newblkno,
 5469                     newadp->ad_oldsize, oldadp->ad_newsize);
 5470         newadp->ad_oldblkno = oldadp->ad_oldblkno;
 5471         newadp->ad_oldsize = oldadp->ad_oldsize;
 5472         /*
 5473          * If the old dependency had a fragment to free or had never
 5474          * previously had a block allocated, then the new dependency
 5475          * can immediately post its freefrag and adopt the old freefrag.
 5476          * This action is done by swapping the freefrag dependencies.
 5477          * The new dependency gains the old one's freefrag, and the
 5478          * old one gets the new one and then immediately puts it on
 5479          * the worklist when it is freed by free_newblk. It is
 5480          * not possible to do this swap when the old dependency had a
 5481          * non-zero size but no previous fragment to free. This condition
 5482          * arises when the new block is an extension of the old block.
 5483          * Here, the first part of the fragment allocated to the new
 5484          * dependency is part of the block currently claimed on disk by
 5485          * the old dependency, so cannot legitimately be freed until the
 5486          * conditions for the new dependency are fulfilled.
 5487          */
 5488         freefrag = newadp->ad_freefrag;
 5489         if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
 5490                 newadp->ad_freefrag = oldadp->ad_freefrag;
 5491                 oldadp->ad_freefrag = freefrag;
 5492         }
 5493         /*
 5494          * If we are tracking a new directory-block allocation,
 5495          * move it from the old allocdirect to the new allocdirect.
 5496          */
 5497         if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
 5498                 WORKLIST_REMOVE(wk);
 5499                 if (!LIST_EMPTY(&oldadp->ad_newdirblk))
 5500                         panic("allocdirect_merge: extra newdirblk");
 5501                 WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
 5502         }
 5503         TAILQ_REMOVE(adphead, oldadp, ad_next);
 5504         /*
 5505          * We need to move any journal dependencies over to the freefrag
 5506          * that releases this block if it exists.  Otherwise we are
 5507          * extending an existing block and we'll wait until that is
 5508          * complete to release the journal space and extend the
 5509          * new journal to cover this old space as well.
 5510          */
 5511         if (freefrag == NULL) {
 5512                 if (oldadp->ad_newblkno != newadp->ad_newblkno)
 5513                         panic("allocdirect_merge: %jd != %jd",
 5514                             oldadp->ad_newblkno, newadp->ad_newblkno);
 5515                 newadp->ad_block.nb_jnewblk = (struct jnewblk *)
 5516                     jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, 
 5517                     &oldadp->ad_block.nb_jnewblk->jn_list,
 5518                     &newadp->ad_block.nb_jwork);
 5519                 oldadp->ad_block.nb_jnewblk = NULL;
 5520                 cancel_newblk(&oldadp->ad_block, NULL,
 5521                     &newadp->ad_block.nb_jwork);
 5522         } else {
 5523                 wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
 5524                     &freefrag->ff_list, &freefrag->ff_jwork);
 5525                 freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
 5526                     &freefrag->ff_jwork);
 5527         }
 5528         free_newblk(&oldadp->ad_block);
 5529 }
 5530 
 5531 /*
 5532  * Allocate a jfreefrag structure to journal a single block free.
 5533  */
 5534 static struct jfreefrag *
 5535 newjfreefrag(freefrag, ip, blkno, size, lbn)
 5536         struct freefrag *freefrag;
 5537         struct inode *ip;
 5538         ufs2_daddr_t blkno;
 5539         long size;
 5540         ufs_lbn_t lbn;
 5541 {
 5542         struct jfreefrag *jfreefrag;
 5543         struct fs *fs;
 5544 
 5545         fs = ITOFS(ip);
 5546         jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
 5547             M_SOFTDEP_FLAGS);
 5548         workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip));
 5549         jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
 5550         jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
 5551         jfreefrag->fr_ino = ip->i_number;
 5552         jfreefrag->fr_lbn = lbn;
 5553         jfreefrag->fr_blkno = blkno;
 5554         jfreefrag->fr_frags = numfrags(fs, size);
 5555         jfreefrag->fr_freefrag = freefrag;
 5556 
 5557         return (jfreefrag);
 5558 }
 5559 
 5560 /*
 5561  * Allocate a new freefrag structure.
 5562  */
 5563 static struct freefrag *
 5564 newfreefrag(ip, blkno, size, lbn)
 5565         struct inode *ip;
 5566         ufs2_daddr_t blkno;
 5567         long size;
 5568         ufs_lbn_t lbn;
 5569 {
 5570         struct freefrag *freefrag;
 5571         struct ufsmount *ump;
 5572         struct fs *fs;
 5573 
 5574         CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
 5575             ip->i_number, blkno, size, lbn);
 5576         ump = ITOUMP(ip);
 5577         fs = ump->um_fs;
 5578         if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
 5579                 panic("newfreefrag: frag size");
 5580         freefrag = malloc(sizeof(struct freefrag),
 5581             M_FREEFRAG, M_SOFTDEP_FLAGS);
 5582         workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump));
 5583         freefrag->ff_state = ATTACHED;
 5584         LIST_INIT(&freefrag->ff_jwork);
 5585         freefrag->ff_inum = ip->i_number;
 5586         freefrag->ff_vtype = ITOV(ip)->v_type;
 5587         freefrag->ff_blkno = blkno;
 5588         freefrag->ff_fragsize = size;
 5589 
 5590         if (MOUNTEDSUJ(UFSTOVFS(ump))) {
 5591                 freefrag->ff_jdep = (struct worklist *)
 5592                     newjfreefrag(freefrag, ip, blkno, size, lbn);
 5593         } else {
 5594                 freefrag->ff_state |= DEPCOMPLETE;
 5595                 freefrag->ff_jdep = NULL;
 5596         }
 5597 
 5598         return (freefrag);
 5599 }
 5600 
 5601 /*
 5602  * This workitem de-allocates fragments that were replaced during
 5603  * file block allocation.
 5604  */
 5605 static void 
 5606 handle_workitem_freefrag(freefrag)
 5607         struct freefrag *freefrag;
 5608 {
 5609         struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
 5610         struct workhead wkhd;
 5611 
 5612         CTR3(KTR_SUJ,
 5613             "handle_workitem_freefrag: ino %d blkno %jd size %ld",
 5614             freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
 5615         /*
 5616          * It would be illegal to add new completion items to the
 5617          * freefrag after it was schedule to be done so it must be
 5618          * safe to modify the list head here.
 5619          */
 5620         LIST_INIT(&wkhd);
 5621         ACQUIRE_LOCK(ump);
 5622         LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
 5623         /*
 5624          * If the journal has not been written we must cancel it here.
 5625          */
 5626         if (freefrag->ff_jdep) {
 5627                 if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
 5628                         panic("handle_workitem_freefrag: Unexpected type %d\n",
 5629                             freefrag->ff_jdep->wk_type);
 5630                 cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
 5631         }
 5632         FREE_LOCK(ump);
 5633         ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
 5634            freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
 5635         ACQUIRE_LOCK(ump);
 5636         WORKITEM_FREE(freefrag, D_FREEFRAG);
 5637         FREE_LOCK(ump);
 5638 }
 5639 
 5640 /*
 5641  * Set up a dependency structure for an external attributes data block.
 5642  * This routine follows much of the structure of softdep_setup_allocdirect.
 5643  * See the description of softdep_setup_allocdirect above for details.
 5644  */
 5645 void 
 5646 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
 5647         struct inode *ip;
 5648         ufs_lbn_t off;
 5649         ufs2_daddr_t newblkno;
 5650         ufs2_daddr_t oldblkno;
 5651         long newsize;
 5652         long oldsize;
 5653         struct buf *bp;
 5654 {
 5655         struct allocdirect *adp, *oldadp;
 5656         struct allocdirectlst *adphead;
 5657         struct freefrag *freefrag;
 5658         struct inodedep *inodedep;
 5659         struct jnewblk *jnewblk;
 5660         struct newblk *newblk;
 5661         struct mount *mp;
 5662         struct ufsmount *ump;
 5663         ufs_lbn_t lbn;
 5664 
 5665         mp = ITOVFS(ip);
 5666         ump = VFSTOUFS(mp);
 5667         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 5668             ("softdep_setup_allocext called on non-softdep filesystem"));
 5669         KASSERT(off < NXADDR, ("softdep_setup_allocext: lbn %lld > NXADDR",
 5670                     (long long)off));
 5671 
 5672         lbn = bp->b_lblkno;
 5673         if (oldblkno && oldblkno != newblkno)
 5674                 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
 5675         else
 5676                 freefrag = NULL;
 5677 
 5678         ACQUIRE_LOCK(ump);
 5679         if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
 5680                 panic("softdep_setup_allocext: lost block");
 5681         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
 5682             ("softdep_setup_allocext: newblk already initialized"));
 5683         /*
 5684          * Convert the newblk to an allocdirect.
 5685          */
 5686         WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
 5687         adp = (struct allocdirect *)newblk;
 5688         newblk->nb_freefrag = freefrag;
 5689         adp->ad_offset = off;
 5690         adp->ad_oldblkno = oldblkno;
 5691         adp->ad_newsize = newsize;
 5692         adp->ad_oldsize = oldsize;
 5693         adp->ad_state |=  EXTDATA;
 5694 
 5695         /*
 5696          * Finish initializing the journal.
 5697          */
 5698         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
 5699                 jnewblk->jn_ino = ip->i_number;
 5700                 jnewblk->jn_lbn = lbn;
 5701                 add_to_journal(&jnewblk->jn_list);
 5702         }
 5703         if (freefrag && freefrag->ff_jdep != NULL &&
 5704             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
 5705                 add_to_journal(freefrag->ff_jdep);
 5706         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 5707         adp->ad_inodedep = inodedep;
 5708 
 5709         WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
 5710         /*
 5711          * The list of allocdirects must be kept in sorted and ascending
 5712          * order so that the rollback routines can quickly determine the
 5713          * first uncommitted block (the size of the file stored on disk
 5714          * ends at the end of the lowest committed fragment, or if there
 5715          * are no fragments, at the end of the highest committed block).
 5716          * Since files generally grow, the typical case is that the new
 5717          * block is to be added at the end of the list. We speed this
 5718          * special case by checking against the last allocdirect in the
 5719          * list before laboriously traversing the list looking for the
 5720          * insertion point.
 5721          */
 5722         adphead = &inodedep->id_newextupdt;
 5723         oldadp = TAILQ_LAST(adphead, allocdirectlst);
 5724         if (oldadp == NULL || oldadp->ad_offset <= off) {
 5725                 /* insert at end of list */
 5726                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 5727                 if (oldadp != NULL && oldadp->ad_offset == off)
 5728                         allocdirect_merge(adphead, adp, oldadp);
 5729                 FREE_LOCK(ump);
 5730                 return;
 5731         }
 5732         TAILQ_FOREACH(oldadp, adphead, ad_next) {
 5733                 if (oldadp->ad_offset >= off)
 5734                         break;
 5735         }
 5736         if (oldadp == NULL)
 5737                 panic("softdep_setup_allocext: lost entry");
 5738         /* insert in middle of list */
 5739         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
 5740         if (oldadp->ad_offset == off)
 5741                 allocdirect_merge(adphead, adp, oldadp);
 5742         FREE_LOCK(ump);
 5743 }
 5744 
 5745 /*
 5746  * Indirect block allocation dependencies.
 5747  * 
 5748  * The same dependencies that exist for a direct block also exist when
 5749  * a new block is allocated and pointed to by an entry in a block of
 5750  * indirect pointers. The undo/redo states described above are also
 5751  * used here. Because an indirect block contains many pointers that
 5752  * may have dependencies, a second copy of the entire in-memory indirect
 5753  * block is kept. The buffer cache copy is always completely up-to-date.
 5754  * The second copy, which is used only as a source for disk writes,
 5755  * contains only the safe pointers (i.e., those that have no remaining
 5756  * update dependencies). The second copy is freed when all pointers
 5757  * are safe. The cache is not allowed to replace indirect blocks with
 5758  * pending update dependencies. If a buffer containing an indirect
 5759  * block with dependencies is written, these routines will mark it
 5760  * dirty again. It can only be successfully written once all the
 5761  * dependencies are removed. The ffs_fsync routine in conjunction with
 5762  * softdep_sync_metadata work together to get all the dependencies
 5763  * removed so that a file can be successfully written to disk. Three
 5764  * procedures are used when setting up indirect block pointer
 5765  * dependencies. The division is necessary because of the organization
 5766  * of the "balloc" routine and because of the distinction between file
 5767  * pages and file metadata blocks.
 5768  */
 5769 
 5770 /*
 5771  * Allocate a new allocindir structure.
 5772  */
 5773 static struct allocindir *
 5774 newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
 5775         struct inode *ip;       /* inode for file being extended */
 5776         int ptrno;              /* offset of pointer in indirect block */
 5777         ufs2_daddr_t newblkno;  /* disk block number being added */
 5778         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
 5779         ufs_lbn_t lbn;
 5780 {
 5781         struct newblk *newblk;
 5782         struct allocindir *aip;
 5783         struct freefrag *freefrag;
 5784         struct jnewblk *jnewblk;
 5785 
 5786         if (oldblkno)
 5787                 freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn);
 5788         else
 5789                 freefrag = NULL;
 5790         ACQUIRE_LOCK(ITOUMP(ip));
 5791         if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0)
 5792                 panic("new_allocindir: lost block");
 5793         KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
 5794             ("newallocindir: newblk already initialized"));
 5795         WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
 5796         newblk->nb_freefrag = freefrag;
 5797         aip = (struct allocindir *)newblk;
 5798         aip->ai_offset = ptrno;
 5799         aip->ai_oldblkno = oldblkno;
 5800         aip->ai_lbn = lbn;
 5801         if ((jnewblk = newblk->nb_jnewblk) != NULL) {
 5802                 jnewblk->jn_ino = ip->i_number;
 5803                 jnewblk->jn_lbn = lbn;
 5804                 add_to_journal(&jnewblk->jn_list);
 5805         }
 5806         if (freefrag && freefrag->ff_jdep != NULL &&
 5807             freefrag->ff_jdep->wk_type == D_JFREEFRAG)
 5808                 add_to_journal(freefrag->ff_jdep);
 5809         return (aip);
 5810 }
 5811 
 5812 /*
 5813  * Called just before setting an indirect block pointer
 5814  * to a newly allocated file page.
 5815  */
 5816 void
 5817 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 5818         struct inode *ip;       /* inode for file being extended */
 5819         ufs_lbn_t lbn;          /* allocated block number within file */
 5820         struct buf *bp;         /* buffer with indirect blk referencing page */
 5821         int ptrno;              /* offset of pointer in indirect block */
 5822         ufs2_daddr_t newblkno;  /* disk block number being added */
 5823         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
 5824         struct buf *nbp;        /* buffer holding allocated page */
 5825 {
 5826         struct inodedep *inodedep;
 5827         struct freefrag *freefrag;
 5828         struct allocindir *aip;
 5829         struct pagedep *pagedep;
 5830         struct mount *mp;
 5831         struct ufsmount *ump;
 5832 
 5833         mp = ITOVFS(ip);
 5834         ump = VFSTOUFS(mp);
 5835         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 5836             ("softdep_setup_allocindir_page called on non-softdep filesystem"));
 5837         KASSERT(lbn == nbp->b_lblkno,
 5838             ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
 5839             lbn, bp->b_lblkno));
 5840         CTR4(KTR_SUJ,
 5841             "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
 5842             "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
 5843         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
 5844         aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
 5845         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 5846         /*
 5847          * If we are allocating a directory page, then we must
 5848          * allocate an associated pagedep to track additions and
 5849          * deletions.
 5850          */
 5851         if ((ip->i_mode & IFMT) == IFDIR)
 5852                 pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
 5853         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
 5854         freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
 5855         FREE_LOCK(ump);
 5856         if (freefrag)
 5857                 handle_workitem_freefrag(freefrag);
 5858 }
 5859 
 5860 /*
 5861  * Called just before setting an indirect block pointer to a
 5862  * newly allocated indirect block.
 5863  */
 5864 void
 5865 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 5866         struct buf *nbp;        /* newly allocated indirect block */
 5867         struct inode *ip;       /* inode for file being extended */
 5868         struct buf *bp;         /* indirect block referencing allocated block */
 5869         int ptrno;              /* offset of pointer in indirect block */
 5870         ufs2_daddr_t newblkno;  /* disk block number being added */
 5871 {
 5872         struct inodedep *inodedep;
 5873         struct allocindir *aip;
 5874         struct ufsmount *ump;
 5875         ufs_lbn_t lbn;
 5876 
 5877         ump = ITOUMP(ip);
 5878         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 5879             ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
 5880         CTR3(KTR_SUJ,
 5881             "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
 5882             ip->i_number, newblkno, ptrno);
 5883         lbn = nbp->b_lblkno;
 5884         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
 5885         aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
 5886         inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
 5887         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
 5888         if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
 5889                 panic("softdep_setup_allocindir_meta: Block already existed");
 5890         FREE_LOCK(ump);
 5891 }
 5892 
 5893 static void
 5894 indirdep_complete(indirdep)
 5895         struct indirdep *indirdep;
 5896 {
 5897         struct allocindir *aip;
 5898 
 5899         LIST_REMOVE(indirdep, ir_next);
 5900         indirdep->ir_state |= DEPCOMPLETE;
 5901 
 5902         while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
 5903                 LIST_REMOVE(aip, ai_next);
 5904                 free_newblk(&aip->ai_block);
 5905         }
 5906         /*
 5907          * If this indirdep is not attached to a buf it was simply waiting
 5908          * on completion to clear completehd.  free_indirdep() asserts
 5909          * that nothing is dangling.
 5910          */
 5911         if ((indirdep->ir_state & ONWORKLIST) == 0)
 5912                 free_indirdep(indirdep);
 5913 }
 5914 
 5915 static struct indirdep *
 5916 indirdep_lookup(mp, ip, bp)
 5917         struct mount *mp;
 5918         struct inode *ip;
 5919         struct buf *bp;
 5920 {
 5921         struct indirdep *indirdep, *newindirdep;
 5922         struct newblk *newblk;
 5923         struct ufsmount *ump;
 5924         struct worklist *wk;
 5925         struct fs *fs;
 5926         ufs2_daddr_t blkno;
 5927 
 5928         ump = VFSTOUFS(mp);
 5929         LOCK_OWNED(ump);
 5930         indirdep = NULL;
 5931         newindirdep = NULL;
 5932         fs = ump->um_fs;
 5933         for (;;) {
 5934                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 5935                         if (wk->wk_type != D_INDIRDEP)
 5936                                 continue;
 5937                         indirdep = WK_INDIRDEP(wk);
 5938                         break;
 5939                 }
 5940                 /* Found on the buffer worklist, no new structure to free. */
 5941                 if (indirdep != NULL && newindirdep == NULL)
 5942                         return (indirdep);
 5943                 if (indirdep != NULL && newindirdep != NULL)
 5944                         panic("indirdep_lookup: simultaneous create");
 5945                 /* None found on the buffer and a new structure is ready. */
 5946                 if (indirdep == NULL && newindirdep != NULL)
 5947                         break;
 5948                 /* None found and no new structure available. */
 5949                 FREE_LOCK(ump);
 5950                 newindirdep = malloc(sizeof(struct indirdep),
 5951                     M_INDIRDEP, M_SOFTDEP_FLAGS);
 5952                 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
 5953                 newindirdep->ir_state = ATTACHED;
 5954                 if (I_IS_UFS1(ip))
 5955                         newindirdep->ir_state |= UFS1FMT;
 5956                 TAILQ_INIT(&newindirdep->ir_trunc);
 5957                 newindirdep->ir_saveddata = NULL;
 5958                 LIST_INIT(&newindirdep->ir_deplisthd);
 5959                 LIST_INIT(&newindirdep->ir_donehd);
 5960                 LIST_INIT(&newindirdep->ir_writehd);
 5961                 LIST_INIT(&newindirdep->ir_completehd);
 5962                 if (bp->b_blkno == bp->b_lblkno) {
 5963                         ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
 5964                             NULL, NULL);
 5965                         bp->b_blkno = blkno;
 5966                 }
 5967                 newindirdep->ir_freeblks = NULL;
 5968                 newindirdep->ir_savebp =
 5969                     getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
 5970                 newindirdep->ir_bp = bp;
 5971                 BUF_KERNPROC(newindirdep->ir_savebp);
 5972                 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
 5973                 ACQUIRE_LOCK(ump);
 5974         }
 5975         indirdep = newindirdep;
 5976         WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
 5977         /*
 5978          * If the block is not yet allocated we don't set DEPCOMPLETE so
 5979          * that we don't free dependencies until the pointers are valid.
 5980          * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
 5981          * than using the hash.
 5982          */
 5983         if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
 5984                 LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
 5985         else
 5986                 indirdep->ir_state |= DEPCOMPLETE;
 5987         return (indirdep);
 5988 }
 5989 
 5990 /*
 5991  * Called to finish the allocation of the "aip" allocated
 5992  * by one of the two routines above.
 5993  */
 5994 static struct freefrag *
 5995 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
 5996         struct buf *bp;         /* in-memory copy of the indirect block */
 5997         struct inode *ip;       /* inode for file being extended */
 5998         struct inodedep *inodedep; /* Inodedep for ip */
 5999         struct allocindir *aip; /* allocindir allocated by the above routines */
 6000         ufs_lbn_t lbn;          /* Logical block number for this block. */
 6001 {
 6002         struct fs *fs;
 6003         struct indirdep *indirdep;
 6004         struct allocindir *oldaip;
 6005         struct freefrag *freefrag;
 6006         struct mount *mp;
 6007         struct ufsmount *ump;
 6008 
 6009         mp = ITOVFS(ip);
 6010         ump = VFSTOUFS(mp);
 6011         LOCK_OWNED(ump);
 6012         fs = ump->um_fs;
 6013         if (bp->b_lblkno >= 0)
 6014                 panic("setup_allocindir_phase2: not indir blk");
 6015         KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
 6016             ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
 6017         indirdep = indirdep_lookup(mp, ip, bp);
 6018         KASSERT(indirdep->ir_savebp != NULL,
 6019             ("setup_allocindir_phase2 NULL ir_savebp"));
 6020         aip->ai_indirdep = indirdep;
 6021         /*
 6022          * Check for an unwritten dependency for this indirect offset.  If
 6023          * there is, merge the old dependency into the new one.  This happens
 6024          * as a result of reallocblk only.
 6025          */
 6026         freefrag = NULL;
 6027         if (aip->ai_oldblkno != 0) {
 6028                 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
 6029                         if (oldaip->ai_offset == aip->ai_offset) {
 6030                                 freefrag = allocindir_merge(aip, oldaip);
 6031                                 goto done;
 6032                         }
 6033                 }
 6034                 LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
 6035                         if (oldaip->ai_offset == aip->ai_offset) {
 6036                                 freefrag = allocindir_merge(aip, oldaip);
 6037                                 goto done;
 6038                         }
 6039                 }
 6040         }
 6041 done:
 6042         LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
 6043         return (freefrag);
 6044 }
 6045 
 6046 /*
 6047  * Merge two allocindirs which refer to the same block.  Move newblock
 6048  * dependencies and setup the freefrags appropriately.
 6049  */
 6050 static struct freefrag *
 6051 allocindir_merge(aip, oldaip)
 6052         struct allocindir *aip;
 6053         struct allocindir *oldaip;
 6054 {
 6055         struct freefrag *freefrag;
 6056         struct worklist *wk;
 6057 
 6058         if (oldaip->ai_newblkno != aip->ai_oldblkno)
 6059                 panic("allocindir_merge: blkno");
 6060         aip->ai_oldblkno = oldaip->ai_oldblkno;
 6061         freefrag = aip->ai_freefrag;
 6062         aip->ai_freefrag = oldaip->ai_freefrag;
 6063         oldaip->ai_freefrag = NULL;
 6064         KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
 6065         /*
 6066          * If we are tracking a new directory-block allocation,
 6067          * move it from the old allocindir to the new allocindir.
 6068          */
 6069         if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
 6070                 WORKLIST_REMOVE(wk);
 6071                 if (!LIST_EMPTY(&oldaip->ai_newdirblk))
 6072                         panic("allocindir_merge: extra newdirblk");
 6073                 WORKLIST_INSERT(&aip->ai_newdirblk, wk);
 6074         }
 6075         /*
 6076          * We can skip journaling for this freefrag and just complete
 6077          * any pending journal work for the allocindir that is being
 6078          * removed after the freefrag completes.
 6079          */
 6080         if (freefrag->ff_jdep)
 6081                 cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
 6082         LIST_REMOVE(oldaip, ai_next);
 6083         freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
 6084             &freefrag->ff_list, &freefrag->ff_jwork);
 6085         free_newblk(&oldaip->ai_block);
 6086 
 6087         return (freefrag);
 6088 }
 6089 
 6090 static inline void
 6091 setup_freedirect(freeblks, ip, i, needj)
 6092         struct freeblks *freeblks;
 6093         struct inode *ip;
 6094         int i;
 6095         int needj;
 6096 {
 6097         struct ufsmount *ump;
 6098         ufs2_daddr_t blkno;
 6099         int frags;
 6100 
 6101         blkno = DIP(ip, i_db[i]);
 6102         if (blkno == 0)
 6103                 return;
 6104         DIP_SET(ip, i_db[i], 0);
 6105         ump = ITOUMP(ip);
 6106         frags = sblksize(ump->um_fs, ip->i_size, i);
 6107         frags = numfrags(ump->um_fs, frags);
 6108         newfreework(ump, freeblks, NULL, i, blkno, frags, 0, needj);
 6109 }
 6110 
 6111 static inline void
 6112 setup_freeext(freeblks, ip, i, needj)
 6113         struct freeblks *freeblks;
 6114         struct inode *ip;
 6115         int i;
 6116         int needj;
 6117 {
 6118         struct ufsmount *ump;
 6119         ufs2_daddr_t blkno;
 6120         int frags;
 6121 
 6122         blkno = ip->i_din2->di_extb[i];
 6123         if (blkno == 0)
 6124                 return;
 6125         ip->i_din2->di_extb[i] = 0;
 6126         ump = ITOUMP(ip);
 6127         frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i);
 6128         frags = numfrags(ump->um_fs, frags);
 6129         newfreework(ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
 6130 }
 6131 
 6132 static inline void
 6133 setup_freeindir(freeblks, ip, i, lbn, needj)
 6134         struct freeblks *freeblks;
 6135         struct inode *ip;
 6136         int i;
 6137         ufs_lbn_t lbn;
 6138         int needj;
 6139 {
 6140         struct ufsmount *ump;
 6141         ufs2_daddr_t blkno;
 6142 
 6143         blkno = DIP(ip, i_ib[i]);
 6144         if (blkno == 0)
 6145                 return;
 6146         DIP_SET(ip, i_ib[i], 0);
 6147         ump = ITOUMP(ip);
 6148         newfreework(ump, freeblks, NULL, lbn, blkno, ump->um_fs->fs_frag,
 6149             0, needj);
 6150 }
 6151 
 6152 static inline struct freeblks *
 6153 newfreeblks(mp, ip)
 6154         struct mount *mp;
 6155         struct inode *ip;
 6156 {
 6157         struct freeblks *freeblks;
 6158 
 6159         freeblks = malloc(sizeof(struct freeblks),
 6160                 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
 6161         workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
 6162         LIST_INIT(&freeblks->fb_jblkdephd);
 6163         LIST_INIT(&freeblks->fb_jwork);
 6164         freeblks->fb_ref = 0;
 6165         freeblks->fb_cgwait = 0;
 6166         freeblks->fb_state = ATTACHED;
 6167         freeblks->fb_uid = ip->i_uid;
 6168         freeblks->fb_inum = ip->i_number;
 6169         freeblks->fb_vtype = ITOV(ip)->v_type;
 6170         freeblks->fb_modrev = DIP(ip, i_modrev);
 6171         freeblks->fb_devvp = ITODEVVP(ip);
 6172         freeblks->fb_chkcnt = 0;
 6173         freeblks->fb_len = 0;
 6174 
 6175         return (freeblks);
 6176 }
 6177 
 6178 static void
 6179 trunc_indirdep(indirdep, freeblks, bp, off)
 6180         struct indirdep *indirdep;
 6181         struct freeblks *freeblks;
 6182         struct buf *bp;
 6183         int off;
 6184 {
 6185         struct allocindir *aip, *aipn;
 6186 
 6187         /*
 6188          * The first set of allocindirs won't be in savedbp.
 6189          */
 6190         LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
 6191                 if (aip->ai_offset > off)
 6192                         cancel_allocindir(aip, bp, freeblks, 1);
 6193         LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
 6194                 if (aip->ai_offset > off)
 6195                         cancel_allocindir(aip, bp, freeblks, 1);
 6196         /*
 6197          * These will exist in savedbp.
 6198          */
 6199         LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
 6200                 if (aip->ai_offset > off)
 6201                         cancel_allocindir(aip, NULL, freeblks, 0);
 6202         LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
 6203                 if (aip->ai_offset > off)
 6204                         cancel_allocindir(aip, NULL, freeblks, 0);
 6205 }
 6206 
 6207 /*
 6208  * Follow the chain of indirects down to lastlbn creating a freework
 6209  * structure for each.  This will be used to start indir_trunc() at
 6210  * the right offset and create the journal records for the parrtial
 6211  * truncation.  A second step will handle the truncated dependencies.
 6212  */
 6213 static int
 6214 setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
 6215         struct freeblks *freeblks;
 6216         struct inode *ip;
 6217         ufs_lbn_t lbn;
 6218         ufs_lbn_t lastlbn;
 6219         ufs2_daddr_t blkno;
 6220 {
 6221         struct indirdep *indirdep;
 6222         struct indirdep *indirn;
 6223         struct freework *freework;
 6224         struct newblk *newblk;
 6225         struct mount *mp;
 6226         struct ufsmount *ump;
 6227         struct buf *bp;
 6228         uint8_t *start;
 6229         uint8_t *end;
 6230         ufs_lbn_t lbnadd;
 6231         int level;
 6232         int error;
 6233         int off;
 6234 
 6235 
 6236         freework = NULL;
 6237         if (blkno == 0)
 6238                 return (0);
 6239         mp = freeblks->fb_list.wk_mp;
 6240         ump = VFSTOUFS(mp);
 6241         bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
 6242         if ((bp->b_flags & B_CACHE) == 0) {
 6243                 bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
 6244                 bp->b_iocmd = BIO_READ;
 6245                 bp->b_flags &= ~B_INVAL;
 6246                 bp->b_ioflags &= ~BIO_ERROR;
 6247                 vfs_busy_pages(bp, 0);
 6248                 bp->b_iooffset = dbtob(bp->b_blkno);
 6249                 bstrategy(bp);
 6250 #ifdef RACCT
 6251                 if (racct_enable) {
 6252                         PROC_LOCK(curproc);
 6253                         racct_add_buf(curproc, bp, 0);
 6254                         PROC_UNLOCK(curproc);
 6255                 }
 6256 #endif /* RACCT */
 6257                 curthread->td_ru.ru_inblock++;
 6258                 error = bufwait(bp);
 6259                 if (error) {
 6260                         brelse(bp);
 6261                         return (error);
 6262                 }
 6263         }
 6264         level = lbn_level(lbn);
 6265         lbnadd = lbn_offset(ump->um_fs, level);
 6266         /*
 6267          * Compute the offset of the last block we want to keep.  Store
 6268          * in the freework the first block we want to completely free.
 6269          */
 6270         off = (lastlbn - -(lbn + level)) / lbnadd;
 6271         if (off + 1 == NINDIR(ump->um_fs))
 6272                 goto nowork;
 6273         freework = newfreework(ump, freeblks, NULL, lbn, blkno, 0, off + 1, 0);
 6274         /*
 6275          * Link the freework into the indirdep.  This will prevent any new
 6276          * allocations from proceeding until we are finished with the
 6277          * truncate and the block is written.
 6278          */
 6279         ACQUIRE_LOCK(ump);
 6280         indirdep = indirdep_lookup(mp, ip, bp);
 6281         if (indirdep->ir_freeblks)
 6282                 panic("setup_trunc_indir: indirdep already truncated.");
 6283         TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
 6284         freework->fw_indir = indirdep;
 6285         /*
 6286          * Cancel any allocindirs that will not make it to disk.
 6287          * We have to do this for all copies of the indirdep that
 6288          * live on this newblk.
 6289          */
 6290         if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
 6291                 newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0, &newblk);
 6292                 LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
 6293                         trunc_indirdep(indirn, freeblks, bp, off);
 6294         } else
 6295                 trunc_indirdep(indirdep, freeblks, bp, off);
 6296         FREE_LOCK(ump);
 6297         /*
 6298          * Creation is protected by the buf lock. The saveddata is only
 6299          * needed if a full truncation follows a partial truncation but it
 6300          * is difficult to allocate in that case so we fetch it anyway.
 6301          */
 6302         if (indirdep->ir_saveddata == NULL)
 6303                 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
 6304                     M_SOFTDEP_FLAGS);
 6305 nowork:
 6306         /* Fetch the blkno of the child and the zero start offset. */
 6307         if (I_IS_UFS1(ip)) {
 6308                 blkno = ((ufs1_daddr_t *)bp->b_data)[off];
 6309                 start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
 6310         } else {
 6311                 blkno = ((ufs2_daddr_t *)bp->b_data)[off];
 6312                 start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
 6313         }
 6314         if (freework) {
 6315                 /* Zero the truncated pointers. */
 6316                 end = bp->b_data + bp->b_bcount;
 6317                 bzero(start, end - start);
 6318                 bdwrite(bp);
 6319         } else
 6320                 bqrelse(bp);
 6321         if (level == 0)
 6322                 return (0);
 6323         lbn++; /* adjust level */
 6324         lbn -= (off * lbnadd);
 6325         return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
 6326 }
 6327 
 6328 /*
 6329  * Complete the partial truncation of an indirect block setup by
 6330  * setup_trunc_indir().  This zeros the truncated pointers in the saved
 6331  * copy and writes them to disk before the freeblks is allowed to complete.
 6332  */
 6333 static void
 6334 complete_trunc_indir(freework)
 6335         struct freework *freework;
 6336 {
 6337         struct freework *fwn;
 6338         struct indirdep *indirdep;
 6339         struct ufsmount *ump;
 6340         struct buf *bp;
 6341         uintptr_t start;
 6342         int count;
 6343 
 6344         ump = VFSTOUFS(freework->fw_list.wk_mp);
 6345         LOCK_OWNED(ump);
 6346         indirdep = freework->fw_indir;
 6347         for (;;) {
 6348                 bp = indirdep->ir_bp;
 6349                 /* See if the block was discarded. */
 6350                 if (bp == NULL)
 6351                         break;
 6352                 /* Inline part of getdirtybuf().  We dont want bremfree. */
 6353                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
 6354                         break;
 6355                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 6356                     LOCK_PTR(ump)) == 0)
 6357                         BUF_UNLOCK(bp);
 6358                 ACQUIRE_LOCK(ump);
 6359         }
 6360         freework->fw_state |= DEPCOMPLETE;
 6361         TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
 6362         /*
 6363          * Zero the pointers in the saved copy.
 6364          */
 6365         if (indirdep->ir_state & UFS1FMT)
 6366                 start = sizeof(ufs1_daddr_t);
 6367         else
 6368                 start = sizeof(ufs2_daddr_t);
 6369         start *= freework->fw_start;
 6370         count = indirdep->ir_savebp->b_bcount - start;
 6371         start += (uintptr_t)indirdep->ir_savebp->b_data;
 6372         bzero((char *)start, count);
 6373         /*
 6374          * We need to start the next truncation in the list if it has not
 6375          * been started yet.
 6376          */
 6377         fwn = TAILQ_FIRST(&indirdep->ir_trunc);
 6378         if (fwn != NULL) {
 6379                 if (fwn->fw_freeblks == indirdep->ir_freeblks)
 6380                         TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
 6381                 if ((fwn->fw_state & ONWORKLIST) == 0)
 6382                         freework_enqueue(fwn);
 6383         }
 6384         /*
 6385          * If bp is NULL the block was fully truncated, restore
 6386          * the saved block list otherwise free it if it is no
 6387          * longer needed.
 6388          */
 6389         if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
 6390                 if (bp == NULL)
 6391                         bcopy(indirdep->ir_saveddata,
 6392                             indirdep->ir_savebp->b_data,
 6393                             indirdep->ir_savebp->b_bcount);
 6394                 free(indirdep->ir_saveddata, M_INDIRDEP);
 6395                 indirdep->ir_saveddata = NULL;
 6396         }
 6397         /*
 6398          * When bp is NULL there is a full truncation pending.  We
 6399          * must wait for this full truncation to be journaled before
 6400          * we can release this freework because the disk pointers will
 6401          * never be written as zero.
 6402          */
 6403         if (bp == NULL)  {
 6404                 if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
 6405                         handle_written_freework(freework);
 6406                 else
 6407                         WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
 6408                            &freework->fw_list);
 6409         } else {
 6410                 /* Complete when the real copy is written. */
 6411                 WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
 6412                 BUF_UNLOCK(bp);
 6413         }
 6414 }
 6415 
 6416 /*
 6417  * Calculate the number of blocks we are going to release where datablocks
 6418  * is the current total and length is the new file size.
 6419  */
 6420 static ufs2_daddr_t
 6421 blkcount(fs, datablocks, length)
 6422         struct fs *fs;
 6423         ufs2_daddr_t datablocks;
 6424         off_t length;
 6425 {
 6426         off_t totblks, numblks;
 6427 
 6428         totblks = 0;
 6429         numblks = howmany(length, fs->fs_bsize);
 6430         if (numblks <= NDADDR) {
 6431                 totblks = howmany(length, fs->fs_fsize);
 6432                 goto out;
 6433         }
 6434         totblks = blkstofrags(fs, numblks);
 6435         numblks -= NDADDR;
 6436         /*
 6437          * Count all single, then double, then triple indirects required.
 6438          * Subtracting one indirects worth of blocks for each pass
 6439          * acknowledges one of each pointed to by the inode.
 6440          */
 6441         for (;;) {
 6442                 totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
 6443                 numblks -= NINDIR(fs);
 6444                 if (numblks <= 0)
 6445                         break;
 6446                 numblks = howmany(numblks, NINDIR(fs));
 6447         }
 6448 out:
 6449         totblks = fsbtodb(fs, totblks);
 6450         /*
 6451          * Handle sparse files.  We can't reclaim more blocks than the inode
 6452          * references.  We will correct it later in handle_complete_freeblks()
 6453          * when we know the real count.
 6454          */
 6455         if (totblks > datablocks)
 6456                 return (0);
 6457         return (datablocks - totblks);
 6458 }
 6459 
 6460 /*
 6461  * Handle freeblocks for journaled softupdate filesystems.
 6462  *
 6463  * Contrary to normal softupdates, we must preserve the block pointers in
 6464  * indirects until their subordinates are free.  This is to avoid journaling
 6465  * every block that is freed which may consume more space than the journal
 6466  * itself.  The recovery program will see the free block journals at the
 6467  * base of the truncated area and traverse them to reclaim space.  The
 6468  * pointers in the inode may be cleared immediately after the journal
 6469  * records are written because each direct and indirect pointer in the
 6470  * inode is recorded in a journal.  This permits full truncation to proceed
 6471  * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
 6472  *
 6473  * The algorithm is as follows:
 6474  * 1) Traverse the in-memory state and create journal entries to release
 6475  *    the relevant blocks and full indirect trees.
 6476  * 2) Traverse the indirect block chain adding partial truncation freework
 6477  *    records to indirects in the path to lastlbn.  The freework will
 6478  *    prevent new allocation dependencies from being satisfied in this
 6479  *    indirect until the truncation completes.
 6480  * 3) Read and lock the inode block, performing an update with the new size
 6481  *    and pointers.  This prevents truncated data from becoming valid on
 6482  *    disk through step 4.
 6483  * 4) Reap unsatisfied dependencies that are beyond the truncated area,
 6484  *    eliminate journal work for those records that do not require it.
 6485  * 5) Schedule the journal records to be written followed by the inode block.
 6486  * 6) Allocate any necessary frags for the end of file.
 6487  * 7) Zero any partially truncated blocks.
 6488  *
 6489  * From this truncation proceeds asynchronously using the freework and
 6490  * indir_trunc machinery.  The file will not be extended again into a
 6491  * partially truncated indirect block until all work is completed but
 6492  * the normal dependency mechanism ensures that it is rolled back/forward
 6493  * as appropriate.  Further truncation may occur without delay and is
 6494  * serialized in indir_trunc().
 6495  */
 6496 void
 6497 softdep_journal_freeblocks(ip, cred, length, flags)
 6498         struct inode *ip;       /* The inode whose length is to be reduced */
 6499         struct ucred *cred;
 6500         off_t length;           /* The new length for the file */
 6501         int flags;              /* IO_EXT and/or IO_NORMAL */
 6502 {
 6503         struct freeblks *freeblks, *fbn;
 6504         struct worklist *wk, *wkn;
 6505         struct inodedep *inodedep;
 6506         struct jblkdep *jblkdep;
 6507         struct allocdirect *adp, *adpn;
 6508         struct ufsmount *ump;
 6509         struct fs *fs;
 6510         struct buf *bp;
 6511         struct vnode *vp;
 6512         struct mount *mp;
 6513         ufs2_daddr_t extblocks, datablocks;
 6514         ufs_lbn_t tmpval, lbn, lastlbn;
 6515         int frags, lastoff, iboff, allocblock, needj, error, i;
 6516 
 6517         ump = ITOUMP(ip);
 6518         mp = UFSTOVFS(ump);
 6519         fs = ump->um_fs;
 6520         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 6521             ("softdep_journal_freeblocks called on non-softdep filesystem"));
 6522         vp = ITOV(ip);
 6523         needj = 1;
 6524         iboff = -1;
 6525         allocblock = 0;
 6526         extblocks = 0;
 6527         datablocks = 0;
 6528         frags = 0;
 6529         freeblks = newfreeblks(mp, ip);
 6530         ACQUIRE_LOCK(ump);
 6531         /*
 6532          * If we're truncating a removed file that will never be written
 6533          * we don't need to journal the block frees.  The canceled journals
 6534          * for the allocations will suffice.
 6535          */
 6536         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 6537         if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
 6538             length == 0)
 6539                 needj = 0;
 6540         CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
 6541             ip->i_number, length, needj);
 6542         FREE_LOCK(ump);
 6543         /*
 6544          * Calculate the lbn that we are truncating to.  This results in -1
 6545          * if we're truncating the 0 bytes.  So it is the last lbn we want
 6546          * to keep, not the first lbn we want to truncate.
 6547          */
 6548         lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
 6549         lastoff = blkoff(fs, length);
 6550         /*
 6551          * Compute frags we are keeping in lastlbn.  0 means all.
 6552          */
 6553         if (lastlbn >= 0 && lastlbn < NDADDR) {
 6554                 frags = fragroundup(fs, lastoff);
 6555                 /* adp offset of last valid allocdirect. */
 6556                 iboff = lastlbn;
 6557         } else if (lastlbn > 0)
 6558                 iboff = NDADDR;
 6559         if (fs->fs_magic == FS_UFS2_MAGIC)
 6560                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
 6561         /*
 6562          * Handle normal data blocks and indirects.  This section saves
 6563          * values used after the inode update to complete frag and indirect
 6564          * truncation.
 6565          */
 6566         if ((flags & IO_NORMAL) != 0) {
 6567                 /*
 6568                  * Handle truncation of whole direct and indirect blocks.
 6569                  */
 6570                 for (i = iboff + 1; i < NDADDR; i++)
 6571                         setup_freedirect(freeblks, ip, i, needj);
 6572                 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
 6573                     i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
 6574                         /* Release a whole indirect tree. */
 6575                         if (lbn > lastlbn) {
 6576                                 setup_freeindir(freeblks, ip, i, -lbn -i,
 6577                                     needj);
 6578                                 continue;
 6579                         }
 6580                         iboff = i + NDADDR;
 6581                         /*
 6582                          * Traverse partially truncated indirect tree.
 6583                          */
 6584                         if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
 6585                                 setup_trunc_indir(freeblks, ip, -lbn - i,
 6586                                     lastlbn, DIP(ip, i_ib[i]));
 6587                 }
 6588                 /*
 6589                  * Handle partial truncation to a frag boundary.
 6590                  */
 6591                 if (frags) {
 6592                         ufs2_daddr_t blkno;
 6593                         long oldfrags;
 6594 
 6595                         oldfrags = blksize(fs, ip, lastlbn);
 6596                         blkno = DIP(ip, i_db[lastlbn]);
 6597                         if (blkno && oldfrags != frags) {
 6598                                 oldfrags -= frags;
 6599                                 oldfrags = numfrags(fs, oldfrags);
 6600                                 blkno += numfrags(fs, frags);
 6601                                 newfreework(ump, freeblks, NULL, lastlbn,
 6602                                     blkno, oldfrags, 0, needj);
 6603                                 if (needj)
 6604                                         adjust_newfreework(freeblks,
 6605                                             numfrags(fs, frags));
 6606                         } else if (blkno == 0)
 6607                                 allocblock = 1;
 6608                 }
 6609                 /*
 6610                  * Add a journal record for partial truncate if we are
 6611                  * handling indirect blocks.  Non-indirects need no extra
 6612                  * journaling.
 6613                  */
 6614                 if (length != 0 && lastlbn >= NDADDR) {
 6615                         ip->i_flag |= IN_TRUNCATED;
 6616                         newjtrunc(freeblks, length, 0);
 6617                 }
 6618                 ip->i_size = length;
 6619                 DIP_SET(ip, i_size, ip->i_size);
 6620                 ip->i_flag |= IN_SIZEMOD | IN_CHANGE;
 6621                 datablocks = DIP(ip, i_blocks) - extblocks;
 6622                 if (length != 0)
 6623                         datablocks = blkcount(fs, datablocks, length);
 6624                 freeblks->fb_len = length;
 6625         }
 6626         if ((flags & IO_EXT) != 0) {
 6627                 for (i = 0; i < NXADDR; i++)
 6628                         setup_freeext(freeblks, ip, i, needj);
 6629                 ip->i_din2->di_extsize = 0;
 6630                 datablocks += extblocks;
 6631                 ip->i_flag |= IN_SIZEMOD | IN_CHANGE;
 6632         }
 6633 #ifdef QUOTA
 6634         /* Reference the quotas in case the block count is wrong in the end. */
 6635         quotaref(vp, freeblks->fb_quota);
 6636         (void) chkdq(ip, -datablocks, NOCRED, 0);
 6637 #endif
 6638         freeblks->fb_chkcnt = -datablocks;
 6639         UFS_LOCK(ump);
 6640         fs->fs_pendingblocks += datablocks;
 6641         UFS_UNLOCK(ump);
 6642         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
 6643         /*
 6644          * Handle truncation of incomplete alloc direct dependencies.  We
 6645          * hold the inode block locked to prevent incomplete dependencies
 6646          * from reaching the disk while we are eliminating those that
 6647          * have been truncated.  This is a partially inlined ffs_update().
 6648          */
 6649         ufs_itimes(vp);
 6650         ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
 6651         error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 6652             (int)fs->fs_bsize, cred, &bp);
 6653         if (error) {
 6654                 brelse(bp);
 6655                 softdep_error("softdep_journal_freeblocks", error);
 6656                 return;
 6657         }
 6658         if (bp->b_bufsize == fs->fs_bsize)
 6659                 bp->b_flags |= B_CLUSTEROK;
 6660         softdep_update_inodeblock(ip, bp, 0);
 6661         if (ump->um_fstype == UFS1)
 6662                 *((struct ufs1_dinode *)bp->b_data +
 6663                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
 6664         else
 6665                 *((struct ufs2_dinode *)bp->b_data +
 6666                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
 6667         ACQUIRE_LOCK(ump);
 6668         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 6669         if ((inodedep->id_state & IOSTARTED) != 0)
 6670                 panic("softdep_setup_freeblocks: inode busy");
 6671         /*
 6672          * Add the freeblks structure to the list of operations that
 6673          * must await the zero'ed inode being written to disk. If we
 6674          * still have a bitmap dependency (needj), then the inode
 6675          * has never been written to disk, so we can process the
 6676          * freeblks below once we have deleted the dependencies.
 6677          */
 6678         if (needj)
 6679                 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
 6680         else
 6681                 freeblks->fb_state |= COMPLETE;
 6682         if ((flags & IO_NORMAL) != 0) {
 6683                 TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
 6684                         if (adp->ad_offset > iboff)
 6685                                 cancel_allocdirect(&inodedep->id_inoupdt, adp,
 6686                                     freeblks);
 6687                         /*
 6688                          * Truncate the allocdirect.  We could eliminate
 6689                          * or modify journal records as well.
 6690                          */
 6691                         else if (adp->ad_offset == iboff && frags)
 6692                                 adp->ad_newsize = frags;
 6693                 }
 6694         }
 6695         if ((flags & IO_EXT) != 0)
 6696                 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
 6697                         cancel_allocdirect(&inodedep->id_extupdt, adp,
 6698                             freeblks);
 6699         /*
 6700          * Scan the bufwait list for newblock dependencies that will never
 6701          * make it to disk.
 6702          */
 6703         LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
 6704                 if (wk->wk_type != D_ALLOCDIRECT)
 6705                         continue;
 6706                 adp = WK_ALLOCDIRECT(wk);
 6707                 if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
 6708                     ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
 6709                         cancel_jfreeblk(freeblks, adp->ad_newblkno);
 6710                         cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
 6711                         WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
 6712                 }
 6713         }
 6714         /*
 6715          * Add journal work.
 6716          */
 6717         LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
 6718                 add_to_journal(&jblkdep->jb_list);
 6719         FREE_LOCK(ump);
 6720         bdwrite(bp);
 6721         /*
 6722          * Truncate dependency structures beyond length.
 6723          */
 6724         trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
 6725         /*
 6726          * This is only set when we need to allocate a fragment because
 6727          * none existed at the end of a frag-sized file.  It handles only
 6728          * allocating a new, zero filled block.
 6729          */
 6730         if (allocblock) {
 6731                 ip->i_size = length - lastoff;
 6732                 DIP_SET(ip, i_size, ip->i_size);
 6733                 error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
 6734                 if (error != 0) {
 6735                         softdep_error("softdep_journal_freeblks", error);
 6736                         return;
 6737                 }
 6738                 ip->i_size = length;
 6739                 DIP_SET(ip, i_size, length);
 6740                 ip->i_flag |= IN_SIZEMOD | IN_CHANGE | IN_UPDATE;
 6741                 allocbuf(bp, frags);
 6742                 ffs_update(vp, 0);
 6743                 bawrite(bp);
 6744         } else if (lastoff != 0 && vp->v_type != VDIR) {
 6745                 int size;
 6746 
 6747                 /*
 6748                  * Zero the end of a truncated frag or block.
 6749                  */
 6750                 size = sblksize(fs, length, lastlbn);
 6751                 error = bread(vp, lastlbn, size, cred, &bp);
 6752                 if (error) {
 6753                         softdep_error("softdep_journal_freeblks", error);
 6754                         return;
 6755                 }
 6756                 bzero((char *)bp->b_data + lastoff, size - lastoff);
 6757                 bawrite(bp);
 6758 
 6759         }
 6760         ACQUIRE_LOCK(ump);
 6761         inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 6762         TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
 6763         freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
 6764         /*
 6765          * We zero earlier truncations so they don't erroneously
 6766          * update i_blocks.
 6767          */
 6768         if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
 6769                 TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
 6770                         fbn->fb_len = 0;
 6771         if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
 6772             LIST_EMPTY(&freeblks->fb_jblkdephd))
 6773                 freeblks->fb_state |= INPROGRESS;
 6774         else
 6775                 freeblks = NULL;
 6776         FREE_LOCK(ump);
 6777         if (freeblks)
 6778                 handle_workitem_freeblocks(freeblks, 0);
 6779         trunc_pages(ip, length, extblocks, flags);
 6780 
 6781 }
 6782 
 6783 /*
 6784  * Flush a JOP_SYNC to the journal.
 6785  */
 6786 void
 6787 softdep_journal_fsync(ip)
 6788         struct inode *ip;
 6789 {
 6790         struct jfsync *jfsync;
 6791         struct ufsmount *ump;
 6792 
 6793         ump = ITOUMP(ip);
 6794         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 6795             ("softdep_journal_fsync called on non-softdep filesystem"));
 6796         if ((ip->i_flag & IN_TRUNCATED) == 0)
 6797                 return;
 6798         ip->i_flag &= ~IN_TRUNCATED;
 6799         jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
 6800         workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump));
 6801         jfsync->jfs_size = ip->i_size;
 6802         jfsync->jfs_ino = ip->i_number;
 6803         ACQUIRE_LOCK(ump);
 6804         add_to_journal(&jfsync->jfs_list);
 6805         jwait(&jfsync->jfs_list, MNT_WAIT);
 6806         FREE_LOCK(ump);
 6807 }
 6808 
 6809 /*
 6810  * Block de-allocation dependencies.
 6811  * 
 6812  * When blocks are de-allocated, the on-disk pointers must be nullified before
 6813  * the blocks are made available for use by other files.  (The true
 6814  * requirement is that old pointers must be nullified before new on-disk
 6815  * pointers are set.  We chose this slightly more stringent requirement to
 6816  * reduce complexity.) Our implementation handles this dependency by updating
 6817  * the inode (or indirect block) appropriately but delaying the actual block
 6818  * de-allocation (i.e., freemap and free space count manipulation) until
 6819  * after the updated versions reach stable storage.  After the disk is
 6820  * updated, the blocks can be safely de-allocated whenever it is convenient.
 6821  * This implementation handles only the common case of reducing a file's
 6822  * length to zero. Other cases are handled by the conventional synchronous
 6823  * write approach.
 6824  *
 6825  * The ffs implementation with which we worked double-checks
 6826  * the state of the block pointers and file size as it reduces
 6827  * a file's length.  Some of this code is replicated here in our
 6828  * soft updates implementation.  The freeblks->fb_chkcnt field is
 6829  * used to transfer a part of this information to the procedure
 6830  * that eventually de-allocates the blocks.
 6831  *
 6832  * This routine should be called from the routine that shortens
 6833  * a file's length, before the inode's size or block pointers
 6834  * are modified. It will save the block pointer information for
 6835  * later release and zero the inode so that the calling routine
 6836  * can release it.
 6837  */
 6838 void
 6839 softdep_setup_freeblocks(ip, length, flags)
 6840         struct inode *ip;       /* The inode whose length is to be reduced */
 6841         off_t length;           /* The new length for the file */
 6842         int flags;              /* IO_EXT and/or IO_NORMAL */
 6843 {
 6844         struct ufs1_dinode *dp1;
 6845         struct ufs2_dinode *dp2;
 6846         struct freeblks *freeblks;
 6847         struct inodedep *inodedep;
 6848         struct allocdirect *adp;
 6849         struct ufsmount *ump;
 6850         struct buf *bp;
 6851         struct fs *fs;
 6852         ufs2_daddr_t extblocks, datablocks;
 6853         struct mount *mp;
 6854         int i, delay, error;
 6855         ufs_lbn_t tmpval;
 6856         ufs_lbn_t lbn;
 6857 
 6858         ump = ITOUMP(ip);
 6859         mp = UFSTOVFS(ump);
 6860         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 6861             ("softdep_setup_freeblocks called on non-softdep filesystem"));
 6862         CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
 6863             ip->i_number, length);
 6864         KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
 6865         fs = ump->um_fs;
 6866         if ((error = bread(ump->um_devvp,
 6867             fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 6868             (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
 6869                 brelse(bp);
 6870                 softdep_error("softdep_setup_freeblocks", error);
 6871                 return;
 6872         }
 6873         freeblks = newfreeblks(mp, ip);
 6874         extblocks = 0;
 6875         datablocks = 0;
 6876         if (fs->fs_magic == FS_UFS2_MAGIC)
 6877                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
 6878         if ((flags & IO_NORMAL) != 0) {
 6879                 for (i = 0; i < NDADDR; i++)
 6880                         setup_freedirect(freeblks, ip, i, 0);
 6881                 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
 6882                     i++, lbn += tmpval, tmpval *= NINDIR(fs))
 6883                         setup_freeindir(freeblks, ip, i, -lbn -i, 0);
 6884                 ip->i_size = 0;
 6885                 DIP_SET(ip, i_size, 0);
 6886                 ip->i_flag |= IN_SIZEMOD | IN_CHANGE;
 6887                 datablocks = DIP(ip, i_blocks) - extblocks;
 6888         }
 6889         if ((flags & IO_EXT) != 0) {
 6890                 for (i = 0; i < NXADDR; i++)
 6891                         setup_freeext(freeblks, ip, i, 0);
 6892                 ip->i_din2->di_extsize = 0;
 6893                 datablocks += extblocks;
 6894                 ip->i_flag |= IN_SIZEMOD | IN_CHANGE;
 6895         }
 6896 #ifdef QUOTA
 6897         /* Reference the quotas in case the block count is wrong in the end. */
 6898         quotaref(ITOV(ip), freeblks->fb_quota);
 6899         (void) chkdq(ip, -datablocks, NOCRED, 0);
 6900 #endif
 6901         freeblks->fb_chkcnt = -datablocks;
 6902         UFS_LOCK(ump);
 6903         fs->fs_pendingblocks += datablocks;
 6904         UFS_UNLOCK(ump);
 6905         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
 6906         /*
 6907          * Push the zero'ed inode to its disk buffer so that we are free
 6908          * to delete its dependencies below. Once the dependencies are gone
 6909          * the buffer can be safely released.
 6910          */
 6911         if (ump->um_fstype == UFS1) {
 6912                 dp1 = ((struct ufs1_dinode *)bp->b_data +
 6913                     ino_to_fsbo(fs, ip->i_number));
 6914                 ip->i_din1->di_freelink = dp1->di_freelink;
 6915                 *dp1 = *ip->i_din1;
 6916         } else {
 6917                 dp2 = ((struct ufs2_dinode *)bp->b_data +
 6918                     ino_to_fsbo(fs, ip->i_number));
 6919                 ip->i_din2->di_freelink = dp2->di_freelink;
 6920                 *dp2 = *ip->i_din2;
 6921         }
 6922         /*
 6923          * Find and eliminate any inode dependencies.
 6924          */
 6925         ACQUIRE_LOCK(ump);
 6926         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 6927         if ((inodedep->id_state & IOSTARTED) != 0)
 6928                 panic("softdep_setup_freeblocks: inode busy");
 6929         /*
 6930          * Add the freeblks structure to the list of operations that
 6931          * must await the zero'ed inode being written to disk. If we
 6932          * still have a bitmap dependency (delay == 0), then the inode
 6933          * has never been written to disk, so we can process the
 6934          * freeblks below once we have deleted the dependencies.
 6935          */
 6936         delay = (inodedep->id_state & DEPCOMPLETE);
 6937         if (delay)
 6938                 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
 6939         else
 6940                 freeblks->fb_state |= COMPLETE;
 6941         /*
 6942          * Because the file length has been truncated to zero, any
 6943          * pending block allocation dependency structures associated
 6944          * with this inode are obsolete and can simply be de-allocated.
 6945          * We must first merge the two dependency lists to get rid of
 6946          * any duplicate freefrag structures, then purge the merged list.
 6947          * If we still have a bitmap dependency, then the inode has never
 6948          * been written to disk, so we can free any fragments without delay.
 6949          */
 6950         if (flags & IO_NORMAL) {
 6951                 merge_inode_lists(&inodedep->id_newinoupdt,
 6952                     &inodedep->id_inoupdt);
 6953                 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
 6954                         cancel_allocdirect(&inodedep->id_inoupdt, adp,
 6955                             freeblks);
 6956         }
 6957         if (flags & IO_EXT) {
 6958                 merge_inode_lists(&inodedep->id_newextupdt,
 6959                     &inodedep->id_extupdt);
 6960                 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
 6961                         cancel_allocdirect(&inodedep->id_extupdt, adp,
 6962                             freeblks);
 6963         }
 6964         FREE_LOCK(ump);
 6965         bdwrite(bp);
 6966         trunc_dependencies(ip, freeblks, -1, 0, flags);
 6967         ACQUIRE_LOCK(ump);
 6968         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
 6969                 (void) free_inodedep(inodedep);
 6970         freeblks->fb_state |= DEPCOMPLETE;
 6971         /*
 6972          * If the inode with zeroed block pointers is now on disk
 6973          * we can start freeing blocks.
 6974          */  
 6975         if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
 6976                 freeblks->fb_state |= INPROGRESS;
 6977         else
 6978                 freeblks = NULL;
 6979         FREE_LOCK(ump);
 6980         if (freeblks)
 6981                 handle_workitem_freeblocks(freeblks, 0);
 6982         trunc_pages(ip, length, extblocks, flags);
 6983 }
 6984 
 6985 /*
 6986  * Eliminate pages from the page cache that back parts of this inode and
 6987  * adjust the vnode pager's idea of our size.  This prevents stale data
 6988  * from hanging around in the page cache.
 6989  */
 6990 static void
 6991 trunc_pages(ip, length, extblocks, flags)
 6992         struct inode *ip;
 6993         off_t length;
 6994         ufs2_daddr_t extblocks;
 6995         int flags;
 6996 {
 6997         struct vnode *vp;
 6998         struct fs *fs;
 6999         ufs_lbn_t lbn;
 7000         off_t end, extend;
 7001 
 7002         vp = ITOV(ip);
 7003         fs = ITOFS(ip);
 7004         extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
 7005         if ((flags & IO_EXT) != 0)
 7006                 vn_pages_remove(vp, extend, 0);
 7007         if ((flags & IO_NORMAL) == 0)
 7008                 return;
 7009         BO_LOCK(&vp->v_bufobj);
 7010         drain_output(vp);
 7011         BO_UNLOCK(&vp->v_bufobj);
 7012         /*
 7013          * The vnode pager eliminates file pages we eliminate indirects
 7014          * below.
 7015          */
 7016         vnode_pager_setsize(vp, length);
 7017         /*
 7018          * Calculate the end based on the last indirect we want to keep.  If
 7019          * the block extends into indirects we can just use the negative of
 7020          * its lbn.  Doubles and triples exist at lower numbers so we must
 7021          * be careful not to remove those, if they exist.  double and triple
 7022          * indirect lbns do not overlap with others so it is not important
 7023          * to verify how many levels are required.
 7024          */
 7025         lbn = lblkno(fs, length);
 7026         if (lbn >= NDADDR) {
 7027                 /* Calculate the virtual lbn of the triple indirect. */
 7028                 lbn = -lbn - (NIADDR - 1);
 7029                 end = OFF_TO_IDX(lblktosize(fs, lbn));
 7030         } else
 7031                 end = extend;
 7032         vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
 7033 }
 7034 
 7035 /*
 7036  * See if the buf bp is in the range eliminated by truncation.
 7037  */
 7038 static int
 7039 trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
 7040         struct buf *bp;
 7041         int *blkoffp;
 7042         ufs_lbn_t lastlbn;
 7043         int lastoff;
 7044         int flags;
 7045 {
 7046         ufs_lbn_t lbn;
 7047 
 7048         *blkoffp = 0;
 7049         /* Only match ext/normal blocks as appropriate. */
 7050         if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
 7051             ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
 7052                 return (0);
 7053         /* ALTDATA is always a full truncation. */
 7054         if ((bp->b_xflags & BX_ALTDATA) != 0)
 7055                 return (1);
 7056         /* -1 is full truncation. */
 7057         if (lastlbn == -1)
 7058                 return (1);
 7059         /*
 7060          * If this is a partial truncate we only want those
 7061          * blocks and indirect blocks that cover the range
 7062          * we're after.
 7063          */
 7064         lbn = bp->b_lblkno;
 7065         if (lbn < 0)
 7066                 lbn = -(lbn + lbn_level(lbn));
 7067         if (lbn < lastlbn)
 7068                 return (0);
 7069         /* Here we only truncate lblkno if it's partial. */
 7070         if (lbn == lastlbn) {
 7071                 if (lastoff == 0)
 7072                         return (0);
 7073                 *blkoffp = lastoff;
 7074         }
 7075         return (1);
 7076 }
 7077 
 7078 /*
 7079  * Eliminate any dependencies that exist in memory beyond lblkno:off
 7080  */
 7081 static void
 7082 trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
 7083         struct inode *ip;
 7084         struct freeblks *freeblks;
 7085         ufs_lbn_t lastlbn;
 7086         int lastoff;
 7087         int flags;
 7088 {
 7089         struct bufobj *bo;
 7090         struct vnode *vp;
 7091         struct buf *bp;
 7092         int blkoff;
 7093 
 7094         /*
 7095          * We must wait for any I/O in progress to finish so that
 7096          * all potential buffers on the dirty list will be visible.
 7097          * Once they are all there, walk the list and get rid of
 7098          * any dependencies.
 7099          */
 7100         vp = ITOV(ip);
 7101         bo = &vp->v_bufobj;
 7102         BO_LOCK(bo);
 7103         drain_output(vp);
 7104         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
 7105                 bp->b_vflags &= ~BV_SCANNED;
 7106 restart:
 7107         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
 7108                 if (bp->b_vflags & BV_SCANNED)
 7109                         continue;
 7110                 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
 7111                         bp->b_vflags |= BV_SCANNED;
 7112                         continue;
 7113                 }
 7114                 KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer"));
 7115                 if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL)
 7116                         goto restart;
 7117                 BO_UNLOCK(bo);
 7118                 if (deallocate_dependencies(bp, freeblks, blkoff))
 7119                         bqrelse(bp);
 7120                 else
 7121                         brelse(bp);
 7122                 BO_LOCK(bo);
 7123                 goto restart;
 7124         }
 7125         /*
 7126          * Now do the work of vtruncbuf while also matching indirect blocks.
 7127          */
 7128         TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
 7129                 bp->b_vflags &= ~BV_SCANNED;
 7130 cleanrestart:
 7131         TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
 7132                 if (bp->b_vflags & BV_SCANNED)
 7133                         continue;
 7134                 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
 7135                         bp->b_vflags |= BV_SCANNED;
 7136                         continue;
 7137                 }
 7138                 if (BUF_LOCK(bp,
 7139                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 7140                     BO_LOCKPTR(bo)) == ENOLCK) {
 7141                         BO_LOCK(bo);
 7142                         goto cleanrestart;
 7143                 }
 7144                 bp->b_vflags |= BV_SCANNED;
 7145                 bremfree(bp);
 7146                 if (blkoff != 0) {
 7147                         allocbuf(bp, blkoff);
 7148                         bqrelse(bp);
 7149                 } else {
 7150                         bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
 7151                         brelse(bp);
 7152                 }
 7153                 BO_LOCK(bo);
 7154                 goto cleanrestart;
 7155         }
 7156         drain_output(vp);
 7157         BO_UNLOCK(bo);
 7158 }
 7159 
 7160 static int
 7161 cancel_pagedep(pagedep, freeblks, blkoff)
 7162         struct pagedep *pagedep;
 7163         struct freeblks *freeblks;
 7164         int blkoff;
 7165 {
 7166         struct jremref *jremref;
 7167         struct jmvref *jmvref;
 7168         struct dirrem *dirrem, *tmp;
 7169         int i;
 7170 
 7171         /*
 7172          * Copy any directory remove dependencies to the list
 7173          * to be processed after the freeblks proceeds.  If
 7174          * directory entry never made it to disk they
 7175          * can be dumped directly onto the work list.
 7176          */
 7177         LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
 7178                 /* Skip this directory removal if it is intended to remain. */
 7179                 if (dirrem->dm_offset < blkoff)
 7180                         continue;
 7181                 /*
 7182                  * If there are any dirrems we wait for the journal write
 7183                  * to complete and then restart the buf scan as the lock
 7184                  * has been dropped.
 7185                  */
 7186                 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
 7187                         jwait(&jremref->jr_list, MNT_WAIT);
 7188                         return (ERESTART);
 7189                 }
 7190                 LIST_REMOVE(dirrem, dm_next);
 7191                 dirrem->dm_dirinum = pagedep->pd_ino;
 7192                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
 7193         }
 7194         while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
 7195                 jwait(&jmvref->jm_list, MNT_WAIT);
 7196                 return (ERESTART);
 7197         }
 7198         /*
 7199          * When we're partially truncating a pagedep we just want to flush
 7200          * journal entries and return.  There can not be any adds in the
 7201          * truncated portion of the directory and newblk must remain if
 7202          * part of the block remains.
 7203          */
 7204         if (blkoff != 0) {
 7205                 struct diradd *dap;
 7206 
 7207                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
 7208                         if (dap->da_offset > blkoff)
 7209                                 panic("cancel_pagedep: diradd %p off %d > %d",
 7210                                     dap, dap->da_offset, blkoff);
 7211                 for (i = 0; i < DAHASHSZ; i++)
 7212                         LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
 7213                                 if (dap->da_offset > blkoff)
 7214                                         panic("cancel_pagedep: diradd %p off %d > %d",
 7215                                             dap, dap->da_offset, blkoff);
 7216                 return (0);
 7217         }
 7218         /*
 7219          * There should be no directory add dependencies present
 7220          * as the directory could not be truncated until all
 7221          * children were removed.
 7222          */
 7223         KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
 7224             ("deallocate_dependencies: pendinghd != NULL"));
 7225         for (i = 0; i < DAHASHSZ; i++)
 7226                 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
 7227                     ("deallocate_dependencies: diraddhd != NULL"));
 7228         if ((pagedep->pd_state & NEWBLOCK) != 0)
 7229                 free_newdirblk(pagedep->pd_newdirblk);
 7230         if (free_pagedep(pagedep) == 0)
 7231                 panic("Failed to free pagedep %p", pagedep);
 7232         return (0);
 7233 }
 7234 
 7235 /*
 7236  * Reclaim any dependency structures from a buffer that is about to
 7237  * be reallocated to a new vnode. The buffer must be locked, thus,
 7238  * no I/O completion operations can occur while we are manipulating
 7239  * its associated dependencies. The mutex is held so that other I/O's
 7240  * associated with related dependencies do not occur.
 7241  */
 7242 static int
 7243 deallocate_dependencies(bp, freeblks, off)
 7244         struct buf *bp;
 7245         struct freeblks *freeblks;
 7246         int off;
 7247 {
 7248         struct indirdep *indirdep;
 7249         struct pagedep *pagedep;
 7250         struct worklist *wk, *wkn;
 7251         struct ufsmount *ump;
 7252 
 7253         ump = softdep_bp_to_mp(bp);
 7254         if (ump == NULL)
 7255                 goto done;
 7256         ACQUIRE_LOCK(ump);
 7257         LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
 7258                 switch (wk->wk_type) {
 7259                 case D_INDIRDEP:
 7260                         indirdep = WK_INDIRDEP(wk);
 7261                         if (bp->b_lblkno >= 0 ||
 7262                             bp->b_blkno != indirdep->ir_savebp->b_lblkno)
 7263                                 panic("deallocate_dependencies: not indir");
 7264                         cancel_indirdep(indirdep, bp, freeblks);
 7265                         continue;
 7266 
 7267                 case D_PAGEDEP:
 7268                         pagedep = WK_PAGEDEP(wk);
 7269                         if (cancel_pagedep(pagedep, freeblks, off)) {
 7270                                 FREE_LOCK(ump);
 7271                                 return (ERESTART);
 7272                         }
 7273                         continue;
 7274 
 7275                 case D_ALLOCINDIR:
 7276                         /*
 7277                          * Simply remove the allocindir, we'll find it via
 7278                          * the indirdep where we can clear pointers if
 7279                          * needed.
 7280                          */
 7281                         WORKLIST_REMOVE(wk);
 7282                         continue;
 7283 
 7284                 case D_FREEWORK:
 7285                         /*
 7286                          * A truncation is waiting for the zero'd pointers
 7287                          * to be written.  It can be freed when the freeblks
 7288                          * is journaled.
 7289                          */
 7290                         WORKLIST_REMOVE(wk);
 7291                         wk->wk_state |= ONDEPLIST;
 7292                         WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
 7293                         break;
 7294 
 7295                 case D_ALLOCDIRECT:
 7296                         if (off != 0)
 7297                                 continue;
 7298                         /* FALLTHROUGH */
 7299                 default:
 7300                         panic("deallocate_dependencies: Unexpected type %s",
 7301                             TYPENAME(wk->wk_type));
 7302                         /* NOTREACHED */
 7303                 }
 7304         }
 7305         FREE_LOCK(ump);
 7306 done:
 7307         /*
 7308          * Don't throw away this buf, we were partially truncating and
 7309          * some deps may always remain.
 7310          */
 7311         if (off) {
 7312                 allocbuf(bp, off);
 7313                 bp->b_vflags |= BV_SCANNED;
 7314                 return (EBUSY);
 7315         }
 7316         bp->b_flags |= B_INVAL | B_NOCACHE;
 7317 
 7318         return (0);
 7319 }
 7320 
 7321 /*
 7322  * An allocdirect is being canceled due to a truncate.  We must make sure
 7323  * the journal entry is released in concert with the blkfree that releases
 7324  * the storage.  Completed journal entries must not be released until the
 7325  * space is no longer pointed to by the inode or in the bitmap.
 7326  */
 7327 static void
 7328 cancel_allocdirect(adphead, adp, freeblks)
 7329         struct allocdirectlst *adphead;
 7330         struct allocdirect *adp;
 7331         struct freeblks *freeblks;
 7332 {
 7333         struct freework *freework;
 7334         struct newblk *newblk;
 7335         struct worklist *wk;
 7336 
 7337         TAILQ_REMOVE(adphead, adp, ad_next);
 7338         newblk = (struct newblk *)adp;
 7339         freework = NULL;
 7340         /*
 7341          * Find the correct freework structure.
 7342          */
 7343         LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
 7344                 if (wk->wk_type != D_FREEWORK)
 7345                         continue;
 7346                 freework = WK_FREEWORK(wk);
 7347                 if (freework->fw_blkno == newblk->nb_newblkno)
 7348                         break;
 7349         }
 7350         if (freework == NULL)
 7351                 panic("cancel_allocdirect: Freework not found");
 7352         /*
 7353          * If a newblk exists at all we still have the journal entry that
 7354          * initiated the allocation so we do not need to journal the free.
 7355          */
 7356         cancel_jfreeblk(freeblks, freework->fw_blkno);
 7357         /*
 7358          * If the journal hasn't been written the jnewblk must be passed
 7359          * to the call to ffs_blkfree that reclaims the space.  We accomplish
 7360          * this by linking the journal dependency into the freework to be
 7361          * freed when freework_freeblock() is called.  If the journal has
 7362          * been written we can simply reclaim the journal space when the
 7363          * freeblks work is complete.
 7364          */
 7365         freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
 7366             &freeblks->fb_jwork);
 7367         WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
 7368 }
 7369 
 7370 
 7371 /*
 7372  * Cancel a new block allocation.  May be an indirect or direct block.  We
 7373  * remove it from various lists and return any journal record that needs to
 7374  * be resolved by the caller.
 7375  *
 7376  * A special consideration is made for indirects which were never pointed
 7377  * at on disk and will never be found once this block is released.
 7378  */
 7379 static struct jnewblk *
 7380 cancel_newblk(newblk, wk, wkhd)
 7381         struct newblk *newblk;
 7382         struct worklist *wk;
 7383         struct workhead *wkhd;
 7384 {
 7385         struct jnewblk *jnewblk;
 7386 
 7387         CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
 7388             
 7389         newblk->nb_state |= GOINGAWAY;
 7390         /*
 7391          * Previously we traversed the completedhd on each indirdep
 7392          * attached to this newblk to cancel them and gather journal
 7393          * work.  Since we need only the oldest journal segment and
 7394          * the lowest point on the tree will always have the oldest
 7395          * journal segment we are free to release the segments
 7396          * of any subordinates and may leave the indirdep list to
 7397          * indirdep_complete() when this newblk is freed.
 7398          */
 7399         if (newblk->nb_state & ONDEPLIST) {
 7400                 newblk->nb_state &= ~ONDEPLIST;
 7401                 LIST_REMOVE(newblk, nb_deps);
 7402         }
 7403         if (newblk->nb_state & ONWORKLIST)
 7404                 WORKLIST_REMOVE(&newblk->nb_list);
 7405         /*
 7406          * If the journal entry hasn't been written we save a pointer to
 7407          * the dependency that frees it until it is written or the
 7408          * superseding operation completes.
 7409          */
 7410         jnewblk = newblk->nb_jnewblk;
 7411         if (jnewblk != NULL && wk != NULL) {
 7412                 newblk->nb_jnewblk = NULL;
 7413                 jnewblk->jn_dep = wk;
 7414         }
 7415         if (!LIST_EMPTY(&newblk->nb_jwork))
 7416                 jwork_move(wkhd, &newblk->nb_jwork);
 7417         /*
 7418          * When truncating we must free the newdirblk early to remove
 7419          * the pagedep from the hash before returning.
 7420          */
 7421         if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
 7422                 free_newdirblk(WK_NEWDIRBLK(wk));
 7423         if (!LIST_EMPTY(&newblk->nb_newdirblk))
 7424                 panic("cancel_newblk: extra newdirblk");
 7425 
 7426         return (jnewblk);
 7427 }
 7428 
 7429 /*
 7430  * Schedule the freefrag associated with a newblk to be released once
 7431  * the pointers are written and the previous block is no longer needed.
 7432  */
 7433 static void
 7434 newblk_freefrag(newblk)
 7435         struct newblk *newblk;
 7436 {
 7437         struct freefrag *freefrag;
 7438 
 7439         if (newblk->nb_freefrag == NULL)
 7440                 return;
 7441         freefrag = newblk->nb_freefrag;
 7442         newblk->nb_freefrag = NULL;
 7443         freefrag->ff_state |= COMPLETE;
 7444         if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
 7445                 add_to_worklist(&freefrag->ff_list, 0);
 7446 }
 7447 
 7448 /*
 7449  * Free a newblk. Generate a new freefrag work request if appropriate.
 7450  * This must be called after the inode pointer and any direct block pointers
 7451  * are valid or fully removed via truncate or frag extension.
 7452  */
 7453 static void
 7454 free_newblk(newblk)
 7455         struct newblk *newblk;
 7456 {
 7457         struct indirdep *indirdep;
 7458         struct worklist *wk;
 7459 
 7460         KASSERT(newblk->nb_jnewblk == NULL,
 7461             ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
 7462         KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
 7463             ("free_newblk: unclaimed newblk"));
 7464         LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp));
 7465         newblk_freefrag(newblk);
 7466         if (newblk->nb_state & ONDEPLIST)
 7467                 LIST_REMOVE(newblk, nb_deps);
 7468         if (newblk->nb_state & ONWORKLIST)
 7469                 WORKLIST_REMOVE(&newblk->nb_list);
 7470         LIST_REMOVE(newblk, nb_hash);
 7471         if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
 7472                 free_newdirblk(WK_NEWDIRBLK(wk));
 7473         if (!LIST_EMPTY(&newblk->nb_newdirblk))
 7474                 panic("free_newblk: extra newdirblk");
 7475         while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
 7476                 indirdep_complete(indirdep);
 7477         handle_jwork(&newblk->nb_jwork);
 7478         WORKITEM_FREE(newblk, D_NEWBLK);
 7479 }
 7480 
 7481 /*
 7482  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
 7483  * This routine must be called with splbio interrupts blocked.
 7484  */
 7485 static void
 7486 free_newdirblk(newdirblk)
 7487         struct newdirblk *newdirblk;
 7488 {
 7489         struct pagedep *pagedep;
 7490         struct diradd *dap;
 7491         struct worklist *wk;
 7492 
 7493         LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp));
 7494         WORKLIST_REMOVE(&newdirblk->db_list);
 7495         /*
 7496          * If the pagedep is still linked onto the directory buffer
 7497          * dependency chain, then some of the entries on the
 7498          * pd_pendinghd list may not be committed to disk yet. In
 7499          * this case, we will simply clear the NEWBLOCK flag and
 7500          * let the pd_pendinghd list be processed when the pagedep
 7501          * is next written. If the pagedep is no longer on the buffer
 7502          * dependency chain, then all the entries on the pd_pending
 7503          * list are committed to disk and we can free them here.
 7504          */
 7505         pagedep = newdirblk->db_pagedep;
 7506         pagedep->pd_state &= ~NEWBLOCK;
 7507         if ((pagedep->pd_state & ONWORKLIST) == 0) {
 7508                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 7509                         free_diradd(dap, NULL);
 7510                 /*
 7511                  * If no dependencies remain, the pagedep will be freed.
 7512                  */
 7513                 free_pagedep(pagedep);
 7514         }
 7515         /* Should only ever be one item in the list. */
 7516         while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
 7517                 WORKLIST_REMOVE(wk);
 7518                 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
 7519         }
 7520         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 7521 }
 7522 
 7523 /*
 7524  * Prepare an inode to be freed. The actual free operation is not
 7525  * done until the zero'ed inode has been written to disk.
 7526  */
 7527 void
 7528 softdep_freefile(pvp, ino, mode)
 7529         struct vnode *pvp;
 7530         ino_t ino;
 7531         int mode;
 7532 {
 7533         struct inode *ip = VTOI(pvp);
 7534         struct inodedep *inodedep;
 7535         struct freefile *freefile;
 7536         struct freeblks *freeblks;
 7537         struct ufsmount *ump;
 7538 
 7539         ump = ITOUMP(ip);
 7540         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 7541             ("softdep_freefile called on non-softdep filesystem"));
 7542         /*
 7543          * This sets up the inode de-allocation dependency.
 7544          */
 7545         freefile = malloc(sizeof(struct freefile),
 7546                 M_FREEFILE, M_SOFTDEP_FLAGS);
 7547         workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
 7548         freefile->fx_mode = mode;
 7549         freefile->fx_oldinum = ino;
 7550         freefile->fx_devvp = ump->um_devvp;
 7551         LIST_INIT(&freefile->fx_jwork);
 7552         UFS_LOCK(ump);
 7553         ump->um_fs->fs_pendinginodes += 1;
 7554         UFS_UNLOCK(ump);
 7555 
 7556         /*
 7557          * If the inodedep does not exist, then the zero'ed inode has
 7558          * been written to disk. If the allocated inode has never been
 7559          * written to disk, then the on-disk inode is zero'ed. In either
 7560          * case we can free the file immediately.  If the journal was
 7561          * canceled before being written the inode will never make it to
 7562          * disk and we must send the canceled journal entrys to
 7563          * ffs_freefile() to be cleared in conjunction with the bitmap.
 7564          * Any blocks waiting on the inode to write can be safely freed
 7565          * here as it will never been written.
 7566          */
 7567         ACQUIRE_LOCK(ump);
 7568         inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
 7569         if (inodedep) {
 7570                 /*
 7571                  * Clear out freeblks that no longer need to reference
 7572                  * this inode.
 7573                  */
 7574                 while ((freeblks =
 7575                     TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
 7576                         TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
 7577                             fb_next);
 7578                         freeblks->fb_state &= ~ONDEPLIST;
 7579                 }
 7580                 /*
 7581                  * Remove this inode from the unlinked list.
 7582                  */
 7583                 if (inodedep->id_state & UNLINKED) {
 7584                         /*
 7585                          * Save the journal work to be freed with the bitmap
 7586                          * before we clear UNLINKED.  Otherwise it can be lost
 7587                          * if the inode block is written.
 7588                          */
 7589                         handle_bufwait(inodedep, &freefile->fx_jwork);
 7590                         clear_unlinked_inodedep(inodedep);
 7591                         /*
 7592                          * Re-acquire inodedep as we've dropped the
 7593                          * per-filesystem lock in clear_unlinked_inodedep().
 7594                          */
 7595                         inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
 7596                 }
 7597         }
 7598         if (inodedep == NULL || check_inode_unwritten(inodedep)) {
 7599                 FREE_LOCK(ump);
 7600                 handle_workitem_freefile(freefile);
 7601                 return;
 7602         }
 7603         if ((inodedep->id_state & DEPCOMPLETE) == 0)
 7604                 inodedep->id_state |= GOINGAWAY;
 7605         WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
 7606         FREE_LOCK(ump);
 7607         if (ip->i_number == ino)
 7608                 ip->i_flag |= IN_MODIFIED;
 7609 }
 7610 
 7611 /*
 7612  * Check to see if an inode has never been written to disk. If
 7613  * so free the inodedep and return success, otherwise return failure.
 7614  * This routine must be called with splbio interrupts blocked.
 7615  *
 7616  * If we still have a bitmap dependency, then the inode has never
 7617  * been written to disk. Drop the dependency as it is no longer
 7618  * necessary since the inode is being deallocated. We set the
 7619  * ALLCOMPLETE flags since the bitmap now properly shows that the
 7620  * inode is not allocated. Even if the inode is actively being
 7621  * written, it has been rolled back to its zero'ed state, so we
 7622  * are ensured that a zero inode is what is on the disk. For short
 7623  * lived files, this change will usually result in removing all the
 7624  * dependencies from the inode so that it can be freed immediately.
 7625  */
 7626 static int
 7627 check_inode_unwritten(inodedep)
 7628         struct inodedep *inodedep;
 7629 {
 7630 
 7631         LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
 7632 
 7633         if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
 7634             !LIST_EMPTY(&inodedep->id_dirremhd) ||
 7635             !LIST_EMPTY(&inodedep->id_pendinghd) ||
 7636             !LIST_EMPTY(&inodedep->id_bufwait) ||
 7637             !LIST_EMPTY(&inodedep->id_inowait) ||
 7638             !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
 7639             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 7640             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
 7641             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 7642             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 7643             !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
 7644             inodedep->id_mkdiradd != NULL || 
 7645             inodedep->id_nlinkdelta != 0)
 7646                 return (0);
 7647         /*
 7648          * Another process might be in initiate_write_inodeblock_ufs[12]
 7649          * trying to allocate memory without holding "Softdep Lock".
 7650          */
 7651         if ((inodedep->id_state & IOSTARTED) != 0 &&
 7652             inodedep->id_savedino1 == NULL)
 7653                 return (0);
 7654 
 7655         if (inodedep->id_state & ONDEPLIST)
 7656                 LIST_REMOVE(inodedep, id_deps);
 7657         inodedep->id_state &= ~ONDEPLIST;
 7658         inodedep->id_state |= ALLCOMPLETE;
 7659         inodedep->id_bmsafemap = NULL;
 7660         if (inodedep->id_state & ONWORKLIST)
 7661                 WORKLIST_REMOVE(&inodedep->id_list);
 7662         if (inodedep->id_savedino1 != NULL) {
 7663                 free(inodedep->id_savedino1, M_SAVEDINO);
 7664                 inodedep->id_savedino1 = NULL;
 7665         }
 7666         if (free_inodedep(inodedep) == 0)
 7667                 panic("check_inode_unwritten: busy inode");
 7668         return (1);
 7669 }
 7670 
 7671 static int
 7672 check_inodedep_free(inodedep)
 7673         struct inodedep *inodedep;
 7674 {
 7675 
 7676         LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
 7677         if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
 7678             !LIST_EMPTY(&inodedep->id_dirremhd) ||
 7679             !LIST_EMPTY(&inodedep->id_pendinghd) ||
 7680             !LIST_EMPTY(&inodedep->id_bufwait) ||
 7681             !LIST_EMPTY(&inodedep->id_inowait) ||
 7682             !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
 7683             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 7684             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
 7685             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 7686             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 7687             !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
 7688             inodedep->id_mkdiradd != NULL ||
 7689             inodedep->id_nlinkdelta != 0 ||
 7690             inodedep->id_savedino1 != NULL)
 7691                 return (0);
 7692         return (1);
 7693 }
 7694 
 7695 /*
 7696  * Try to free an inodedep structure. Return 1 if it could be freed.
 7697  */
 7698 static int
 7699 free_inodedep(inodedep)
 7700         struct inodedep *inodedep;
 7701 {
 7702 
 7703         LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
 7704         if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
 7705             !check_inodedep_free(inodedep))
 7706                 return (0);
 7707         if (inodedep->id_state & ONDEPLIST)
 7708                 LIST_REMOVE(inodedep, id_deps);
 7709         LIST_REMOVE(inodedep, id_hash);
 7710         WORKITEM_FREE(inodedep, D_INODEDEP);
 7711         return (1);
 7712 }
 7713 
 7714 /*
 7715  * Free the block referenced by a freework structure.  The parent freeblks
 7716  * structure is released and completed when the final cg bitmap reaches
 7717  * the disk.  This routine may be freeing a jnewblk which never made it to
 7718  * disk in which case we do not have to wait as the operation is undone
 7719  * in memory immediately.
 7720  */
 7721 static void
 7722 freework_freeblock(freework)
 7723         struct freework *freework;
 7724 {
 7725         struct freeblks *freeblks;
 7726         struct jnewblk *jnewblk;
 7727         struct ufsmount *ump;
 7728         struct workhead wkhd;
 7729         struct fs *fs;
 7730         int bsize;
 7731         int needj;
 7732 
 7733         ump = VFSTOUFS(freework->fw_list.wk_mp);
 7734         LOCK_OWNED(ump);
 7735         /*
 7736          * Handle partial truncate separately.
 7737          */
 7738         if (freework->fw_indir) {
 7739                 complete_trunc_indir(freework);
 7740                 return;
 7741         }
 7742         freeblks = freework->fw_freeblks;
 7743         fs = ump->um_fs;
 7744         needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
 7745         bsize = lfragtosize(fs, freework->fw_frags);
 7746         LIST_INIT(&wkhd);
 7747         /*
 7748          * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
 7749          * on the indirblk hashtable and prevents premature freeing.
 7750          */
 7751         freework->fw_state |= DEPCOMPLETE;
 7752         /*
 7753          * SUJ needs to wait for the segment referencing freed indirect
 7754          * blocks to expire so that we know the checker will not confuse
 7755          * a re-allocated indirect block with its old contents.
 7756          */
 7757         if (needj && freework->fw_lbn <= -NDADDR)
 7758                 indirblk_insert(freework);
 7759         /*
 7760          * If we are canceling an existing jnewblk pass it to the free
 7761          * routine, otherwise pass the freeblk which will ultimately
 7762          * release the freeblks.  If we're not journaling, we can just
 7763          * free the freeblks immediately.
 7764          */
 7765         jnewblk = freework->fw_jnewblk;
 7766         if (jnewblk != NULL) {
 7767                 cancel_jnewblk(jnewblk, &wkhd);
 7768                 needj = 0;
 7769         } else if (needj) {
 7770                 freework->fw_state |= DELAYEDFREE;
 7771                 freeblks->fb_cgwait++;
 7772                 WORKLIST_INSERT(&wkhd, &freework->fw_list);
 7773         }
 7774         FREE_LOCK(ump);
 7775         freeblks_free(ump, freeblks, btodb(bsize));
 7776         CTR4(KTR_SUJ,
 7777             "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
 7778             freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
 7779         ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
 7780             freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
 7781         ACQUIRE_LOCK(ump);
 7782         /*
 7783          * The jnewblk will be discarded and the bits in the map never
 7784          * made it to disk.  We can immediately free the freeblk.
 7785          */
 7786         if (needj == 0)
 7787                 handle_written_freework(freework);
 7788 }
 7789 
 7790 /*
 7791  * We enqueue freework items that need processing back on the freeblks and
 7792  * add the freeblks to the worklist.  This makes it easier to find all work
 7793  * required to flush a truncation in process_truncates().
 7794  */
 7795 static void
 7796 freework_enqueue(freework)
 7797         struct freework *freework;
 7798 {
 7799         struct freeblks *freeblks;
 7800 
 7801         freeblks = freework->fw_freeblks;
 7802         if ((freework->fw_state & INPROGRESS) == 0)
 7803                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
 7804         if ((freeblks->fb_state &
 7805             (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
 7806             LIST_EMPTY(&freeblks->fb_jblkdephd))
 7807                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
 7808 }
 7809 
 7810 /*
 7811  * Start, continue, or finish the process of freeing an indirect block tree.
 7812  * The free operation may be paused at any point with fw_off containing the
 7813  * offset to restart from.  This enables us to implement some flow control
 7814  * for large truncates which may fan out and generate a huge number of
 7815  * dependencies.
 7816  */
 7817 static void
 7818 handle_workitem_indirblk(freework)
 7819         struct freework *freework;
 7820 {
 7821         struct freeblks *freeblks;
 7822         struct ufsmount *ump;
 7823         struct fs *fs;
 7824 
 7825         freeblks = freework->fw_freeblks;
 7826         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 7827         fs = ump->um_fs;
 7828         if (freework->fw_state & DEPCOMPLETE) {
 7829                 handle_written_freework(freework);
 7830                 return;
 7831         }
 7832         if (freework->fw_off == NINDIR(fs)) {
 7833                 freework_freeblock(freework);
 7834                 return;
 7835         }
 7836         freework->fw_state |= INPROGRESS;
 7837         FREE_LOCK(ump);
 7838         indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
 7839             freework->fw_lbn);
 7840         ACQUIRE_LOCK(ump);
 7841 }
 7842 
 7843 /*
 7844  * Called when a freework structure attached to a cg buf is written.  The
 7845  * ref on either the parent or the freeblks structure is released and
 7846  * the freeblks is added back to the worklist if there is more work to do.
 7847  */
 7848 static void
 7849 handle_written_freework(freework)
 7850         struct freework *freework;
 7851 {
 7852         struct freeblks *freeblks;
 7853         struct freework *parent;
 7854 
 7855         freeblks = freework->fw_freeblks;
 7856         parent = freework->fw_parent;
 7857         if (freework->fw_state & DELAYEDFREE)
 7858                 freeblks->fb_cgwait--;
 7859         freework->fw_state |= COMPLETE;
 7860         if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
 7861                 WORKITEM_FREE(freework, D_FREEWORK);
 7862         if (parent) {
 7863                 if (--parent->fw_ref == 0)
 7864                         freework_enqueue(parent);
 7865                 return;
 7866         }
 7867         if (--freeblks->fb_ref != 0)
 7868                 return;
 7869         if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
 7870             ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) 
 7871                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
 7872 }
 7873 
 7874 /*
 7875  * This workitem routine performs the block de-allocation.
 7876  * The workitem is added to the pending list after the updated
 7877  * inode block has been written to disk.  As mentioned above,
 7878  * checks regarding the number of blocks de-allocated (compared
 7879  * to the number of blocks allocated for the file) are also
 7880  * performed in this function.
 7881  */
 7882 static int
 7883 handle_workitem_freeblocks(freeblks, flags)
 7884         struct freeblks *freeblks;
 7885         int flags;
 7886 {
 7887         struct freework *freework;
 7888         struct newblk *newblk;
 7889         struct allocindir *aip;
 7890         struct ufsmount *ump;
 7891         struct worklist *wk;
 7892 
 7893         KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
 7894             ("handle_workitem_freeblocks: Journal entries not written."));
 7895         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 7896         ACQUIRE_LOCK(ump);
 7897         while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
 7898                 WORKLIST_REMOVE(wk);
 7899                 switch (wk->wk_type) {
 7900                 case D_DIRREM:
 7901                         wk->wk_state |= COMPLETE;
 7902                         add_to_worklist(wk, 0);
 7903                         continue;
 7904 
 7905                 case D_ALLOCDIRECT:
 7906                         free_newblk(WK_NEWBLK(wk));
 7907                         continue;
 7908 
 7909                 case D_ALLOCINDIR:
 7910                         aip = WK_ALLOCINDIR(wk);
 7911                         freework = NULL;
 7912                         if (aip->ai_state & DELAYEDFREE) {
 7913                                 FREE_LOCK(ump);
 7914                                 freework = newfreework(ump, freeblks, NULL,
 7915                                     aip->ai_lbn, aip->ai_newblkno,
 7916                                     ump->um_fs->fs_frag, 0, 0);
 7917                                 ACQUIRE_LOCK(ump);
 7918                         }
 7919                         newblk = WK_NEWBLK(wk);
 7920                         if (newblk->nb_jnewblk) {
 7921                                 freework->fw_jnewblk = newblk->nb_jnewblk;
 7922                                 newblk->nb_jnewblk->jn_dep = &freework->fw_list;
 7923                                 newblk->nb_jnewblk = NULL;
 7924                         }
 7925                         free_newblk(newblk);
 7926                         continue;
 7927 
 7928                 case D_FREEWORK:
 7929                         freework = WK_FREEWORK(wk);
 7930                         if (freework->fw_lbn <= -NDADDR)
 7931                                 handle_workitem_indirblk(freework);
 7932                         else
 7933                                 freework_freeblock(freework);
 7934                         continue;
 7935                 default:
 7936                         panic("handle_workitem_freeblocks: Unknown type %s",
 7937                             TYPENAME(wk->wk_type));
 7938                 }
 7939         }
 7940         if (freeblks->fb_ref != 0) {
 7941                 freeblks->fb_state &= ~INPROGRESS;
 7942                 wake_worklist(&freeblks->fb_list);
 7943                 freeblks = NULL;
 7944         }
 7945         FREE_LOCK(ump);
 7946         if (freeblks)
 7947                 return handle_complete_freeblocks(freeblks, flags);
 7948         return (0);
 7949 }
 7950 
 7951 /*
 7952  * Handle completion of block free via truncate.  This allows fs_pending
 7953  * to track the actual free block count more closely than if we only updated
 7954  * it at the end.  We must be careful to handle cases where the block count
 7955  * on free was incorrect.
 7956  */
 7957 static void
 7958 freeblks_free(ump, freeblks, blocks)
 7959         struct ufsmount *ump;
 7960         struct freeblks *freeblks;
 7961         int blocks;
 7962 {
 7963         struct fs *fs;
 7964         ufs2_daddr_t remain;
 7965 
 7966         UFS_LOCK(ump);
 7967         remain = -freeblks->fb_chkcnt;
 7968         freeblks->fb_chkcnt += blocks;
 7969         if (remain > 0) {
 7970                 if (remain < blocks)
 7971                         blocks = remain;
 7972                 fs = ump->um_fs;
 7973                 fs->fs_pendingblocks -= blocks;
 7974         }
 7975         UFS_UNLOCK(ump);
 7976 }
 7977 
 7978 /*
 7979  * Once all of the freework workitems are complete we can retire the
 7980  * freeblocks dependency and any journal work awaiting completion.  This
 7981  * can not be called until all other dependencies are stable on disk.
 7982  */
 7983 static int
 7984 handle_complete_freeblocks(freeblks, flags)
 7985         struct freeblks *freeblks;
 7986         int flags;
 7987 {
 7988         struct inodedep *inodedep;
 7989         struct inode *ip;
 7990         struct vnode *vp;
 7991         struct fs *fs;
 7992         struct ufsmount *ump;
 7993         ufs2_daddr_t spare;
 7994 
 7995         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 7996         fs = ump->um_fs;
 7997         flags = LK_EXCLUSIVE | flags;
 7998         spare = freeblks->fb_chkcnt;
 7999 
 8000         /*
 8001          * If we did not release the expected number of blocks we may have
 8002          * to adjust the inode block count here.  Only do so if it wasn't
 8003          * a truncation to zero and the modrev still matches.
 8004          */
 8005         if (spare && freeblks->fb_len != 0) {
 8006                 if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
 8007                     flags, &vp, FFSV_FORCEINSMQ) != 0)
 8008                         return (EBUSY);
 8009                 ip = VTOI(vp);
 8010                 if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
 8011                         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
 8012                         ip->i_flag |= IN_CHANGE;
 8013                         /*
 8014                          * We must wait so this happens before the
 8015                          * journal is reclaimed.
 8016                          */
 8017                         ffs_update(vp, 1);
 8018                 }
 8019                 vput(vp);
 8020         }
 8021         if (spare < 0) {
 8022                 UFS_LOCK(ump);
 8023                 fs->fs_pendingblocks += spare;
 8024                 UFS_UNLOCK(ump);
 8025         }
 8026 #ifdef QUOTA
 8027         /* Handle spare. */
 8028         if (spare)
 8029                 quotaadj(freeblks->fb_quota, ump, -spare);
 8030         quotarele(freeblks->fb_quota);
 8031 #endif
 8032         ACQUIRE_LOCK(ump);
 8033         if (freeblks->fb_state & ONDEPLIST) {
 8034                 inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
 8035                     0, &inodedep);
 8036                 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
 8037                 freeblks->fb_state &= ~ONDEPLIST;
 8038                 if (TAILQ_EMPTY(&inodedep->id_freeblklst))
 8039                         free_inodedep(inodedep);
 8040         }
 8041         /*
 8042          * All of the freeblock deps must be complete prior to this call
 8043          * so it's now safe to complete earlier outstanding journal entries.
 8044          */
 8045         handle_jwork(&freeblks->fb_jwork);
 8046         WORKITEM_FREE(freeblks, D_FREEBLKS);
 8047         FREE_LOCK(ump);
 8048         return (0);
 8049 }
 8050 
 8051 /*
 8052  * Release blocks associated with the freeblks and stored in the indirect
 8053  * block dbn. If level is greater than SINGLE, the block is an indirect block
 8054  * and recursive calls to indirtrunc must be used to cleanse other indirect
 8055  * blocks.
 8056  *
 8057  * This handles partial and complete truncation of blocks.  Partial is noted
 8058  * with goingaway == 0.  In this case the freework is completed after the
 8059  * zero'd indirects are written to disk.  For full truncation the freework
 8060  * is completed after the block is freed.
 8061  */
 8062 static void
 8063 indir_trunc(freework, dbn, lbn)
 8064         struct freework *freework;
 8065         ufs2_daddr_t dbn;
 8066         ufs_lbn_t lbn;
 8067 {
 8068         struct freework *nfreework;
 8069         struct workhead wkhd;
 8070         struct freeblks *freeblks;
 8071         struct buf *bp;
 8072         struct fs *fs;
 8073         struct indirdep *indirdep;
 8074         struct ufsmount *ump;
 8075         ufs1_daddr_t *bap1;
 8076         ufs2_daddr_t nb, nnb, *bap2;
 8077         ufs_lbn_t lbnadd, nlbn;
 8078         int i, nblocks, ufs1fmt;
 8079         int freedblocks;
 8080         int goingaway;
 8081         int freedeps;
 8082         int needj;
 8083         int level;
 8084         int cnt;
 8085 
 8086         freeblks = freework->fw_freeblks;
 8087         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 8088         fs = ump->um_fs;
 8089         /*
 8090          * Get buffer of block pointers to be freed.  There are three cases:
 8091          * 
 8092          * 1) Partial truncate caches the indirdep pointer in the freework
 8093          *    which provides us a back copy to the save bp which holds the
 8094          *    pointers we want to clear.  When this completes the zero
 8095          *    pointers are written to the real copy.
 8096          * 2) The indirect is being completely truncated, cancel_indirdep()
 8097          *    eliminated the real copy and placed the indirdep on the saved
 8098          *    copy.  The indirdep and buf are discarded when this completes.
 8099          * 3) The indirect was not in memory, we read a copy off of the disk
 8100          *    using the devvp and drop and invalidate the buffer when we're
 8101          *    done.
 8102          */
 8103         goingaway = 1;
 8104         indirdep = NULL;
 8105         if (freework->fw_indir != NULL) {
 8106                 goingaway = 0;
 8107                 indirdep = freework->fw_indir;
 8108                 bp = indirdep->ir_savebp;
 8109                 if (bp == NULL || bp->b_blkno != dbn)
 8110                         panic("indir_trunc: Bad saved buf %p blkno %jd",
 8111                             bp, (intmax_t)dbn);
 8112         } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
 8113                 /*
 8114                  * The lock prevents the buf dep list from changing and
 8115                  * indirects on devvp should only ever have one dependency.
 8116                  */
 8117                 indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
 8118                 if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
 8119                         panic("indir_trunc: Bad indirdep %p from buf %p",
 8120                             indirdep, bp);
 8121         } else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
 8122             NOCRED, &bp) != 0) {
 8123                 brelse(bp);
 8124                 return;
 8125         }
 8126         ACQUIRE_LOCK(ump);
 8127         /* Protects against a race with complete_trunc_indir(). */
 8128         freework->fw_state &= ~INPROGRESS;
 8129         /*
 8130          * If we have an indirdep we need to enforce the truncation order
 8131          * and discard it when it is complete.
 8132          */
 8133         if (indirdep) {
 8134                 if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
 8135                     !TAILQ_EMPTY(&indirdep->ir_trunc)) {
 8136                         /*
 8137                          * Add the complete truncate to the list on the
 8138                          * indirdep to enforce in-order processing.
 8139                          */
 8140                         if (freework->fw_indir == NULL)
 8141                                 TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
 8142                                     freework, fw_next);
 8143                         FREE_LOCK(ump);
 8144                         return;
 8145                 }
 8146                 /*
 8147                  * If we're goingaway, free the indirdep.  Otherwise it will
 8148                  * linger until the write completes.
 8149                  */
 8150                 if (goingaway)
 8151                         free_indirdep(indirdep);
 8152         }
 8153         FREE_LOCK(ump);
 8154         /* Initialize pointers depending on block size. */
 8155         if (ump->um_fstype == UFS1) {
 8156                 bap1 = (ufs1_daddr_t *)bp->b_data;
 8157                 nb = bap1[freework->fw_off];
 8158                 ufs1fmt = 1;
 8159                 bap2 = NULL;
 8160         } else {
 8161                 bap2 = (ufs2_daddr_t *)bp->b_data;
 8162                 nb = bap2[freework->fw_off];
 8163                 ufs1fmt = 0;
 8164                 bap1 = NULL;
 8165         }
 8166         level = lbn_level(lbn);
 8167         needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
 8168         lbnadd = lbn_offset(fs, level);
 8169         nblocks = btodb(fs->fs_bsize);
 8170         nfreework = freework;
 8171         freedeps = 0;
 8172         cnt = 0;
 8173         /*
 8174          * Reclaim blocks.  Traverses into nested indirect levels and
 8175          * arranges for the current level to be freed when subordinates
 8176          * are free when journaling.
 8177          */
 8178         for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
 8179                 if (i != NINDIR(fs) - 1) {
 8180                         if (ufs1fmt)
 8181                                 nnb = bap1[i+1];
 8182                         else
 8183                                 nnb = bap2[i+1];
 8184                 } else
 8185                         nnb = 0;
 8186                 if (nb == 0)
 8187                         continue;
 8188                 cnt++;
 8189                 if (level != 0) {
 8190                         nlbn = (lbn + 1) - (i * lbnadd);
 8191                         if (needj != 0) {
 8192                                 nfreework = newfreework(ump, freeblks, freework,
 8193                                     nlbn, nb, fs->fs_frag, 0, 0);
 8194                                 freedeps++;
 8195                         }
 8196                         indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
 8197                 } else {
 8198                         struct freedep *freedep;
 8199 
 8200                         /*
 8201                          * Attempt to aggregate freedep dependencies for
 8202                          * all blocks being released to the same CG.
 8203                          */
 8204                         LIST_INIT(&wkhd);
 8205                         if (needj != 0 &&
 8206                             (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
 8207                                 freedep = newfreedep(freework);
 8208                                 WORKLIST_INSERT_UNLOCKED(&wkhd,
 8209                                     &freedep->fd_list);
 8210                                 freedeps++;
 8211                         }
 8212                         CTR3(KTR_SUJ,
 8213                             "indir_trunc: ino %d blkno %jd size %ld",
 8214                             freeblks->fb_inum, nb, fs->fs_bsize);
 8215                         ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
 8216                             fs->fs_bsize, freeblks->fb_inum,
 8217                             freeblks->fb_vtype, &wkhd);
 8218                 }
 8219         }
 8220         if (goingaway) {
 8221                 bp->b_flags |= B_INVAL | B_NOCACHE;
 8222                 brelse(bp);
 8223         }
 8224         freedblocks = 0;
 8225         if (level == 0)
 8226                 freedblocks = (nblocks * cnt);
 8227         if (needj == 0)
 8228                 freedblocks += nblocks;
 8229         freeblks_free(ump, freeblks, freedblocks);
 8230         /*
 8231          * If we are journaling set up the ref counts and offset so this
 8232          * indirect can be completed when its children are free.
 8233          */
 8234         if (needj) {
 8235                 ACQUIRE_LOCK(ump);
 8236                 freework->fw_off = i;
 8237                 freework->fw_ref += freedeps;
 8238                 freework->fw_ref -= NINDIR(fs) + 1;
 8239                 if (level == 0)
 8240                         freeblks->fb_cgwait += freedeps;
 8241                 if (freework->fw_ref == 0)
 8242                         freework_freeblock(freework);
 8243                 FREE_LOCK(ump);
 8244                 return;
 8245         }
 8246         /*
 8247          * If we're not journaling we can free the indirect now.
 8248          */
 8249         dbn = dbtofsb(fs, dbn);
 8250         CTR3(KTR_SUJ,
 8251             "indir_trunc 2: ino %d blkno %jd size %ld",
 8252             freeblks->fb_inum, dbn, fs->fs_bsize);
 8253         ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
 8254             freeblks->fb_inum, freeblks->fb_vtype, NULL);
 8255         /* Non SUJ softdep does single-threaded truncations. */
 8256         if (freework->fw_blkno == dbn) {
 8257                 freework->fw_state |= ALLCOMPLETE;
 8258                 ACQUIRE_LOCK(ump);
 8259                 handle_written_freework(freework);
 8260                 FREE_LOCK(ump);
 8261         }
 8262         return;
 8263 }
 8264 
 8265 /*
 8266  * Cancel an allocindir when it is removed via truncation.  When bp is not
 8267  * NULL the indirect never appeared on disk and is scheduled to be freed
 8268  * independently of the indir so we can more easily track journal work.
 8269  */
 8270 static void
 8271 cancel_allocindir(aip, bp, freeblks, trunc)
 8272         struct allocindir *aip;
 8273         struct buf *bp;
 8274         struct freeblks *freeblks;
 8275         int trunc;
 8276 {
 8277         struct indirdep *indirdep;
 8278         struct freefrag *freefrag;
 8279         struct newblk *newblk;
 8280 
 8281         newblk = (struct newblk *)aip;
 8282         LIST_REMOVE(aip, ai_next);
 8283         /*
 8284          * We must eliminate the pointer in bp if it must be freed on its
 8285          * own due to partial truncate or pending journal work.
 8286          */
 8287         if (bp && (trunc || newblk->nb_jnewblk)) {
 8288                 /*
 8289                  * Clear the pointer and mark the aip to be freed
 8290                  * directly if it never existed on disk.
 8291                  */
 8292                 aip->ai_state |= DELAYEDFREE;
 8293                 indirdep = aip->ai_indirdep;
 8294                 if (indirdep->ir_state & UFS1FMT)
 8295                         ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
 8296                 else
 8297                         ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
 8298         }
 8299         /*
 8300          * When truncating the previous pointer will be freed via
 8301          * savedbp.  Eliminate the freefrag which would dup free.
 8302          */
 8303         if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
 8304                 newblk->nb_freefrag = NULL;
 8305                 if (freefrag->ff_jdep)
 8306                         cancel_jfreefrag(
 8307                             WK_JFREEFRAG(freefrag->ff_jdep));
 8308                 jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
 8309                 WORKITEM_FREE(freefrag, D_FREEFRAG);
 8310         }
 8311         /*
 8312          * If the journal hasn't been written the jnewblk must be passed
 8313          * to the call to ffs_blkfree that reclaims the space.  We accomplish
 8314          * this by leaving the journal dependency on the newblk to be freed
 8315          * when a freework is created in handle_workitem_freeblocks().
 8316          */
 8317         cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
 8318         WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
 8319 }
 8320 
 8321 /*
 8322  * Create the mkdir dependencies for . and .. in a new directory.  Link them
 8323  * in to a newdirblk so any subsequent additions are tracked properly.  The
 8324  * caller is responsible for adding the mkdir1 dependency to the journal
 8325  * and updating id_mkdiradd.  This function returns with the per-filesystem
 8326  * lock held.
 8327  */
 8328 static struct mkdir *
 8329 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
 8330         struct diradd *dap;
 8331         ino_t newinum;
 8332         ino_t dinum;
 8333         struct buf *newdirbp;
 8334         struct mkdir **mkdirp;
 8335 {
 8336         struct newblk *newblk;
 8337         struct pagedep *pagedep;
 8338         struct inodedep *inodedep;
 8339         struct newdirblk *newdirblk;
 8340         struct mkdir *mkdir1, *mkdir2;
 8341         struct worklist *wk;
 8342         struct jaddref *jaddref;
 8343         struct ufsmount *ump;
 8344         struct mount *mp;
 8345 
 8346         mp = dap->da_list.wk_mp;
 8347         ump = VFSTOUFS(mp);
 8348         newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
 8349             M_SOFTDEP_FLAGS);
 8350         workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
 8351         LIST_INIT(&newdirblk->db_mkdir);
 8352         mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
 8353         workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
 8354         mkdir1->md_state = ATTACHED | MKDIR_BODY;
 8355         mkdir1->md_diradd = dap;
 8356         mkdir1->md_jaddref = NULL;
 8357         mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
 8358         workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
 8359         mkdir2->md_state = ATTACHED | MKDIR_PARENT;
 8360         mkdir2->md_diradd = dap;
 8361         mkdir2->md_jaddref = NULL;
 8362         if (MOUNTEDSUJ(mp) == 0) {
 8363                 mkdir1->md_state |= DEPCOMPLETE;
 8364                 mkdir2->md_state |= DEPCOMPLETE;
 8365         }
 8366         /*
 8367          * Dependency on "." and ".." being written to disk.
 8368          */
 8369         mkdir1->md_buf = newdirbp;
 8370         ACQUIRE_LOCK(VFSTOUFS(mp));
 8371         LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs);
 8372         /*
 8373          * We must link the pagedep, allocdirect, and newdirblk for
 8374          * the initial file page so the pointer to the new directory
 8375          * is not written until the directory contents are live and
 8376          * any subsequent additions are not marked live until the
 8377          * block is reachable via the inode.
 8378          */
 8379         if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
 8380                 panic("setup_newdir: lost pagedep");
 8381         LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
 8382                 if (wk->wk_type == D_ALLOCDIRECT)
 8383                         break;
 8384         if (wk == NULL)
 8385                 panic("setup_newdir: lost allocdirect");
 8386         if (pagedep->pd_state & NEWBLOCK)
 8387                 panic("setup_newdir: NEWBLOCK already set");
 8388         newblk = WK_NEWBLK(wk);
 8389         pagedep->pd_state |= NEWBLOCK;
 8390         pagedep->pd_newdirblk = newdirblk;
 8391         newdirblk->db_pagedep = pagedep;
 8392         WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
 8393         WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
 8394         /*
 8395          * Look up the inodedep for the parent directory so that we
 8396          * can link mkdir2 into the pending dotdot jaddref or
 8397          * the inode write if there is none.  If the inode is
 8398          * ALLCOMPLETE and no jaddref is present all dependencies have
 8399          * been satisfied and mkdir2 can be freed.
 8400          */
 8401         inodedep_lookup(mp, dinum, 0, &inodedep);
 8402         if (MOUNTEDSUJ(mp)) {
 8403                 if (inodedep == NULL)
 8404                         panic("setup_newdir: Lost parent.");
 8405                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 8406                     inoreflst);
 8407                 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
 8408                     (jaddref->ja_state & MKDIR_PARENT),
 8409                     ("setup_newdir: bad dotdot jaddref %p", jaddref));
 8410                 LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
 8411                 mkdir2->md_jaddref = jaddref;
 8412                 jaddref->ja_mkdir = mkdir2;
 8413         } else if (inodedep == NULL ||
 8414             (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 8415                 dap->da_state &= ~MKDIR_PARENT;
 8416                 WORKITEM_FREE(mkdir2, D_MKDIR);
 8417                 mkdir2 = NULL;
 8418         } else {
 8419                 LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
 8420                 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
 8421         }
 8422         *mkdirp = mkdir2;
 8423 
 8424         return (mkdir1);
 8425 }
 8426 
 8427 /*
 8428  * Directory entry addition dependencies.
 8429  * 
 8430  * When adding a new directory entry, the inode (with its incremented link
 8431  * count) must be written to disk before the directory entry's pointer to it.
 8432  * Also, if the inode is newly allocated, the corresponding freemap must be
 8433  * updated (on disk) before the directory entry's pointer. These requirements
 8434  * are met via undo/redo on the directory entry's pointer, which consists
 8435  * simply of the inode number.
 8436  * 
 8437  * As directory entries are added and deleted, the free space within a
 8438  * directory block can become fragmented.  The ufs filesystem will compact
 8439  * a fragmented directory block to make space for a new entry. When this
 8440  * occurs, the offsets of previously added entries change. Any "diradd"
 8441  * dependency structures corresponding to these entries must be updated with
 8442  * the new offsets.
 8443  */
 8444 
 8445 /*
 8446  * This routine is called after the in-memory inode's link
 8447  * count has been incremented, but before the directory entry's
 8448  * pointer to the inode has been set.
 8449  */
 8450 int
 8451 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 8452         struct buf *bp;         /* buffer containing directory block */
 8453         struct inode *dp;       /* inode for directory */
 8454         off_t diroffset;        /* offset of new entry in directory */
 8455         ino_t newinum;          /* inode referenced by new directory entry */
 8456         struct buf *newdirbp;   /* non-NULL => contents of new mkdir */
 8457         int isnewblk;           /* entry is in a newly allocated block */
 8458 {
 8459         int offset;             /* offset of new entry within directory block */
 8460         ufs_lbn_t lbn;          /* block in directory containing new entry */
 8461         struct fs *fs;
 8462         struct diradd *dap;
 8463         struct newblk *newblk;
 8464         struct pagedep *pagedep;
 8465         struct inodedep *inodedep;
 8466         struct newdirblk *newdirblk;
 8467         struct mkdir *mkdir1, *mkdir2;
 8468         struct jaddref *jaddref;
 8469         struct ufsmount *ump;
 8470         struct mount *mp;
 8471         int isindir;
 8472 
 8473         mp = ITOVFS(dp);
 8474         ump = VFSTOUFS(mp);
 8475         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 8476             ("softdep_setup_directory_add called on non-softdep filesystem"));
 8477         /*
 8478          * Whiteouts have no dependencies.
 8479          */
 8480         if (newinum == WINO) {
 8481                 if (newdirbp != NULL)
 8482                         bdwrite(newdirbp);
 8483                 return (0);
 8484         }
 8485         jaddref = NULL;
 8486         mkdir1 = mkdir2 = NULL;
 8487         fs = ump->um_fs;
 8488         lbn = lblkno(fs, diroffset);
 8489         offset = blkoff(fs, diroffset);
 8490         dap = malloc(sizeof(struct diradd), M_DIRADD,
 8491                 M_SOFTDEP_FLAGS|M_ZERO);
 8492         workitem_alloc(&dap->da_list, D_DIRADD, mp);
 8493         dap->da_offset = offset;
 8494         dap->da_newinum = newinum;
 8495         dap->da_state = ATTACHED;
 8496         LIST_INIT(&dap->da_jwork);
 8497         isindir = bp->b_lblkno >= NDADDR;
 8498         newdirblk = NULL;
 8499         if (isnewblk &&
 8500             (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
 8501                 newdirblk = malloc(sizeof(struct newdirblk),
 8502                     M_NEWDIRBLK, M_SOFTDEP_FLAGS);
 8503                 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
 8504                 LIST_INIT(&newdirblk->db_mkdir);
 8505         }
 8506         /*
 8507          * If we're creating a new directory setup the dependencies and set
 8508          * the dap state to wait for them.  Otherwise it's COMPLETE and
 8509          * we can move on.
 8510          */
 8511         if (newdirbp == NULL) {
 8512                 dap->da_state |= DEPCOMPLETE;
 8513                 ACQUIRE_LOCK(ump);
 8514         } else {
 8515                 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
 8516                 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
 8517                     &mkdir2);
 8518         }
 8519         /*
 8520          * Link into parent directory pagedep to await its being written.
 8521          */
 8522         pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
 8523 #ifdef DEBUG
 8524         if (diradd_lookup(pagedep, offset) != NULL)
 8525                 panic("softdep_setup_directory_add: %p already at off %d\n",
 8526                     diradd_lookup(pagedep, offset), offset);
 8527 #endif
 8528         dap->da_pagedep = pagedep;
 8529         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
 8530             da_pdlist);
 8531         inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
 8532         /*
 8533          * If we're journaling, link the diradd into the jaddref so it
 8534          * may be completed after the journal entry is written.  Otherwise,
 8535          * link the diradd into its inodedep.  If the inode is not yet
 8536          * written place it on the bufwait list, otherwise do the post-inode
 8537          * write processing to put it on the id_pendinghd list.
 8538          */
 8539         if (MOUNTEDSUJ(mp)) {
 8540                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 8541                     inoreflst);
 8542                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
 8543                     ("softdep_setup_directory_add: bad jaddref %p", jaddref));
 8544                 jaddref->ja_diroff = diroffset;
 8545                 jaddref->ja_diradd = dap;
 8546                 add_to_journal(&jaddref->ja_list);
 8547         } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
 8548                 diradd_inode_written(dap, inodedep);
 8549         else
 8550                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 8551         /*
 8552          * Add the journal entries for . and .. links now that the primary
 8553          * link is written.
 8554          */
 8555         if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
 8556                 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
 8557                     inoreflst, if_deps);
 8558                 KASSERT(jaddref != NULL &&
 8559                     jaddref->ja_ino == jaddref->ja_parent &&
 8560                     (jaddref->ja_state & MKDIR_BODY),
 8561                     ("softdep_setup_directory_add: bad dot jaddref %p",
 8562                     jaddref));
 8563                 mkdir1->md_jaddref = jaddref;
 8564                 jaddref->ja_mkdir = mkdir1;
 8565                 /*
 8566                  * It is important that the dotdot journal entry
 8567                  * is added prior to the dot entry since dot writes
 8568                  * both the dot and dotdot links.  These both must
 8569                  * be added after the primary link for the journal
 8570                  * to remain consistent.
 8571                  */
 8572                 add_to_journal(&mkdir2->md_jaddref->ja_list);
 8573                 add_to_journal(&jaddref->ja_list);
 8574         }
 8575         /*
 8576          * If we are adding a new directory remember this diradd so that if
 8577          * we rename it we can keep the dot and dotdot dependencies.  If
 8578          * we are adding a new name for an inode that has a mkdiradd we
 8579          * must be in rename and we have to move the dot and dotdot
 8580          * dependencies to this new name.  The old name is being orphaned
 8581          * soon.
 8582          */
 8583         if (mkdir1 != NULL) {
 8584                 if (inodedep->id_mkdiradd != NULL)
 8585                         panic("softdep_setup_directory_add: Existing mkdir");
 8586                 inodedep->id_mkdiradd = dap;
 8587         } else if (inodedep->id_mkdiradd)
 8588                 merge_diradd(inodedep, dap);
 8589         if (newdirblk != NULL) {
 8590                 /*
 8591                  * There is nothing to do if we are already tracking
 8592                  * this block.
 8593                  */
 8594                 if ((pagedep->pd_state & NEWBLOCK) != 0) {
 8595                         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 8596                         FREE_LOCK(ump);
 8597                         return (0);
 8598                 }
 8599                 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
 8600                     == 0)
 8601                         panic("softdep_setup_directory_add: lost entry");
 8602                 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
 8603                 pagedep->pd_state |= NEWBLOCK;
 8604                 pagedep->pd_newdirblk = newdirblk;
 8605                 newdirblk->db_pagedep = pagedep;
 8606                 FREE_LOCK(ump);
 8607                 /*
 8608                  * If we extended into an indirect signal direnter to sync.
 8609                  */
 8610                 if (isindir)
 8611                         return (1);
 8612                 return (0);
 8613         }
 8614         FREE_LOCK(ump);
 8615         return (0);
 8616 }
 8617 
 8618 /*
 8619  * This procedure is called to change the offset of a directory
 8620  * entry when compacting a directory block which must be owned
 8621  * exclusively by the caller. Note that the actual entry movement
 8622  * must be done in this procedure to ensure that no I/O completions
 8623  * occur while the move is in progress.
 8624  */
 8625 void 
 8626 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
 8627         struct buf *bp;         /* Buffer holding directory block. */
 8628         struct inode *dp;       /* inode for directory */
 8629         caddr_t base;           /* address of dp->i_offset */
 8630         caddr_t oldloc;         /* address of old directory location */
 8631         caddr_t newloc;         /* address of new directory location */
 8632         int entrysize;          /* size of directory entry */
 8633 {
 8634         int offset, oldoffset, newoffset;
 8635         struct pagedep *pagedep;
 8636         struct jmvref *jmvref;
 8637         struct diradd *dap;
 8638         struct direct *de;
 8639         struct mount *mp;
 8640         struct ufsmount *ump;
 8641         ufs_lbn_t lbn;
 8642         int flags;
 8643 
 8644         mp = ITOVFS(dp);
 8645         ump = VFSTOUFS(mp);
 8646         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 8647             ("softdep_change_directoryentry_offset called on "
 8648              "non-softdep filesystem"));
 8649         de = (struct direct *)oldloc;
 8650         jmvref = NULL;
 8651         flags = 0;
 8652         /*
 8653          * Moves are always journaled as it would be too complex to
 8654          * determine if any affected adds or removes are present in the
 8655          * journal.
 8656          */
 8657         if (MOUNTEDSUJ(mp)) {
 8658                 flags = DEPALLOC;
 8659                 jmvref = newjmvref(dp, de->d_ino,
 8660                     dp->i_offset + (oldloc - base),
 8661                     dp->i_offset + (newloc - base));
 8662         }
 8663         lbn = lblkno(ump->um_fs, dp->i_offset);
 8664         offset = blkoff(ump->um_fs, dp->i_offset);
 8665         oldoffset = offset + (oldloc - base);
 8666         newoffset = offset + (newloc - base);
 8667         ACQUIRE_LOCK(ump);
 8668         if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
 8669                 goto done;
 8670         dap = diradd_lookup(pagedep, oldoffset);
 8671         if (dap) {
 8672                 dap->da_offset = newoffset;
 8673                 newoffset = DIRADDHASH(newoffset);
 8674                 oldoffset = DIRADDHASH(oldoffset);
 8675                 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
 8676                     newoffset != oldoffset) {
 8677                         LIST_REMOVE(dap, da_pdlist);
 8678                         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
 8679                             dap, da_pdlist);
 8680                 }
 8681         }
 8682 done:
 8683         if (jmvref) {
 8684                 jmvref->jm_pagedep = pagedep;
 8685                 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
 8686                 add_to_journal(&jmvref->jm_list);
 8687         }
 8688         bcopy(oldloc, newloc, entrysize);
 8689         FREE_LOCK(ump);
 8690 }
 8691 
 8692 /*
 8693  * Move the mkdir dependencies and journal work from one diradd to another
 8694  * when renaming a directory.  The new name must depend on the mkdir deps
 8695  * completing as the old name did.  Directories can only have one valid link
 8696  * at a time so one must be canonical.
 8697  */
 8698 static void
 8699 merge_diradd(inodedep, newdap)
 8700         struct inodedep *inodedep;
 8701         struct diradd *newdap;
 8702 {
 8703         struct diradd *olddap;
 8704         struct mkdir *mkdir, *nextmd;
 8705         struct ufsmount *ump;
 8706         short state;
 8707 
 8708         olddap = inodedep->id_mkdiradd;
 8709         inodedep->id_mkdiradd = newdap;
 8710         if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
 8711                 newdap->da_state &= ~DEPCOMPLETE;
 8712                 ump = VFSTOUFS(inodedep->id_list.wk_mp);
 8713                 for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
 8714                      mkdir = nextmd) {
 8715                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
 8716                         if (mkdir->md_diradd != olddap)
 8717                                 continue;
 8718                         mkdir->md_diradd = newdap;
 8719                         state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
 8720                         newdap->da_state |= state;
 8721                         olddap->da_state &= ~state;
 8722                         if ((olddap->da_state &
 8723                             (MKDIR_PARENT | MKDIR_BODY)) == 0)
 8724                                 break;
 8725                 }
 8726                 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
 8727                         panic("merge_diradd: unfound ref");
 8728         }
 8729         /*
 8730          * Any mkdir related journal items are not safe to be freed until
 8731          * the new name is stable.
 8732          */
 8733         jwork_move(&newdap->da_jwork, &olddap->da_jwork);
 8734         olddap->da_state |= DEPCOMPLETE;
 8735         complete_diradd(olddap);
 8736 }
 8737 
 8738 /*
 8739  * Move the diradd to the pending list when all diradd dependencies are
 8740  * complete.
 8741  */
 8742 static void
 8743 complete_diradd(dap)
 8744         struct diradd *dap;
 8745 {
 8746         struct pagedep *pagedep;
 8747 
 8748         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 8749                 if (dap->da_state & DIRCHG)
 8750                         pagedep = dap->da_previous->dm_pagedep;
 8751                 else
 8752                         pagedep = dap->da_pagedep;
 8753                 LIST_REMOVE(dap, da_pdlist);
 8754                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 8755         }
 8756 }
 8757 
 8758 /*
 8759  * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
 8760  * add entries and conditonally journal the remove.
 8761  */
 8762 static void
 8763 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
 8764         struct diradd *dap;
 8765         struct dirrem *dirrem;
 8766         struct jremref *jremref;
 8767         struct jremref *dotremref;
 8768         struct jremref *dotdotremref;
 8769 {
 8770         struct inodedep *inodedep;
 8771         struct jaddref *jaddref;
 8772         struct inoref *inoref;
 8773         struct ufsmount *ump;
 8774         struct mkdir *mkdir;
 8775 
 8776         /*
 8777          * If no remove references were allocated we're on a non-journaled
 8778          * filesystem and can skip the cancel step.
 8779          */
 8780         if (jremref == NULL) {
 8781                 free_diradd(dap, NULL);
 8782                 return;
 8783         }
 8784         /*
 8785          * Cancel the primary name an free it if it does not require
 8786          * journaling.
 8787          */
 8788         if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
 8789             0, &inodedep) != 0) {
 8790                 /* Abort the addref that reference this diradd.  */
 8791                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
 8792                         if (inoref->if_list.wk_type != D_JADDREF)
 8793                                 continue;
 8794                         jaddref = (struct jaddref *)inoref;
 8795                         if (jaddref->ja_diradd != dap)
 8796                                 continue;
 8797                         if (cancel_jaddref(jaddref, inodedep,
 8798                             &dirrem->dm_jwork) == 0) {
 8799                                 free_jremref(jremref);
 8800                                 jremref = NULL;
 8801                         }
 8802                         break;
 8803                 }
 8804         }
 8805         /*
 8806          * Cancel subordinate names and free them if they do not require
 8807          * journaling.
 8808          */
 8809         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
 8810                 ump = VFSTOUFS(dap->da_list.wk_mp);
 8811                 LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) {
 8812                         if (mkdir->md_diradd != dap)
 8813                                 continue;
 8814                         if ((jaddref = mkdir->md_jaddref) == NULL)
 8815                                 continue;
 8816                         mkdir->md_jaddref = NULL;
 8817                         if (mkdir->md_state & MKDIR_PARENT) {
 8818                                 if (cancel_jaddref(jaddref, NULL,
 8819                                     &dirrem->dm_jwork) == 0) {
 8820                                         free_jremref(dotdotremref);
 8821                                         dotdotremref = NULL;
 8822                                 }
 8823                         } else {
 8824                                 if (cancel_jaddref(jaddref, inodedep,
 8825                                     &dirrem->dm_jwork) == 0) {
 8826                                         free_jremref(dotremref);
 8827                                         dotremref = NULL;
 8828                                 }
 8829                         }
 8830                 }
 8831         }
 8832 
 8833         if (jremref)
 8834                 journal_jremref(dirrem, jremref, inodedep);
 8835         if (dotremref)
 8836                 journal_jremref(dirrem, dotremref, inodedep);
 8837         if (dotdotremref)
 8838                 journal_jremref(dirrem, dotdotremref, NULL);
 8839         jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
 8840         free_diradd(dap, &dirrem->dm_jwork);
 8841 }
 8842 
 8843 /*
 8844  * Free a diradd dependency structure. This routine must be called
 8845  * with splbio interrupts blocked.
 8846  */
 8847 static void
 8848 free_diradd(dap, wkhd)
 8849         struct diradd *dap;
 8850         struct workhead *wkhd;
 8851 {
 8852         struct dirrem *dirrem;
 8853         struct pagedep *pagedep;
 8854         struct inodedep *inodedep;
 8855         struct mkdir *mkdir, *nextmd;
 8856         struct ufsmount *ump;
 8857 
 8858         ump = VFSTOUFS(dap->da_list.wk_mp);
 8859         LOCK_OWNED(ump);
 8860         LIST_REMOVE(dap, da_pdlist);
 8861         if (dap->da_state & ONWORKLIST)
 8862                 WORKLIST_REMOVE(&dap->da_list);
 8863         if ((dap->da_state & DIRCHG) == 0) {
 8864                 pagedep = dap->da_pagedep;
 8865         } else {
 8866                 dirrem = dap->da_previous;
 8867                 pagedep = dirrem->dm_pagedep;
 8868                 dirrem->dm_dirinum = pagedep->pd_ino;
 8869                 dirrem->dm_state |= COMPLETE;
 8870                 if (LIST_EMPTY(&dirrem->dm_jremrefhd))
 8871                         add_to_worklist(&dirrem->dm_list, 0);
 8872         }
 8873         if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
 8874             0, &inodedep) != 0)
 8875                 if (inodedep->id_mkdiradd == dap)
 8876                         inodedep->id_mkdiradd = NULL;
 8877         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
 8878                 for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
 8879                      mkdir = nextmd) {
 8880                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
 8881                         if (mkdir->md_diradd != dap)
 8882                                 continue;
 8883                         dap->da_state &=
 8884                             ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
 8885                         LIST_REMOVE(mkdir, md_mkdirs);
 8886                         if (mkdir->md_state & ONWORKLIST)
 8887                                 WORKLIST_REMOVE(&mkdir->md_list);
 8888                         if (mkdir->md_jaddref != NULL)
 8889                                 panic("free_diradd: Unexpected jaddref");
 8890                         WORKITEM_FREE(mkdir, D_MKDIR);
 8891                         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
 8892                                 break;
 8893                 }
 8894                 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
 8895                         panic("free_diradd: unfound ref");
 8896         }
 8897         if (inodedep)
 8898                 free_inodedep(inodedep);
 8899         /*
 8900          * Free any journal segments waiting for the directory write.
 8901          */
 8902         handle_jwork(&dap->da_jwork);
 8903         WORKITEM_FREE(dap, D_DIRADD);
 8904 }
 8905 
 8906 /*
 8907  * Directory entry removal dependencies.
 8908  * 
 8909  * When removing a directory entry, the entry's inode pointer must be
 8910  * zero'ed on disk before the corresponding inode's link count is decremented
 8911  * (possibly freeing the inode for re-use). This dependency is handled by
 8912  * updating the directory entry but delaying the inode count reduction until
 8913  * after the directory block has been written to disk. After this point, the
 8914  * inode count can be decremented whenever it is convenient.
 8915  */
 8916 
 8917 /*
 8918  * This routine should be called immediately after removing
 8919  * a directory entry.  The inode's link count should not be
 8920  * decremented by the calling procedure -- the soft updates
 8921  * code will do this task when it is safe.
 8922  */
 8923 void 
 8924 softdep_setup_remove(bp, dp, ip, isrmdir)
 8925         struct buf *bp;         /* buffer containing directory block */
 8926         struct inode *dp;       /* inode for the directory being modified */
 8927         struct inode *ip;       /* inode for directory entry being removed */
 8928         int isrmdir;            /* indicates if doing RMDIR */
 8929 {
 8930         struct dirrem *dirrem, *prevdirrem;
 8931         struct inodedep *inodedep;
 8932         struct ufsmount *ump;
 8933         int direct;
 8934 
 8935         ump = ITOUMP(ip);
 8936         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 8937             ("softdep_setup_remove called on non-softdep filesystem"));
 8938         /*
 8939          * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
 8940          * newdirrem() to setup the full directory remove which requires
 8941          * isrmdir > 1.
 8942          */
 8943         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
 8944         /*
 8945          * Add the dirrem to the inodedep's pending remove list for quick
 8946          * discovery later.
 8947          */
 8948         if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0)
 8949                 panic("softdep_setup_remove: Lost inodedep.");
 8950         KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
 8951         dirrem->dm_state |= ONDEPLIST;
 8952         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
 8953 
 8954         /*
 8955          * If the COMPLETE flag is clear, then there were no active
 8956          * entries and we want to roll back to a zeroed entry until
 8957          * the new inode is committed to disk. If the COMPLETE flag is
 8958          * set then we have deleted an entry that never made it to
 8959          * disk. If the entry we deleted resulted from a name change,
 8960          * then the old name still resides on disk. We cannot delete
 8961          * its inode (returned to us in prevdirrem) until the zeroed
 8962          * directory entry gets to disk. The new inode has never been
 8963          * referenced on the disk, so can be deleted immediately.
 8964          */
 8965         if ((dirrem->dm_state & COMPLETE) == 0) {
 8966                 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
 8967                     dm_next);
 8968                 FREE_LOCK(ump);
 8969         } else {
 8970                 if (prevdirrem != NULL)
 8971                         LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
 8972                             prevdirrem, dm_next);
 8973                 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
 8974                 direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
 8975                 FREE_LOCK(ump);
 8976                 if (direct)
 8977                         handle_workitem_remove(dirrem, 0);
 8978         }
 8979 }
 8980 
 8981 /*
 8982  * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
 8983  * pd_pendinghd list of a pagedep.
 8984  */
 8985 static struct diradd *
 8986 diradd_lookup(pagedep, offset)
 8987         struct pagedep *pagedep;
 8988         int offset;
 8989 {
 8990         struct diradd *dap;
 8991 
 8992         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
 8993                 if (dap->da_offset == offset)
 8994                         return (dap);
 8995         LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
 8996                 if (dap->da_offset == offset)
 8997                         return (dap);
 8998         return (NULL);
 8999 }
 9000 
 9001 /*
 9002  * Search for a .. diradd dependency in a directory that is being removed.
 9003  * If the directory was renamed to a new parent we have a diradd rather
 9004  * than a mkdir for the .. entry.  We need to cancel it now before
 9005  * it is found in truncate().
 9006  */
 9007 static struct jremref *
 9008 cancel_diradd_dotdot(ip, dirrem, jremref)
 9009         struct inode *ip;
 9010         struct dirrem *dirrem;
 9011         struct jremref *jremref;
 9012 {
 9013         struct pagedep *pagedep;
 9014         struct diradd *dap;
 9015         struct worklist *wk;
 9016 
 9017         if (pagedep_lookup(ITOVFS(ip), NULL, ip->i_number, 0, 0, &pagedep) == 0)
 9018                 return (jremref);
 9019         dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
 9020         if (dap == NULL)
 9021                 return (jremref);
 9022         cancel_diradd(dap, dirrem, jremref, NULL, NULL);
 9023         /*
 9024          * Mark any journal work as belonging to the parent so it is freed
 9025          * with the .. reference.
 9026          */
 9027         LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
 9028                 wk->wk_state |= MKDIR_PARENT;
 9029         return (NULL);
 9030 }
 9031 
 9032 /*
 9033  * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
 9034  * replace it with a dirrem/diradd pair as a result of re-parenting a
 9035  * directory.  This ensures that we don't simultaneously have a mkdir and
 9036  * a diradd for the same .. entry.
 9037  */
 9038 static struct jremref *
 9039 cancel_mkdir_dotdot(ip, dirrem, jremref)
 9040         struct inode *ip;
 9041         struct dirrem *dirrem;
 9042         struct jremref *jremref;
 9043 {
 9044         struct inodedep *inodedep;
 9045         struct jaddref *jaddref;
 9046         struct ufsmount *ump;
 9047         struct mkdir *mkdir;
 9048         struct diradd *dap;
 9049         struct mount *mp;
 9050 
 9051         mp = ITOVFS(ip);
 9052         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
 9053                 return (jremref);
 9054         dap = inodedep->id_mkdiradd;
 9055         if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
 9056                 return (jremref);
 9057         ump = VFSTOUFS(inodedep->id_list.wk_mp);
 9058         for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
 9059             mkdir = LIST_NEXT(mkdir, md_mkdirs))
 9060                 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
 9061                         break;
 9062         if (mkdir == NULL)
 9063                 panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
 9064         if ((jaddref = mkdir->md_jaddref) != NULL) {
 9065                 mkdir->md_jaddref = NULL;
 9066                 jaddref->ja_state &= ~MKDIR_PARENT;
 9067                 if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0)
 9068                         panic("cancel_mkdir_dotdot: Lost parent inodedep");
 9069                 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
 9070                         journal_jremref(dirrem, jremref, inodedep);
 9071                         jremref = NULL;
 9072                 }
 9073         }
 9074         if (mkdir->md_state & ONWORKLIST)
 9075                 WORKLIST_REMOVE(&mkdir->md_list);
 9076         mkdir->md_state |= ALLCOMPLETE;
 9077         complete_mkdir(mkdir);
 9078         return (jremref);
 9079 }
 9080 
 9081 static void
 9082 journal_jremref(dirrem, jremref, inodedep)
 9083         struct dirrem *dirrem;
 9084         struct jremref *jremref;
 9085         struct inodedep *inodedep;
 9086 {
 9087 
 9088         if (inodedep == NULL)
 9089                 if (inodedep_lookup(jremref->jr_list.wk_mp,
 9090                     jremref->jr_ref.if_ino, 0, &inodedep) == 0)
 9091                         panic("journal_jremref: Lost inodedep");
 9092         LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
 9093         TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
 9094         add_to_journal(&jremref->jr_list);
 9095 }
 9096 
 9097 static void
 9098 dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
 9099         struct dirrem *dirrem;
 9100         struct jremref *jremref;
 9101         struct jremref *dotremref;
 9102         struct jremref *dotdotremref;
 9103 {
 9104         struct inodedep *inodedep;
 9105 
 9106 
 9107         if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
 9108             &inodedep) == 0)
 9109                 panic("dirrem_journal: Lost inodedep");
 9110         journal_jremref(dirrem, jremref, inodedep);
 9111         if (dotremref)
 9112                 journal_jremref(dirrem, dotremref, inodedep);
 9113         if (dotdotremref)
 9114                 journal_jremref(dirrem, dotdotremref, NULL);
 9115 }
 9116 
 9117 /*
 9118  * Allocate a new dirrem if appropriate and return it along with
 9119  * its associated pagedep. Called without a lock, returns with lock.
 9120  */
 9121 static struct dirrem *
 9122 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
 9123         struct buf *bp;         /* buffer containing directory block */
 9124         struct inode *dp;       /* inode for the directory being modified */
 9125         struct inode *ip;       /* inode for directory entry being removed */
 9126         int isrmdir;            /* indicates if doing RMDIR */
 9127         struct dirrem **prevdirremp; /* previously referenced inode, if any */
 9128 {
 9129         int offset;
 9130         ufs_lbn_t lbn;
 9131         struct diradd *dap;
 9132         struct dirrem *dirrem;
 9133         struct pagedep *pagedep;
 9134         struct jremref *jremref;
 9135         struct jremref *dotremref;
 9136         struct jremref *dotdotremref;
 9137         struct vnode *dvp;
 9138         struct ufsmount *ump;
 9139 
 9140         /*
 9141          * Whiteouts have no deletion dependencies.
 9142          */
 9143         if (ip == NULL)
 9144                 panic("newdirrem: whiteout");
 9145         dvp = ITOV(dp);
 9146         ump = ITOUMP(dp);
 9147 
 9148         /*
 9149          * If the system is over its limit and our filesystem is
 9150          * responsible for more than our share of that usage and
 9151          * we are not a snapshot, request some inodedep cleanup.
 9152          * Limiting the number of dirrem structures will also limit
 9153          * the number of freefile and freeblks structures.
 9154          */
 9155         ACQUIRE_LOCK(ump);
 9156         if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM))
 9157                 schedule_cleanup(UFSTOVFS(ump));
 9158         else
 9159                 FREE_LOCK(ump);
 9160         dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS |
 9161             M_ZERO);
 9162         workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
 9163         LIST_INIT(&dirrem->dm_jremrefhd);
 9164         LIST_INIT(&dirrem->dm_jwork);
 9165         dirrem->dm_state = isrmdir ? RMDIR : 0;
 9166         dirrem->dm_oldinum = ip->i_number;
 9167         *prevdirremp = NULL;
 9168         /*
 9169          * Allocate remove reference structures to track journal write
 9170          * dependencies.  We will always have one for the link and
 9171          * when doing directories we will always have one more for dot.
 9172          * When renaming a directory we skip the dotdot link change so
 9173          * this is not needed.
 9174          */
 9175         jremref = dotremref = dotdotremref = NULL;
 9176         if (DOINGSUJ(dvp)) {
 9177                 if (isrmdir) {
 9178                         jremref = newjremref(dirrem, dp, ip, dp->i_offset,
 9179                             ip->i_effnlink + 2);
 9180                         dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
 9181                             ip->i_effnlink + 1);
 9182                         dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
 9183                             dp->i_effnlink + 1);
 9184                         dotdotremref->jr_state |= MKDIR_PARENT;
 9185                 } else
 9186                         jremref = newjremref(dirrem, dp, ip, dp->i_offset,
 9187                             ip->i_effnlink + 1);
 9188         }
 9189         ACQUIRE_LOCK(ump);
 9190         lbn = lblkno(ump->um_fs, dp->i_offset);
 9191         offset = blkoff(ump->um_fs, dp->i_offset);
 9192         pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC,
 9193             &pagedep);
 9194         dirrem->dm_pagedep = pagedep;
 9195         dirrem->dm_offset = offset;
 9196         /*
 9197          * If we're renaming a .. link to a new directory, cancel any
 9198          * existing MKDIR_PARENT mkdir.  If it has already been canceled
 9199          * the jremref is preserved for any potential diradd in this
 9200          * location.  This can not coincide with a rmdir.
 9201          */
 9202         if (dp->i_offset == DOTDOT_OFFSET) {
 9203                 if (isrmdir)
 9204                         panic("newdirrem: .. directory change during remove?");
 9205                 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
 9206         }
 9207         /*
 9208          * If we're removing a directory search for the .. dependency now and
 9209          * cancel it.  Any pending journal work will be added to the dirrem
 9210          * to be completed when the workitem remove completes.
 9211          */
 9212         if (isrmdir)
 9213                 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
 9214         /*
 9215          * Check for a diradd dependency for the same directory entry.
 9216          * If present, then both dependencies become obsolete and can
 9217          * be de-allocated.
 9218          */
 9219         dap = diradd_lookup(pagedep, offset);
 9220         if (dap == NULL) {
 9221                 /*
 9222                  * Link the jremref structures into the dirrem so they are
 9223                  * written prior to the pagedep.
 9224                  */
 9225                 if (jremref)
 9226                         dirrem_journal(dirrem, jremref, dotremref,
 9227                             dotdotremref);
 9228                 return (dirrem);
 9229         }
 9230         /*
 9231          * Must be ATTACHED at this point.
 9232          */
 9233         if ((dap->da_state & ATTACHED) == 0)
 9234                 panic("newdirrem: not ATTACHED");
 9235         if (dap->da_newinum != ip->i_number)
 9236                 panic("newdirrem: inum %ju should be %ju",
 9237                     (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
 9238         /*
 9239          * If we are deleting a changed name that never made it to disk,
 9240          * then return the dirrem describing the previous inode (which
 9241          * represents the inode currently referenced from this entry on disk).
 9242          */
 9243         if ((dap->da_state & DIRCHG) != 0) {
 9244                 *prevdirremp = dap->da_previous;
 9245                 dap->da_state &= ~DIRCHG;
 9246                 dap->da_pagedep = pagedep;
 9247         }
 9248         /*
 9249          * We are deleting an entry that never made it to disk.
 9250          * Mark it COMPLETE so we can delete its inode immediately.
 9251          */
 9252         dirrem->dm_state |= COMPLETE;
 9253         cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
 9254 #ifdef SUJ_DEBUG
 9255         if (isrmdir == 0) {
 9256                 struct worklist *wk;
 9257 
 9258                 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
 9259                         if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
 9260                                 panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
 9261         }
 9262 #endif
 9263 
 9264         return (dirrem);
 9265 }
 9266 
 9267 /*
 9268  * Directory entry change dependencies.
 9269  * 
 9270  * Changing an existing directory entry requires that an add operation
 9271  * be completed first followed by a deletion. The semantics for the addition
 9272  * are identical to the description of adding a new entry above except
 9273  * that the rollback is to the old inode number rather than zero. Once
 9274  * the addition dependency is completed, the removal is done as described
 9275  * in the removal routine above.
 9276  */
 9277 
 9278 /*
 9279  * This routine should be called immediately after changing
 9280  * a directory entry.  The inode's link count should not be
 9281  * decremented by the calling procedure -- the soft updates
 9282  * code will perform this task when it is safe.
 9283  */
 9284 void 
 9285 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 9286         struct buf *bp;         /* buffer containing directory block */
 9287         struct inode *dp;       /* inode for the directory being modified */
 9288         struct inode *ip;       /* inode for directory entry being removed */
 9289         ino_t newinum;          /* new inode number for changed entry */
 9290         int isrmdir;            /* indicates if doing RMDIR */
 9291 {
 9292         int offset;
 9293         struct diradd *dap = NULL;
 9294         struct dirrem *dirrem, *prevdirrem;
 9295         struct pagedep *pagedep;
 9296         struct inodedep *inodedep;
 9297         struct jaddref *jaddref;
 9298         struct mount *mp;
 9299         struct ufsmount *ump;
 9300 
 9301         mp = ITOVFS(dp);
 9302         ump = VFSTOUFS(mp);
 9303         offset = blkoff(ump->um_fs, dp->i_offset);
 9304         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 9305            ("softdep_setup_directory_change called on non-softdep filesystem"));
 9306 
 9307         /*
 9308          * Whiteouts do not need diradd dependencies.
 9309          */
 9310         if (newinum != WINO) {
 9311                 dap = malloc(sizeof(struct diradd),
 9312                     M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
 9313                 workitem_alloc(&dap->da_list, D_DIRADD, mp);
 9314                 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
 9315                 dap->da_offset = offset;
 9316                 dap->da_newinum = newinum;
 9317                 LIST_INIT(&dap->da_jwork);
 9318         }
 9319 
 9320         /*
 9321          * Allocate a new dirrem and ACQUIRE_LOCK.
 9322          */
 9323         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
 9324         pagedep = dirrem->dm_pagedep;
 9325         /*
 9326          * The possible values for isrmdir:
 9327          *      0 - non-directory file rename
 9328          *      1 - directory rename within same directory
 9329          *   inum - directory rename to new directory of given inode number
 9330          * When renaming to a new directory, we are both deleting and
 9331          * creating a new directory entry, so the link count on the new
 9332          * directory should not change. Thus we do not need the followup
 9333          * dirrem which is usually done in handle_workitem_remove. We set
 9334          * the DIRCHG flag to tell handle_workitem_remove to skip the 
 9335          * followup dirrem.
 9336          */
 9337         if (isrmdir > 1)
 9338                 dirrem->dm_state |= DIRCHG;
 9339 
 9340         /*
 9341          * Whiteouts have no additional dependencies,
 9342          * so just put the dirrem on the correct list.
 9343          */
 9344         if (newinum == WINO) {
 9345                 if ((dirrem->dm_state & COMPLETE) == 0) {
 9346                         LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
 9347                             dm_next);
 9348                 } else {
 9349                         dirrem->dm_dirinum = pagedep->pd_ino;
 9350                         if (LIST_EMPTY(&dirrem->dm_jremrefhd))
 9351                                 add_to_worklist(&dirrem->dm_list, 0);
 9352                 }
 9353                 FREE_LOCK(ump);
 9354                 return;
 9355         }
 9356         /*
 9357          * Add the dirrem to the inodedep's pending remove list for quick
 9358          * discovery later.  A valid nlinkdelta ensures that this lookup
 9359          * will not fail.
 9360          */
 9361         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
 9362                 panic("softdep_setup_directory_change: Lost inodedep.");
 9363         dirrem->dm_state |= ONDEPLIST;
 9364         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
 9365 
 9366         /*
 9367          * If the COMPLETE flag is clear, then there were no active
 9368          * entries and we want to roll back to the previous inode until
 9369          * the new inode is committed to disk. If the COMPLETE flag is
 9370          * set, then we have deleted an entry that never made it to disk.
 9371          * If the entry we deleted resulted from a name change, then the old
 9372          * inode reference still resides on disk. Any rollback that we do
 9373          * needs to be to that old inode (returned to us in prevdirrem). If
 9374          * the entry we deleted resulted from a create, then there is
 9375          * no entry on the disk, so we want to roll back to zero rather
 9376          * than the uncommitted inode. In either of the COMPLETE cases we
 9377          * want to immediately free the unwritten and unreferenced inode.
 9378          */
 9379         if ((dirrem->dm_state & COMPLETE) == 0) {
 9380                 dap->da_previous = dirrem;
 9381         } else {
 9382                 if (prevdirrem != NULL) {
 9383                         dap->da_previous = prevdirrem;
 9384                 } else {
 9385                         dap->da_state &= ~DIRCHG;
 9386                         dap->da_pagedep = pagedep;
 9387                 }
 9388                 dirrem->dm_dirinum = pagedep->pd_ino;
 9389                 if (LIST_EMPTY(&dirrem->dm_jremrefhd))
 9390                         add_to_worklist(&dirrem->dm_list, 0);
 9391         }
 9392         /*
 9393          * Lookup the jaddref for this journal entry.  We must finish
 9394          * initializing it and make the diradd write dependent on it.
 9395          * If we're not journaling, put it on the id_bufwait list if the
 9396          * inode is not yet written. If it is written, do the post-inode
 9397          * write processing to put it on the id_pendinghd list.
 9398          */
 9399         inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
 9400         if (MOUNTEDSUJ(mp)) {
 9401                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 9402                     inoreflst);
 9403                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
 9404                     ("softdep_setup_directory_change: bad jaddref %p",
 9405                     jaddref));
 9406                 jaddref->ja_diroff = dp->i_offset;
 9407                 jaddref->ja_diradd = dap;
 9408                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
 9409                     dap, da_pdlist);
 9410                 add_to_journal(&jaddref->ja_list);
 9411         } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 9412                 dap->da_state |= COMPLETE;
 9413                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 9414                 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
 9415         } else {
 9416                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
 9417                     dap, da_pdlist);
 9418                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 9419         }
 9420         /*
 9421          * If we're making a new name for a directory that has not been
 9422          * committed when need to move the dot and dotdot references to
 9423          * this new name.
 9424          */
 9425         if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
 9426                 merge_diradd(inodedep, dap);
 9427         FREE_LOCK(ump);
 9428 }
 9429 
 9430 /*
 9431  * Called whenever the link count on an inode is changed.
 9432  * It creates an inode dependency so that the new reference(s)
 9433  * to the inode cannot be committed to disk until the updated
 9434  * inode has been written.
 9435  */
 9436 void
 9437 softdep_change_linkcnt(ip)
 9438         struct inode *ip;       /* the inode with the increased link count */
 9439 {
 9440         struct inodedep *inodedep;
 9441         struct ufsmount *ump;
 9442 
 9443         ump = ITOUMP(ip);
 9444         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 9445             ("softdep_change_linkcnt called on non-softdep filesystem"));
 9446         ACQUIRE_LOCK(ump);
 9447         inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
 9448         if (ip->i_nlink < ip->i_effnlink)
 9449                 panic("softdep_change_linkcnt: bad delta");
 9450         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 9451         FREE_LOCK(ump);
 9452 }
 9453 
 9454 /*
 9455  * Attach a sbdep dependency to the superblock buf so that we can keep
 9456  * track of the head of the linked list of referenced but unlinked inodes.
 9457  */
 9458 void
 9459 softdep_setup_sbupdate(ump, fs, bp)
 9460         struct ufsmount *ump;
 9461         struct fs *fs;
 9462         struct buf *bp;
 9463 {
 9464         struct sbdep *sbdep;
 9465         struct worklist *wk;
 9466 
 9467         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
 9468             ("softdep_setup_sbupdate called on non-softdep filesystem"));
 9469         LIST_FOREACH(wk, &bp->b_dep, wk_list)
 9470                 if (wk->wk_type == D_SBDEP)
 9471                         break;
 9472         if (wk != NULL)
 9473                 return;
 9474         sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
 9475         workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
 9476         sbdep->sb_fs = fs;
 9477         sbdep->sb_ump = ump;
 9478         ACQUIRE_LOCK(ump);
 9479         WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
 9480         FREE_LOCK(ump);
 9481 }
 9482 
 9483 /*
 9484  * Return the first unlinked inodedep which is ready to be the head of the
 9485  * list.  The inodedep and all those after it must have valid next pointers.
 9486  */
 9487 static struct inodedep *
 9488 first_unlinked_inodedep(ump)
 9489         struct ufsmount *ump;
 9490 {
 9491         struct inodedep *inodedep;
 9492         struct inodedep *idp;
 9493 
 9494         LOCK_OWNED(ump);
 9495         for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
 9496             inodedep; inodedep = idp) {
 9497                 if ((inodedep->id_state & UNLINKNEXT) == 0)
 9498                         return (NULL);
 9499                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
 9500                 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
 9501                         break;
 9502                 if ((inodedep->id_state & UNLINKPREV) == 0)
 9503                         break;
 9504         }
 9505         return (inodedep);
 9506 }
 9507 
 9508 /*
 9509  * Set the sujfree unlinked head pointer prior to writing a superblock.
 9510  */
 9511 static void
 9512 initiate_write_sbdep(sbdep)
 9513         struct sbdep *sbdep;
 9514 {
 9515         struct inodedep *inodedep;
 9516         struct fs *bpfs;
 9517         struct fs *fs;
 9518 
 9519         bpfs = sbdep->sb_fs;
 9520         fs = sbdep->sb_ump->um_fs;
 9521         inodedep = first_unlinked_inodedep(sbdep->sb_ump);
 9522         if (inodedep) {
 9523                 fs->fs_sujfree = inodedep->id_ino;
 9524                 inodedep->id_state |= UNLINKPREV;
 9525         } else
 9526                 fs->fs_sujfree = 0;
 9527         bpfs->fs_sujfree = fs->fs_sujfree;
 9528 }
 9529 
 9530 /*
 9531  * After a superblock is written determine whether it must be written again
 9532  * due to a changing unlinked list head.
 9533  */
 9534 static int
 9535 handle_written_sbdep(sbdep, bp)
 9536         struct sbdep *sbdep;
 9537         struct buf *bp;
 9538 {
 9539         struct inodedep *inodedep;
 9540         struct fs *fs;
 9541 
 9542         LOCK_OWNED(sbdep->sb_ump);
 9543         fs = sbdep->sb_fs;
 9544         /*
 9545          * If the superblock doesn't match the in-memory list start over.
 9546          */
 9547         inodedep = first_unlinked_inodedep(sbdep->sb_ump);
 9548         if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
 9549             (inodedep == NULL && fs->fs_sujfree != 0)) {
 9550                 bdirty(bp);
 9551                 return (1);
 9552         }
 9553         WORKITEM_FREE(sbdep, D_SBDEP);
 9554         if (fs->fs_sujfree == 0)
 9555                 return (0);
 9556         /*
 9557          * Now that we have a record of this inode in stable store allow it
 9558          * to be written to free up pending work.  Inodes may see a lot of
 9559          * write activity after they are unlinked which we must not hold up.
 9560          */
 9561         for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
 9562                 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
 9563                         panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
 9564                             inodedep, inodedep->id_state);
 9565                 if (inodedep->id_state & UNLINKONLIST)
 9566                         break;
 9567                 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
 9568         }
 9569 
 9570         return (0);
 9571 }
 9572 
 9573 /*
 9574  * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
 9575  */
 9576 static void
 9577 unlinked_inodedep(mp, inodedep)
 9578         struct mount *mp;
 9579         struct inodedep *inodedep;
 9580 {
 9581         struct ufsmount *ump;
 9582 
 9583         ump = VFSTOUFS(mp);
 9584         LOCK_OWNED(ump);
 9585         if (MOUNTEDSUJ(mp) == 0)
 9586                 return;
 9587         ump->um_fs->fs_fmod = 1;
 9588         if (inodedep->id_state & UNLINKED)
 9589                 panic("unlinked_inodedep: %p already unlinked\n", inodedep);
 9590         inodedep->id_state |= UNLINKED;
 9591         TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
 9592 }
 9593 
 9594 /*
 9595  * Remove an inodedep from the unlinked inodedep list.  This may require
 9596  * disk writes if the inode has made it that far.
 9597  */
 9598 static void
 9599 clear_unlinked_inodedep(inodedep)
 9600         struct inodedep *inodedep;
 9601 {
 9602         struct ufsmount *ump;
 9603         struct inodedep *idp;
 9604         struct inodedep *idn;
 9605         struct fs *fs;
 9606         struct buf *bp;
 9607         ino_t ino;
 9608         ino_t nino;
 9609         ino_t pino;
 9610         int error;
 9611 
 9612         ump = VFSTOUFS(inodedep->id_list.wk_mp);
 9613         fs = ump->um_fs;
 9614         ino = inodedep->id_ino;
 9615         error = 0;
 9616         for (;;) {
 9617                 LOCK_OWNED(ump);
 9618                 KASSERT((inodedep->id_state & UNLINKED) != 0,
 9619                     ("clear_unlinked_inodedep: inodedep %p not unlinked",
 9620                     inodedep));
 9621                 /*
 9622                  * If nothing has yet been written simply remove us from
 9623                  * the in memory list and return.  This is the most common
 9624                  * case where handle_workitem_remove() loses the final
 9625                  * reference.
 9626                  */
 9627                 if ((inodedep->id_state & UNLINKLINKS) == 0)
 9628                         break;
 9629                 /*
 9630                  * If we have a NEXT pointer and no PREV pointer we can simply
 9631                  * clear NEXT's PREV and remove ourselves from the list.  Be
 9632                  * careful not to clear PREV if the superblock points at
 9633                  * next as well.
 9634                  */
 9635                 idn = TAILQ_NEXT(inodedep, id_unlinked);
 9636                 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
 9637                         if (idn && fs->fs_sujfree != idn->id_ino)
 9638                                 idn->id_state &= ~UNLINKPREV;
 9639                         break;
 9640                 }
 9641                 /*
 9642                  * Here we have an inodedep which is actually linked into
 9643                  * the list.  We must remove it by forcing a write to the
 9644                  * link before us, whether it be the superblock or an inode.
 9645                  * Unfortunately the list may change while we're waiting
 9646                  * on the buf lock for either resource so we must loop until
 9647                  * we lock the right one.  If both the superblock and an
 9648                  * inode point to this inode we must clear the inode first
 9649                  * followed by the superblock.
 9650                  */
 9651                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
 9652                 pino = 0;
 9653                 if (idp && (idp->id_state & UNLINKNEXT))
 9654                         pino = idp->id_ino;
 9655                 FREE_LOCK(ump);
 9656                 if (pino == 0) {
 9657                         bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
 9658                             (int)fs->fs_sbsize, 0, 0, 0);
 9659                 } else {
 9660                         error = bread(ump->um_devvp,
 9661                             fsbtodb(fs, ino_to_fsba(fs, pino)),
 9662                             (int)fs->fs_bsize, NOCRED, &bp);
 9663                         if (error)
 9664                                 brelse(bp);
 9665                 }
 9666                 ACQUIRE_LOCK(ump);
 9667                 if (error)
 9668                         break;
 9669                 /* If the list has changed restart the loop. */
 9670                 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
 9671                 nino = 0;
 9672                 if (idp && (idp->id_state & UNLINKNEXT))
 9673                         nino = idp->id_ino;
 9674                 if (nino != pino ||
 9675                     (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
 9676                         FREE_LOCK(ump);
 9677                         brelse(bp);
 9678                         ACQUIRE_LOCK(ump);
 9679                         continue;
 9680                 }
 9681                 nino = 0;
 9682                 idn = TAILQ_NEXT(inodedep, id_unlinked);
 9683                 if (idn)
 9684                         nino = idn->id_ino;
 9685                 /*
 9686                  * Remove us from the in memory list.  After this we cannot
 9687                  * access the inodedep.
 9688                  */
 9689                 KASSERT((inodedep->id_state & UNLINKED) != 0,
 9690                     ("clear_unlinked_inodedep: inodedep %p not unlinked",
 9691                     inodedep));
 9692                 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
 9693                 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
 9694                 FREE_LOCK(ump);
 9695                 /*
 9696                  * The predecessor's next pointer is manually updated here
 9697                  * so that the NEXT flag is never cleared for an element
 9698                  * that is in the list.
 9699                  */
 9700                 if (pino == 0) {
 9701                         bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
 9702                         ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
 9703                         softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
 9704                             bp);
 9705                 } else if (fs->fs_magic == FS_UFS1_MAGIC)
 9706                         ((struct ufs1_dinode *)bp->b_data +
 9707                             ino_to_fsbo(fs, pino))->di_freelink = nino;
 9708                 else
 9709                         ((struct ufs2_dinode *)bp->b_data +
 9710                             ino_to_fsbo(fs, pino))->di_freelink = nino;
 9711                 /*
 9712                  * If the bwrite fails we have no recourse to recover.  The
 9713                  * filesystem is corrupted already.
 9714                  */
 9715                 bwrite(bp);
 9716                 ACQUIRE_LOCK(ump);
 9717                 /*
 9718                  * If the superblock pointer still needs to be cleared force
 9719                  * a write here.
 9720                  */
 9721                 if (fs->fs_sujfree == ino) {
 9722                         FREE_LOCK(ump);
 9723                         bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
 9724                             (int)fs->fs_sbsize, 0, 0, 0);
 9725                         bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
 9726                         ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
 9727                         softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
 9728                             bp);
 9729                         bwrite(bp);
 9730                         ACQUIRE_LOCK(ump);
 9731                 }
 9732 
 9733                 if (fs->fs_sujfree != ino)
 9734                         return;
 9735                 panic("clear_unlinked_inodedep: Failed to clear free head");
 9736         }
 9737         if (inodedep->id_ino == fs->fs_sujfree)
 9738                 panic("clear_unlinked_inodedep: Freeing head of free list");
 9739         inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
 9740         TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
 9741         return;
 9742 }
 9743 
 9744 /*
 9745  * This workitem decrements the inode's link count.
 9746  * If the link count reaches zero, the file is removed.
 9747  */
 9748 static int
 9749 handle_workitem_remove(dirrem, flags)
 9750         struct dirrem *dirrem;
 9751         int flags;
 9752 {
 9753         struct inodedep *inodedep;
 9754         struct workhead dotdotwk;
 9755         struct worklist *wk;
 9756         struct ufsmount *ump;
 9757         struct mount *mp;
 9758         struct vnode *vp;
 9759         struct inode *ip;
 9760         ino_t oldinum;
 9761 
 9762         if (dirrem->dm_state & ONWORKLIST)
 9763                 panic("handle_workitem_remove: dirrem %p still on worklist",
 9764                     dirrem);
 9765         oldinum = dirrem->dm_oldinum;
 9766         mp = dirrem->dm_list.wk_mp;
 9767         ump = VFSTOUFS(mp);
 9768         flags |= LK_EXCLUSIVE;
 9769         if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
 9770                 return (EBUSY);
 9771         ip = VTOI(vp);
 9772         ACQUIRE_LOCK(ump);
 9773         if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
 9774                 panic("handle_workitem_remove: lost inodedep");
 9775         if (dirrem->dm_state & ONDEPLIST)
 9776                 LIST_REMOVE(dirrem, dm_inonext);
 9777         KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
 9778             ("handle_workitem_remove:  Journal entries not written."));
 9779 
 9780         /*
 9781          * Move all dependencies waiting on the remove to complete
 9782          * from the dirrem to the inode inowait list to be completed
 9783          * after the inode has been updated and written to disk.
 9784          *
 9785          * Any marked MKDIR_PARENT are saved to be completed when the 
 9786          * dotdot ref is removed unless DIRCHG is specified.  For
 9787          * directory change operations there will be no further
 9788          * directory writes and the jsegdeps need to be moved along
 9789          * with the rest to be completed when the inode is free or
 9790          * stable in the inode free list.
 9791          */
 9792         LIST_INIT(&dotdotwk);
 9793         while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
 9794                 WORKLIST_REMOVE(wk);
 9795                 if ((dirrem->dm_state & DIRCHG) == 0 &&
 9796                     wk->wk_state & MKDIR_PARENT) {
 9797                         wk->wk_state &= ~MKDIR_PARENT;
 9798                         WORKLIST_INSERT(&dotdotwk, wk);
 9799                         continue;
 9800                 }
 9801                 WORKLIST_INSERT(&inodedep->id_inowait, wk);
 9802         }
 9803         LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
 9804         /*
 9805          * Normal file deletion.
 9806          */
 9807         if ((dirrem->dm_state & RMDIR) == 0) {
 9808                 ip->i_nlink--;
 9809                 DIP_SET(ip, i_nlink, ip->i_nlink);
 9810                 ip->i_flag |= IN_CHANGE;
 9811                 if (ip->i_nlink < ip->i_effnlink)
 9812                         panic("handle_workitem_remove: bad file delta");
 9813                 if (ip->i_nlink == 0) 
 9814                         unlinked_inodedep(mp, inodedep);
 9815                 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 9816                 KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
 9817                     ("handle_workitem_remove: worklist not empty. %s",
 9818                     TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
 9819                 WORKITEM_FREE(dirrem, D_DIRREM);
 9820                 FREE_LOCK(ump);
 9821                 goto out;
 9822         }
 9823         /*
 9824          * Directory deletion. Decrement reference count for both the
 9825          * just deleted parent directory entry and the reference for ".".
 9826          * Arrange to have the reference count on the parent decremented
 9827          * to account for the loss of "..".
 9828          */
 9829         ip->i_nlink -= 2;
 9830         DIP_SET(ip, i_nlink, ip->i_nlink);
 9831         ip->i_flag |= IN_CHANGE;
 9832         if (ip->i_nlink < ip->i_effnlink)
 9833                 panic("handle_workitem_remove: bad dir delta");
 9834         if (ip->i_nlink == 0)
 9835                 unlinked_inodedep(mp, inodedep);
 9836         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 9837         /*
 9838          * Rename a directory to a new parent. Since, we are both deleting
 9839          * and creating a new directory entry, the link count on the new
 9840          * directory should not change. Thus we skip the followup dirrem.
 9841          */
 9842         if (dirrem->dm_state & DIRCHG) {
 9843                 KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
 9844                     ("handle_workitem_remove: DIRCHG and worklist not empty."));
 9845                 WORKITEM_FREE(dirrem, D_DIRREM);
 9846                 FREE_LOCK(ump);
 9847                 goto out;
 9848         }
 9849         dirrem->dm_state = ONDEPLIST;
 9850         dirrem->dm_oldinum = dirrem->dm_dirinum;
 9851         /*
 9852          * Place the dirrem on the parent's diremhd list.
 9853          */
 9854         if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
 9855                 panic("handle_workitem_remove: lost dir inodedep");
 9856         LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
 9857         /*
 9858          * If the allocated inode has never been written to disk, then
 9859          * the on-disk inode is zero'ed and we can remove the file
 9860          * immediately.  When journaling if the inode has been marked
 9861          * unlinked and not DEPCOMPLETE we know it can never be written.
 9862          */
 9863         inodedep_lookup(mp, oldinum, 0, &inodedep);
 9864         if (inodedep == NULL ||
 9865             (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
 9866             check_inode_unwritten(inodedep)) {
 9867                 FREE_LOCK(ump);
 9868                 vput(vp);
 9869                 return handle_workitem_remove(dirrem, flags);
 9870         }
 9871         WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
 9872         FREE_LOCK(ump);
 9873         ip->i_flag |= IN_CHANGE;
 9874 out:
 9875         ffs_update(vp, 0);
 9876         vput(vp);
 9877         return (0);
 9878 }
 9879 
 9880 /*
 9881  * Inode de-allocation dependencies.
 9882  * 
 9883  * When an inode's link count is reduced to zero, it can be de-allocated. We
 9884  * found it convenient to postpone de-allocation until after the inode is
 9885  * written to disk with its new link count (zero).  At this point, all of the
 9886  * on-disk inode's block pointers are nullified and, with careful dependency
 9887  * list ordering, all dependencies related to the inode will be satisfied and
 9888  * the corresponding dependency structures de-allocated.  So, if/when the
 9889  * inode is reused, there will be no mixing of old dependencies with new
 9890  * ones.  This artificial dependency is set up by the block de-allocation
 9891  * procedure above (softdep_setup_freeblocks) and completed by the
 9892  * following procedure.
 9893  */
 9894 static void 
 9895 handle_workitem_freefile(freefile)
 9896         struct freefile *freefile;
 9897 {
 9898         struct workhead wkhd;
 9899         struct fs *fs;
 9900         struct inodedep *idp;
 9901         struct ufsmount *ump;
 9902         int error;
 9903 
 9904         ump = VFSTOUFS(freefile->fx_list.wk_mp);
 9905         fs = ump->um_fs;
 9906 #ifdef DEBUG
 9907         ACQUIRE_LOCK(ump);
 9908         error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
 9909         FREE_LOCK(ump);
 9910         if (error)
 9911                 panic("handle_workitem_freefile: inodedep %p survived", idp);
 9912 #endif
 9913         UFS_LOCK(ump);
 9914         fs->fs_pendinginodes -= 1;
 9915         UFS_UNLOCK(ump);
 9916         LIST_INIT(&wkhd);
 9917         LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
 9918         if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
 9919             freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
 9920                 softdep_error("handle_workitem_freefile", error);
 9921         ACQUIRE_LOCK(ump);
 9922         WORKITEM_FREE(freefile, D_FREEFILE);
 9923         FREE_LOCK(ump);
 9924 }
 9925 
 9926 
 9927 /*
 9928  * Helper function which unlinks marker element from work list and returns
 9929  * the next element on the list.
 9930  */
 9931 static __inline struct worklist *
 9932 markernext(struct worklist *marker)
 9933 {
 9934         struct worklist *next;
 9935         
 9936         next = LIST_NEXT(marker, wk_list);
 9937         LIST_REMOVE(marker, wk_list);
 9938         return next;
 9939 }
 9940 
 9941 /*
 9942  * Disk writes.
 9943  * 
 9944  * The dependency structures constructed above are most actively used when file
 9945  * system blocks are written to disk.  No constraints are placed on when a
 9946  * block can be written, but unsatisfied update dependencies are made safe by
 9947  * modifying (or replacing) the source memory for the duration of the disk
 9948  * write.  When the disk write completes, the memory block is again brought
 9949  * up-to-date.
 9950  *
 9951  * In-core inode structure reclamation.
 9952  * 
 9953  * Because there are a finite number of "in-core" inode structures, they are
 9954  * reused regularly.  By transferring all inode-related dependencies to the
 9955  * in-memory inode block and indexing them separately (via "inodedep"s), we
 9956  * can allow "in-core" inode structures to be reused at any time and avoid
 9957  * any increase in contention.
 9958  *
 9959  * Called just before entering the device driver to initiate a new disk I/O.
 9960  * The buffer must be locked, thus, no I/O completion operations can occur
 9961  * while we are manipulating its associated dependencies.
 9962  */
 9963 static void 
 9964 softdep_disk_io_initiation(bp)
 9965         struct buf *bp;         /* structure describing disk write to occur */
 9966 {
 9967         struct worklist *wk;
 9968         struct worklist marker;
 9969         struct inodedep *inodedep;
 9970         struct freeblks *freeblks;
 9971         struct jblkdep *jblkdep;
 9972         struct newblk *newblk;
 9973         struct ufsmount *ump;
 9974 
 9975         /*
 9976          * We only care about write operations. There should never
 9977          * be dependencies for reads.
 9978          */
 9979         if (bp->b_iocmd != BIO_WRITE)
 9980                 panic("softdep_disk_io_initiation: not write");
 9981 
 9982         if (bp->b_vflags & BV_BKGRDINPROG)
 9983                 panic("softdep_disk_io_initiation: Writing buffer with "
 9984                     "background write in progress: %p", bp);
 9985 
 9986         ump = softdep_bp_to_mp(bp);
 9987         if (ump == NULL)
 9988                 return;
 9989 
 9990         marker.wk_type = D_LAST + 1;    /* Not a normal workitem */
 9991         PHOLD(curproc);                 /* Don't swap out kernel stack */
 9992         ACQUIRE_LOCK(ump);
 9993         /*
 9994          * Do any necessary pre-I/O processing.
 9995          */
 9996         for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
 9997              wk = markernext(&marker)) {
 9998                 LIST_INSERT_AFTER(wk, &marker, wk_list);
 9999                 switch (wk->wk_type) {
10000 
10001                 case D_PAGEDEP:
10002                         initiate_write_filepage(WK_PAGEDEP(wk), bp);
10003                         continue;
10004 
10005                 case D_INODEDEP:
10006                         inodedep = WK_INODEDEP(wk);
10007                         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
10008                                 initiate_write_inodeblock_ufs1(inodedep, bp);
10009                         else
10010                                 initiate_write_inodeblock_ufs2(inodedep, bp);
10011                         continue;
10012 
10013                 case D_INDIRDEP:
10014                         initiate_write_indirdep(WK_INDIRDEP(wk), bp);
10015                         continue;
10016 
10017                 case D_BMSAFEMAP:
10018                         initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
10019                         continue;
10020 
10021                 case D_JSEG:
10022                         WK_JSEG(wk)->js_buf = NULL;
10023                         continue;
10024 
10025                 case D_FREEBLKS:
10026                         freeblks = WK_FREEBLKS(wk);
10027                         jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
10028                         /*
10029                          * We have to wait for the freeblks to be journaled
10030                          * before we can write an inodeblock with updated
10031                          * pointers.  Be careful to arrange the marker so
10032                          * we revisit the freeblks if it's not removed by
10033                          * the first jwait().
10034                          */
10035                         if (jblkdep != NULL) {
10036                                 LIST_REMOVE(&marker, wk_list);
10037                                 LIST_INSERT_BEFORE(wk, &marker, wk_list);
10038                                 jwait(&jblkdep->jb_list, MNT_WAIT);
10039                         }
10040                         continue;
10041                 case D_ALLOCDIRECT:
10042                 case D_ALLOCINDIR:
10043                         /*
10044                          * We have to wait for the jnewblk to be journaled
10045                          * before we can write to a block if the contents
10046                          * may be confused with an earlier file's indirect
10047                          * at recovery time.  Handle the marker as described
10048                          * above.
10049                          */
10050                         newblk = WK_NEWBLK(wk);
10051                         if (newblk->nb_jnewblk != NULL &&
10052                             indirblk_lookup(newblk->nb_list.wk_mp,
10053                             newblk->nb_newblkno)) {
10054                                 LIST_REMOVE(&marker, wk_list);
10055                                 LIST_INSERT_BEFORE(wk, &marker, wk_list);
10056                                 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
10057                         }
10058                         continue;
10059 
10060                 case D_SBDEP:
10061                         initiate_write_sbdep(WK_SBDEP(wk));
10062                         continue;
10063 
10064                 case D_MKDIR:
10065                 case D_FREEWORK:
10066                 case D_FREEDEP:
10067                 case D_JSEGDEP:
10068                         continue;
10069 
10070                 default:
10071                         panic("handle_disk_io_initiation: Unexpected type %s",
10072                             TYPENAME(wk->wk_type));
10073                         /* NOTREACHED */
10074                 }
10075         }
10076         FREE_LOCK(ump);
10077         PRELE(curproc);                 /* Allow swapout of kernel stack */
10078 }
10079 
10080 /*
10081  * Called from within the procedure above to deal with unsatisfied
10082  * allocation dependencies in a directory. The buffer must be locked,
10083  * thus, no I/O completion operations can occur while we are
10084  * manipulating its associated dependencies.
10085  */
10086 static void
10087 initiate_write_filepage(pagedep, bp)
10088         struct pagedep *pagedep;
10089         struct buf *bp;
10090 {
10091         struct jremref *jremref;
10092         struct jmvref *jmvref;
10093         struct dirrem *dirrem;
10094         struct diradd *dap;
10095         struct direct *ep;
10096         int i;
10097 
10098         if (pagedep->pd_state & IOSTARTED) {
10099                 /*
10100                  * This can only happen if there is a driver that does not
10101                  * understand chaining. Here biodone will reissue the call
10102                  * to strategy for the incomplete buffers.
10103                  */
10104                 printf("initiate_write_filepage: already started\n");
10105                 return;
10106         }
10107         pagedep->pd_state |= IOSTARTED;
10108         /*
10109          * Wait for all journal remove dependencies to hit the disk.
10110          * We can not allow any potentially conflicting directory adds
10111          * to be visible before removes and rollback is too difficult.
10112          * The per-filesystem lock may be dropped and re-acquired, however 
10113          * we hold the buf locked so the dependency can not go away.
10114          */
10115         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
10116                 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
10117                         jwait(&jremref->jr_list, MNT_WAIT);
10118         while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
10119                 jwait(&jmvref->jm_list, MNT_WAIT);
10120         for (i = 0; i < DAHASHSZ; i++) {
10121                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
10122                         ep = (struct direct *)
10123                             ((char *)bp->b_data + dap->da_offset);
10124                         if (ep->d_ino != dap->da_newinum)
10125                                 panic("%s: dir inum %ju != new %ju",
10126                                     "initiate_write_filepage",
10127                                     (uintmax_t)ep->d_ino,
10128                                     (uintmax_t)dap->da_newinum);
10129                         if (dap->da_state & DIRCHG)
10130                                 ep->d_ino = dap->da_previous->dm_oldinum;
10131                         else
10132                                 ep->d_ino = 0;
10133                         dap->da_state &= ~ATTACHED;
10134                         dap->da_state |= UNDONE;
10135                 }
10136         }
10137 }
10138 
10139 /*
10140  * Version of initiate_write_inodeblock that handles UFS1 dinodes.
10141  * Note that any bug fixes made to this routine must be done in the
10142  * version found below.
10143  *
10144  * Called from within the procedure above to deal with unsatisfied
10145  * allocation dependencies in an inodeblock. The buffer must be
10146  * locked, thus, no I/O completion operations can occur while we
10147  * are manipulating its associated dependencies.
10148  */
10149 static void 
10150 initiate_write_inodeblock_ufs1(inodedep, bp)
10151         struct inodedep *inodedep;
10152         struct buf *bp;                 /* The inode block */
10153 {
10154         struct allocdirect *adp, *lastadp;
10155         struct ufs1_dinode *dp;
10156         struct ufs1_dinode *sip;
10157         struct inoref *inoref;
10158         struct ufsmount *ump;
10159         struct fs *fs;
10160         ufs_lbn_t i;
10161 #ifdef INVARIANTS
10162         ufs_lbn_t prevlbn = 0;
10163 #endif
10164         int deplist;
10165 
10166         if (inodedep->id_state & IOSTARTED)
10167                 panic("initiate_write_inodeblock_ufs1: already started");
10168         inodedep->id_state |= IOSTARTED;
10169         fs = inodedep->id_fs;
10170         ump = VFSTOUFS(inodedep->id_list.wk_mp);
10171         LOCK_OWNED(ump);
10172         dp = (struct ufs1_dinode *)bp->b_data +
10173             ino_to_fsbo(fs, inodedep->id_ino);
10174 
10175         /*
10176          * If we're on the unlinked list but have not yet written our
10177          * next pointer initialize it here.
10178          */
10179         if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10180                 struct inodedep *inon;
10181 
10182                 inon = TAILQ_NEXT(inodedep, id_unlinked);
10183                 dp->di_freelink = inon ? inon->id_ino : 0;
10184         }
10185         /*
10186          * If the bitmap is not yet written, then the allocated
10187          * inode cannot be written to disk.
10188          */
10189         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10190                 if (inodedep->id_savedino1 != NULL)
10191                         panic("initiate_write_inodeblock_ufs1: I/O underway");
10192                 FREE_LOCK(ump);
10193                 sip = malloc(sizeof(struct ufs1_dinode),
10194                     M_SAVEDINO, M_SOFTDEP_FLAGS);
10195                 ACQUIRE_LOCK(ump);
10196                 inodedep->id_savedino1 = sip;
10197                 *inodedep->id_savedino1 = *dp;
10198                 bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
10199                 dp->di_gen = inodedep->id_savedino1->di_gen;
10200                 dp->di_freelink = inodedep->id_savedino1->di_freelink;
10201                 return;
10202         }
10203         /*
10204          * If no dependencies, then there is nothing to roll back.
10205          */
10206         inodedep->id_savedsize = dp->di_size;
10207         inodedep->id_savedextsize = 0;
10208         inodedep->id_savednlink = dp->di_nlink;
10209         if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10210             TAILQ_EMPTY(&inodedep->id_inoreflst))
10211                 return;
10212         /*
10213          * Revert the link count to that of the first unwritten journal entry.
10214          */
10215         inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10216         if (inoref)
10217                 dp->di_nlink = inoref->if_nlink;
10218         /*
10219          * Set the dependencies to busy.
10220          */
10221         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10222              adp = TAILQ_NEXT(adp, ad_next)) {
10223 #ifdef INVARIANTS
10224                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10225                         panic("softdep_write_inodeblock: lbn order");
10226                 prevlbn = adp->ad_offset;
10227                 if (adp->ad_offset < NDADDR &&
10228                     dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10229                         panic("initiate_write_inodeblock_ufs1: "
10230                             "direct pointer #%jd mismatch %d != %jd",
10231                             (intmax_t)adp->ad_offset,
10232                             dp->di_db[adp->ad_offset],
10233                             (intmax_t)adp->ad_newblkno);
10234                 if (adp->ad_offset >= NDADDR &&
10235                     dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10236                         panic("initiate_write_inodeblock_ufs1: "
10237                             "indirect pointer #%jd mismatch %d != %jd",
10238                             (intmax_t)adp->ad_offset - NDADDR,
10239                             dp->di_ib[adp->ad_offset - NDADDR],
10240                             (intmax_t)adp->ad_newblkno);
10241                 deplist |= 1 << adp->ad_offset;
10242                 if ((adp->ad_state & ATTACHED) == 0)
10243                         panic("initiate_write_inodeblock_ufs1: "
10244                             "Unknown state 0x%x", adp->ad_state);
10245 #endif /* INVARIANTS */
10246                 adp->ad_state &= ~ATTACHED;
10247                 adp->ad_state |= UNDONE;
10248         }
10249         /*
10250          * The on-disk inode cannot claim to be any larger than the last
10251          * fragment that has been written. Otherwise, the on-disk inode
10252          * might have fragments that were not the last block in the file
10253          * which would corrupt the filesystem.
10254          */
10255         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10256              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10257                 if (adp->ad_offset >= NDADDR)
10258                         break;
10259                 dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10260                 /* keep going until hitting a rollback to a frag */
10261                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10262                         continue;
10263                 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10264                 for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10265 #ifdef INVARIANTS
10266                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10267                                 panic("initiate_write_inodeblock_ufs1: "
10268                                     "lost dep1");
10269 #endif /* INVARIANTS */
10270                         dp->di_db[i] = 0;
10271                 }
10272                 for (i = 0; i < NIADDR; i++) {
10273 #ifdef INVARIANTS
10274                         if (dp->di_ib[i] != 0 &&
10275                             (deplist & ((1 << NDADDR) << i)) == 0)
10276                                 panic("initiate_write_inodeblock_ufs1: "
10277                                     "lost dep2");
10278 #endif /* INVARIANTS */
10279                         dp->di_ib[i] = 0;
10280                 }
10281                 return;
10282         }
10283         /*
10284          * If we have zero'ed out the last allocated block of the file,
10285          * roll back the size to the last currently allocated block.
10286          * We know that this last allocated block is a full-sized as
10287          * we already checked for fragments in the loop above.
10288          */
10289         if (lastadp != NULL &&
10290             dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10291                 for (i = lastadp->ad_offset; i >= 0; i--)
10292                         if (dp->di_db[i] != 0)
10293                                 break;
10294                 dp->di_size = (i + 1) * fs->fs_bsize;
10295         }
10296         /*
10297          * The only dependencies are for indirect blocks.
10298          *
10299          * The file size for indirect block additions is not guaranteed.
10300          * Such a guarantee would be non-trivial to achieve. The conventional
10301          * synchronous write implementation also does not make this guarantee.
10302          * Fsck should catch and fix discrepancies. Arguably, the file size
10303          * can be over-estimated without destroying integrity when the file
10304          * moves into the indirect blocks (i.e., is large). If we want to
10305          * postpone fsck, we are stuck with this argument.
10306          */
10307         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10308                 dp->di_ib[adp->ad_offset - NDADDR] = 0;
10309 }
10310                 
10311 /*
10312  * Version of initiate_write_inodeblock that handles UFS2 dinodes.
10313  * Note that any bug fixes made to this routine must be done in the
10314  * version found above.
10315  *
10316  * Called from within the procedure above to deal with unsatisfied
10317  * allocation dependencies in an inodeblock. The buffer must be
10318  * locked, thus, no I/O completion operations can occur while we
10319  * are manipulating its associated dependencies.
10320  */
10321 static void 
10322 initiate_write_inodeblock_ufs2(inodedep, bp)
10323         struct inodedep *inodedep;
10324         struct buf *bp;                 /* The inode block */
10325 {
10326         struct allocdirect *adp, *lastadp;
10327         struct ufs2_dinode *dp;
10328         struct ufs2_dinode *sip;
10329         struct inoref *inoref;
10330         struct ufsmount *ump;
10331         struct fs *fs;
10332         ufs_lbn_t i;
10333 #ifdef INVARIANTS
10334         ufs_lbn_t prevlbn = 0;
10335 #endif
10336         int deplist;
10337 
10338         if (inodedep->id_state & IOSTARTED)
10339                 panic("initiate_write_inodeblock_ufs2: already started");
10340         inodedep->id_state |= IOSTARTED;
10341         fs = inodedep->id_fs;
10342         ump = VFSTOUFS(inodedep->id_list.wk_mp);
10343         LOCK_OWNED(ump);
10344         dp = (struct ufs2_dinode *)bp->b_data +
10345             ino_to_fsbo(fs, inodedep->id_ino);
10346 
10347         /*
10348          * If we're on the unlinked list but have not yet written our
10349          * next pointer initialize it here.
10350          */
10351         if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10352                 struct inodedep *inon;
10353 
10354                 inon = TAILQ_NEXT(inodedep, id_unlinked);
10355                 dp->di_freelink = inon ? inon->id_ino : 0;
10356         }
10357         /*
10358          * If the bitmap is not yet written, then the allocated
10359          * inode cannot be written to disk.
10360          */
10361         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10362                 if (inodedep->id_savedino2 != NULL)
10363                         panic("initiate_write_inodeblock_ufs2: I/O underway");
10364                 FREE_LOCK(ump);
10365                 sip = malloc(sizeof(struct ufs2_dinode),
10366                     M_SAVEDINO, M_SOFTDEP_FLAGS);
10367                 ACQUIRE_LOCK(ump);
10368                 inodedep->id_savedino2 = sip;
10369                 *inodedep->id_savedino2 = *dp;
10370                 bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10371                 dp->di_gen = inodedep->id_savedino2->di_gen;
10372                 dp->di_freelink = inodedep->id_savedino2->di_freelink;
10373                 return;
10374         }
10375         /*
10376          * If no dependencies, then there is nothing to roll back.
10377          */
10378         inodedep->id_savedsize = dp->di_size;
10379         inodedep->id_savedextsize = dp->di_extsize;
10380         inodedep->id_savednlink = dp->di_nlink;
10381         if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10382             TAILQ_EMPTY(&inodedep->id_extupdt) &&
10383             TAILQ_EMPTY(&inodedep->id_inoreflst))
10384                 return;
10385         /*
10386          * Revert the link count to that of the first unwritten journal entry.
10387          */
10388         inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10389         if (inoref)
10390                 dp->di_nlink = inoref->if_nlink;
10391 
10392         /*
10393          * Set the ext data dependencies to busy.
10394          */
10395         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10396              adp = TAILQ_NEXT(adp, ad_next)) {
10397 #ifdef INVARIANTS
10398                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10399                         panic("initiate_write_inodeblock_ufs2: lbn order");
10400                 prevlbn = adp->ad_offset;
10401                 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10402                         panic("initiate_write_inodeblock_ufs2: "
10403                             "ext pointer #%jd mismatch %jd != %jd",
10404                             (intmax_t)adp->ad_offset,
10405                             (intmax_t)dp->di_extb[adp->ad_offset],
10406                             (intmax_t)adp->ad_newblkno);
10407                 deplist |= 1 << adp->ad_offset;
10408                 if ((adp->ad_state & ATTACHED) == 0)
10409                         panic("initiate_write_inodeblock_ufs2: Unknown "
10410                             "state 0x%x", adp->ad_state);
10411 #endif /* INVARIANTS */
10412                 adp->ad_state &= ~ATTACHED;
10413                 adp->ad_state |= UNDONE;
10414         }
10415         /*
10416          * The on-disk inode cannot claim to be any larger than the last
10417          * fragment that has been written. Otherwise, the on-disk inode
10418          * might have fragments that were not the last block in the ext
10419          * data which would corrupt the filesystem.
10420          */
10421         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10422              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10423                 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10424                 /* keep going until hitting a rollback to a frag */
10425                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10426                         continue;
10427                 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10428                 for (i = adp->ad_offset + 1; i < NXADDR; i++) {
10429 #ifdef INVARIANTS
10430                         if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10431                                 panic("initiate_write_inodeblock_ufs2: "
10432                                     "lost dep1");
10433 #endif /* INVARIANTS */
10434                         dp->di_extb[i] = 0;
10435                 }
10436                 lastadp = NULL;
10437                 break;
10438         }
10439         /*
10440          * If we have zero'ed out the last allocated block of the ext
10441          * data, roll back the size to the last currently allocated block.
10442          * We know that this last allocated block is a full-sized as
10443          * we already checked for fragments in the loop above.
10444          */
10445         if (lastadp != NULL &&
10446             dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10447                 for (i = lastadp->ad_offset; i >= 0; i--)
10448                         if (dp->di_extb[i] != 0)
10449                                 break;
10450                 dp->di_extsize = (i + 1) * fs->fs_bsize;
10451         }
10452         /*
10453          * Set the file data dependencies to busy.
10454          */
10455         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10456              adp = TAILQ_NEXT(adp, ad_next)) {
10457 #ifdef INVARIANTS
10458                 if (deplist != 0 && prevlbn >= adp->ad_offset)
10459                         panic("softdep_write_inodeblock: lbn order");
10460                 if ((adp->ad_state & ATTACHED) == 0)
10461                         panic("inodedep %p and adp %p not attached", inodedep, adp);
10462                 prevlbn = adp->ad_offset;
10463                 if (adp->ad_offset < NDADDR &&
10464                     dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10465                         panic("initiate_write_inodeblock_ufs2: "
10466                             "direct pointer #%jd mismatch %jd != %jd",
10467                             (intmax_t)adp->ad_offset,
10468                             (intmax_t)dp->di_db[adp->ad_offset],
10469                             (intmax_t)adp->ad_newblkno);
10470                 if (adp->ad_offset >= NDADDR &&
10471                     dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10472                         panic("initiate_write_inodeblock_ufs2: "
10473                             "indirect pointer #%jd mismatch %jd != %jd",
10474                             (intmax_t)adp->ad_offset - NDADDR,
10475                             (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
10476                             (intmax_t)adp->ad_newblkno);
10477                 deplist |= 1 << adp->ad_offset;
10478                 if ((adp->ad_state & ATTACHED) == 0)
10479                         panic("initiate_write_inodeblock_ufs2: Unknown "
10480                              "state 0x%x", adp->ad_state);
10481 #endif /* INVARIANTS */
10482                 adp->ad_state &= ~ATTACHED;
10483                 adp->ad_state |= UNDONE;
10484         }
10485         /*
10486          * The on-disk inode cannot claim to be any larger than the last
10487          * fragment that has been written. Otherwise, the on-disk inode
10488          * might have fragments that were not the last block in the file
10489          * which would corrupt the filesystem.
10490          */
10491         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10492              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10493                 if (adp->ad_offset >= NDADDR)
10494                         break;
10495                 dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10496                 /* keep going until hitting a rollback to a frag */
10497                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10498                         continue;
10499                 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10500                 for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10501 #ifdef INVARIANTS
10502                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10503                                 panic("initiate_write_inodeblock_ufs2: "
10504                                     "lost dep2");
10505 #endif /* INVARIANTS */
10506                         dp->di_db[i] = 0;
10507                 }
10508                 for (i = 0; i < NIADDR; i++) {
10509 #ifdef INVARIANTS
10510                         if (dp->di_ib[i] != 0 &&
10511                             (deplist & ((1 << NDADDR) << i)) == 0)
10512                                 panic("initiate_write_inodeblock_ufs2: "
10513                                     "lost dep3");
10514 #endif /* INVARIANTS */
10515                         dp->di_ib[i] = 0;
10516                 }
10517                 return;
10518         }
10519         /*
10520          * If we have zero'ed out the last allocated block of the file,
10521          * roll back the size to the last currently allocated block.
10522          * We know that this last allocated block is a full-sized as
10523          * we already checked for fragments in the loop above.
10524          */
10525         if (lastadp != NULL &&
10526             dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10527                 for (i = lastadp->ad_offset; i >= 0; i--)
10528                         if (dp->di_db[i] != 0)
10529                                 break;
10530                 dp->di_size = (i + 1) * fs->fs_bsize;
10531         }
10532         /*
10533          * The only dependencies are for indirect blocks.
10534          *
10535          * The file size for indirect block additions is not guaranteed.
10536          * Such a guarantee would be non-trivial to achieve. The conventional
10537          * synchronous write implementation also does not make this guarantee.
10538          * Fsck should catch and fix discrepancies. Arguably, the file size
10539          * can be over-estimated without destroying integrity when the file
10540          * moves into the indirect blocks (i.e., is large). If we want to
10541          * postpone fsck, we are stuck with this argument.
10542          */
10543         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10544                 dp->di_ib[adp->ad_offset - NDADDR] = 0;
10545 }
10546 
10547 /*
10548  * Cancel an indirdep as a result of truncation.  Release all of the
10549  * children allocindirs and place their journal work on the appropriate
10550  * list.
10551  */
10552 static void
10553 cancel_indirdep(indirdep, bp, freeblks)
10554         struct indirdep *indirdep;
10555         struct buf *bp;
10556         struct freeblks *freeblks;
10557 {
10558         struct allocindir *aip;
10559 
10560         /*
10561          * None of the indirect pointers will ever be visible,
10562          * so they can simply be tossed. GOINGAWAY ensures
10563          * that allocated pointers will be saved in the buffer
10564          * cache until they are freed. Note that they will
10565          * only be able to be found by their physical address
10566          * since the inode mapping the logical address will
10567          * be gone. The save buffer used for the safe copy
10568          * was allocated in setup_allocindir_phase2 using
10569          * the physical address so it could be used for this
10570          * purpose. Hence we swap the safe copy with the real
10571          * copy, allowing the safe copy to be freed and holding
10572          * on to the real copy for later use in indir_trunc.
10573          */
10574         if (indirdep->ir_state & GOINGAWAY)
10575                 panic("cancel_indirdep: already gone");
10576         if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10577                 indirdep->ir_state |= DEPCOMPLETE;
10578                 LIST_REMOVE(indirdep, ir_next);
10579         }
10580         indirdep->ir_state |= GOINGAWAY;
10581         /*
10582          * Pass in bp for blocks still have journal writes
10583          * pending so we can cancel them on their own.
10584          */
10585         while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL)
10586                 cancel_allocindir(aip, bp, freeblks, 0);
10587         while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL)
10588                 cancel_allocindir(aip, NULL, freeblks, 0);
10589         while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL)
10590                 cancel_allocindir(aip, NULL, freeblks, 0);
10591         while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL)
10592                 cancel_allocindir(aip, NULL, freeblks, 0);
10593         /*
10594          * If there are pending partial truncations we need to keep the
10595          * old block copy around until they complete.  This is because
10596          * the current b_data is not a perfect superset of the available
10597          * blocks.
10598          */
10599         if (TAILQ_EMPTY(&indirdep->ir_trunc))
10600                 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10601         else
10602                 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10603         WORKLIST_REMOVE(&indirdep->ir_list);
10604         WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10605         indirdep->ir_bp = NULL;
10606         indirdep->ir_freeblks = freeblks;
10607 }
10608 
10609 /*
10610  * Free an indirdep once it no longer has new pointers to track.
10611  */
10612 static void
10613 free_indirdep(indirdep)
10614         struct indirdep *indirdep;
10615 {
10616 
10617         KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10618             ("free_indirdep: Indir trunc list not empty."));
10619         KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10620             ("free_indirdep: Complete head not empty."));
10621         KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10622             ("free_indirdep: write head not empty."));
10623         KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10624             ("free_indirdep: done head not empty."));
10625         KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10626             ("free_indirdep: deplist head not empty."));
10627         KASSERT((indirdep->ir_state & DEPCOMPLETE),
10628             ("free_indirdep: %p still on newblk list.", indirdep));
10629         KASSERT(indirdep->ir_saveddata == NULL,
10630             ("free_indirdep: %p still has saved data.", indirdep));
10631         if (indirdep->ir_state & ONWORKLIST)
10632                 WORKLIST_REMOVE(&indirdep->ir_list);
10633         WORKITEM_FREE(indirdep, D_INDIRDEP);
10634 }
10635 
10636 /*
10637  * Called before a write to an indirdep.  This routine is responsible for
10638  * rolling back pointers to a safe state which includes only those
10639  * allocindirs which have been completed.
10640  */
10641 static void
10642 initiate_write_indirdep(indirdep, bp)
10643         struct indirdep *indirdep;
10644         struct buf *bp;
10645 {
10646         struct ufsmount *ump;
10647 
10648         indirdep->ir_state |= IOSTARTED;
10649         if (indirdep->ir_state & GOINGAWAY)
10650                 panic("disk_io_initiation: indirdep gone");
10651         /*
10652          * If there are no remaining dependencies, this will be writing
10653          * the real pointers.
10654          */
10655         if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10656             TAILQ_EMPTY(&indirdep->ir_trunc))
10657                 return;
10658         /*
10659          * Replace up-to-date version with safe version.
10660          */
10661         if (indirdep->ir_saveddata == NULL) {
10662                 ump = VFSTOUFS(indirdep->ir_list.wk_mp);
10663                 LOCK_OWNED(ump);
10664                 FREE_LOCK(ump);
10665                 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10666                     M_SOFTDEP_FLAGS);
10667                 ACQUIRE_LOCK(ump);
10668         }
10669         indirdep->ir_state &= ~ATTACHED;
10670         indirdep->ir_state |= UNDONE;
10671         bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10672         bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10673             bp->b_bcount);
10674 }
10675 
10676 /*
10677  * Called when an inode has been cleared in a cg bitmap.  This finally
10678  * eliminates any canceled jaddrefs
10679  */
10680 void
10681 softdep_setup_inofree(mp, bp, ino, wkhd)
10682         struct mount *mp;
10683         struct buf *bp;
10684         ino_t ino;
10685         struct workhead *wkhd;
10686 {
10687         struct worklist *wk, *wkn;
10688         struct inodedep *inodedep;
10689         struct ufsmount *ump;
10690         uint8_t *inosused;
10691         struct cg *cgp;
10692         struct fs *fs;
10693 
10694         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
10695             ("softdep_setup_inofree called on non-softdep filesystem"));
10696         ump = VFSTOUFS(mp);
10697         ACQUIRE_LOCK(ump);
10698         fs = ump->um_fs;
10699         cgp = (struct cg *)bp->b_data;
10700         inosused = cg_inosused(cgp);
10701         if (isset(inosused, ino % fs->fs_ipg))
10702                 panic("softdep_setup_inofree: inode %ju not freed.",
10703                     (uintmax_t)ino);
10704         if (inodedep_lookup(mp, ino, 0, &inodedep))
10705                 panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
10706                     (uintmax_t)ino, inodedep);
10707         if (wkhd) {
10708                 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10709                         if (wk->wk_type != D_JADDREF)
10710                                 continue;
10711                         WORKLIST_REMOVE(wk);
10712                         /*
10713                          * We can free immediately even if the jaddref
10714                          * isn't attached in a background write as now
10715                          * the bitmaps are reconciled.
10716                          */
10717                         wk->wk_state |= COMPLETE | ATTACHED;
10718                         free_jaddref(WK_JADDREF(wk));
10719                 }
10720                 jwork_move(&bp->b_dep, wkhd);
10721         }
10722         FREE_LOCK(ump);
10723 }
10724 
10725 
10726 /*
10727  * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10728  * map.  Any dependencies waiting for the write to clear are added to the
10729  * buf's list and any jnewblks that are being canceled are discarded
10730  * immediately.
10731  */
10732 void
10733 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
10734         struct mount *mp;
10735         struct buf *bp;
10736         ufs2_daddr_t blkno;
10737         int frags;
10738         struct workhead *wkhd;
10739 {
10740         struct bmsafemap *bmsafemap;
10741         struct jnewblk *jnewblk;
10742         struct ufsmount *ump;
10743         struct worklist *wk;
10744         struct fs *fs;
10745 #ifdef SUJ_DEBUG
10746         uint8_t *blksfree;
10747         struct cg *cgp;
10748         ufs2_daddr_t jstart;
10749         ufs2_daddr_t jend;
10750         ufs2_daddr_t end;
10751         long bno;
10752         int i;
10753 #endif
10754 
10755         CTR3(KTR_SUJ,
10756             "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
10757             blkno, frags, wkhd);
10758 
10759         ump = VFSTOUFS(mp);
10760         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
10761             ("softdep_setup_blkfree called on non-softdep filesystem"));
10762         ACQUIRE_LOCK(ump);
10763         /* Lookup the bmsafemap so we track when it is dirty. */
10764         fs = ump->um_fs;
10765         bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10766         /*
10767          * Detach any jnewblks which have been canceled.  They must linger
10768          * until the bitmap is cleared again by ffs_blkfree() to prevent
10769          * an unjournaled allocation from hitting the disk.
10770          */
10771         if (wkhd) {
10772                 while ((wk = LIST_FIRST(wkhd)) != NULL) {
10773                         CTR2(KTR_SUJ,
10774                             "softdep_setup_blkfree: blkno %jd wk type %d",
10775                             blkno, wk->wk_type);
10776                         WORKLIST_REMOVE(wk);
10777                         if (wk->wk_type != D_JNEWBLK) {
10778                                 WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
10779                                 continue;
10780                         }
10781                         jnewblk = WK_JNEWBLK(wk);
10782                         KASSERT(jnewblk->jn_state & GOINGAWAY,
10783                             ("softdep_setup_blkfree: jnewblk not canceled."));
10784 #ifdef SUJ_DEBUG
10785                         /*
10786                          * Assert that this block is free in the bitmap
10787                          * before we discard the jnewblk.
10788                          */
10789                         cgp = (struct cg *)bp->b_data;
10790                         blksfree = cg_blksfree(cgp);
10791                         bno = dtogd(fs, jnewblk->jn_blkno);
10792                         for (i = jnewblk->jn_oldfrags;
10793                             i < jnewblk->jn_frags; i++) {
10794                                 if (isset(blksfree, bno + i))
10795                                         continue;
10796                                 panic("softdep_setup_blkfree: not free");
10797                         }
10798 #endif
10799                         /*
10800                          * Even if it's not attached we can free immediately
10801                          * as the new bitmap is correct.
10802                          */
10803                         wk->wk_state |= COMPLETE | ATTACHED;
10804                         free_jnewblk(jnewblk);
10805                 }
10806         }
10807 
10808 #ifdef SUJ_DEBUG
10809         /*
10810          * Assert that we are not freeing a block which has an outstanding
10811          * allocation dependency.
10812          */
10813         fs = VFSTOUFS(mp)->um_fs;
10814         bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10815         end = blkno + frags;
10816         LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10817                 /*
10818                  * Don't match against blocks that will be freed when the
10819                  * background write is done.
10820                  */
10821                 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
10822                     (COMPLETE | DEPCOMPLETE))
10823                         continue;
10824                 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
10825                 jend = jnewblk->jn_blkno + jnewblk->jn_frags;
10826                 if ((blkno >= jstart && blkno < jend) ||
10827                     (end > jstart && end <= jend)) {
10828                         printf("state 0x%X %jd - %d %d dep %p\n",
10829                             jnewblk->jn_state, jnewblk->jn_blkno,
10830                             jnewblk->jn_oldfrags, jnewblk->jn_frags,
10831                             jnewblk->jn_dep);
10832                         panic("softdep_setup_blkfree: "
10833                             "%jd-%jd(%d) overlaps with %jd-%jd",
10834                             blkno, end, frags, jstart, jend);
10835                 }
10836         }
10837 #endif
10838         FREE_LOCK(ump);
10839 }
10840 
10841 /*
10842  * Revert a block allocation when the journal record that describes it
10843  * is not yet written.
10844  */
10845 static int
10846 jnewblk_rollback(jnewblk, fs, cgp, blksfree)
10847         struct jnewblk *jnewblk;
10848         struct fs *fs;
10849         struct cg *cgp;
10850         uint8_t *blksfree;
10851 {
10852         ufs1_daddr_t fragno;
10853         long cgbno, bbase;
10854         int frags, blk;
10855         int i;
10856 
10857         frags = 0;
10858         cgbno = dtogd(fs, jnewblk->jn_blkno);
10859         /*
10860          * We have to test which frags need to be rolled back.  We may
10861          * be operating on a stale copy when doing background writes.
10862          */
10863         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
10864                 if (isclr(blksfree, cgbno + i))
10865                         frags++;
10866         if (frags == 0)
10867                 return (0);
10868         /*
10869          * This is mostly ffs_blkfree() sans some validation and
10870          * superblock updates.
10871          */
10872         if (frags == fs->fs_frag) {
10873                 fragno = fragstoblks(fs, cgbno);
10874                 ffs_setblock(fs, blksfree, fragno);
10875                 ffs_clusteracct(fs, cgp, fragno, 1);
10876                 cgp->cg_cs.cs_nbfree++;
10877         } else {
10878                 cgbno += jnewblk->jn_oldfrags;
10879                 bbase = cgbno - fragnum(fs, cgbno);
10880                 /* Decrement the old frags.  */
10881                 blk = blkmap(fs, blksfree, bbase);
10882                 ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
10883                 /* Deallocate the fragment */
10884                 for (i = 0; i < frags; i++)
10885                         setbit(blksfree, cgbno + i);
10886                 cgp->cg_cs.cs_nffree += frags;
10887                 /* Add back in counts associated with the new frags */
10888                 blk = blkmap(fs, blksfree, bbase);
10889                 ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
10890                 /* If a complete block has been reassembled, account for it. */
10891                 fragno = fragstoblks(fs, bbase);
10892                 if (ffs_isblock(fs, blksfree, fragno)) {
10893                         cgp->cg_cs.cs_nffree -= fs->fs_frag;
10894                         ffs_clusteracct(fs, cgp, fragno, 1);
10895                         cgp->cg_cs.cs_nbfree++;
10896                 }
10897         }
10898         stat_jnewblk++;
10899         jnewblk->jn_state &= ~ATTACHED;
10900         jnewblk->jn_state |= UNDONE;
10901 
10902         return (frags);
10903 }
10904 
10905 static void
10906 initiate_write_bmsafemap(bmsafemap, bp)
10907         struct bmsafemap *bmsafemap;
10908         struct buf *bp;                 /* The cg block. */
10909 {
10910         struct jaddref *jaddref;
10911         struct jnewblk *jnewblk;
10912         uint8_t *inosused;
10913         uint8_t *blksfree;
10914         struct cg *cgp;
10915         struct fs *fs;
10916         ino_t ino;
10917 
10918         /*
10919          * If this is a background write, we did this at the time that
10920          * the copy was made, so do not need to do it again.
10921          */
10922         if (bmsafemap->sm_state & IOSTARTED)
10923                 return;
10924         bmsafemap->sm_state |= IOSTARTED;
10925         /*
10926          * Clear any inode allocations which are pending journal writes.
10927          */
10928         if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
10929                 cgp = (struct cg *)bp->b_data;
10930                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10931                 inosused = cg_inosused(cgp);
10932                 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
10933                         ino = jaddref->ja_ino % fs->fs_ipg;
10934                         if (isset(inosused, ino)) {
10935                                 if ((jaddref->ja_mode & IFMT) == IFDIR)
10936                                         cgp->cg_cs.cs_ndir--;
10937                                 cgp->cg_cs.cs_nifree++;
10938                                 clrbit(inosused, ino);
10939                                 jaddref->ja_state &= ~ATTACHED;
10940                                 jaddref->ja_state |= UNDONE;
10941                                 stat_jaddref++;
10942                         } else
10943                                 panic("initiate_write_bmsafemap: inode %ju "
10944                                     "marked free", (uintmax_t)jaddref->ja_ino);
10945                 }
10946         }
10947         /*
10948          * Clear any block allocations which are pending journal writes.
10949          */
10950         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
10951                 cgp = (struct cg *)bp->b_data;
10952                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10953                 blksfree = cg_blksfree(cgp);
10954                 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10955                         if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
10956                                 continue;
10957                         panic("initiate_write_bmsafemap: block %jd "
10958                             "marked free", jnewblk->jn_blkno);
10959                 }
10960         }
10961         /*
10962          * Move allocation lists to the written lists so they can be
10963          * cleared once the block write is complete.
10964          */
10965         LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
10966             inodedep, id_deps);
10967         LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
10968             newblk, nb_deps);
10969         LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
10970             wk_list);
10971 }
10972 
10973 /*
10974  * This routine is called during the completion interrupt
10975  * service routine for a disk write (from the procedure called
10976  * by the device driver to inform the filesystem caches of
10977  * a request completion).  It should be called early in this
10978  * procedure, before the block is made available to other
10979  * processes or other routines are called.
10980  *
10981  */
10982 static void 
10983 softdep_disk_write_complete(bp)
10984         struct buf *bp;         /* describes the completed disk write */
10985 {
10986         struct worklist *wk;
10987         struct worklist *owk;
10988         struct ufsmount *ump;
10989         struct workhead reattach;
10990         struct freeblks *freeblks;
10991         struct buf *sbp;
10992 
10993         ump = softdep_bp_to_mp(bp);
10994         if (ump == NULL)
10995                 return;
10996 
10997         /*
10998          * If an error occurred while doing the write, then the data
10999          * has not hit the disk and the dependencies cannot be processed.
11000          * But we do have to go through and roll forward any dependencies
11001          * that were rolled back before the disk write.
11002          */
11003         ACQUIRE_LOCK(ump);
11004         if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) {
11005                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
11006                         switch (wk->wk_type) {
11007 
11008                         case D_PAGEDEP:
11009                                 handle_written_filepage(WK_PAGEDEP(wk), bp, 0);
11010                                 continue;
11011 
11012                         case D_INODEDEP:
11013                                 handle_written_inodeblock(WK_INODEDEP(wk),
11014                                     bp, 0);
11015                                 continue;
11016 
11017                         case D_BMSAFEMAP:
11018                                 handle_written_bmsafemap(WK_BMSAFEMAP(wk),
11019                                     bp, 0);
11020                                 continue;
11021 
11022                         case D_INDIRDEP:
11023                                 handle_written_indirdep(WK_INDIRDEP(wk),
11024                                     bp, &sbp, 0);
11025                                 continue;
11026                         default:
11027                                 /* nothing to roll forward */
11028                                 continue;
11029                         }
11030                 }
11031                 FREE_LOCK(ump);
11032                 return;
11033         }
11034         LIST_INIT(&reattach);
11035 
11036         /*
11037          * Ump SU lock must not be released anywhere in this code segment.
11038          */
11039         sbp = NULL;
11040         owk = NULL;
11041         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
11042                 WORKLIST_REMOVE(wk);
11043                 atomic_add_long(&dep_write[wk->wk_type], 1);
11044                 if (wk == owk)
11045                         panic("duplicate worklist: %p\n", wk);
11046                 owk = wk;
11047                 switch (wk->wk_type) {
11048 
11049                 case D_PAGEDEP:
11050                         if (handle_written_filepage(WK_PAGEDEP(wk), bp,
11051                             WRITESUCCEEDED))
11052                                 WORKLIST_INSERT(&reattach, wk);
11053                         continue;
11054 
11055                 case D_INODEDEP:
11056                         if (handle_written_inodeblock(WK_INODEDEP(wk), bp,
11057                             WRITESUCCEEDED))
11058                                 WORKLIST_INSERT(&reattach, wk);
11059                         continue;
11060 
11061                 case D_BMSAFEMAP:
11062                         if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp,
11063                             WRITESUCCEEDED))
11064                                 WORKLIST_INSERT(&reattach, wk);
11065                         continue;
11066 
11067                 case D_MKDIR:
11068                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
11069                         continue;
11070 
11071                 case D_ALLOCDIRECT:
11072                         wk->wk_state |= COMPLETE;
11073                         handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
11074                         continue;
11075 
11076                 case D_ALLOCINDIR:
11077                         wk->wk_state |= COMPLETE;
11078                         handle_allocindir_partdone(WK_ALLOCINDIR(wk));
11079                         continue;
11080 
11081                 case D_INDIRDEP:
11082                         if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp,
11083                             WRITESUCCEEDED))
11084                                 WORKLIST_INSERT(&reattach, wk);
11085                         continue;
11086 
11087                 case D_FREEBLKS:
11088                         wk->wk_state |= COMPLETE;
11089                         freeblks = WK_FREEBLKS(wk);
11090                         if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
11091                             LIST_EMPTY(&freeblks->fb_jblkdephd))
11092                                 add_to_worklist(wk, WK_NODELAY);
11093                         continue;
11094 
11095                 case D_FREEWORK:
11096                         handle_written_freework(WK_FREEWORK(wk));
11097                         break;
11098 
11099                 case D_JSEGDEP:
11100                         free_jsegdep(WK_JSEGDEP(wk));
11101                         continue;
11102 
11103                 case D_JSEG:
11104                         handle_written_jseg(WK_JSEG(wk), bp);
11105                         continue;
11106 
11107                 case D_SBDEP:
11108                         if (handle_written_sbdep(WK_SBDEP(wk), bp))
11109                                 WORKLIST_INSERT(&reattach, wk);
11110                         continue;
11111 
11112                 case D_FREEDEP:
11113                         free_freedep(WK_FREEDEP(wk));
11114                         continue;
11115 
11116                 default:
11117                         panic("handle_disk_write_complete: Unknown type %s",
11118                             TYPENAME(wk->wk_type));
11119                         /* NOTREACHED */
11120                 }
11121         }
11122         /*
11123          * Reattach any requests that must be redone.
11124          */
11125         while ((wk = LIST_FIRST(&reattach)) != NULL) {
11126                 WORKLIST_REMOVE(wk);
11127                 WORKLIST_INSERT(&bp->b_dep, wk);
11128         }
11129         FREE_LOCK(ump);
11130         if (sbp)
11131                 brelse(sbp);
11132 }
11133 
11134 /*
11135  * Called from within softdep_disk_write_complete above. Note that
11136  * this routine is always called from interrupt level with further
11137  * splbio interrupts blocked.
11138  */
11139 static void 
11140 handle_allocdirect_partdone(adp, wkhd)
11141         struct allocdirect *adp;        /* the completed allocdirect */
11142         struct workhead *wkhd;          /* Work to do when inode is writtne. */
11143 {
11144         struct allocdirectlst *listhead;
11145         struct allocdirect *listadp;
11146         struct inodedep *inodedep;
11147         long bsize;
11148 
11149         if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11150                 return;
11151         /*
11152          * The on-disk inode cannot claim to be any larger than the last
11153          * fragment that has been written. Otherwise, the on-disk inode
11154          * might have fragments that were not the last block in the file
11155          * which would corrupt the filesystem. Thus, we cannot free any
11156          * allocdirects after one whose ad_oldblkno claims a fragment as
11157          * these blocks must be rolled back to zero before writing the inode.
11158          * We check the currently active set of allocdirects in id_inoupdt
11159          * or id_extupdt as appropriate.
11160          */
11161         inodedep = adp->ad_inodedep;
11162         bsize = inodedep->id_fs->fs_bsize;
11163         if (adp->ad_state & EXTDATA)
11164                 listhead = &inodedep->id_extupdt;
11165         else
11166                 listhead = &inodedep->id_inoupdt;
11167         TAILQ_FOREACH(listadp, listhead, ad_next) {
11168                 /* found our block */
11169                 if (listadp == adp)
11170                         break;
11171                 /* continue if ad_oldlbn is not a fragment */
11172                 if (listadp->ad_oldsize == 0 ||
11173                     listadp->ad_oldsize == bsize)
11174                         continue;
11175                 /* hit a fragment */
11176                 return;
11177         }
11178         /*
11179          * If we have reached the end of the current list without
11180          * finding the just finished dependency, then it must be
11181          * on the future dependency list. Future dependencies cannot
11182          * be freed until they are moved to the current list.
11183          */
11184         if (listadp == NULL) {
11185 #ifdef DEBUG
11186                 if (adp->ad_state & EXTDATA)
11187                         listhead = &inodedep->id_newextupdt;
11188                 else
11189                         listhead = &inodedep->id_newinoupdt;
11190                 TAILQ_FOREACH(listadp, listhead, ad_next)
11191                         /* found our block */
11192                         if (listadp == adp)
11193                                 break;
11194                 if (listadp == NULL)
11195                         panic("handle_allocdirect_partdone: lost dep");
11196 #endif /* DEBUG */
11197                 return;
11198         }
11199         /*
11200          * If we have found the just finished dependency, then queue
11201          * it along with anything that follows it that is complete.
11202          * Since the pointer has not yet been written in the inode
11203          * as the dependency prevents it, place the allocdirect on the
11204          * bufwait list where it will be freed once the pointer is
11205          * valid.
11206          */
11207         if (wkhd == NULL)
11208                 wkhd = &inodedep->id_bufwait;
11209         for (; adp; adp = listadp) {
11210                 listadp = TAILQ_NEXT(adp, ad_next);
11211                 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11212                         return;
11213                 TAILQ_REMOVE(listhead, adp, ad_next);
11214                 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
11215         }
11216 }
11217 
11218 /*
11219  * Called from within softdep_disk_write_complete above.  This routine
11220  * completes successfully written allocindirs.
11221  */
11222 static void
11223 handle_allocindir_partdone(aip)
11224         struct allocindir *aip;         /* the completed allocindir */
11225 {
11226         struct indirdep *indirdep;
11227 
11228         if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
11229                 return;
11230         indirdep = aip->ai_indirdep;
11231         LIST_REMOVE(aip, ai_next);
11232         /*
11233          * Don't set a pointer while the buffer is undergoing IO or while
11234          * we have active truncations.
11235          */
11236         if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
11237                 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
11238                 return;
11239         }
11240         if (indirdep->ir_state & UFS1FMT)
11241                 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11242                     aip->ai_newblkno;
11243         else
11244                 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11245                     aip->ai_newblkno;
11246         /*
11247          * Await the pointer write before freeing the allocindir.
11248          */
11249         LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
11250 }
11251 
11252 /*
11253  * Release segments held on a jwork list.
11254  */
11255 static void
11256 handle_jwork(wkhd)
11257         struct workhead *wkhd;
11258 {
11259         struct worklist *wk;
11260 
11261         while ((wk = LIST_FIRST(wkhd)) != NULL) {
11262                 WORKLIST_REMOVE(wk);
11263                 switch (wk->wk_type) {
11264                 case D_JSEGDEP:
11265                         free_jsegdep(WK_JSEGDEP(wk));
11266                         continue;
11267                 case D_FREEDEP:
11268                         free_freedep(WK_FREEDEP(wk));
11269                         continue;
11270                 case D_FREEFRAG:
11271                         rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
11272                         WORKITEM_FREE(wk, D_FREEFRAG);
11273                         continue;
11274                 case D_FREEWORK:
11275                         handle_written_freework(WK_FREEWORK(wk));
11276                         continue;
11277                 default:
11278                         panic("handle_jwork: Unknown type %s\n",
11279                             TYPENAME(wk->wk_type));
11280                 }
11281         }
11282 }
11283 
11284 /*
11285  * Handle the bufwait list on an inode when it is safe to release items
11286  * held there.  This normally happens after an inode block is written but
11287  * may be delayed and handled later if there are pending journal items that
11288  * are not yet safe to be released.
11289  */
11290 static struct freefile *
11291 handle_bufwait(inodedep, refhd)
11292         struct inodedep *inodedep;
11293         struct workhead *refhd;
11294 {
11295         struct jaddref *jaddref;
11296         struct freefile *freefile;
11297         struct worklist *wk;
11298 
11299         freefile = NULL;
11300         while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
11301                 WORKLIST_REMOVE(wk);
11302                 switch (wk->wk_type) {
11303                 case D_FREEFILE:
11304                         /*
11305                          * We defer adding freefile to the worklist
11306                          * until all other additions have been made to
11307                          * ensure that it will be done after all the
11308                          * old blocks have been freed.
11309                          */
11310                         if (freefile != NULL)
11311                                 panic("handle_bufwait: freefile");
11312                         freefile = WK_FREEFILE(wk);
11313                         continue;
11314 
11315                 case D_MKDIR:
11316                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
11317                         continue;
11318 
11319                 case D_DIRADD:
11320                         diradd_inode_written(WK_DIRADD(wk), inodedep);
11321                         continue;
11322 
11323                 case D_FREEFRAG:
11324                         wk->wk_state |= COMPLETE;
11325                         if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
11326                                 add_to_worklist(wk, 0);
11327                         continue;
11328 
11329                 case D_DIRREM:
11330                         wk->wk_state |= COMPLETE;
11331                         add_to_worklist(wk, 0);
11332                         continue;
11333 
11334                 case D_ALLOCDIRECT:
11335                 case D_ALLOCINDIR:
11336                         free_newblk(WK_NEWBLK(wk));
11337                         continue;
11338 
11339                 case D_JNEWBLK:
11340                         wk->wk_state |= COMPLETE;
11341                         free_jnewblk(WK_JNEWBLK(wk));
11342                         continue;
11343 
11344                 /*
11345                  * Save freed journal segments and add references on
11346                  * the supplied list which will delay their release
11347                  * until the cg bitmap is cleared on disk.
11348                  */
11349                 case D_JSEGDEP:
11350                         if (refhd == NULL)
11351                                 free_jsegdep(WK_JSEGDEP(wk));
11352                         else
11353                                 WORKLIST_INSERT(refhd, wk);
11354                         continue;
11355 
11356                 case D_JADDREF:
11357                         jaddref = WK_JADDREF(wk);
11358                         TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
11359                             if_deps);
11360                         /*
11361                          * Transfer any jaddrefs to the list to be freed with
11362                          * the bitmap if we're handling a removed file.
11363                          */
11364                         if (refhd == NULL) {
11365                                 wk->wk_state |= COMPLETE;
11366                                 free_jaddref(jaddref);
11367                         } else
11368                                 WORKLIST_INSERT(refhd, wk);
11369                         continue;
11370 
11371                 default:
11372                         panic("handle_bufwait: Unknown type %p(%s)",
11373                             wk, TYPENAME(wk->wk_type));
11374                         /* NOTREACHED */
11375                 }
11376         }
11377         return (freefile);
11378 }
11379 /*
11380  * Called from within softdep_disk_write_complete above to restore
11381  * in-memory inode block contents to their most up-to-date state. Note
11382  * that this routine is always called from interrupt level with further
11383  * interrupts from this device blocked.
11384  *
11385  * If the write did not succeed, we will do all the roll-forward
11386  * operations, but we will not take the actions that will allow its
11387  * dependencies to be processed.
11388  */
11389 static int 
11390 handle_written_inodeblock(inodedep, bp, flags)
11391         struct inodedep *inodedep;
11392         struct buf *bp;         /* buffer containing the inode block */
11393         int flags;
11394 {
11395         struct freefile *freefile;
11396         struct allocdirect *adp, *nextadp;
11397         struct ufs1_dinode *dp1 = NULL;
11398         struct ufs2_dinode *dp2 = NULL;
11399         struct workhead wkhd;
11400         int hadchanges, fstype;
11401         ino_t freelink;
11402 
11403         LIST_INIT(&wkhd);
11404         hadchanges = 0;
11405         freefile = NULL;
11406         if ((inodedep->id_state & IOSTARTED) == 0)
11407                 panic("handle_written_inodeblock: not started");
11408         inodedep->id_state &= ~IOSTARTED;
11409         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11410                 fstype = UFS1;
11411                 dp1 = (struct ufs1_dinode *)bp->b_data +
11412                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11413                 freelink = dp1->di_freelink;
11414         } else {
11415                 fstype = UFS2;
11416                 dp2 = (struct ufs2_dinode *)bp->b_data +
11417                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11418                 freelink = dp2->di_freelink;
11419         }
11420         /*
11421          * Leave this inodeblock dirty until it's in the list.
11422          */
11423         if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED &&
11424             (flags & WRITESUCCEEDED)) {
11425                 struct inodedep *inon;
11426 
11427                 inon = TAILQ_NEXT(inodedep, id_unlinked);
11428                 if ((inon == NULL && freelink == 0) ||
11429                     (inon && inon->id_ino == freelink)) {
11430                         if (inon)
11431                                 inon->id_state |= UNLINKPREV;
11432                         inodedep->id_state |= UNLINKNEXT;
11433                 }
11434                 hadchanges = 1;
11435         }
11436         /*
11437          * If we had to rollback the inode allocation because of
11438          * bitmaps being incomplete, then simply restore it.
11439          * Keep the block dirty so that it will not be reclaimed until
11440          * all associated dependencies have been cleared and the
11441          * corresponding updates written to disk.
11442          */
11443         if (inodedep->id_savedino1 != NULL) {
11444                 hadchanges = 1;
11445                 if (fstype == UFS1)
11446                         *dp1 = *inodedep->id_savedino1;
11447                 else
11448                         *dp2 = *inodedep->id_savedino2;
11449                 free(inodedep->id_savedino1, M_SAVEDINO);
11450                 inodedep->id_savedino1 = NULL;
11451                 if ((bp->b_flags & B_DELWRI) == 0)
11452                         stat_inode_bitmap++;
11453                 bdirty(bp);
11454                 /*
11455                  * If the inode is clear here and GOINGAWAY it will never
11456                  * be written.  Process the bufwait and clear any pending
11457                  * work which may include the freefile.
11458                  */
11459                 if (inodedep->id_state & GOINGAWAY)
11460                         goto bufwait;
11461                 return (1);
11462         }
11463         if (flags & WRITESUCCEEDED)
11464                 inodedep->id_state |= COMPLETE;
11465         /*
11466          * Roll forward anything that had to be rolled back before 
11467          * the inode could be updated.
11468          */
11469         for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11470                 nextadp = TAILQ_NEXT(adp, ad_next);
11471                 if (adp->ad_state & ATTACHED)
11472                         panic("handle_written_inodeblock: new entry");
11473                 if (fstype == UFS1) {
11474                         if (adp->ad_offset < NDADDR) {
11475                                 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11476                                         panic("%s %s #%jd mismatch %d != %jd",
11477                                             "handle_written_inodeblock:",
11478                                             "direct pointer",
11479                                             (intmax_t)adp->ad_offset,
11480                                             dp1->di_db[adp->ad_offset],
11481                                             (intmax_t)adp->ad_oldblkno);
11482                                 dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11483                         } else {
11484                                 if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
11485                                         panic("%s: %s #%jd allocated as %d",
11486                                             "handle_written_inodeblock",
11487                                             "indirect pointer",
11488                                             (intmax_t)adp->ad_offset - NDADDR,
11489                                             dp1->di_ib[adp->ad_offset - NDADDR]);
11490                                 dp1->di_ib[adp->ad_offset - NDADDR] =
11491                                     adp->ad_newblkno;
11492                         }
11493                 } else {
11494                         if (adp->ad_offset < NDADDR) {
11495                                 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11496                                         panic("%s: %s #%jd %s %jd != %jd",
11497                                             "handle_written_inodeblock",
11498                                             "direct pointer",
11499                                             (intmax_t)adp->ad_offset, "mismatch",
11500                                             (intmax_t)dp2->di_db[adp->ad_offset],
11501                                             (intmax_t)adp->ad_oldblkno);
11502                                 dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11503                         } else {
11504                                 if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
11505                                         panic("%s: %s #%jd allocated as %jd",
11506                                             "handle_written_inodeblock",
11507                                             "indirect pointer",
11508                                             (intmax_t)adp->ad_offset - NDADDR,
11509                                             (intmax_t)
11510                                             dp2->di_ib[adp->ad_offset - NDADDR]);
11511                                 dp2->di_ib[adp->ad_offset - NDADDR] =
11512                                     adp->ad_newblkno;
11513                         }
11514                 }
11515                 adp->ad_state &= ~UNDONE;
11516                 adp->ad_state |= ATTACHED;
11517                 hadchanges = 1;
11518         }
11519         for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11520                 nextadp = TAILQ_NEXT(adp, ad_next);
11521                 if (adp->ad_state & ATTACHED)
11522                         panic("handle_written_inodeblock: new entry");
11523                 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11524                         panic("%s: direct pointers #%jd %s %jd != %jd",
11525                             "handle_written_inodeblock",
11526                             (intmax_t)adp->ad_offset, "mismatch",
11527                             (intmax_t)dp2->di_extb[adp->ad_offset],
11528                             (intmax_t)adp->ad_oldblkno);
11529                 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11530                 adp->ad_state &= ~UNDONE;
11531                 adp->ad_state |= ATTACHED;
11532                 hadchanges = 1;
11533         }
11534         if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11535                 stat_direct_blk_ptrs++;
11536         /*
11537          * Reset the file size to its most up-to-date value.
11538          */
11539         if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11540                 panic("handle_written_inodeblock: bad size");
11541         if (inodedep->id_savednlink > LINK_MAX)
11542                 panic("handle_written_inodeblock: Invalid link count "
11543                     "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink,
11544                     inodedep);
11545         if (fstype == UFS1) {
11546                 if (dp1->di_nlink != inodedep->id_savednlink) { 
11547                         dp1->di_nlink = inodedep->id_savednlink;
11548                         hadchanges = 1;
11549                 }
11550                 if (dp1->di_size != inodedep->id_savedsize) {
11551                         dp1->di_size = inodedep->id_savedsize;
11552                         hadchanges = 1;
11553                 }
11554         } else {
11555                 if (dp2->di_nlink != inodedep->id_savednlink) { 
11556                         dp2->di_nlink = inodedep->id_savednlink;
11557                         hadchanges = 1;
11558                 }
11559                 if (dp2->di_size != inodedep->id_savedsize) {
11560                         dp2->di_size = inodedep->id_savedsize;
11561                         hadchanges = 1;
11562                 }
11563                 if (dp2->di_extsize != inodedep->id_savedextsize) {
11564                         dp2->di_extsize = inodedep->id_savedextsize;
11565                         hadchanges = 1;
11566                 }
11567         }
11568         inodedep->id_savedsize = -1;
11569         inodedep->id_savedextsize = -1;
11570         inodedep->id_savednlink = -1;
11571         /*
11572          * If there were any rollbacks in the inode block, then it must be
11573          * marked dirty so that its will eventually get written back in
11574          * its correct form.
11575          */
11576         if (hadchanges)
11577                 bdirty(bp);
11578 bufwait:
11579         /*
11580          * If the write did not succeed, we have done all the roll-forward
11581          * operations, but we cannot take the actions that will allow its
11582          * dependencies to be processed.
11583          */
11584         if ((flags & WRITESUCCEEDED) == 0)
11585                 return (hadchanges);
11586         /*
11587          * Process any allocdirects that completed during the update.
11588          */
11589         if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11590                 handle_allocdirect_partdone(adp, &wkhd);
11591         if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11592                 handle_allocdirect_partdone(adp, &wkhd);
11593         /*
11594          * Process deallocations that were held pending until the
11595          * inode had been written to disk. Freeing of the inode
11596          * is delayed until after all blocks have been freed to
11597          * avoid creation of new <vfsid, inum, lbn> triples
11598          * before the old ones have been deleted.  Completely
11599          * unlinked inodes are not processed until the unlinked
11600          * inode list is written or the last reference is removed.
11601          */
11602         if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11603                 freefile = handle_bufwait(inodedep, NULL);
11604                 if (freefile && !LIST_EMPTY(&wkhd)) {
11605                         WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11606                         freefile = NULL;
11607                 }
11608         }
11609         /*
11610          * Move rolled forward dependency completions to the bufwait list
11611          * now that those that were already written have been processed.
11612          */
11613         if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11614                 panic("handle_written_inodeblock: bufwait but no changes");
11615         jwork_move(&inodedep->id_bufwait, &wkhd);
11616 
11617         if (freefile != NULL) {
11618                 /*
11619                  * If the inode is goingaway it was never written.  Fake up
11620                  * the state here so free_inodedep() can succeed.
11621                  */
11622                 if (inodedep->id_state & GOINGAWAY)
11623                         inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11624                 if (free_inodedep(inodedep) == 0)
11625                         panic("handle_written_inodeblock: live inodedep %p",
11626                             inodedep);
11627                 add_to_worklist(&freefile->fx_list, 0);
11628                 return (0);
11629         }
11630 
11631         /*
11632          * If no outstanding dependencies, free it.
11633          */
11634         if (free_inodedep(inodedep) ||
11635             (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11636              TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11637              TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11638              LIST_FIRST(&inodedep->id_bufwait) == 0))
11639                 return (0);
11640         return (hadchanges);
11641 }
11642 
11643 /*
11644  * Perform needed roll-forwards and kick off any dependencies that
11645  * can now be processed.
11646  *
11647  * If the write did not succeed, we will do all the roll-forward
11648  * operations, but we will not take the actions that will allow its
11649  * dependencies to be processed.
11650  */
11651 static int
11652 handle_written_indirdep(indirdep, bp, bpp, flags)
11653         struct indirdep *indirdep;
11654         struct buf *bp;
11655         struct buf **bpp;
11656         int flags;
11657 {
11658         struct allocindir *aip;
11659         struct buf *sbp;
11660         int chgs;
11661 
11662         if (indirdep->ir_state & GOINGAWAY)
11663                 panic("handle_written_indirdep: indirdep gone");
11664         if ((indirdep->ir_state & IOSTARTED) == 0)
11665                 panic("handle_written_indirdep: IO not started");
11666         chgs = 0;
11667         /*
11668          * If there were rollbacks revert them here.
11669          */
11670         if (indirdep->ir_saveddata) {
11671                 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11672                 if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11673                         free(indirdep->ir_saveddata, M_INDIRDEP);
11674                         indirdep->ir_saveddata = NULL;
11675                 }
11676                 chgs = 1;
11677         }
11678         indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11679         indirdep->ir_state |= ATTACHED;
11680         /*
11681          * If the write did not succeed, we have done all the roll-forward
11682          * operations, but we cannot take the actions that will allow its
11683          * dependencies to be processed.
11684          */
11685         if ((flags & WRITESUCCEEDED) == 0) {
11686                 stat_indir_blk_ptrs++;
11687                 bdirty(bp);
11688                 return (1);
11689         }
11690         /*
11691          * Move allocindirs with written pointers to the completehd if
11692          * the indirdep's pointer is not yet written.  Otherwise
11693          * free them here.
11694          */
11695         while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) {
11696                 LIST_REMOVE(aip, ai_next);
11697                 if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11698                         LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11699                             ai_next);
11700                         newblk_freefrag(&aip->ai_block);
11701                         continue;
11702                 }
11703                 free_newblk(&aip->ai_block);
11704         }
11705         /*
11706          * Move allocindirs that have finished dependency processing from
11707          * the done list to the write list after updating the pointers.
11708          */
11709         if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11710                 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) {
11711                         handle_allocindir_partdone(aip);
11712                         if (aip == LIST_FIRST(&indirdep->ir_donehd))
11713                                 panic("disk_write_complete: not gone");
11714                         chgs = 1;
11715                 }
11716         }
11717         /*
11718          * Preserve the indirdep if there were any changes or if it is not
11719          * yet valid on disk.
11720          */
11721         if (chgs) {
11722                 stat_indir_blk_ptrs++;
11723                 bdirty(bp);
11724                 return (1);
11725         }
11726         /*
11727          * If there were no changes we can discard the savedbp and detach
11728          * ourselves from the buf.  We are only carrying completed pointers
11729          * in this case.
11730          */
11731         sbp = indirdep->ir_savebp;
11732         sbp->b_flags |= B_INVAL | B_NOCACHE;
11733         indirdep->ir_savebp = NULL;
11734         indirdep->ir_bp = NULL;
11735         if (*bpp != NULL)
11736                 panic("handle_written_indirdep: bp already exists.");
11737         *bpp = sbp;
11738         /*
11739          * The indirdep may not be freed until its parent points at it.
11740          */
11741         if (indirdep->ir_state & DEPCOMPLETE)
11742                 free_indirdep(indirdep);
11743 
11744         return (0);
11745 }
11746 
11747 /*
11748  * Process a diradd entry after its dependent inode has been written.
11749  * This routine must be called with splbio interrupts blocked.
11750  */
11751 static void
11752 diradd_inode_written(dap, inodedep)
11753         struct diradd *dap;
11754         struct inodedep *inodedep;
11755 {
11756 
11757         dap->da_state |= COMPLETE;
11758         complete_diradd(dap);
11759         WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
11760 }
11761 
11762 /*
11763  * Returns true if the bmsafemap will have rollbacks when written.  Must only
11764  * be called with the per-filesystem lock and the buf lock on the cg held.
11765  */
11766 static int
11767 bmsafemap_backgroundwrite(bmsafemap, bp)
11768         struct bmsafemap *bmsafemap;
11769         struct buf *bp;
11770 {
11771         int dirty;
11772 
11773         LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp));
11774         dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | 
11775             !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
11776         /*
11777          * If we're initiating a background write we need to process the
11778          * rollbacks as they exist now, not as they exist when IO starts.
11779          * No other consumers will look at the contents of the shadowed
11780          * buf so this is safe to do here.
11781          */
11782         if (bp->b_xflags & BX_BKGRDMARKER)
11783                 initiate_write_bmsafemap(bmsafemap, bp);
11784 
11785         return (dirty);
11786 }
11787 
11788 /*
11789  * Re-apply an allocation when a cg write is complete.
11790  */
11791 static int
11792 jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
11793         struct jnewblk *jnewblk;
11794         struct fs *fs;
11795         struct cg *cgp;
11796         uint8_t *blksfree;
11797 {
11798         ufs1_daddr_t fragno;
11799         ufs2_daddr_t blkno;
11800         long cgbno, bbase;
11801         int frags, blk;
11802         int i;
11803 
11804         frags = 0;
11805         cgbno = dtogd(fs, jnewblk->jn_blkno);
11806         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
11807                 if (isclr(blksfree, cgbno + i))
11808                         panic("jnewblk_rollforward: re-allocated fragment");
11809                 frags++;
11810         }
11811         if (frags == fs->fs_frag) {
11812                 blkno = fragstoblks(fs, cgbno);
11813                 ffs_clrblock(fs, blksfree, (long)blkno);
11814                 ffs_clusteracct(fs, cgp, blkno, -1);
11815                 cgp->cg_cs.cs_nbfree--;
11816         } else {
11817                 bbase = cgbno - fragnum(fs, cgbno);
11818                 cgbno += jnewblk->jn_oldfrags;
11819                 /* If a complete block had been reassembled, account for it. */
11820                 fragno = fragstoblks(fs, bbase);
11821                 if (ffs_isblock(fs, blksfree, fragno)) {
11822                         cgp->cg_cs.cs_nffree += fs->fs_frag;
11823                         ffs_clusteracct(fs, cgp, fragno, -1);
11824                         cgp->cg_cs.cs_nbfree--;
11825                 }
11826                 /* Decrement the old frags.  */
11827                 blk = blkmap(fs, blksfree, bbase);
11828                 ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11829                 /* Allocate the fragment */
11830                 for (i = 0; i < frags; i++)
11831                         clrbit(blksfree, cgbno + i);
11832                 cgp->cg_cs.cs_nffree -= frags;
11833                 /* Add back in counts associated with the new frags */
11834                 blk = blkmap(fs, blksfree, bbase);
11835                 ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11836         }
11837         return (frags);
11838 }
11839 
11840 /*
11841  * Complete a write to a bmsafemap structure.  Roll forward any bitmap
11842  * changes if it's not a background write.  Set all written dependencies 
11843  * to DEPCOMPLETE and free the structure if possible.
11844  *
11845  * If the write did not succeed, we will do all the roll-forward
11846  * operations, but we will not take the actions that will allow its
11847  * dependencies to be processed.
11848  */
11849 static int
11850 handle_written_bmsafemap(bmsafemap, bp, flags)
11851         struct bmsafemap *bmsafemap;
11852         struct buf *bp;
11853         int flags;
11854 {
11855         struct newblk *newblk;
11856         struct inodedep *inodedep;
11857         struct jaddref *jaddref, *jatmp;
11858         struct jnewblk *jnewblk, *jntmp;
11859         struct ufsmount *ump;
11860         uint8_t *inosused;
11861         uint8_t *blksfree;
11862         struct cg *cgp;
11863         struct fs *fs;
11864         ino_t ino;
11865         int foreground;
11866         int chgs;
11867 
11868         if ((bmsafemap->sm_state & IOSTARTED) == 0)
11869                 panic("handle_written_bmsafemap: Not started\n");
11870         ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
11871         chgs = 0;
11872         bmsafemap->sm_state &= ~IOSTARTED;
11873         foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
11874         /*
11875          * If write was successful, release journal work that was waiting
11876          * on the write. Otherwise move the work back.
11877          */
11878         if (flags & WRITESUCCEEDED)
11879                 handle_jwork(&bmsafemap->sm_freewr);
11880         else
11881                 LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
11882                     worklist, wk_list);
11883 
11884         /*
11885          * Restore unwritten inode allocation pending jaddref writes.
11886          */
11887         if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
11888                 cgp = (struct cg *)bp->b_data;
11889                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11890                 inosused = cg_inosused(cgp);
11891                 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
11892                     ja_bmdeps, jatmp) {
11893                         if ((jaddref->ja_state & UNDONE) == 0)
11894                                 continue;
11895                         ino = jaddref->ja_ino % fs->fs_ipg;
11896                         if (isset(inosused, ino))
11897                                 panic("handle_written_bmsafemap: "
11898                                     "re-allocated inode");
11899                         /* Do the roll-forward only if it's a real copy. */
11900                         if (foreground) {
11901                                 if ((jaddref->ja_mode & IFMT) == IFDIR)
11902                                         cgp->cg_cs.cs_ndir++;
11903                                 cgp->cg_cs.cs_nifree--;
11904                                 setbit(inosused, ino);
11905                                 chgs = 1;
11906                         }
11907                         jaddref->ja_state &= ~UNDONE;
11908                         jaddref->ja_state |= ATTACHED;
11909                         free_jaddref(jaddref);
11910                 }
11911         }
11912         /*
11913          * Restore any block allocations which are pending journal writes.
11914          */
11915         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11916                 cgp = (struct cg *)bp->b_data;
11917                 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11918                 blksfree = cg_blksfree(cgp);
11919                 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
11920                     jntmp) {
11921                         if ((jnewblk->jn_state & UNDONE) == 0)
11922                                 continue;
11923                         /* Do the roll-forward only if it's a real copy. */
11924                         if (foreground &&
11925                             jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
11926                                 chgs = 1;
11927                         jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
11928                         jnewblk->jn_state |= ATTACHED;
11929                         free_jnewblk(jnewblk);
11930                 }
11931         }
11932         /*
11933          * If the write did not succeed, we have done all the roll-forward
11934          * operations, but we cannot take the actions that will allow its
11935          * dependencies to be processed.
11936          */
11937         if ((flags & WRITESUCCEEDED) == 0) {
11938                 LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
11939                     newblk, nb_deps);
11940                 LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
11941                     worklist, wk_list);
11942                 if (foreground)
11943                         bdirty(bp);
11944                 return (1);
11945         }
11946         while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
11947                 newblk->nb_state |= DEPCOMPLETE;
11948                 newblk->nb_state &= ~ONDEPLIST;
11949                 newblk->nb_bmsafemap = NULL;
11950                 LIST_REMOVE(newblk, nb_deps);
11951                 if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
11952                         handle_allocdirect_partdone(
11953                             WK_ALLOCDIRECT(&newblk->nb_list), NULL);
11954                 else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
11955                         handle_allocindir_partdone(
11956                             WK_ALLOCINDIR(&newblk->nb_list));
11957                 else if (newblk->nb_list.wk_type != D_NEWBLK)
11958                         panic("handle_written_bmsafemap: Unexpected type: %s",
11959                             TYPENAME(newblk->nb_list.wk_type));
11960         }
11961         while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
11962                 inodedep->id_state |= DEPCOMPLETE;
11963                 inodedep->id_state &= ~ONDEPLIST;
11964                 LIST_REMOVE(inodedep, id_deps);
11965                 inodedep->id_bmsafemap = NULL;
11966         }
11967         LIST_REMOVE(bmsafemap, sm_next);
11968         if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
11969             LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
11970             LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
11971             LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
11972             LIST_EMPTY(&bmsafemap->sm_freehd)) {
11973                 LIST_REMOVE(bmsafemap, sm_hash);
11974                 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
11975                 return (0);
11976         }
11977         LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
11978         if (foreground)
11979                 bdirty(bp);
11980         return (1);
11981 }
11982 
11983 /*
11984  * Try to free a mkdir dependency.
11985  */
11986 static void
11987 complete_mkdir(mkdir)
11988         struct mkdir *mkdir;
11989 {
11990         struct diradd *dap;
11991 
11992         if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
11993                 return;
11994         LIST_REMOVE(mkdir, md_mkdirs);
11995         dap = mkdir->md_diradd;
11996         dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
11997         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
11998                 dap->da_state |= DEPCOMPLETE;
11999                 complete_diradd(dap);
12000         }
12001         WORKITEM_FREE(mkdir, D_MKDIR);
12002 }
12003 
12004 /*
12005  * Handle the completion of a mkdir dependency.
12006  */
12007 static void
12008 handle_written_mkdir(mkdir, type)
12009         struct mkdir *mkdir;
12010         int type;
12011 {
12012 
12013         if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
12014                 panic("handle_written_mkdir: bad type");
12015         mkdir->md_state |= COMPLETE;
12016         complete_mkdir(mkdir);
12017 }
12018 
12019 static int
12020 free_pagedep(pagedep)
12021         struct pagedep *pagedep;
12022 {
12023         int i;
12024 
12025         if (pagedep->pd_state & NEWBLOCK)
12026                 return (0);
12027         if (!LIST_EMPTY(&pagedep->pd_dirremhd))
12028                 return (0);
12029         for (i = 0; i < DAHASHSZ; i++)
12030                 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
12031                         return (0);
12032         if (!LIST_EMPTY(&pagedep->pd_pendinghd))
12033                 return (0);
12034         if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
12035                 return (0);
12036         if (pagedep->pd_state & ONWORKLIST)
12037                 WORKLIST_REMOVE(&pagedep->pd_list);
12038         LIST_REMOVE(pagedep, pd_hash);
12039         WORKITEM_FREE(pagedep, D_PAGEDEP);
12040 
12041         return (1);
12042 }
12043 
12044 /*
12045  * Called from within softdep_disk_write_complete above.
12046  * A write operation was just completed. Removed inodes can
12047  * now be freed and associated block pointers may be committed.
12048  * Note that this routine is always called from interrupt level
12049  * with further interrupts from this device blocked.
12050  *
12051  * If the write did not succeed, we will do all the roll-forward
12052  * operations, but we will not take the actions that will allow its
12053  * dependencies to be processed.
12054  */
12055 static int 
12056 handle_written_filepage(pagedep, bp, flags)
12057         struct pagedep *pagedep;
12058         struct buf *bp;         /* buffer containing the written page */
12059         int flags;
12060 {
12061         struct dirrem *dirrem;
12062         struct diradd *dap, *nextdap;
12063         struct direct *ep;
12064         int i, chgs;
12065 
12066         if ((pagedep->pd_state & IOSTARTED) == 0)
12067                 panic("handle_written_filepage: not started");
12068         pagedep->pd_state &= ~IOSTARTED;
12069         if ((flags & WRITESUCCEEDED) == 0)
12070                 goto rollforward;
12071         /*
12072          * Process any directory removals that have been committed.
12073          */
12074         while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
12075                 LIST_REMOVE(dirrem, dm_next);
12076                 dirrem->dm_state |= COMPLETE;
12077                 dirrem->dm_dirinum = pagedep->pd_ino;
12078                 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
12079                     ("handle_written_filepage: Journal entries not written."));
12080                 add_to_worklist(&dirrem->dm_list, 0);
12081         }
12082         /*
12083          * Free any directory additions that have been committed.
12084          * If it is a newly allocated block, we have to wait until
12085          * the on-disk directory inode claims the new block.
12086          */
12087         if ((pagedep->pd_state & NEWBLOCK) == 0)
12088                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
12089                         free_diradd(dap, NULL);
12090 rollforward:
12091         /*
12092          * Uncommitted directory entries must be restored.
12093          */
12094         for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
12095                 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
12096                      dap = nextdap) {
12097                         nextdap = LIST_NEXT(dap, da_pdlist);
12098                         if (dap->da_state & ATTACHED)
12099                                 panic("handle_written_filepage: attached");
12100                         ep = (struct direct *)
12101                             ((char *)bp->b_data + dap->da_offset);
12102                         ep->d_ino = dap->da_newinum;
12103                         dap->da_state &= ~UNDONE;
12104                         dap->da_state |= ATTACHED;
12105                         chgs = 1;
12106                         /*
12107                          * If the inode referenced by the directory has
12108                          * been written out, then the dependency can be
12109                          * moved to the pending list.
12110                          */
12111                         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
12112                                 LIST_REMOVE(dap, da_pdlist);
12113                                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
12114                                     da_pdlist);
12115                         }
12116                 }
12117         }
12118         /*
12119          * If there were any rollbacks in the directory, then it must be
12120          * marked dirty so that its will eventually get written back in
12121          * its correct form.
12122          */
12123         if (chgs || (flags & WRITESUCCEEDED) == 0) {
12124                 if ((bp->b_flags & B_DELWRI) == 0)
12125                         stat_dir_entry++;
12126                 bdirty(bp);
12127                 return (1);
12128         }
12129         /*
12130          * If we are not waiting for a new directory block to be
12131          * claimed by its inode, then the pagedep will be freed.
12132          * Otherwise it will remain to track any new entries on
12133          * the page in case they are fsync'ed.
12134          */
12135         free_pagedep(pagedep);
12136         return (0);
12137 }
12138 
12139 /*
12140  * Writing back in-core inode structures.
12141  * 
12142  * The filesystem only accesses an inode's contents when it occupies an
12143  * "in-core" inode structure.  These "in-core" structures are separate from
12144  * the page frames used to cache inode blocks.  Only the latter are
12145  * transferred to/from the disk.  So, when the updated contents of the
12146  * "in-core" inode structure are copied to the corresponding in-memory inode
12147  * block, the dependencies are also transferred.  The following procedure is
12148  * called when copying a dirty "in-core" inode to a cached inode block.
12149  */
12150 
12151 /*
12152  * Called when an inode is loaded from disk. If the effective link count
12153  * differed from the actual link count when it was last flushed, then we
12154  * need to ensure that the correct effective link count is put back.
12155  */
12156 void 
12157 softdep_load_inodeblock(ip)
12158         struct inode *ip;       /* the "in_core" copy of the inode */
12159 {
12160         struct inodedep *inodedep;
12161         struct ufsmount *ump;
12162 
12163         ump = ITOUMP(ip);
12164         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
12165             ("softdep_load_inodeblock called on non-softdep filesystem"));
12166         /*
12167          * Check for alternate nlink count.
12168          */
12169         ip->i_effnlink = ip->i_nlink;
12170         ACQUIRE_LOCK(ump);
12171         if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) {
12172                 FREE_LOCK(ump);
12173                 return;
12174         }
12175         ip->i_effnlink -= inodedep->id_nlinkdelta;
12176         FREE_LOCK(ump);
12177 }
12178 
12179 /*
12180  * This routine is called just before the "in-core" inode
12181  * information is to be copied to the in-memory inode block.
12182  * Recall that an inode block contains several inodes. If
12183  * the force flag is set, then the dependencies will be
12184  * cleared so that the update can always be made. Note that
12185  * the buffer is locked when this routine is called, so we
12186  * will never be in the middle of writing the inode block 
12187  * to disk.
12188  */
12189 void 
12190 softdep_update_inodeblock(ip, bp, waitfor)
12191         struct inode *ip;       /* the "in_core" copy of the inode */
12192         struct buf *bp;         /* the buffer containing the inode block */
12193         int waitfor;            /* nonzero => update must be allowed */
12194 {
12195         struct inodedep *inodedep;
12196         struct inoref *inoref;
12197         struct ufsmount *ump;
12198         struct worklist *wk;
12199         struct mount *mp;
12200         struct buf *ibp;
12201         struct fs *fs;
12202         int error;
12203 
12204         ump = ITOUMP(ip);
12205         mp = UFSTOVFS(ump);
12206         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
12207             ("softdep_update_inodeblock called on non-softdep filesystem"));
12208         fs = ump->um_fs;
12209         /*
12210          * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
12211          * does not have access to the in-core ip so must write directly into
12212          * the inode block buffer when setting freelink.
12213          */
12214         if (fs->fs_magic == FS_UFS1_MAGIC)
12215                 DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
12216                     ino_to_fsbo(fs, ip->i_number))->di_freelink);
12217         else
12218                 DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
12219                     ino_to_fsbo(fs, ip->i_number))->di_freelink);
12220         /*
12221          * If the effective link count is not equal to the actual link
12222          * count, then we must track the difference in an inodedep while
12223          * the inode is (potentially) tossed out of the cache. Otherwise,
12224          * if there is no existing inodedep, then there are no dependencies
12225          * to track.
12226          */
12227         ACQUIRE_LOCK(ump);
12228 again:
12229         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12230                 FREE_LOCK(ump);
12231                 if (ip->i_effnlink != ip->i_nlink)
12232                         panic("softdep_update_inodeblock: bad link count");
12233                 return;
12234         }
12235         if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
12236                 panic("softdep_update_inodeblock: bad delta");
12237         /*
12238          * If we're flushing all dependencies we must also move any waiting
12239          * for journal writes onto the bufwait list prior to I/O.
12240          */
12241         if (waitfor) {
12242                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12243                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12244                             == DEPCOMPLETE) {
12245                                 jwait(&inoref->if_list, MNT_WAIT);
12246                                 goto again;
12247                         }
12248                 }
12249         }
12250         /*
12251          * Changes have been initiated. Anything depending on these
12252          * changes cannot occur until this inode has been written.
12253          */
12254         inodedep->id_state &= ~COMPLETE;
12255         if ((inodedep->id_state & ONWORKLIST) == 0)
12256                 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
12257         /*
12258          * Any new dependencies associated with the incore inode must 
12259          * now be moved to the list associated with the buffer holding
12260          * the in-memory copy of the inode. Once merged process any
12261          * allocdirects that are completed by the merger.
12262          */
12263         merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
12264         if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
12265                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
12266                     NULL);
12267         merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
12268         if (!TAILQ_EMPTY(&inodedep->id_extupdt))
12269                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
12270                     NULL);
12271         /*
12272          * Now that the inode has been pushed into the buffer, the
12273          * operations dependent on the inode being written to disk
12274          * can be moved to the id_bufwait so that they will be
12275          * processed when the buffer I/O completes.
12276          */
12277         while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
12278                 WORKLIST_REMOVE(wk);
12279                 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
12280         }
12281         /*
12282          * Newly allocated inodes cannot be written until the bitmap
12283          * that allocates them have been written (indicated by
12284          * DEPCOMPLETE being set in id_state). If we are doing a
12285          * forced sync (e.g., an fsync on a file), we force the bitmap
12286          * to be written so that the update can be done.
12287          */
12288         if (waitfor == 0) {
12289                 FREE_LOCK(ump);
12290                 return;
12291         }
12292 retry:
12293         if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
12294                 FREE_LOCK(ump);
12295                 return;
12296         }
12297         ibp = inodedep->id_bmsafemap->sm_buf;
12298         ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT);
12299         if (ibp == NULL) {
12300                 /*
12301                  * If ibp came back as NULL, the dependency could have been
12302                  * freed while we slept.  Look it up again, and check to see
12303                  * that it has completed.
12304                  */
12305                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
12306                         goto retry;
12307                 FREE_LOCK(ump);
12308                 return;
12309         }
12310         FREE_LOCK(ump);
12311         if ((error = bwrite(ibp)) != 0)
12312                 softdep_error("softdep_update_inodeblock: bwrite", error);
12313 }
12314 
12315 /*
12316  * Merge the a new inode dependency list (such as id_newinoupdt) into an
12317  * old inode dependency list (such as id_inoupdt). This routine must be
12318  * called with splbio interrupts blocked.
12319  */
12320 static void
12321 merge_inode_lists(newlisthead, oldlisthead)
12322         struct allocdirectlst *newlisthead;
12323         struct allocdirectlst *oldlisthead;
12324 {
12325         struct allocdirect *listadp, *newadp;
12326 
12327         newadp = TAILQ_FIRST(newlisthead);
12328         for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
12329                 if (listadp->ad_offset < newadp->ad_offset) {
12330                         listadp = TAILQ_NEXT(listadp, ad_next);
12331                         continue;
12332                 }
12333                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
12334                 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
12335                 if (listadp->ad_offset == newadp->ad_offset) {
12336                         allocdirect_merge(oldlisthead, newadp,
12337                             listadp);
12338                         listadp = newadp;
12339                 }
12340                 newadp = TAILQ_FIRST(newlisthead);
12341         }
12342         while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
12343                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
12344                 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
12345         }
12346 }
12347 
12348 /*
12349  * If we are doing an fsync, then we must ensure that any directory
12350  * entries for the inode have been written after the inode gets to disk.
12351  */
12352 int
12353 softdep_fsync(vp)
12354         struct vnode *vp;       /* the "in_core" copy of the inode */
12355 {
12356         struct inodedep *inodedep;
12357         struct pagedep *pagedep;
12358         struct inoref *inoref;
12359         struct ufsmount *ump;
12360         struct worklist *wk;
12361         struct diradd *dap;
12362         struct mount *mp;
12363         struct vnode *pvp;
12364         struct inode *ip;
12365         struct buf *bp;
12366         struct fs *fs;
12367         struct thread *td = curthread;
12368         int error, flushparent, pagedep_new_block;
12369         ino_t parentino;
12370         ufs_lbn_t lbn;
12371 
12372         ip = VTOI(vp);
12373         mp = vp->v_mount;
12374         ump = VFSTOUFS(mp);
12375         fs = ump->um_fs;
12376         if (MOUNTEDSOFTDEP(mp) == 0)
12377                 return (0);
12378         ACQUIRE_LOCK(ump);
12379 restart:
12380         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12381                 FREE_LOCK(ump);
12382                 return (0);
12383         }
12384         TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12385                 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12386                     == DEPCOMPLETE) {
12387                         jwait(&inoref->if_list, MNT_WAIT);
12388                         goto restart;
12389                 }
12390         }
12391         if (!LIST_EMPTY(&inodedep->id_inowait) ||
12392             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
12393             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
12394             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
12395             !TAILQ_EMPTY(&inodedep->id_newinoupdt))
12396                 panic("softdep_fsync: pending ops %p", inodedep);
12397         for (error = 0, flushparent = 0; ; ) {
12398                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
12399                         break;
12400                 if (wk->wk_type != D_DIRADD)
12401                         panic("softdep_fsync: Unexpected type %s",
12402                             TYPENAME(wk->wk_type));
12403                 dap = WK_DIRADD(wk);
12404                 /*
12405                  * Flush our parent if this directory entry has a MKDIR_PARENT
12406                  * dependency or is contained in a newly allocated block.
12407                  */
12408                 if (dap->da_state & DIRCHG)
12409                         pagedep = dap->da_previous->dm_pagedep;
12410                 else
12411                         pagedep = dap->da_pagedep;
12412                 parentino = pagedep->pd_ino;
12413                 lbn = pagedep->pd_lbn;
12414                 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
12415                         panic("softdep_fsync: dirty");
12416                 if ((dap->da_state & MKDIR_PARENT) ||
12417                     (pagedep->pd_state & NEWBLOCK))
12418                         flushparent = 1;
12419                 else
12420                         flushparent = 0;
12421                 /*
12422                  * If we are being fsync'ed as part of vgone'ing this vnode,
12423                  * then we will not be able to release and recover the
12424                  * vnode below, so we just have to give up on writing its
12425                  * directory entry out. It will eventually be written, just
12426                  * not now, but then the user was not asking to have it
12427                  * written, so we are not breaking any promises.
12428                  */
12429                 if (vp->v_iflag & VI_DOOMED)
12430                         break;
12431                 /*
12432                  * We prevent deadlock by always fetching inodes from the
12433                  * root, moving down the directory tree. Thus, when fetching
12434                  * our parent directory, we first try to get the lock. If
12435                  * that fails, we must unlock ourselves before requesting
12436                  * the lock on our parent. See the comment in ufs_lookup
12437                  * for details on possible races.
12438                  */
12439                 FREE_LOCK(ump);
12440                 if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
12441                     FFSV_FORCEINSMQ)) {
12442                         /*
12443                          * Unmount cannot proceed after unlock because
12444                          * caller must have called vn_start_write().
12445                          */
12446                         VOP_UNLOCK(vp, 0);
12447                         error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
12448                             &pvp, FFSV_FORCEINSMQ);
12449                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12450                         if (vp->v_iflag & VI_DOOMED) {
12451                                 if (error == 0)
12452                                         vput(pvp);
12453                                 error = ENOENT;
12454                         }
12455                         if (error != 0)
12456                                 return (error);
12457                 }
12458                 /*
12459                  * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12460                  * that are contained in direct blocks will be resolved by 
12461                  * doing a ffs_update. Pagedeps contained in indirect blocks
12462                  * may require a complete sync'ing of the directory. So, we
12463                  * try the cheap and fast ffs_update first, and if that fails,
12464                  * then we do the slower ffs_syncvnode of the directory.
12465                  */
12466                 if (flushparent) {
12467                         int locked;
12468 
12469                         if ((error = ffs_update(pvp, 1)) != 0) {
12470                                 vput(pvp);
12471                                 return (error);
12472                         }
12473                         ACQUIRE_LOCK(ump);
12474                         locked = 1;
12475                         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12476                                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12477                                         if (wk->wk_type != D_DIRADD)
12478                                                 panic("softdep_fsync: Unexpected type %s",
12479                                                       TYPENAME(wk->wk_type));
12480                                         dap = WK_DIRADD(wk);
12481                                         if (dap->da_state & DIRCHG)
12482                                                 pagedep = dap->da_previous->dm_pagedep;
12483                                         else
12484                                                 pagedep = dap->da_pagedep;
12485                                         pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12486                                         FREE_LOCK(ump);
12487                                         locked = 0;
12488                                         if (pagedep_new_block && (error =
12489                                             ffs_syncvnode(pvp, MNT_WAIT, 0))) {
12490                                                 vput(pvp);
12491                                                 return (error);
12492                                         }
12493                                 }
12494                         }
12495                         if (locked)
12496                                 FREE_LOCK(ump);
12497                 }
12498                 /*
12499                  * Flush directory page containing the inode's name.
12500                  */
12501                 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12502                     &bp);
12503                 if (error == 0)
12504                         error = bwrite(bp);
12505                 else
12506                         brelse(bp);
12507                 vput(pvp);
12508                 if (error != 0)
12509                         return (error);
12510                 ACQUIRE_LOCK(ump);
12511                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12512                         break;
12513         }
12514         FREE_LOCK(ump);
12515         return (0);
12516 }
12517 
12518 /*
12519  * Flush all the dirty bitmaps associated with the block device
12520  * before flushing the rest of the dirty blocks so as to reduce
12521  * the number of dependencies that will have to be rolled back.
12522  *
12523  * XXX Unused?
12524  */
12525 void
12526 softdep_fsync_mountdev(vp)
12527         struct vnode *vp;
12528 {
12529         struct buf *bp, *nbp;
12530         struct worklist *wk;
12531         struct bufobj *bo;
12532 
12533         if (!vn_isdisk(vp, NULL))
12534                 panic("softdep_fsync_mountdev: vnode not a disk");
12535         bo = &vp->v_bufobj;
12536 restart:
12537         BO_LOCK(bo);
12538         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
12539                 /* 
12540                  * If it is already scheduled, skip to the next buffer.
12541                  */
12542                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
12543                         continue;
12544 
12545                 if ((bp->b_flags & B_DELWRI) == 0)
12546                         panic("softdep_fsync_mountdev: not dirty");
12547                 /*
12548                  * We are only interested in bitmaps with outstanding
12549                  * dependencies.
12550                  */
12551                 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
12552                     wk->wk_type != D_BMSAFEMAP ||
12553                     (bp->b_vflags & BV_BKGRDINPROG)) {
12554                         BUF_UNLOCK(bp);
12555                         continue;
12556                 }
12557                 BO_UNLOCK(bo);
12558                 bremfree(bp);
12559                 (void) bawrite(bp);
12560                 goto restart;
12561         }
12562         drain_output(vp);
12563         BO_UNLOCK(bo);
12564 }
12565 
12566 /*
12567  * Sync all cylinder groups that were dirty at the time this function is
12568  * called.  Newly dirtied cgs will be inserted before the sentinel.  This
12569  * is used to flush freedep activity that may be holding up writes to a
12570  * indirect block.
12571  */
12572 static int
12573 sync_cgs(mp, waitfor)
12574         struct mount *mp;
12575         int waitfor;
12576 {
12577         struct bmsafemap *bmsafemap;
12578         struct bmsafemap *sentinel;
12579         struct ufsmount *ump;
12580         struct buf *bp;
12581         int error;
12582 
12583         sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
12584         sentinel->sm_cg = -1;
12585         ump = VFSTOUFS(mp);
12586         error = 0;
12587         ACQUIRE_LOCK(ump);
12588         LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
12589         for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
12590             bmsafemap = LIST_NEXT(sentinel, sm_next)) {
12591                 /* Skip sentinels and cgs with no work to release. */
12592                 if (bmsafemap->sm_cg == -1 ||
12593                     (LIST_EMPTY(&bmsafemap->sm_freehd) &&
12594                     LIST_EMPTY(&bmsafemap->sm_freewr))) {
12595                         LIST_REMOVE(sentinel, sm_next);
12596                         LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12597                         continue;
12598                 }
12599                 /*
12600                  * If we don't get the lock and we're waiting try again, if
12601                  * not move on to the next buf and try to sync it.
12602                  */
12603                 bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor);
12604                 if (bp == NULL && waitfor == MNT_WAIT)
12605                         continue;
12606                 LIST_REMOVE(sentinel, sm_next);
12607                 LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12608                 if (bp == NULL)
12609                         continue;
12610                 FREE_LOCK(ump);
12611                 if (waitfor == MNT_NOWAIT)
12612                         bawrite(bp);
12613                 else
12614                         error = bwrite(bp);
12615                 ACQUIRE_LOCK(ump);
12616                 if (error)
12617                         break;
12618         }
12619         LIST_REMOVE(sentinel, sm_next);
12620         FREE_LOCK(ump);
12621         free(sentinel, M_BMSAFEMAP);
12622         return (error);
12623 }
12624 
12625 /*
12626  * This routine is called when we are trying to synchronously flush a
12627  * file. This routine must eliminate any filesystem metadata dependencies
12628  * so that the syncing routine can succeed.
12629  */
12630 int
12631 softdep_sync_metadata(struct vnode *vp)
12632 {
12633         struct inode *ip;
12634         int error;
12635 
12636         ip = VTOI(vp);
12637         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12638             ("softdep_sync_metadata called on non-softdep filesystem"));
12639         /*
12640          * Ensure that any direct block dependencies have been cleared,
12641          * truncations are started, and inode references are journaled.
12642          */
12643         ACQUIRE_LOCK(VFSTOUFS(vp->v_mount));
12644         /*
12645          * Write all journal records to prevent rollbacks on devvp.
12646          */
12647         if (vp->v_type == VCHR)
12648                 softdep_flushjournal(vp->v_mount);
12649         error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number);
12650         /*
12651          * Ensure that all truncates are written so we won't find deps on
12652          * indirect blocks.
12653          */
12654         process_truncates(vp);
12655         FREE_LOCK(VFSTOUFS(vp->v_mount));
12656 
12657         return (error);
12658 }
12659 
12660 /*
12661  * This routine is called when we are attempting to sync a buf with
12662  * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12663  * other IO it can but returns EBUSY if the buffer is not yet able to
12664  * be written.  Dependencies which will not cause rollbacks will always
12665  * return 0.
12666  */
12667 int
12668 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
12669 {
12670         struct indirdep *indirdep;
12671         struct pagedep *pagedep;
12672         struct allocindir *aip;
12673         struct newblk *newblk;
12674         struct ufsmount *ump;
12675         struct buf *nbp;
12676         struct worklist *wk;
12677         int i, error;
12678 
12679         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12680             ("softdep_sync_buf called on non-softdep filesystem"));
12681         /*
12682          * For VCHR we just don't want to force flush any dependencies that
12683          * will cause rollbacks.
12684          */
12685         if (vp->v_type == VCHR) {
12686                 if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12687                         return (EBUSY);
12688                 return (0);
12689         }
12690         ump = VFSTOUFS(vp->v_mount);
12691         ACQUIRE_LOCK(ump);
12692         /*
12693          * As we hold the buffer locked, none of its dependencies
12694          * will disappear.
12695          */
12696         error = 0;
12697 top:
12698         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12699                 switch (wk->wk_type) {
12700 
12701                 case D_ALLOCDIRECT:
12702                 case D_ALLOCINDIR:
12703                         newblk = WK_NEWBLK(wk);
12704                         if (newblk->nb_jnewblk != NULL) {
12705                                 if (waitfor == MNT_NOWAIT) {
12706                                         error = EBUSY;
12707                                         goto out_unlock;
12708                                 }
12709                                 jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12710                                 goto top;
12711                         }
12712                         if (newblk->nb_state & DEPCOMPLETE ||
12713                             waitfor == MNT_NOWAIT)
12714                                 continue;
12715                         nbp = newblk->nb_bmsafemap->sm_buf;
12716                         nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12717                         if (nbp == NULL)
12718                                 goto top;
12719                         FREE_LOCK(ump);
12720                         if ((error = bwrite(nbp)) != 0)
12721                                 goto out;
12722                         ACQUIRE_LOCK(ump);
12723                         continue;
12724 
12725                 case D_INDIRDEP:
12726                         indirdep = WK_INDIRDEP(wk);
12727                         if (waitfor == MNT_NOWAIT) {
12728                                 if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
12729                                     !LIST_EMPTY(&indirdep->ir_deplisthd)) {
12730                                         error = EBUSY;
12731                                         goto out_unlock;
12732                                 }
12733                         }
12734                         if (!TAILQ_EMPTY(&indirdep->ir_trunc))
12735                                 panic("softdep_sync_buf: truncation pending.");
12736                 restart:
12737                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
12738                                 newblk = (struct newblk *)aip;
12739                                 if (newblk->nb_jnewblk != NULL) {
12740                                         jwait(&newblk->nb_jnewblk->jn_list,
12741                                             waitfor);
12742                                         goto restart;
12743                                 }
12744                                 if (newblk->nb_state & DEPCOMPLETE)
12745                                         continue;
12746                                 nbp = newblk->nb_bmsafemap->sm_buf;
12747                                 nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12748                                 if (nbp == NULL)
12749                                         goto restart;
12750                                 FREE_LOCK(ump);
12751                                 if ((error = bwrite(nbp)) != 0)
12752                                         goto out;
12753                                 ACQUIRE_LOCK(ump);
12754                                 goto restart;
12755                         }
12756                         continue;
12757 
12758                 case D_PAGEDEP:
12759                         /*
12760                          * Only flush directory entries in synchronous passes.
12761                          */
12762                         if (waitfor != MNT_WAIT) {
12763                                 error = EBUSY;
12764                                 goto out_unlock;
12765                         }
12766                         /*
12767                          * While syncing snapshots, we must allow recursive
12768                          * lookups.
12769                          */
12770                         BUF_AREC(bp);
12771                         /*
12772                          * We are trying to sync a directory that may
12773                          * have dependencies on both its own metadata
12774                          * and/or dependencies on the inodes of any
12775                          * recently allocated files. We walk its diradd
12776                          * lists pushing out the associated inode.
12777                          */
12778                         pagedep = WK_PAGEDEP(wk);
12779                         for (i = 0; i < DAHASHSZ; i++) {
12780                                 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
12781                                         continue;
12782                                 if ((error = flush_pagedep_deps(vp, wk->wk_mp,
12783                                     &pagedep->pd_diraddhd[i]))) {
12784                                         BUF_NOREC(bp);
12785                                         goto out_unlock;
12786                                 }
12787                         }
12788                         BUF_NOREC(bp);
12789                         continue;
12790 
12791                 case D_FREEWORK:
12792                 case D_FREEDEP:
12793                 case D_JSEGDEP:
12794                 case D_JNEWBLK:
12795                         continue;
12796 
12797                 default:
12798                         panic("softdep_sync_buf: Unknown type %s",
12799                             TYPENAME(wk->wk_type));
12800                         /* NOTREACHED */
12801                 }
12802         }
12803 out_unlock:
12804         FREE_LOCK(ump);
12805 out:
12806         return (error);
12807 }
12808 
12809 /*
12810  * Flush the dependencies associated with an inodedep.
12811  * Called with splbio blocked.
12812  */
12813 static int
12814 flush_inodedep_deps(vp, mp, ino)
12815         struct vnode *vp;
12816         struct mount *mp;
12817         ino_t ino;
12818 {
12819         struct inodedep *inodedep;
12820         struct inoref *inoref;
12821         struct ufsmount *ump;
12822         int error, waitfor;
12823 
12824         /*
12825          * This work is done in two passes. The first pass grabs most
12826          * of the buffers and begins asynchronously writing them. The
12827          * only way to wait for these asynchronous writes is to sleep
12828          * on the filesystem vnode which may stay busy for a long time
12829          * if the filesystem is active. So, instead, we make a second
12830          * pass over the dependencies blocking on each write. In the
12831          * usual case we will be blocking against a write that we
12832          * initiated, so when it is done the dependency will have been
12833          * resolved. Thus the second pass is expected to end quickly.
12834          * We give a brief window at the top of the loop to allow
12835          * any pending I/O to complete.
12836          */
12837         ump = VFSTOUFS(mp);
12838         LOCK_OWNED(ump);
12839         for (error = 0, waitfor = MNT_NOWAIT; ; ) {
12840                 if (error)
12841                         return (error);
12842                 FREE_LOCK(ump);
12843                 ACQUIRE_LOCK(ump);
12844 restart:
12845                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12846                         return (0);
12847                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12848                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12849                             == DEPCOMPLETE) {
12850                                 jwait(&inoref->if_list, MNT_WAIT);
12851                                 goto restart;
12852                         }
12853                 }
12854                 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
12855                     flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
12856                     flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
12857                     flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
12858                         continue;
12859                 /*
12860                  * If pass2, we are done, otherwise do pass 2.
12861                  */
12862                 if (waitfor == MNT_WAIT)
12863                         break;
12864                 waitfor = MNT_WAIT;
12865         }
12866         /*
12867          * Try freeing inodedep in case all dependencies have been removed.
12868          */
12869         if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
12870                 (void) free_inodedep(inodedep);
12871         return (0);
12872 }
12873 
12874 /*
12875  * Flush an inode dependency list.
12876  * Called with splbio blocked.
12877  */
12878 static int
12879 flush_deplist(listhead, waitfor, errorp)
12880         struct allocdirectlst *listhead;
12881         int waitfor;
12882         int *errorp;
12883 {
12884         struct allocdirect *adp;
12885         struct newblk *newblk;
12886         struct ufsmount *ump;
12887         struct buf *bp;
12888 
12889         if ((adp = TAILQ_FIRST(listhead)) == NULL)
12890                 return (0);
12891         ump = VFSTOUFS(adp->ad_list.wk_mp);
12892         LOCK_OWNED(ump);
12893         TAILQ_FOREACH(adp, listhead, ad_next) {
12894                 newblk = (struct newblk *)adp;
12895                 if (newblk->nb_jnewblk != NULL) {
12896                         jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12897                         return (1);
12898                 }
12899                 if (newblk->nb_state & DEPCOMPLETE)
12900                         continue;
12901                 bp = newblk->nb_bmsafemap->sm_buf;
12902                 bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor);
12903                 if (bp == NULL) {
12904                         if (waitfor == MNT_NOWAIT)
12905                                 continue;
12906                         return (1);
12907                 }
12908                 FREE_LOCK(ump);
12909                 if (waitfor == MNT_NOWAIT)
12910                         bawrite(bp);
12911                 else 
12912                         *errorp = bwrite(bp);
12913                 ACQUIRE_LOCK(ump);
12914                 return (1);
12915         }
12916         return (0);
12917 }
12918 
12919 /*
12920  * Flush dependencies associated with an allocdirect block.
12921  */
12922 static int
12923 flush_newblk_dep(vp, mp, lbn)
12924         struct vnode *vp;
12925         struct mount *mp;
12926         ufs_lbn_t lbn;
12927 {
12928         struct newblk *newblk;
12929         struct ufsmount *ump;
12930         struct bufobj *bo;
12931         struct inode *ip;
12932         struct buf *bp;
12933         ufs2_daddr_t blkno;
12934         int error;
12935 
12936         error = 0;
12937         bo = &vp->v_bufobj;
12938         ip = VTOI(vp);
12939         blkno = DIP(ip, i_db[lbn]);
12940         if (blkno == 0)
12941                 panic("flush_newblk_dep: Missing block");
12942         ump = VFSTOUFS(mp);
12943         ACQUIRE_LOCK(ump);
12944         /*
12945          * Loop until all dependencies related to this block are satisfied.
12946          * We must be careful to restart after each sleep in case a write
12947          * completes some part of this process for us.
12948          */
12949         for (;;) {
12950                 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
12951                         FREE_LOCK(ump);
12952                         break;
12953                 }
12954                 if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
12955                         panic("flush_newblk_deps: Bad newblk %p", newblk);
12956                 /*
12957                  * Flush the journal.
12958                  */
12959                 if (newblk->nb_jnewblk != NULL) {
12960                         jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12961                         continue;
12962                 }
12963                 /*
12964                  * Write the bitmap dependency.
12965                  */
12966                 if ((newblk->nb_state & DEPCOMPLETE) == 0) {
12967                         bp = newblk->nb_bmsafemap->sm_buf;
12968                         bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
12969                         if (bp == NULL)
12970                                 continue;
12971                         FREE_LOCK(ump);
12972                         error = bwrite(bp);
12973                         if (error)
12974                                 break;
12975                         ACQUIRE_LOCK(ump);
12976                         continue;
12977                 }
12978                 /*
12979                  * Write the buffer.
12980                  */
12981                 FREE_LOCK(ump);
12982                 BO_LOCK(bo);
12983                 bp = gbincore(bo, lbn);
12984                 if (bp != NULL) {
12985                         error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
12986                             LK_INTERLOCK, BO_LOCKPTR(bo));
12987                         if (error == ENOLCK) {
12988                                 ACQUIRE_LOCK(ump);
12989                                 error = 0;
12990                                 continue; /* Slept, retry */
12991                         }
12992                         if (error != 0)
12993                                 break;  /* Failed */
12994                         if (bp->b_flags & B_DELWRI) {
12995                                 bremfree(bp);
12996                                 error = bwrite(bp);
12997                                 if (error)
12998                                         break;
12999                         } else
13000                                 BUF_UNLOCK(bp);
13001                 } else
13002                         BO_UNLOCK(bo);
13003                 /*
13004                  * We have to wait for the direct pointers to
13005                  * point at the newdirblk before the dependency
13006                  * will go away.
13007                  */
13008                 error = ffs_update(vp, 1);
13009                 if (error)
13010                         break;
13011                 ACQUIRE_LOCK(ump);
13012         }
13013         return (error);
13014 }
13015 
13016 /*
13017  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
13018  * Called with splbio blocked.
13019  */
13020 static int
13021 flush_pagedep_deps(pvp, mp, diraddhdp)
13022         struct vnode *pvp;
13023         struct mount *mp;
13024         struct diraddhd *diraddhdp;
13025 {
13026         struct inodedep *inodedep;
13027         struct inoref *inoref;
13028         struct ufsmount *ump;
13029         struct diradd *dap;
13030         struct vnode *vp;
13031         int error = 0;
13032         struct buf *bp;
13033         ino_t inum;
13034         struct diraddhd unfinished;
13035 
13036         LIST_INIT(&unfinished);
13037         ump = VFSTOUFS(mp);
13038         LOCK_OWNED(ump);
13039 restart:
13040         while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
13041                 /*
13042                  * Flush ourselves if this directory entry
13043                  * has a MKDIR_PARENT dependency.
13044                  */
13045                 if (dap->da_state & MKDIR_PARENT) {
13046                         FREE_LOCK(ump);
13047                         if ((error = ffs_update(pvp, 1)) != 0)
13048                                 break;
13049                         ACQUIRE_LOCK(ump);
13050                         /*
13051                          * If that cleared dependencies, go on to next.
13052                          */
13053                         if (dap != LIST_FIRST(diraddhdp))
13054                                 continue;
13055                         /*
13056                          * All MKDIR_PARENT dependencies and all the
13057                          * NEWBLOCK pagedeps that are contained in direct
13058                          * blocks were resolved by doing above ffs_update.
13059                          * Pagedeps contained in indirect blocks may
13060                          * require a complete sync'ing of the directory.
13061                          * We are in the midst of doing a complete sync,
13062                          * so if they are not resolved in this pass we
13063                          * defer them for now as they will be sync'ed by
13064                          * our caller shortly.
13065                          */
13066                         LIST_REMOVE(dap, da_pdlist);
13067                         LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
13068                         continue;
13069                 }
13070                 /*
13071                  * A newly allocated directory must have its "." and
13072                  * ".." entries written out before its name can be
13073                  * committed in its parent. 
13074                  */
13075                 inum = dap->da_newinum;
13076                 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13077                         panic("flush_pagedep_deps: lost inode1");
13078                 /*
13079                  * Wait for any pending journal adds to complete so we don't
13080                  * cause rollbacks while syncing.
13081                  */
13082                 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
13083                         if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
13084                             == DEPCOMPLETE) {
13085                                 jwait(&inoref->if_list, MNT_WAIT);
13086                                 goto restart;
13087                         }
13088                 }
13089                 if (dap->da_state & MKDIR_BODY) {
13090                         FREE_LOCK(ump);
13091                         if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
13092                             FFSV_FORCEINSMQ)))
13093                                 break;
13094                         error = flush_newblk_dep(vp, mp, 0);
13095                         /*
13096                          * If we still have the dependency we might need to
13097                          * update the vnode to sync the new link count to
13098                          * disk.
13099                          */
13100                         if (error == 0 && dap == LIST_FIRST(diraddhdp))
13101                                 error = ffs_update(vp, 1);
13102                         vput(vp);
13103                         if (error != 0)
13104                                 break;
13105                         ACQUIRE_LOCK(ump);
13106                         /*
13107                          * If that cleared dependencies, go on to next.
13108                          */
13109                         if (dap != LIST_FIRST(diraddhdp))
13110                                 continue;
13111                         if (dap->da_state & MKDIR_BODY) {
13112                                 inodedep_lookup(UFSTOVFS(ump), inum, 0,
13113                                     &inodedep);
13114                                 panic("flush_pagedep_deps: MKDIR_BODY "
13115                                     "inodedep %p dap %p vp %p",
13116                                     inodedep, dap, vp);
13117                         }
13118                 }
13119                 /*
13120                  * Flush the inode on which the directory entry depends.
13121                  * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
13122                  * the only remaining dependency is that the updated inode
13123                  * count must get pushed to disk. The inode has already
13124                  * been pushed into its inode buffer (via VOP_UPDATE) at
13125                  * the time of the reference count change. So we need only
13126                  * locate that buffer, ensure that there will be no rollback
13127                  * caused by a bitmap dependency, then write the inode buffer.
13128                  */
13129 retry:
13130                 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13131                         panic("flush_pagedep_deps: lost inode");
13132                 /*
13133                  * If the inode still has bitmap dependencies,
13134                  * push them to disk.
13135                  */
13136                 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
13137                         bp = inodedep->id_bmsafemap->sm_buf;
13138                         bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
13139                         if (bp == NULL)
13140                                 goto retry;
13141                         FREE_LOCK(ump);
13142                         if ((error = bwrite(bp)) != 0)
13143                                 break;
13144                         ACQUIRE_LOCK(ump);
13145                         if (dap != LIST_FIRST(diraddhdp))
13146                                 continue;
13147                 }
13148                 /*
13149                  * If the inode is still sitting in a buffer waiting
13150                  * to be written or waiting for the link count to be
13151                  * adjusted update it here to flush it to disk.
13152                  */
13153                 if (dap == LIST_FIRST(diraddhdp)) {
13154                         FREE_LOCK(ump);
13155                         if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
13156                             FFSV_FORCEINSMQ)))
13157                                 break;
13158                         error = ffs_update(vp, 1);
13159                         vput(vp);
13160                         if (error)
13161                                 break;
13162                         ACQUIRE_LOCK(ump);
13163                 }
13164                 /*
13165                  * If we have failed to get rid of all the dependencies
13166                  * then something is seriously wrong.
13167                  */
13168                 if (dap == LIST_FIRST(diraddhdp)) {
13169                         inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
13170                         panic("flush_pagedep_deps: failed to flush " 
13171                             "inodedep %p ino %ju dap %p",
13172                             inodedep, (uintmax_t)inum, dap);
13173                 }
13174         }
13175         if (error)
13176                 ACQUIRE_LOCK(ump);
13177         while ((dap = LIST_FIRST(&unfinished)) != NULL) {
13178                 LIST_REMOVE(dap, da_pdlist);
13179                 LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
13180         }
13181         return (error);
13182 }
13183 
13184 /*
13185  * A large burst of file addition or deletion activity can drive the
13186  * memory load excessively high. First attempt to slow things down
13187  * using the techniques below. If that fails, this routine requests
13188  * the offending operations to fall back to running synchronously
13189  * until the memory load returns to a reasonable level.
13190  */
13191 int
13192 softdep_slowdown(vp)
13193         struct vnode *vp;
13194 {
13195         struct ufsmount *ump;
13196         int jlow;
13197         int max_softdeps_hard;
13198 
13199         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
13200             ("softdep_slowdown called on non-softdep filesystem"));
13201         ump = VFSTOUFS(vp->v_mount);
13202         ACQUIRE_LOCK(ump);
13203         jlow = 0;
13204         /*
13205          * Check for journal space if needed.
13206          */
13207         if (DOINGSUJ(vp)) {
13208                 if (journal_space(ump, 0) == 0)
13209                         jlow = 1;
13210         }
13211         /*
13212          * If the system is under its limits and our filesystem is
13213          * not responsible for more than our share of the usage and
13214          * we are not low on journal space, then no need to slow down.
13215          */
13216         max_softdeps_hard = max_softdeps * 11 / 10;
13217         if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
13218             dep_current[D_INODEDEP] < max_softdeps_hard &&
13219             dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 &&
13220             dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 &&
13221             ump->softdep_curdeps[D_DIRREM] <
13222             (max_softdeps_hard / 2) / stat_flush_threads &&
13223             ump->softdep_curdeps[D_INODEDEP] <
13224             max_softdeps_hard / stat_flush_threads &&
13225             ump->softdep_curdeps[D_INDIRDEP] <
13226             (max_softdeps_hard / 1000) / stat_flush_threads &&
13227             ump->softdep_curdeps[D_FREEBLKS] <
13228             max_softdeps_hard / stat_flush_threads) {
13229                 FREE_LOCK(ump);
13230                 return (0);
13231         }
13232         /*
13233          * If the journal is low or our filesystem is over its limit
13234          * then speedup the cleanup.
13235          */
13236         if (ump->softdep_curdeps[D_INDIRDEP] <
13237             (max_softdeps_hard / 1000) / stat_flush_threads || jlow)
13238                 softdep_speedup(ump);
13239         stat_sync_limit_hit += 1;
13240         FREE_LOCK(ump);
13241         /*
13242          * We only slow down the rate at which new dependencies are
13243          * generated if we are not using journaling. With journaling,
13244          * the cleanup should always be sufficient to keep things
13245          * under control.
13246          */
13247         if (DOINGSUJ(vp))
13248                 return (0);
13249         return (1);
13250 }
13251 
13252 /*
13253  * Called by the allocation routines when they are about to fail
13254  * in the hope that we can free up the requested resource (inodes
13255  * or disk space).
13256  * 
13257  * First check to see if the work list has anything on it. If it has,
13258  * clean up entries until we successfully free the requested resource.
13259  * Because this process holds inodes locked, we cannot handle any remove
13260  * requests that might block on a locked inode as that could lead to
13261  * deadlock. If the worklist yields none of the requested resource,
13262  * start syncing out vnodes to free up the needed space.
13263  */
13264 int
13265 softdep_request_cleanup(fs, vp, cred, resource)
13266         struct fs *fs;
13267         struct vnode *vp;
13268         struct ucred *cred;
13269         int resource;
13270 {
13271         struct ufsmount *ump;
13272         struct mount *mp;
13273         long starttime;
13274         ufs2_daddr_t needed;
13275         int error, failed_vnode;
13276 
13277         /*
13278          * If we are being called because of a process doing a
13279          * copy-on-write, then it is not safe to process any
13280          * worklist items as we will recurse into the copyonwrite
13281          * routine.  This will result in an incoherent snapshot.
13282          * If the vnode that we hold is a snapshot, we must avoid
13283          * handling other resources that could cause deadlock.
13284          */
13285         if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
13286                 return (0);
13287 
13288         if (resource == FLUSH_BLOCKS_WAIT)
13289                 stat_cleanup_blkrequests += 1;
13290         else
13291                 stat_cleanup_inorequests += 1;
13292 
13293         mp = vp->v_mount;
13294         ump = VFSTOUFS(mp);
13295         mtx_assert(UFS_MTX(ump), MA_OWNED);
13296         UFS_UNLOCK(ump);
13297         error = ffs_update(vp, 1);
13298         if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) {
13299                 UFS_LOCK(ump);
13300                 return (0);
13301         }
13302         /*
13303          * If we are in need of resources, start by cleaning up
13304          * any block removals associated with our inode.
13305          */
13306         ACQUIRE_LOCK(ump);
13307         process_removes(vp);
13308         process_truncates(vp);
13309         FREE_LOCK(ump);
13310         /*
13311          * Now clean up at least as many resources as we will need.
13312          *
13313          * When requested to clean up inodes, the number that are needed
13314          * is set by the number of simultaneous writers (mnt_writeopcount)
13315          * plus a bit of slop (2) in case some more writers show up while
13316          * we are cleaning.
13317          *
13318          * When requested to free up space, the amount of space that
13319          * we need is enough blocks to allocate a full-sized segment
13320          * (fs_contigsumsize). The number of such segments that will
13321          * be needed is set by the number of simultaneous writers
13322          * (mnt_writeopcount) plus a bit of slop (2) in case some more
13323          * writers show up while we are cleaning.
13324          *
13325          * Additionally, if we are unpriviledged and allocating space,
13326          * we need to ensure that we clean up enough blocks to get the
13327          * needed number of blocks over the threshold of the minimum
13328          * number of blocks required to be kept free by the filesystem
13329          * (fs_minfree).
13330          */
13331         if (resource == FLUSH_INODES_WAIT) {
13332                 needed = vp->v_mount->mnt_writeopcount + 2;
13333         } else if (resource == FLUSH_BLOCKS_WAIT) {
13334                 needed = (vp->v_mount->mnt_writeopcount + 2) *
13335                     fs->fs_contigsumsize;
13336                 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
13337                         needed += fragstoblks(fs,
13338                             roundup((fs->fs_dsize * fs->fs_minfree / 100) -
13339                             fs->fs_cstotal.cs_nffree, fs->fs_frag));
13340         } else {
13341                 UFS_LOCK(ump);
13342                 printf("softdep_request_cleanup: Unknown resource type %d\n",
13343                     resource);
13344                 return (0);
13345         }
13346         starttime = time_second;
13347 retry:
13348         if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
13349             fs->fs_cstotal.cs_nbfree <= needed) ||
13350             (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13351             fs->fs_cstotal.cs_nifree <= needed)) {
13352                 ACQUIRE_LOCK(ump);
13353                 if (ump->softdep_on_worklist > 0 &&
13354                     process_worklist_item(UFSTOVFS(ump),
13355                     ump->softdep_on_worklist, LK_NOWAIT) != 0)
13356                         stat_worklist_push += 1;
13357                 FREE_LOCK(ump);
13358         }
13359         /*
13360          * If we still need resources and there are no more worklist
13361          * entries to process to obtain them, we have to start flushing
13362          * the dirty vnodes to force the release of additional requests
13363          * to the worklist that we can then process to reap addition
13364          * resources. We walk the vnodes associated with the mount point
13365          * until we get the needed worklist requests that we can reap.
13366          *
13367          * If there are several threads all needing to clean the same
13368          * mount point, only one is allowed to walk the mount list.
13369          * When several threads all try to walk the same mount list,
13370          * they end up competing with each other and often end up in
13371          * livelock. This approach ensures that forward progress is
13372          * made at the cost of occational ENOSPC errors being returned
13373          * that might otherwise have been avoided.
13374          */
13375         error = 1;
13376         if ((resource == FLUSH_BLOCKS_WAIT && 
13377              fs->fs_cstotal.cs_nbfree <= needed) ||
13378             (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13379              fs->fs_cstotal.cs_nifree <= needed)) {
13380                 ACQUIRE_LOCK(ump);
13381                 if ((ump->um_softdep->sd_flags & FLUSH_RC_ACTIVE) == 0) {
13382                         ump->um_softdep->sd_flags |= FLUSH_RC_ACTIVE;
13383                         FREE_LOCK(ump);
13384                         failed_vnode = softdep_request_cleanup_flush(mp, ump);
13385                         ACQUIRE_LOCK(ump);
13386                         ump->um_softdep->sd_flags &= ~FLUSH_RC_ACTIVE;
13387                         FREE_LOCK(ump);
13388                         if (ump->softdep_on_worklist > 0) {
13389                                 stat_cleanup_retries += 1;
13390                                 if (!failed_vnode)
13391                                         goto retry;
13392                         }
13393                 } else {
13394                         FREE_LOCK(ump);
13395                         error = 0;
13396                 }
13397                 stat_cleanup_failures += 1;
13398         }
13399         if (time_second - starttime > stat_cleanup_high_delay)
13400                 stat_cleanup_high_delay = time_second - starttime;
13401         UFS_LOCK(ump);
13402         return (error);
13403 }
13404 
13405 /*
13406  * Scan the vnodes for the specified mount point flushing out any
13407  * vnodes that can be locked without waiting. Finally, try to flush
13408  * the device associated with the mount point if it can be locked
13409  * without waiting.
13410  *
13411  * We return 0 if we were able to lock every vnode in our scan.
13412  * If we had to skip one or more vnodes, we return 1.
13413  */
13414 static int
13415 softdep_request_cleanup_flush(mp, ump)
13416         struct mount *mp;
13417         struct ufsmount *ump;
13418 {
13419         struct thread *td;
13420         struct vnode *lvp, *mvp;
13421         int failed_vnode;
13422 
13423         failed_vnode = 0;
13424         td = curthread;
13425         MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
13426                 if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
13427                         VI_UNLOCK(lvp);
13428                         continue;
13429                 }
13430                 if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
13431                     td) != 0) {
13432                         failed_vnode = 1;
13433                         continue;
13434                 }
13435                 if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */
13436                         vput(lvp);
13437                         continue;
13438                 }
13439                 (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
13440                 vput(lvp);
13441         }
13442         lvp = ump->um_devvp;
13443         if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
13444                 VOP_FSYNC(lvp, MNT_NOWAIT, td);
13445                 VOP_UNLOCK(lvp, 0);
13446         }
13447         return (failed_vnode);
13448 }
13449 
13450 static bool
13451 softdep_excess_items(struct ufsmount *ump, int item)
13452 {
13453 
13454         KASSERT(item >= 0 && item < D_LAST, ("item %d", item));
13455         return (dep_current[item] > max_softdeps &&
13456             ump->softdep_curdeps[item] > max_softdeps /
13457             stat_flush_threads);
13458 }
13459 
13460 static void
13461 schedule_cleanup(struct mount *mp)
13462 {
13463         struct ufsmount *ump;
13464         struct thread *td;
13465 
13466         ump = VFSTOUFS(mp);
13467         LOCK_OWNED(ump);
13468         FREE_LOCK(ump);
13469         td = curthread;
13470         if ((td->td_pflags & TDP_KTHREAD) != 0 &&
13471             (td->td_proc->p_flag2 & P2_AST_SU) == 0) {
13472                 /*
13473                  * No ast is delivered to kernel threads, so nobody
13474                  * would deref the mp.  Some kernel threads
13475                  * explicitely check for AST, e.g. NFS daemon does
13476                  * this in the serving loop.
13477                  */
13478                 return;
13479         }
13480         if (td->td_su != NULL)
13481                 vfs_rel(td->td_su);
13482         vfs_ref(mp);
13483         td->td_su = mp;
13484         thread_lock(td);
13485         td->td_flags |= TDF_ASTPENDING;
13486         thread_unlock(td);
13487 }
13488 
13489 static void
13490 softdep_ast_cleanup_proc(struct thread *td)
13491 {
13492         struct mount *mp;
13493         struct ufsmount *ump;
13494         int error;
13495         bool req;
13496 
13497         while ((mp = td->td_su) != NULL) {
13498                 td->td_su = NULL;
13499                 error = vfs_busy(mp, MBF_NOWAIT);
13500                 vfs_rel(mp);
13501                 if (error != 0)
13502                         return;
13503                 if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) {
13504                         ump = VFSTOUFS(mp);
13505                         for (;;) {
13506                                 req = false;
13507                                 ACQUIRE_LOCK(ump);
13508                                 if (softdep_excess_items(ump, D_INODEDEP)) {
13509                                         req = true;
13510                                         request_cleanup(mp, FLUSH_INODES);
13511                                 }
13512                                 if (softdep_excess_items(ump, D_DIRREM)) {
13513                                         req = true;
13514                                         request_cleanup(mp, FLUSH_BLOCKS);
13515                                 }
13516                                 FREE_LOCK(ump);
13517                                 if (softdep_excess_items(ump, D_NEWBLK) ||
13518                                     softdep_excess_items(ump, D_ALLOCDIRECT) ||
13519                                     softdep_excess_items(ump, D_ALLOCINDIR)) {
13520                                         error = vn_start_write(NULL, &mp,
13521                                             V_WAIT);
13522                                         if (error == 0) {
13523                                                 req = true;
13524                                                 VFS_SYNC(mp, MNT_WAIT);
13525                                                 vn_finished_write(mp);
13526                                         }
13527                                 }
13528                                 if ((td->td_pflags & TDP_KTHREAD) != 0 || !req)
13529                                         break;
13530                         }
13531                 }
13532                 vfs_unbusy(mp);
13533         }
13534         if ((mp = td->td_su) != NULL) {
13535                 td->td_su = NULL;
13536                 vfs_rel(mp);
13537         }
13538 }
13539 
13540 /*
13541  * If memory utilization has gotten too high, deliberately slow things
13542  * down and speed up the I/O processing.
13543  */
13544 static int
13545 request_cleanup(mp, resource)
13546         struct mount *mp;
13547         int resource;
13548 {
13549         struct thread *td = curthread;
13550         struct ufsmount *ump;
13551 
13552         ump = VFSTOUFS(mp);
13553         LOCK_OWNED(ump);
13554         /*
13555          * We never hold up the filesystem syncer or buf daemon.
13556          */
13557         if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
13558                 return (0);
13559         /*
13560          * First check to see if the work list has gotten backlogged.
13561          * If it has, co-opt this process to help clean up two entries.
13562          * Because this process may hold inodes locked, we cannot
13563          * handle any remove requests that might block on a locked
13564          * inode as that could lead to deadlock.  We set TDP_SOFTDEP
13565          * to avoid recursively processing the worklist.
13566          */
13567         if (ump->softdep_on_worklist > max_softdeps / 10) {
13568                 td->td_pflags |= TDP_SOFTDEP;
13569                 process_worklist_item(mp, 2, LK_NOWAIT);
13570                 td->td_pflags &= ~TDP_SOFTDEP;
13571                 stat_worklist_push += 2;
13572                 return(1);
13573         }
13574         /*
13575          * Next, we attempt to speed up the syncer process. If that
13576          * is successful, then we allow the process to continue.
13577          */
13578         if (softdep_speedup(ump) &&
13579             resource != FLUSH_BLOCKS_WAIT &&
13580             resource != FLUSH_INODES_WAIT)
13581                 return(0);
13582         /*
13583          * If we are resource constrained on inode dependencies, try
13584          * flushing some dirty inodes. Otherwise, we are constrained
13585          * by file deletions, so try accelerating flushes of directories
13586          * with removal dependencies. We would like to do the cleanup
13587          * here, but we probably hold an inode locked at this point and 
13588          * that might deadlock against one that we try to clean. So,
13589          * the best that we can do is request the syncer daemon to do
13590          * the cleanup for us.
13591          */
13592         switch (resource) {
13593 
13594         case FLUSH_INODES:
13595         case FLUSH_INODES_WAIT:
13596                 ACQUIRE_GBLLOCK(&lk);
13597                 stat_ino_limit_push += 1;
13598                 req_clear_inodedeps += 1;
13599                 FREE_GBLLOCK(&lk);
13600                 stat_countp = &stat_ino_limit_hit;
13601                 break;
13602 
13603         case FLUSH_BLOCKS:
13604         case FLUSH_BLOCKS_WAIT:
13605                 ACQUIRE_GBLLOCK(&lk);
13606                 stat_blk_limit_push += 1;
13607                 req_clear_remove += 1;
13608                 FREE_GBLLOCK(&lk);
13609                 stat_countp = &stat_blk_limit_hit;
13610                 break;
13611 
13612         default:
13613                 panic("request_cleanup: unknown type");
13614         }
13615         /*
13616          * Hopefully the syncer daemon will catch up and awaken us.
13617          * We wait at most tickdelay before proceeding in any case.
13618          */
13619         ACQUIRE_GBLLOCK(&lk);
13620         FREE_LOCK(ump);
13621         proc_waiting += 1;
13622         if (callout_pending(&softdep_callout) == FALSE)
13623                 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13624                     pause_timer, 0);
13625 
13626         if ((td->td_pflags & TDP_KTHREAD) == 0)
13627                 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
13628         proc_waiting -= 1;
13629         FREE_GBLLOCK(&lk);
13630         ACQUIRE_LOCK(ump);
13631         return (1);
13632 }
13633 
13634 /*
13635  * Awaken processes pausing in request_cleanup and clear proc_waiting
13636  * to indicate that there is no longer a timer running. Pause_timer
13637  * will be called with the global softdep mutex (&lk) locked.
13638  */
13639 static void
13640 pause_timer(arg)
13641         void *arg;
13642 {
13643 
13644         GBLLOCK_OWNED(&lk);
13645         /*
13646          * The callout_ API has acquired mtx and will hold it around this
13647          * function call.
13648          */
13649         *stat_countp += proc_waiting;
13650         wakeup(&proc_waiting);
13651 }
13652 
13653 /*
13654  * If requested, try removing inode or removal dependencies.
13655  */
13656 static void
13657 check_clear_deps(mp)
13658         struct mount *mp;
13659 {
13660 
13661         /*
13662          * If we are suspended, it may be because of our using
13663          * too many inodedeps, so help clear them out.
13664          */
13665         if (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended)
13666                 clear_inodedeps(mp);
13667         /*
13668          * General requests for cleanup of backed up dependencies
13669          */
13670         ACQUIRE_GBLLOCK(&lk);
13671         if (req_clear_inodedeps) {
13672                 req_clear_inodedeps -= 1;
13673                 FREE_GBLLOCK(&lk);
13674                 clear_inodedeps(mp);
13675                 ACQUIRE_GBLLOCK(&lk);
13676                 wakeup(&proc_waiting);
13677         }
13678         if (req_clear_remove) {
13679                 req_clear_remove -= 1;
13680                 FREE_GBLLOCK(&lk);
13681                 clear_remove(mp);
13682                 ACQUIRE_GBLLOCK(&lk);
13683                 wakeup(&proc_waiting);
13684         }
13685         FREE_GBLLOCK(&lk);
13686 }
13687 
13688 /*
13689  * Flush out a directory with at least one removal dependency in an effort to
13690  * reduce the number of dirrem, freefile, and freeblks dependency structures.
13691  */
13692 static void
13693 clear_remove(mp)
13694         struct mount *mp;
13695 {
13696         struct pagedep_hashhead *pagedephd;
13697         struct pagedep *pagedep;
13698         struct ufsmount *ump;
13699         struct vnode *vp;
13700         struct bufobj *bo;
13701         int error, cnt;
13702         ino_t ino;
13703 
13704         ump = VFSTOUFS(mp);
13705         LOCK_OWNED(ump);
13706 
13707         for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) {
13708                 pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++];
13709                 if (ump->pagedep_nextclean > ump->pagedep_hash_size)
13710                         ump->pagedep_nextclean = 0;
13711                 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
13712                         if (LIST_EMPTY(&pagedep->pd_dirremhd))
13713                                 continue;
13714                         ino = pagedep->pd_ino;
13715                         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13716                                 continue;
13717                         FREE_LOCK(ump);
13718 
13719                         /*
13720                          * Let unmount clear deps
13721                          */
13722                         error = vfs_busy(mp, MBF_NOWAIT);
13723                         if (error != 0)
13724                                 goto finish_write;
13725                         error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13726                              FFSV_FORCEINSMQ);
13727                         vfs_unbusy(mp);
13728                         if (error != 0) {
13729                                 softdep_error("clear_remove: vget", error);
13730                                 goto finish_write;
13731                         }
13732                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13733                                 softdep_error("clear_remove: fsync", error);
13734                         bo = &vp->v_bufobj;
13735                         BO_LOCK(bo);
13736                         drain_output(vp);
13737                         BO_UNLOCK(bo);
13738                         vput(vp);
13739                 finish_write:
13740                         vn_finished_write(mp);
13741                         ACQUIRE_LOCK(ump);
13742                         return;
13743                 }
13744         }
13745 }
13746 
13747 /*
13748  * Clear out a block of dirty inodes in an effort to reduce
13749  * the number of inodedep dependency structures.
13750  */
13751 static void
13752 clear_inodedeps(mp)
13753         struct mount *mp;
13754 {
13755         struct inodedep_hashhead *inodedephd;
13756         struct inodedep *inodedep;
13757         struct ufsmount *ump;
13758         struct vnode *vp;
13759         struct fs *fs;
13760         int error, cnt;
13761         ino_t firstino, lastino, ino;
13762 
13763         ump = VFSTOUFS(mp);
13764         fs = ump->um_fs;
13765         LOCK_OWNED(ump);
13766         /*
13767          * Pick a random inode dependency to be cleared.
13768          * We will then gather up all the inodes in its block 
13769          * that have dependencies and flush them out.
13770          */
13771         for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) {
13772                 inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++];
13773                 if (ump->inodedep_nextclean > ump->inodedep_hash_size)
13774                         ump->inodedep_nextclean = 0;
13775                 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
13776                         break;
13777         }
13778         if (inodedep == NULL)
13779                 return;
13780         /*
13781          * Find the last inode in the block with dependencies.
13782          */
13783         firstino = rounddown2(inodedep->id_ino, INOPB(fs));
13784         for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
13785                 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
13786                         break;
13787         /*
13788          * Asynchronously push all but the last inode with dependencies.
13789          * Synchronously push the last inode with dependencies to ensure
13790          * that the inode block gets written to free up the inodedeps.
13791          */
13792         for (ino = firstino; ino <= lastino; ino++) {
13793                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13794                         continue;
13795                 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13796                         continue;
13797                 FREE_LOCK(ump);
13798                 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
13799                 if (error != 0) {
13800                         vn_finished_write(mp);
13801                         ACQUIRE_LOCK(ump);
13802                         return;
13803                 }
13804                 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13805                     FFSV_FORCEINSMQ)) != 0) {
13806                         softdep_error("clear_inodedeps: vget", error);
13807                         vfs_unbusy(mp);
13808                         vn_finished_write(mp);
13809                         ACQUIRE_LOCK(ump);
13810                         return;
13811                 }
13812                 vfs_unbusy(mp);
13813                 if (ino == lastino) {
13814                         if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
13815                                 softdep_error("clear_inodedeps: fsync1", error);
13816                 } else {
13817                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13818                                 softdep_error("clear_inodedeps: fsync2", error);
13819                         BO_LOCK(&vp->v_bufobj);
13820                         drain_output(vp);
13821                         BO_UNLOCK(&vp->v_bufobj);
13822                 }
13823                 vput(vp);
13824                 vn_finished_write(mp);
13825                 ACQUIRE_LOCK(ump);
13826         }
13827 }
13828 
13829 void
13830 softdep_buf_append(bp, wkhd)
13831         struct buf *bp;
13832         struct workhead *wkhd;
13833 {
13834         struct worklist *wk;
13835         struct ufsmount *ump;
13836 
13837         if ((wk = LIST_FIRST(wkhd)) == NULL)
13838                 return;
13839         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13840             ("softdep_buf_append called on non-softdep filesystem"));
13841         ump = VFSTOUFS(wk->wk_mp);
13842         ACQUIRE_LOCK(ump);
13843         while ((wk = LIST_FIRST(wkhd)) != NULL) {
13844                 WORKLIST_REMOVE(wk);
13845                 WORKLIST_INSERT(&bp->b_dep, wk);
13846         }
13847         FREE_LOCK(ump);
13848 
13849 }
13850 
13851 void
13852 softdep_inode_append(ip, cred, wkhd)
13853         struct inode *ip;
13854         struct ucred *cred;
13855         struct workhead *wkhd;
13856 {
13857         struct buf *bp;
13858         struct fs *fs;
13859         struct ufsmount *ump;
13860         int error;
13861 
13862         ump = ITOUMP(ip);
13863         KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
13864             ("softdep_inode_append called on non-softdep filesystem"));
13865         fs = ump->um_fs;
13866         error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
13867             (int)fs->fs_bsize, cred, &bp);
13868         if (error) {
13869                 bqrelse(bp);
13870                 softdep_freework(wkhd);
13871                 return;
13872         }
13873         softdep_buf_append(bp, wkhd);
13874         bqrelse(bp);
13875 }
13876 
13877 void
13878 softdep_freework(wkhd)
13879         struct workhead *wkhd;
13880 {
13881         struct worklist *wk;
13882         struct ufsmount *ump;
13883 
13884         if ((wk = LIST_FIRST(wkhd)) == NULL)
13885                 return;
13886         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13887             ("softdep_freework called on non-softdep filesystem"));
13888         ump = VFSTOUFS(wk->wk_mp);
13889         ACQUIRE_LOCK(ump);
13890         handle_jwork(wkhd);
13891         FREE_LOCK(ump);
13892 }
13893 
13894 static struct ufsmount *
13895 softdep_bp_to_mp(bp)
13896         struct buf *bp;
13897 {
13898         struct mount *mp;
13899         struct vnode *vp;
13900 
13901         if (LIST_EMPTY(&bp->b_dep))
13902                 return (NULL);
13903         vp = bp->b_vp;
13904         KASSERT(vp != NULL,
13905             ("%s, buffer with dependencies lacks vnode", __func__));
13906 
13907         /*
13908          * The ump mount point is stable after we get a correct
13909          * pointer, since bp is locked and this prevents unmount from
13910          * proceeding.  But to get to it, we cannot dereference bp->b_dep
13911          * head wk_mp, because we do not yet own SU ump lock and
13912          * workitem might be freed while dereferenced.
13913          */
13914 retry:
13915         switch (vp->v_type) {
13916         case VCHR:
13917                 VI_LOCK(vp);
13918                 mp = vp->v_type == VCHR ? vp->v_rdev->si_mountpt : NULL;
13919                 VI_UNLOCK(vp);
13920                 if (mp == NULL)
13921                         goto retry;
13922                 break;
13923         case VREG:
13924         case VDIR:
13925         case VLNK:
13926         case VFIFO:
13927         case VSOCK:
13928                 mp = vp->v_mount;
13929                 break;
13930         case VBLK:
13931                 vn_printf(vp, "softdep_bp_to_mp: unexpected block device\n");
13932                 /* FALLTHROUGH */
13933         case VNON:
13934         case VBAD:
13935         case VMARKER:
13936                 mp = NULL;
13937                 break;
13938         default:
13939                 vn_printf(vp, "unknown vnode type");
13940                 mp = NULL;
13941                 break;
13942         }
13943         return (VFSTOUFS(mp));
13944 }
13945 
13946 /*
13947  * Function to determine if the buffer has outstanding dependencies
13948  * that will cause a roll-back if the buffer is written. If wantcount
13949  * is set, return number of dependencies, otherwise just yes or no.
13950  */
13951 static int
13952 softdep_count_dependencies(bp, wantcount)
13953         struct buf *bp;
13954         int wantcount;
13955 {
13956         struct worklist *wk;
13957         struct ufsmount *ump;
13958         struct bmsafemap *bmsafemap;
13959         struct freework *freework;
13960         struct inodedep *inodedep;
13961         struct indirdep *indirdep;
13962         struct freeblks *freeblks;
13963         struct allocindir *aip;
13964         struct pagedep *pagedep;
13965         struct dirrem *dirrem;
13966         struct newblk *newblk;
13967         struct mkdir *mkdir;
13968         struct diradd *dap;
13969         int i, retval;
13970 
13971         ump = softdep_bp_to_mp(bp);
13972         if (ump == NULL)
13973                 return (0);
13974         retval = 0;
13975         ACQUIRE_LOCK(ump);
13976         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
13977                 switch (wk->wk_type) {
13978 
13979                 case D_INODEDEP:
13980                         inodedep = WK_INODEDEP(wk);
13981                         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
13982                                 /* bitmap allocation dependency */
13983                                 retval += 1;
13984                                 if (!wantcount)
13985                                         goto out;
13986                         }
13987                         if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
13988                                 /* direct block pointer dependency */
13989                                 retval += 1;
13990                                 if (!wantcount)
13991                                         goto out;
13992                         }
13993                         if (TAILQ_FIRST(&inodedep->id_extupdt)) {
13994                                 /* direct block pointer dependency */
13995                                 retval += 1;
13996                                 if (!wantcount)
13997                                         goto out;
13998                         }
13999                         if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
14000                                 /* Add reference dependency. */
14001                                 retval += 1;
14002                                 if (!wantcount)
14003                                         goto out;
14004                         }
14005                         continue;
14006 
14007                 case D_INDIRDEP:
14008                         indirdep = WK_INDIRDEP(wk);
14009 
14010                         TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
14011                                 /* indirect truncation dependency */
14012                                 retval += 1;
14013                                 if (!wantcount)
14014                                         goto out;
14015                         }
14016 
14017                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
14018                                 /* indirect block pointer dependency */
14019                                 retval += 1;
14020                                 if (!wantcount)
14021                                         goto out;
14022                         }
14023                         continue;
14024 
14025                 case D_PAGEDEP:
14026                         pagedep = WK_PAGEDEP(wk);
14027                         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
14028                                 if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
14029                                         /* Journal remove ref dependency. */
14030                                         retval += 1;
14031                                         if (!wantcount)
14032                                                 goto out;
14033                                 }
14034                         }
14035                         for (i = 0; i < DAHASHSZ; i++) {
14036 
14037                                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
14038                                         /* directory entry dependency */
14039                                         retval += 1;
14040                                         if (!wantcount)
14041                                                 goto out;
14042                                 }
14043                         }
14044                         continue;
14045 
14046                 case D_BMSAFEMAP:
14047                         bmsafemap = WK_BMSAFEMAP(wk);
14048                         if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
14049                                 /* Add reference dependency. */
14050                                 retval += 1;
14051                                 if (!wantcount)
14052                                         goto out;
14053                         }
14054                         if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
14055                                 /* Allocate block dependency. */
14056                                 retval += 1;
14057                                 if (!wantcount)
14058                                         goto out;
14059                         }
14060                         continue;
14061 
14062                 case D_FREEBLKS:
14063                         freeblks = WK_FREEBLKS(wk);
14064                         if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
14065                                 /* Freeblk journal dependency. */
14066                                 retval += 1;
14067                                 if (!wantcount)
14068                                         goto out;
14069                         }
14070                         continue;
14071 
14072                 case D_ALLOCDIRECT:
14073                 case D_ALLOCINDIR:
14074                         newblk = WK_NEWBLK(wk);
14075                         if (newblk->nb_jnewblk) {
14076                                 /* Journal allocate dependency. */
14077                                 retval += 1;
14078                                 if (!wantcount)
14079                                         goto out;
14080                         }
14081                         continue;
14082 
14083                 case D_MKDIR:
14084                         mkdir = WK_MKDIR(wk);
14085                         if (mkdir->md_jaddref) {
14086                                 /* Journal reference dependency. */
14087                                 retval += 1;
14088                                 if (!wantcount)
14089                                         goto out;
14090                         }
14091                         continue;
14092 
14093                 case D_FREEWORK:
14094                 case D_FREEDEP:
14095                 case D_JSEGDEP:
14096                 case D_JSEG:
14097                 case D_SBDEP:
14098                         /* never a dependency on these blocks */
14099                         continue;
14100 
14101                 default:
14102                         panic("softdep_count_dependencies: Unexpected type %s",
14103                             TYPENAME(wk->wk_type));
14104                         /* NOTREACHED */
14105                 }
14106         }
14107 out:
14108         FREE_LOCK(ump);
14109         return (retval);
14110 }
14111 
14112 /*
14113  * Acquire exclusive access to a buffer.
14114  * Must be called with a locked mtx parameter.
14115  * Return acquired buffer or NULL on failure.
14116  */
14117 static struct buf *
14118 getdirtybuf(bp, lock, waitfor)
14119         struct buf *bp;
14120         struct rwlock *lock;
14121         int waitfor;
14122 {
14123         int error;
14124 
14125         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
14126                 if (waitfor != MNT_WAIT)
14127                         return (NULL);
14128                 error = BUF_LOCK(bp,
14129                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
14130                 /*
14131                  * Even if we successfully acquire bp here, we have dropped
14132                  * lock, which may violates our guarantee.
14133                  */
14134                 if (error == 0)
14135                         BUF_UNLOCK(bp);
14136                 else if (error != ENOLCK)
14137                         panic("getdirtybuf: inconsistent lock: %d", error);
14138                 rw_wlock(lock);
14139                 return (NULL);
14140         }
14141         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
14142                 if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) {
14143                         rw_wunlock(lock);
14144                         BO_LOCK(bp->b_bufobj);
14145                         BUF_UNLOCK(bp);
14146                         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
14147                                 bp->b_vflags |= BV_BKGRDWAIT;
14148                                 msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
14149                                        PRIBIO | PDROP, "getbuf", 0);
14150                         } else
14151                                 BO_UNLOCK(bp->b_bufobj);
14152                         rw_wlock(lock);
14153                         return (NULL);
14154                 }
14155                 BUF_UNLOCK(bp);
14156                 if (waitfor != MNT_WAIT)
14157                         return (NULL);
14158 #ifdef DEBUG_VFS_LOCKS
14159                 if (bp->b_vp->v_type != VCHR)
14160                         ASSERT_BO_WLOCKED(bp->b_bufobj);
14161 #endif
14162                 bp->b_vflags |= BV_BKGRDWAIT;
14163                 rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
14164                 return (NULL);
14165         }
14166         if ((bp->b_flags & B_DELWRI) == 0) {
14167                 BUF_UNLOCK(bp);
14168                 return (NULL);
14169         }
14170         bremfree(bp);
14171         return (bp);
14172 }
14173 
14174 
14175 /*
14176  * Check if it is safe to suspend the file system now.  On entry,
14177  * the vnode interlock for devvp should be held.  Return 0 with
14178  * the mount interlock held if the file system can be suspended now,
14179  * otherwise return EAGAIN with the mount interlock held.
14180  */
14181 int
14182 softdep_check_suspend(struct mount *mp,
14183                       struct vnode *devvp,
14184                       int softdep_depcnt,
14185                       int softdep_accdepcnt,
14186                       int secondary_writes,
14187                       int secondary_accwrites)
14188 {
14189         struct bufobj *bo;
14190         struct ufsmount *ump;
14191         struct inodedep *inodedep;
14192         int error, unlinked;
14193 
14194         bo = &devvp->v_bufobj;
14195         ASSERT_BO_WLOCKED(bo);
14196 
14197         /*
14198          * If we are not running with soft updates, then we need only
14199          * deal with secondary writes as we try to suspend.
14200          */
14201         if (MOUNTEDSOFTDEP(mp) == 0) {
14202                 MNT_ILOCK(mp);
14203                 while (mp->mnt_secondary_writes != 0) {
14204                         BO_UNLOCK(bo);
14205                         msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
14206                             (PUSER - 1) | PDROP, "secwr", 0);
14207                         BO_LOCK(bo);
14208                         MNT_ILOCK(mp);
14209                 }
14210 
14211                 /*
14212                  * Reasons for needing more work before suspend:
14213                  * - Dirty buffers on devvp.
14214                  * - Secondary writes occurred after start of vnode sync loop
14215                  */
14216                 error = 0;
14217                 if (bo->bo_numoutput > 0 ||
14218                     bo->bo_dirty.bv_cnt > 0 ||
14219                     secondary_writes != 0 ||
14220                     mp->mnt_secondary_writes != 0 ||
14221                     secondary_accwrites != mp->mnt_secondary_accwrites)
14222                         error = EAGAIN;
14223                 BO_UNLOCK(bo);
14224                 return (error);
14225         }
14226 
14227         /*
14228          * If we are running with soft updates, then we need to coordinate
14229          * with them as we try to suspend.
14230          */
14231         ump = VFSTOUFS(mp);
14232         for (;;) {
14233                 if (!TRY_ACQUIRE_LOCK(ump)) {
14234                         BO_UNLOCK(bo);
14235                         ACQUIRE_LOCK(ump);
14236                         FREE_LOCK(ump);
14237                         BO_LOCK(bo);
14238                         continue;
14239                 }
14240                 MNT_ILOCK(mp);
14241                 if (mp->mnt_secondary_writes != 0) {
14242                         FREE_LOCK(ump);
14243                         BO_UNLOCK(bo);
14244                         msleep(&mp->mnt_secondary_writes,
14245                                MNT_MTX(mp),
14246                                (PUSER - 1) | PDROP, "secwr", 0);
14247                         BO_LOCK(bo);
14248                         continue;
14249                 }
14250                 break;
14251         }
14252 
14253         unlinked = 0;
14254         if (MOUNTEDSUJ(mp)) {
14255                 for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked);
14256                     inodedep != NULL;
14257                     inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
14258                         if ((inodedep->id_state & (UNLINKED | UNLINKLINKS |
14259                             UNLINKONLIST)) != (UNLINKED | UNLINKLINKS |
14260                             UNLINKONLIST) ||
14261                             !check_inodedep_free(inodedep))
14262                                 continue;
14263                         unlinked++;
14264                 }
14265         }
14266 
14267         /*
14268          * Reasons for needing more work before suspend:
14269          * - Dirty buffers on devvp.
14270          * - Softdep activity occurred after start of vnode sync loop
14271          * - Secondary writes occurred after start of vnode sync loop
14272          */
14273         error = 0;
14274         if (bo->bo_numoutput > 0 ||
14275             bo->bo_dirty.bv_cnt > 0 ||
14276             softdep_depcnt != unlinked ||
14277             ump->softdep_deps != unlinked ||
14278             softdep_accdepcnt != ump->softdep_accdeps ||
14279             secondary_writes != 0 ||
14280             mp->mnt_secondary_writes != 0 ||
14281             secondary_accwrites != mp->mnt_secondary_accwrites)
14282                 error = EAGAIN;
14283         FREE_LOCK(ump);
14284         BO_UNLOCK(bo);
14285         return (error);
14286 }
14287 
14288 
14289 /*
14290  * Get the number of dependency structures for the file system, both
14291  * the current number and the total number allocated.  These will
14292  * later be used to detect that softdep processing has occurred.
14293  */
14294 void
14295 softdep_get_depcounts(struct mount *mp,
14296                       int *softdep_depsp,
14297                       int *softdep_accdepsp)
14298 {
14299         struct ufsmount *ump;
14300 
14301         if (MOUNTEDSOFTDEP(mp) == 0) {
14302                 *softdep_depsp = 0;
14303                 *softdep_accdepsp = 0;
14304                 return;
14305         }
14306         ump = VFSTOUFS(mp);
14307         ACQUIRE_LOCK(ump);
14308         *softdep_depsp = ump->softdep_deps;
14309         *softdep_accdepsp = ump->softdep_accdeps;
14310         FREE_LOCK(ump);
14311 }
14312 
14313 /*
14314  * Wait for pending output on a vnode to complete.
14315  */
14316 static void
14317 drain_output(vp)
14318         struct vnode *vp;
14319 {
14320 
14321         ASSERT_VOP_LOCKED(vp, "drain_output");
14322         (void)bufobj_wwait(&vp->v_bufobj, 0, 0);
14323 }
14324 
14325 /*
14326  * Called whenever a buffer that is being invalidated or reallocated
14327  * contains dependencies. This should only happen if an I/O error has
14328  * occurred. The routine is called with the buffer locked.
14329  */ 
14330 static void
14331 softdep_deallocate_dependencies(bp)
14332         struct buf *bp;
14333 {
14334 
14335         if ((bp->b_ioflags & BIO_ERROR) == 0)
14336                 panic("softdep_deallocate_dependencies: dangling deps");
14337         if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
14338                 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
14339         else
14340                 printf("softdep_deallocate_dependencies: "
14341                     "got error %d while accessing filesystem\n", bp->b_error);
14342         if (bp->b_error != ENXIO)
14343                 panic("softdep_deallocate_dependencies: unrecovered I/O error");
14344 }
14345 
14346 /*
14347  * Function to handle asynchronous write errors in the filesystem.
14348  */
14349 static void
14350 softdep_error(func, error)
14351         char *func;
14352         int error;
14353 {
14354 
14355         /* XXX should do something better! */
14356         printf("%s: got error %d while accessing filesystem\n", func, error);
14357 }
14358 
14359 #ifdef DDB
14360 
14361 static void
14362 inodedep_print(struct inodedep *inodedep, int verbose)
14363 {
14364         db_printf("%p fs %p st %x ino %jd inoblk %jd delta %jd nlink %jd"
14365             " saveino %p\n",
14366             inodedep, inodedep->id_fs, inodedep->id_state,
14367             (intmax_t)inodedep->id_ino,
14368             (intmax_t)fsbtodb(inodedep->id_fs,
14369             ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
14370             (intmax_t)inodedep->id_nlinkdelta,
14371             (intmax_t)inodedep->id_savednlink,
14372             inodedep->id_savedino1);
14373 
14374         if (verbose == 0)
14375                 return;
14376 
14377         db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
14378             "mkdiradd %p\n",
14379             LIST_FIRST(&inodedep->id_pendinghd),
14380             LIST_FIRST(&inodedep->id_bufwait),
14381             LIST_FIRST(&inodedep->id_inowait),
14382             TAILQ_FIRST(&inodedep->id_inoreflst),
14383             inodedep->id_mkdiradd);
14384         db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
14385             TAILQ_FIRST(&inodedep->id_inoupdt),
14386             TAILQ_FIRST(&inodedep->id_newinoupdt),
14387             TAILQ_FIRST(&inodedep->id_extupdt),
14388             TAILQ_FIRST(&inodedep->id_newextupdt));
14389 }
14390 
14391 DB_SHOW_COMMAND(inodedep, db_show_inodedep)
14392 {
14393 
14394         if (have_addr == 0) {
14395                 db_printf("Address required\n");
14396                 return;
14397         }
14398         inodedep_print((struct inodedep*)addr, 1);
14399 }
14400 
14401 DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
14402 {
14403         struct inodedep_hashhead *inodedephd;
14404         struct inodedep *inodedep;
14405         struct ufsmount *ump;
14406         int cnt;
14407 
14408         if (have_addr == 0) {
14409                 db_printf("Address required\n");
14410                 return;
14411         }
14412         ump = (struct ufsmount *)addr;
14413         for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) {
14414                 inodedephd = &ump->inodedep_hashtbl[cnt];
14415                 LIST_FOREACH(inodedep, inodedephd, id_hash) {
14416                         inodedep_print(inodedep, 0);
14417                 }
14418         }
14419 }
14420 
14421 DB_SHOW_COMMAND(worklist, db_show_worklist)
14422 {
14423         struct worklist *wk;
14424 
14425         if (have_addr == 0) {
14426                 db_printf("Address required\n");
14427                 return;
14428         }
14429         wk = (struct worklist *)addr;
14430         printf("worklist: %p type %s state 0x%X\n",
14431             wk, TYPENAME(wk->wk_type), wk->wk_state);
14432 }
14433 
14434 DB_SHOW_COMMAND(workhead, db_show_workhead)
14435 {
14436         struct workhead *wkhd;
14437         struct worklist *wk;
14438         int i;
14439 
14440         if (have_addr == 0) {
14441                 db_printf("Address required\n");
14442                 return;
14443         }
14444         wkhd = (struct workhead *)addr;
14445         wk = LIST_FIRST(wkhd);
14446         for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
14447                 db_printf("worklist: %p type %s state 0x%X",
14448                     wk, TYPENAME(wk->wk_type), wk->wk_state);
14449         if (i == 100)
14450                 db_printf("workhead overflow");
14451         printf("\n");
14452 }
14453 
14454 
14455 DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
14456 {
14457         struct mkdirlist *mkdirlisthd;
14458         struct jaddref *jaddref;
14459         struct diradd *diradd;
14460         struct mkdir *mkdir;
14461 
14462         if (have_addr == 0) {
14463                 db_printf("Address required\n");
14464                 return;
14465         }
14466         mkdirlisthd = (struct mkdirlist *)addr;
14467         LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) {
14468                 diradd = mkdir->md_diradd;
14469                 db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
14470                     mkdir, mkdir->md_state, diradd, diradd->da_state);
14471                 if ((jaddref = mkdir->md_jaddref) != NULL)
14472                         db_printf(" jaddref %p jaddref state 0x%X",
14473                             jaddref, jaddref->ja_state);
14474                 db_printf("\n");
14475         }
14476 }
14477 
14478 /* exported to ffs_vfsops.c */
14479 extern void db_print_ffs(struct ufsmount *ump);
14480 void
14481 db_print_ffs(struct ufsmount *ump)
14482 {
14483         db_printf("mp %p %s devvp %p fs %p su_wl %d su_deps %d su_req %d\n",
14484             ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname,
14485             ump->um_devvp, ump->um_fs, ump->softdep_on_worklist,
14486             ump->softdep_deps, ump->softdep_req);
14487 }
14488 
14489 #endif /* DDB */
14490 
14491 #endif /* SOFTUPDATES */

Cache object: c1e527735145d97a7d36eda6347fa279


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.