The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/ufs/ffs/ffs_softdep.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright 1998, 2000 Marshall Kirk McKusick.
    3  * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
    4  * All rights reserved.
    5  *
    6  * The soft updates code is derived from the appendix of a University
    7  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
    8  * "Soft Updates: A Solution to the Metadata Update Problem in File
    9  * Systems", CSE-TR-254-95, August 1995).
   10  *
   11  * Further information about soft updates can be obtained from:
   12  *
   13  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
   14  *      1614 Oxford Street              mckusick@mckusick.com
   15  *      Berkeley, CA 94709-1608         +1-510-843-9542
   16  *      USA
   17  *
   18  * Redistribution and use in source and binary forms, with or without
   19  * modification, are permitted provided that the following conditions
   20  * are met:
   21  *
   22  * 1. Redistributions of source code must retain the above copyright
   23  *    notice, this list of conditions and the following disclaimer.
   24  * 2. Redistributions in binary form must reproduce the above copyright
   25  *    notice, this list of conditions and the following disclaimer in the
   26  *    documentation and/or other materials provided with the distribution.
   27  *
   28  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
   29  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   30  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   31  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
   34  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   35  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
   36  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
   37  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   38  *
   39  *      from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
   40  */
   41 
   42 #include <sys/cdefs.h>
   43 __FBSDID("$FreeBSD$");
   44 
   45 #include "opt_ffs.h"
   46 #include "opt_quota.h"
   47 #include "opt_ddb.h"
   48 
   49 /*
   50  * For now we want the safety net that the DEBUG flag provides.
   51  */
   52 #ifndef DEBUG
   53 #define DEBUG
   54 #endif
   55 
   56 #include <sys/param.h>
   57 #include <sys/kernel.h>
   58 #include <sys/systm.h>
   59 #include <sys/bio.h>
   60 #include <sys/buf.h>
   61 #include <sys/kdb.h>
   62 #include <sys/kthread.h>
   63 #include <sys/ktr.h>
   64 #include <sys/limits.h>
   65 #include <sys/lock.h>
   66 #include <sys/malloc.h>
   67 #include <sys/mount.h>
   68 #include <sys/mutex.h>
   69 #include <sys/namei.h>
   70 #include <sys/priv.h>
   71 #include <sys/proc.h>
   72 #include <sys/racct.h>
   73 #include <sys/rwlock.h>
   74 #include <sys/stat.h>
   75 #include <sys/sysctl.h>
   76 #include <sys/syslog.h>
   77 #include <sys/vnode.h>
   78 #include <sys/conf.h>
   79 
   80 #include <ufs/ufs/dir.h>
   81 #include <ufs/ufs/extattr.h>
   82 #include <ufs/ufs/quota.h>
   83 #include <ufs/ufs/inode.h>
   84 #include <ufs/ufs/ufsmount.h>
   85 #include <ufs/ffs/fs.h>
   86 #include <ufs/ffs/softdep.h>
   87 #include <ufs/ffs/ffs_extern.h>
   88 #include <ufs/ufs/ufs_extern.h>
   89 
   90 #include <vm/vm.h>
   91 #include <vm/vm_extern.h>
   92 #include <vm/vm_object.h>
   93 
   94 #include <geom/geom.h>
   95 
   96 #include <ddb/ddb.h>
   97 
   98 #define KTR_SUJ 0       /* Define to KTR_SPARE. */
   99 
  100 #ifndef SOFTUPDATES
  101 
  102 int
  103 softdep_flushfiles(oldmnt, flags, td)
  104         struct mount *oldmnt;
  105         int flags;
  106         struct thread *td;
  107 {
  108 
  109         panic("softdep_flushfiles called");
  110 }
  111 
  112 int
  113 softdep_mount(devvp, mp, fs, cred)
  114         struct vnode *devvp;
  115         struct mount *mp;
  116         struct fs *fs;
  117         struct ucred *cred;
  118 {
  119 
  120         return (0);
  121 }
  122 
  123 void
  124 softdep_initialize()
  125 {
  126 
  127         return;
  128 }
  129 
  130 void
  131 softdep_uninitialize()
  132 {
  133 
  134         return;
  135 }
  136 
  137 void
  138 softdep_unmount(mp)
  139         struct mount *mp;
  140 {
  141 
  142         panic("softdep_unmount called");
  143 }
  144 
  145 void
  146 softdep_setup_sbupdate(ump, fs, bp)
  147         struct ufsmount *ump;
  148         struct fs *fs;
  149         struct buf *bp;
  150 {
  151 
  152         panic("softdep_setup_sbupdate called");
  153 }
  154 
  155 void
  156 softdep_setup_inomapdep(bp, ip, newinum, mode)
  157         struct buf *bp;
  158         struct inode *ip;
  159         ino_t newinum;
  160         int mode;
  161 {
  162 
  163         panic("softdep_setup_inomapdep called");
  164 }
  165 
  166 void
  167 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
  168         struct buf *bp;
  169         struct mount *mp;
  170         ufs2_daddr_t newblkno;
  171         int frags;
  172         int oldfrags;
  173 {
  174 
  175         panic("softdep_setup_blkmapdep called");
  176 }
  177 
  178 void
  179 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
  180         struct inode *ip;
  181         ufs_lbn_t lbn;
  182         ufs2_daddr_t newblkno;
  183         ufs2_daddr_t oldblkno;
  184         long newsize;
  185         long oldsize;
  186         struct buf *bp;
  187 {
  188         
  189         panic("softdep_setup_allocdirect called");
  190 }
  191 
  192 void
  193 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
  194         struct inode *ip;
  195         ufs_lbn_t lbn;
  196         ufs2_daddr_t newblkno;
  197         ufs2_daddr_t oldblkno;
  198         long newsize;
  199         long oldsize;
  200         struct buf *bp;
  201 {
  202         
  203         panic("softdep_setup_allocext called");
  204 }
  205 
  206 void
  207 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
  208         struct inode *ip;
  209         ufs_lbn_t lbn;
  210         struct buf *bp;
  211         int ptrno;
  212         ufs2_daddr_t newblkno;
  213         ufs2_daddr_t oldblkno;
  214         struct buf *nbp;
  215 {
  216 
  217         panic("softdep_setup_allocindir_page called");
  218 }
  219 
  220 void
  221 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
  222         struct buf *nbp;
  223         struct inode *ip;
  224         struct buf *bp;
  225         int ptrno;
  226         ufs2_daddr_t newblkno;
  227 {
  228 
  229         panic("softdep_setup_allocindir_meta called");
  230 }
  231 
  232 void
  233 softdep_journal_freeblocks(ip, cred, length, flags)
  234         struct inode *ip;
  235         struct ucred *cred;
  236         off_t length;
  237         int flags;
  238 {
  239         
  240         panic("softdep_journal_freeblocks called");
  241 }
  242 
  243 void
  244 softdep_journal_fsync(ip)
  245         struct inode *ip;
  246 {
  247 
  248         panic("softdep_journal_fsync called");
  249 }
  250 
  251 void
  252 softdep_setup_freeblocks(ip, length, flags)
  253         struct inode *ip;
  254         off_t length;
  255         int flags;
  256 {
  257         
  258         panic("softdep_setup_freeblocks called");
  259 }
  260 
  261 void
  262 softdep_freefile(pvp, ino, mode)
  263                 struct vnode *pvp;
  264                 ino_t ino;
  265                 int mode;
  266 {
  267 
  268         panic("softdep_freefile called");
  269 }
  270 
  271 int
  272 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
  273         struct buf *bp;
  274         struct inode *dp;
  275         off_t diroffset;
  276         ino_t newinum;
  277         struct buf *newdirbp;
  278         int isnewblk;
  279 {
  280 
  281         panic("softdep_setup_directory_add called");
  282 }
  283 
  284 void
  285 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
  286         struct buf *bp;
  287         struct inode *dp;
  288         caddr_t base;
  289         caddr_t oldloc;
  290         caddr_t newloc;
  291         int entrysize;
  292 {
  293 
  294         panic("softdep_change_directoryentry_offset called");
  295 }
  296 
  297 void
  298 softdep_setup_remove(bp, dp, ip, isrmdir)
  299         struct buf *bp;
  300         struct inode *dp;
  301         struct inode *ip;
  302         int isrmdir;
  303 {
  304         
  305         panic("softdep_setup_remove called");
  306 }
  307 
  308 void
  309 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
  310         struct buf *bp;
  311         struct inode *dp;
  312         struct inode *ip;
  313         ino_t newinum;
  314         int isrmdir;
  315 {
  316 
  317         panic("softdep_setup_directory_change called");
  318 }
  319 
  320 void
  321 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
  322         struct mount *mp;
  323         struct buf *bp;
  324         ufs2_daddr_t blkno;
  325         int frags;
  326         struct workhead *wkhd;
  327 {
  328 
  329         panic("%s called", __FUNCTION__);
  330 }
  331 
  332 void
  333 softdep_setup_inofree(mp, bp, ino, wkhd)
  334         struct mount *mp;
  335         struct buf *bp;
  336         ino_t ino;
  337         struct workhead *wkhd;
  338 {
  339 
  340         panic("%s called", __FUNCTION__);
  341 }
  342 
  343 void
  344 softdep_setup_unlink(dp, ip)
  345         struct inode *dp;
  346         struct inode *ip;
  347 {
  348 
  349         panic("%s called", __FUNCTION__);
  350 }
  351 
  352 void
  353 softdep_setup_link(dp, ip)
  354         struct inode *dp;
  355         struct inode *ip;
  356 {
  357 
  358         panic("%s called", __FUNCTION__);
  359 }
  360 
  361 void
  362 softdep_revert_link(dp, ip)
  363         struct inode *dp;
  364         struct inode *ip;
  365 {
  366 
  367         panic("%s called", __FUNCTION__);
  368 }
  369 
  370 void
  371 softdep_setup_rmdir(dp, ip)
  372         struct inode *dp;
  373         struct inode *ip;
  374 {
  375 
  376         panic("%s called", __FUNCTION__);
  377 }
  378 
  379 void
  380 softdep_revert_rmdir(dp, ip)
  381         struct inode *dp;
  382         struct inode *ip;
  383 {
  384 
  385         panic("%s called", __FUNCTION__);
  386 }
  387 
  388 void
  389 softdep_setup_create(dp, ip)
  390         struct inode *dp;
  391         struct inode *ip;
  392 {
  393 
  394         panic("%s called", __FUNCTION__);
  395 }
  396 
  397 void
  398 softdep_revert_create(dp, ip)
  399         struct inode *dp;
  400         struct inode *ip;
  401 {
  402 
  403         panic("%s called", __FUNCTION__);
  404 }
  405 
  406 void
  407 softdep_setup_mkdir(dp, ip)
  408         struct inode *dp;
  409         struct inode *ip;
  410 {
  411 
  412         panic("%s called", __FUNCTION__);
  413 }
  414 
  415 void
  416 softdep_revert_mkdir(dp, ip)
  417         struct inode *dp;
  418         struct inode *ip;
  419 {
  420 
  421         panic("%s called", __FUNCTION__);
  422 }
  423 
  424 void
  425 softdep_setup_dotdot_link(dp, ip)
  426         struct inode *dp;
  427         struct inode *ip;
  428 {
  429 
  430         panic("%s called", __FUNCTION__);
  431 }
  432 
  433 int
  434 softdep_prealloc(vp, waitok)
  435         struct vnode *vp;
  436         int waitok;
  437 {
  438 
  439         panic("%s called", __FUNCTION__);
  440 }
  441 
  442 int
  443 softdep_journal_lookup(mp, vpp)
  444         struct mount *mp;
  445         struct vnode **vpp;
  446 {
  447 
  448         return (ENOENT);
  449 }
  450 
  451 void
  452 softdep_change_linkcnt(ip)
  453         struct inode *ip;
  454 {
  455 
  456         panic("softdep_change_linkcnt called");
  457 }
  458 
  459 void 
  460 softdep_load_inodeblock(ip)
  461         struct inode *ip;
  462 {
  463 
  464         panic("softdep_load_inodeblock called");
  465 }
  466 
  467 void
  468 softdep_update_inodeblock(ip, bp, waitfor)
  469         struct inode *ip;
  470         struct buf *bp;
  471         int waitfor;
  472 {
  473 
  474         panic("softdep_update_inodeblock called");
  475 }
  476 
  477 int
  478 softdep_fsync(vp)
  479         struct vnode *vp;       /* the "in_core" copy of the inode */
  480 {
  481 
  482         return (0);
  483 }
  484 
  485 void
  486 softdep_fsync_mountdev(vp)
  487         struct vnode *vp;
  488 {
  489 
  490         return;
  491 }
  492 
  493 int
  494 softdep_flushworklist(oldmnt, countp, td)
  495         struct mount *oldmnt;
  496         int *countp;
  497         struct thread *td;
  498 {
  499 
  500         *countp = 0;
  501         return (0);
  502 }
  503 
  504 int
  505 softdep_sync_metadata(struct vnode *vp)
  506 {
  507 
  508         panic("softdep_sync_metadata called");
  509 }
  510 
  511 int
  512 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
  513 {
  514 
  515         panic("softdep_sync_buf called");
  516 }
  517 
  518 int
  519 softdep_slowdown(vp)
  520         struct vnode *vp;
  521 {
  522 
  523         panic("softdep_slowdown called");
  524 }
  525 
  526 int
  527 softdep_request_cleanup(fs, vp, cred, resource)
  528         struct fs *fs;
  529         struct vnode *vp;
  530         struct ucred *cred;
  531         int resource;
  532 {
  533 
  534         return (0);
  535 }
  536 
  537 int
  538 softdep_check_suspend(struct mount *mp,
  539                       struct vnode *devvp,
  540                       int softdep_depcnt,
  541                       int softdep_accdepcnt,
  542                       int secondary_writes,
  543                       int secondary_accwrites)
  544 {
  545         struct bufobj *bo;
  546         int error;
  547         
  548         (void) softdep_depcnt,
  549         (void) softdep_accdepcnt;
  550 
  551         bo = &devvp->v_bufobj;
  552         ASSERT_BO_WLOCKED(bo);
  553 
  554         MNT_ILOCK(mp);
  555         while (mp->mnt_secondary_writes != 0) {
  556                 BO_UNLOCK(bo);
  557                 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
  558                     (PUSER - 1) | PDROP, "secwr", 0);
  559                 BO_LOCK(bo);
  560                 MNT_ILOCK(mp);
  561         }
  562 
  563         /*
  564          * Reasons for needing more work before suspend:
  565          * - Dirty buffers on devvp.
  566          * - Secondary writes occurred after start of vnode sync loop
  567          */
  568         error = 0;
  569         if (bo->bo_numoutput > 0 ||
  570             bo->bo_dirty.bv_cnt > 0 ||
  571             secondary_writes != 0 ||
  572             mp->mnt_secondary_writes != 0 ||
  573             secondary_accwrites != mp->mnt_secondary_accwrites)
  574                 error = EAGAIN;
  575         BO_UNLOCK(bo);
  576         return (error);
  577 }
  578 
  579 void
  580 softdep_get_depcounts(struct mount *mp,
  581                       int *softdepactivep,
  582                       int *softdepactiveaccp)
  583 {
  584         (void) mp;
  585         *softdepactivep = 0;
  586         *softdepactiveaccp = 0;
  587 }
  588 
  589 void
  590 softdep_buf_append(bp, wkhd)
  591         struct buf *bp;
  592         struct workhead *wkhd;
  593 {
  594 
  595         panic("softdep_buf_appendwork called");
  596 }
  597 
  598 void
  599 softdep_inode_append(ip, cred, wkhd)
  600         struct inode *ip;
  601         struct ucred *cred;
  602         struct workhead *wkhd;
  603 {
  604 
  605         panic("softdep_inode_appendwork called");
  606 }
  607 
  608 void
  609 softdep_freework(wkhd)
  610         struct workhead *wkhd;
  611 {
  612 
  613         panic("softdep_freework called");
  614 }
  615 
  616 #else
  617 
  618 FEATURE(softupdates, "FFS soft-updates support");
  619 
  620 static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
  621     "soft updates stats");
  622 static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
  623     "total dependencies allocated");
  624 static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
  625     "high use dependencies allocated");
  626 static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
  627     "current dependencies allocated");
  628 static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
  629     "current dependencies written");
  630 
  631 unsigned long dep_current[D_LAST + 1];
  632 unsigned long dep_highuse[D_LAST + 1];
  633 unsigned long dep_total[D_LAST + 1];
  634 unsigned long dep_write[D_LAST + 1];
  635 
  636 #define SOFTDEP_TYPE(type, str, long)                                   \
  637     static MALLOC_DEFINE(M_ ## type, #str, long);                       \
  638     SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,       \
  639         &dep_total[D_ ## type], 0, "");                                 \
  640     SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD,     \
  641         &dep_current[D_ ## type], 0, "");                               \
  642     SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD,     \
  643         &dep_highuse[D_ ## type], 0, "");                               \
  644     SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD,       \
  645         &dep_write[D_ ## type], 0, "");
  646 
  647 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 
  648 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
  649 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
  650     "Block or frag allocated from cyl group map");
  651 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
  652 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
  653 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
  654 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
  655 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
  656 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
  657 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
  658 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
  659 SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
  660 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
  661 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
  662 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
  663 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
  664 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
  665 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
  666 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
  667 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
  668 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
  669 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
  670 SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
  671 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
  672 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
  673 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
  674 SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
  675 
  676 static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
  677 
  678 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
  679 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
  680 static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
  681 
  682 #define M_SOFTDEP_FLAGS (M_WAITOK)
  683 
  684 /* 
  685  * translate from workitem type to memory type
  686  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
  687  */
  688 static struct malloc_type *memtype[] = {
  689         M_PAGEDEP,
  690         M_INODEDEP,
  691         M_BMSAFEMAP,
  692         M_NEWBLK,
  693         M_ALLOCDIRECT,
  694         M_INDIRDEP,
  695         M_ALLOCINDIR,
  696         M_FREEFRAG,
  697         M_FREEBLKS,
  698         M_FREEFILE,
  699         M_DIRADD,
  700         M_MKDIR,
  701         M_DIRREM,
  702         M_NEWDIRBLK,
  703         M_FREEWORK,
  704         M_FREEDEP,
  705         M_JADDREF,
  706         M_JREMREF,
  707         M_JMVREF,
  708         M_JNEWBLK,
  709         M_JFREEBLK,
  710         M_JFREEFRAG,
  711         M_JSEG,
  712         M_JSEGDEP,
  713         M_SBDEP,
  714         M_JTRUNC,
  715         M_JFSYNC,
  716         M_SENTINEL
  717 };
  718 
  719 #define DtoM(type) (memtype[type])
  720 
  721 /*
  722  * Names of malloc types.
  723  */
  724 #define TYPENAME(type)  \
  725         ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
  726 /*
  727  * End system adaptation definitions.
  728  */
  729 
  730 #define DOTDOT_OFFSET   offsetof(struct dirtemplate, dotdot_ino)
  731 #define DOT_OFFSET      offsetof(struct dirtemplate, dot_ino)
  732 
  733 /*
  734  * Internal function prototypes.
  735  */
  736 static  void check_clear_deps(struct mount *);
  737 static  void softdep_error(char *, int);
  738 static  int softdep_process_worklist(struct mount *, int);
  739 static  int softdep_waitidle(struct mount *, int);
  740 static  void drain_output(struct vnode *);
  741 static  struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
  742 static  int check_inodedep_free(struct inodedep *);
  743 static  void clear_remove(struct mount *);
  744 static  void clear_inodedeps(struct mount *);
  745 static  void unlinked_inodedep(struct mount *, struct inodedep *);
  746 static  void clear_unlinked_inodedep(struct inodedep *);
  747 static  struct inodedep *first_unlinked_inodedep(struct ufsmount *);
  748 static  int flush_pagedep_deps(struct vnode *, struct mount *,
  749             struct diraddhd *);
  750 static  int free_pagedep(struct pagedep *);
  751 static  int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
  752 static  int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
  753 static  int flush_deplist(struct allocdirectlst *, int, int *);
  754 static  int sync_cgs(struct mount *, int);
  755 static  int handle_written_filepage(struct pagedep *, struct buf *, int);
  756 static  int handle_written_sbdep(struct sbdep *, struct buf *);
  757 static  void initiate_write_sbdep(struct sbdep *);
  758 static  void diradd_inode_written(struct diradd *, struct inodedep *);
  759 static  int handle_written_indirdep(struct indirdep *, struct buf *,
  760             struct buf**, int);
  761 static  int handle_written_inodeblock(struct inodedep *, struct buf *, int);
  762 static  int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
  763             uint8_t *);
  764 static  int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
  765 static  void handle_written_jaddref(struct jaddref *);
  766 static  void handle_written_jremref(struct jremref *);
  767 static  void handle_written_jseg(struct jseg *, struct buf *);
  768 static  void handle_written_jnewblk(struct jnewblk *);
  769 static  void handle_written_jblkdep(struct jblkdep *);
  770 static  void handle_written_jfreefrag(struct jfreefrag *);
  771 static  void complete_jseg(struct jseg *);
  772 static  void complete_jsegs(struct jseg *);
  773 static  void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
  774 static  void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
  775 static  void jremref_write(struct jremref *, struct jseg *, uint8_t *);
  776 static  void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
  777 static  void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
  778 static  void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
  779 static  void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
  780 static  void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
  781 static  void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
  782 static  inline void inoref_write(struct inoref *, struct jseg *,
  783             struct jrefrec *);
  784 static  void handle_allocdirect_partdone(struct allocdirect *,
  785             struct workhead *);
  786 static  struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
  787             struct workhead *);
  788 static  void indirdep_complete(struct indirdep *);
  789 static  int indirblk_lookup(struct mount *, ufs2_daddr_t);
  790 static  void indirblk_insert(struct freework *);
  791 static  void indirblk_remove(struct freework *);
  792 static  void handle_allocindir_partdone(struct allocindir *);
  793 static  void initiate_write_filepage(struct pagedep *, struct buf *);
  794 static  void initiate_write_indirdep(struct indirdep*, struct buf *);
  795 static  void handle_written_mkdir(struct mkdir *, int);
  796 static  int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
  797             uint8_t *);
  798 static  void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
  799 static  void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
  800 static  void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
  801 static  void handle_workitem_freefile(struct freefile *);
  802 static  int handle_workitem_remove(struct dirrem *, int);
  803 static  struct dirrem *newdirrem(struct buf *, struct inode *,
  804             struct inode *, int, struct dirrem **);
  805 static  struct indirdep *indirdep_lookup(struct mount *, struct inode *,
  806             struct buf *);
  807 static  void cancel_indirdep(struct indirdep *, struct buf *,
  808             struct freeblks *);
  809 static  void free_indirdep(struct indirdep *);
  810 static  void free_diradd(struct diradd *, struct workhead *);
  811 static  void merge_diradd(struct inodedep *, struct diradd *);
  812 static  void complete_diradd(struct diradd *);
  813 static  struct diradd *diradd_lookup(struct pagedep *, int);
  814 static  struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
  815             struct jremref *);
  816 static  struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
  817             struct jremref *);
  818 static  void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
  819             struct jremref *, struct jremref *);
  820 static  void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
  821             struct jremref *);
  822 static  void cancel_allocindir(struct allocindir *, struct buf *bp,
  823             struct freeblks *, int);
  824 static  int setup_trunc_indir(struct freeblks *, struct inode *,
  825             ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
  826 static  void complete_trunc_indir(struct freework *);
  827 static  void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
  828             int);
  829 static  void complete_mkdir(struct mkdir *);
  830 static  void free_newdirblk(struct newdirblk *);
  831 static  void free_jremref(struct jremref *);
  832 static  void free_jaddref(struct jaddref *);
  833 static  void free_jsegdep(struct jsegdep *);
  834 static  void free_jsegs(struct jblocks *);
  835 static  void rele_jseg(struct jseg *);
  836 static  void free_jseg(struct jseg *, struct jblocks *);
  837 static  void free_jnewblk(struct jnewblk *);
  838 static  void free_jblkdep(struct jblkdep *);
  839 static  void free_jfreefrag(struct jfreefrag *);
  840 static  void free_freedep(struct freedep *);
  841 static  void journal_jremref(struct dirrem *, struct jremref *,
  842             struct inodedep *);
  843 static  void cancel_jnewblk(struct jnewblk *, struct workhead *);
  844 static  int cancel_jaddref(struct jaddref *, struct inodedep *,
  845             struct workhead *);
  846 static  void cancel_jfreefrag(struct jfreefrag *);
  847 static  inline void setup_freedirect(struct freeblks *, struct inode *,
  848             int, int);
  849 static  inline void setup_freeext(struct freeblks *, struct inode *, int, int);
  850 static  inline void setup_freeindir(struct freeblks *, struct inode *, int,
  851             ufs_lbn_t, int);
  852 static  inline struct freeblks *newfreeblks(struct mount *, struct inode *);
  853 static  void freeblks_free(struct ufsmount *, struct freeblks *, int);
  854 static  void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
  855 static  ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
  856 static  int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
  857 static  void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
  858             int, int);
  859 static  void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
  860 static  int cancel_pagedep(struct pagedep *, struct freeblks *, int);
  861 static  int deallocate_dependencies(struct buf *, struct freeblks *, int);
  862 static  void newblk_freefrag(struct newblk*);
  863 static  void free_newblk(struct newblk *);
  864 static  void cancel_allocdirect(struct allocdirectlst *,
  865             struct allocdirect *, struct freeblks *);
  866 static  int check_inode_unwritten(struct inodedep *);
  867 static  int free_inodedep(struct inodedep *);
  868 static  void freework_freeblock(struct freework *);
  869 static  void freework_enqueue(struct freework *);
  870 static  int handle_workitem_freeblocks(struct freeblks *, int);
  871 static  int handle_complete_freeblocks(struct freeblks *, int);
  872 static  void handle_workitem_indirblk(struct freework *);
  873 static  void handle_written_freework(struct freework *);
  874 static  void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
  875 static  struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
  876             struct workhead *);
  877 static  struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
  878             struct inodedep *, struct allocindir *, ufs_lbn_t);
  879 static  struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
  880             ufs2_daddr_t, ufs_lbn_t);
  881 static  void handle_workitem_freefrag(struct freefrag *);
  882 static  struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
  883             ufs_lbn_t);
  884 static  void allocdirect_merge(struct allocdirectlst *,
  885             struct allocdirect *, struct allocdirect *);
  886 static  struct freefrag *allocindir_merge(struct allocindir *,
  887             struct allocindir *);
  888 static  int bmsafemap_find(struct bmsafemap_hashhead *, int,
  889             struct bmsafemap **);
  890 static  struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
  891             int cg, struct bmsafemap *);
  892 static  int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
  893             struct newblk **);
  894 static  int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
  895 static  int inodedep_find(struct inodedep_hashhead *, ino_t,
  896             struct inodedep **);
  897 static  int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
  898 static  int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
  899             int, struct pagedep **);
  900 static  int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
  901             struct pagedep **);
  902 static  void pause_timer(void *);
  903 static  int request_cleanup(struct mount *, int);
  904 static  int softdep_request_cleanup_flush(struct mount *, struct ufsmount *);
  905 static  void schedule_cleanup(struct mount *);
  906 static void softdep_ast_cleanup_proc(struct thread *);
  907 static struct ufsmount *softdep_bp_to_mp(struct buf *bp);
  908 static  int process_worklist_item(struct mount *, int, int);
  909 static  void process_removes(struct vnode *);
  910 static  void process_truncates(struct vnode *);
  911 static  void jwork_move(struct workhead *, struct workhead *);
  912 static  void jwork_insert(struct workhead *, struct jsegdep *);
  913 static  void add_to_worklist(struct worklist *, int);
  914 static  void wake_worklist(struct worklist *);
  915 static  void wait_worklist(struct worklist *, char *);
  916 static  void remove_from_worklist(struct worklist *);
  917 static  void softdep_flush(void *);
  918 static  void softdep_flushjournal(struct mount *);
  919 static  int softdep_speedup(struct ufsmount *);
  920 static  void worklist_speedup(struct mount *);
  921 static  int journal_mount(struct mount *, struct fs *, struct ucred *);
  922 static  void journal_unmount(struct ufsmount *);
  923 static  int journal_space(struct ufsmount *, int);
  924 static  void journal_suspend(struct ufsmount *);
  925 static  int journal_unsuspend(struct ufsmount *ump);
  926 static  void softdep_prelink(struct vnode *, struct vnode *);
  927 static  void add_to_journal(struct worklist *);
  928 static  void remove_from_journal(struct worklist *);
  929 static  bool softdep_excess_items(struct ufsmount *, int);
  930 static  void softdep_process_journal(struct mount *, struct worklist *, int);
  931 static  struct jremref *newjremref(struct dirrem *, struct inode *,
  932             struct inode *ip, off_t, nlink_t);
  933 static  struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
  934             uint16_t);
  935 static  inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
  936             uint16_t);
  937 static  inline struct jsegdep *inoref_jseg(struct inoref *);
  938 static  struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
  939 static  struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
  940             ufs2_daddr_t, int);
  941 static  void adjust_newfreework(struct freeblks *, int);
  942 static  struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
  943 static  void move_newblock_dep(struct jaddref *, struct inodedep *);
  944 static  void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
  945 static  struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
  946             ufs2_daddr_t, long, ufs_lbn_t);
  947 static  struct freework *newfreework(struct ufsmount *, struct freeblks *,
  948             struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
  949 static  int jwait(struct worklist *, int);
  950 static  struct inodedep *inodedep_lookup_ip(struct inode *);
  951 static  int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
  952 static  struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
  953 static  void handle_jwork(struct workhead *);
  954 static  struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
  955             struct mkdir **);
  956 static  struct jblocks *jblocks_create(void);
  957 static  ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
  958 static  void jblocks_free(struct jblocks *, struct mount *, int);
  959 static  void jblocks_destroy(struct jblocks *);
  960 static  void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
  961 
  962 /*
  963  * Exported softdep operations.
  964  */
  965 static  void softdep_disk_io_initiation(struct buf *);
  966 static  void softdep_disk_write_complete(struct buf *);
  967 static  void softdep_deallocate_dependencies(struct buf *);
  968 static  int softdep_count_dependencies(struct buf *bp, int);
  969 
  970 /*
  971  * Global lock over all of soft updates.
  972  */
  973 static struct mtx lk;
  974 MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF);
  975 
  976 #define ACQUIRE_GBLLOCK(lk)     mtx_lock(lk)
  977 #define FREE_GBLLOCK(lk)        mtx_unlock(lk)
  978 #define GBLLOCK_OWNED(lk)       mtx_assert((lk), MA_OWNED)
  979 
  980 /*
  981  * Per-filesystem soft-updates locking.
  982  */
  983 #define LOCK_PTR(ump)           (&(ump)->um_softdep->sd_fslock)
  984 #define TRY_ACQUIRE_LOCK(ump)   rw_try_wlock(&(ump)->um_softdep->sd_fslock)
  985 #define ACQUIRE_LOCK(ump)       rw_wlock(&(ump)->um_softdep->sd_fslock)
  986 #define FREE_LOCK(ump)          rw_wunlock(&(ump)->um_softdep->sd_fslock)
  987 #define LOCK_OWNED(ump)         rw_assert(&(ump)->um_softdep->sd_fslock, \
  988                                     RA_WLOCKED)
  989 
  990 #define BUF_AREC(bp)            lockallowrecurse(&(bp)->b_lock)
  991 #define BUF_NOREC(bp)           lockdisablerecurse(&(bp)->b_lock)
  992 
  993 /*
  994  * Worklist queue management.
  995  * These routines require that the lock be held.
  996  */
  997 #ifndef /* NOT */ DEBUG
  998 #define WORKLIST_INSERT(head, item) do {        \
  999         (item)->wk_state |= ONWORKLIST;         \
 1000         LIST_INSERT_HEAD(head, item, wk_list);  \
 1001 } while (0)
 1002 #define WORKLIST_REMOVE(item) do {              \
 1003         (item)->wk_state &= ~ONWORKLIST;        \
 1004         LIST_REMOVE(item, wk_list);             \
 1005 } while (0)
 1006 #define WORKLIST_INSERT_UNLOCKED        WORKLIST_INSERT
 1007 #define WORKLIST_REMOVE_UNLOCKED        WORKLIST_REMOVE
 1008 
 1009 #else /* DEBUG */
 1010 static  void worklist_insert(struct workhead *, struct worklist *, int);
 1011 static  void worklist_remove(struct worklist *, int);
 1012 
 1013 #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
 1014 #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
 1015 #define WORKLIST_REMOVE(item) worklist_remove(item, 1)
 1016 #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
 1017 
 1018 static void
 1019 worklist_insert(head, item, locked)
 1020         struct workhead *head;
 1021         struct worklist *item;
 1022         int locked;
 1023 {
 1024 
 1025         if (locked)
 1026                 LOCK_OWNED(VFSTOUFS(item->wk_mp));
 1027         if (item->wk_state & ONWORKLIST)
 1028                 panic("worklist_insert: %p %s(0x%X) already on list",
 1029                     item, TYPENAME(item->wk_type), item->wk_state);
 1030         item->wk_state |= ONWORKLIST;
 1031         LIST_INSERT_HEAD(head, item, wk_list);
 1032 }
 1033 
 1034 static void
 1035 worklist_remove(item, locked)
 1036         struct worklist *item;
 1037         int locked;
 1038 {
 1039 
 1040         if (locked)
 1041                 LOCK_OWNED(VFSTOUFS(item->wk_mp));
 1042         if ((item->wk_state & ONWORKLIST) == 0)
 1043                 panic("worklist_remove: %p %s(0x%X) not on list",
 1044                     item, TYPENAME(item->wk_type), item->wk_state);
 1045         item->wk_state &= ~ONWORKLIST;
 1046         LIST_REMOVE(item, wk_list);
 1047 }
 1048 #endif /* DEBUG */
 1049 
 1050 /*
 1051  * Merge two jsegdeps keeping only the oldest one as newer references
 1052  * can't be discarded until after older references.
 1053  */
 1054 static inline struct jsegdep *
 1055 jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
 1056 {
 1057         struct jsegdep *swp;
 1058 
 1059         if (two == NULL)
 1060                 return (one);
 1061 
 1062         if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
 1063                 swp = one;
 1064                 one = two;
 1065                 two = swp;
 1066         }
 1067         WORKLIST_REMOVE(&two->jd_list);
 1068         free_jsegdep(two);
 1069 
 1070         return (one);
 1071 }
 1072 
 1073 /*
 1074  * If two freedeps are compatible free one to reduce list size.
 1075  */
 1076 static inline struct freedep *
 1077 freedep_merge(struct freedep *one, struct freedep *two)
 1078 {
 1079         if (two == NULL)
 1080                 return (one);
 1081 
 1082         if (one->fd_freework == two->fd_freework) {
 1083                 WORKLIST_REMOVE(&two->fd_list);
 1084                 free_freedep(two);
 1085         }
 1086         return (one);
 1087 }
 1088 
 1089 /*
 1090  * Move journal work from one list to another.  Duplicate freedeps and
 1091  * jsegdeps are coalesced to keep the lists as small as possible.
 1092  */
 1093 static void
 1094 jwork_move(dst, src)
 1095         struct workhead *dst;
 1096         struct workhead *src;
 1097 {
 1098         struct freedep *freedep;
 1099         struct jsegdep *jsegdep;
 1100         struct worklist *wkn;
 1101         struct worklist *wk;
 1102 
 1103         KASSERT(dst != src,
 1104             ("jwork_move: dst == src"));
 1105         freedep = NULL;
 1106         jsegdep = NULL;
 1107         LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
 1108                 if (wk->wk_type == D_JSEGDEP)
 1109                         jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
 1110                 else if (wk->wk_type == D_FREEDEP)
 1111                         freedep = freedep_merge(WK_FREEDEP(wk), freedep);
 1112         }
 1113 
 1114         while ((wk = LIST_FIRST(src)) != NULL) {
 1115                 WORKLIST_REMOVE(wk);
 1116                 WORKLIST_INSERT(dst, wk);
 1117                 if (wk->wk_type == D_JSEGDEP) {
 1118                         jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
 1119                         continue;
 1120                 }
 1121                 if (wk->wk_type == D_FREEDEP)
 1122                         freedep = freedep_merge(WK_FREEDEP(wk), freedep);
 1123         }
 1124 }
 1125 
 1126 static void
 1127 jwork_insert(dst, jsegdep)
 1128         struct workhead *dst;
 1129         struct jsegdep *jsegdep;
 1130 {
 1131         struct jsegdep *jsegdepn;
 1132         struct worklist *wk;
 1133 
 1134         LIST_FOREACH(wk, dst, wk_list)
 1135                 if (wk->wk_type == D_JSEGDEP)
 1136                         break;
 1137         if (wk == NULL) {
 1138                 WORKLIST_INSERT(dst, &jsegdep->jd_list);
 1139                 return;
 1140         }
 1141         jsegdepn = WK_JSEGDEP(wk);
 1142         if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
 1143                 WORKLIST_REMOVE(wk);
 1144                 free_jsegdep(jsegdepn);
 1145                 WORKLIST_INSERT(dst, &jsegdep->jd_list);
 1146         } else
 1147                 free_jsegdep(jsegdep);
 1148 }
 1149 
 1150 /*
 1151  * Routines for tracking and managing workitems.
 1152  */
 1153 static  void workitem_free(struct worklist *, int);
 1154 static  void workitem_alloc(struct worklist *, int, struct mount *);
 1155 static  void workitem_reassign(struct worklist *, int);
 1156 
 1157 #define WORKITEM_FREE(item, type) \
 1158         workitem_free((struct worklist *)(item), (type))
 1159 #define WORKITEM_REASSIGN(item, type) \
 1160         workitem_reassign((struct worklist *)(item), (type))
 1161 
 1162 static void
 1163 workitem_free(item, type)
 1164         struct worklist *item;
 1165         int type;
 1166 {
 1167         struct ufsmount *ump;
 1168 
 1169 #ifdef DEBUG
 1170         if (item->wk_state & ONWORKLIST)
 1171                 panic("workitem_free: %s(0x%X) still on list",
 1172                     TYPENAME(item->wk_type), item->wk_state);
 1173         if (item->wk_type != type && type != D_NEWBLK)
 1174                 panic("workitem_free: type mismatch %s != %s",
 1175                     TYPENAME(item->wk_type), TYPENAME(type));
 1176 #endif
 1177         if (item->wk_state & IOWAITING)
 1178                 wakeup(item);
 1179         ump = VFSTOUFS(item->wk_mp);
 1180         LOCK_OWNED(ump);
 1181         KASSERT(ump->softdep_deps > 0,
 1182             ("workitem_free: %s: softdep_deps going negative",
 1183             ump->um_fs->fs_fsmnt));
 1184         if (--ump->softdep_deps == 0 && ump->softdep_req)
 1185                 wakeup(&ump->softdep_deps);
 1186         KASSERT(dep_current[item->wk_type] > 0,
 1187             ("workitem_free: %s: dep_current[%s] going negative",
 1188             ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
 1189         KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
 1190             ("workitem_free: %s: softdep_curdeps[%s] going negative",
 1191             ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
 1192         atomic_subtract_long(&dep_current[item->wk_type], 1);
 1193         ump->softdep_curdeps[item->wk_type] -= 1;
 1194         free(item, DtoM(type));
 1195 }
 1196 
 1197 static void
 1198 workitem_alloc(item, type, mp)
 1199         struct worklist *item;
 1200         int type;
 1201         struct mount *mp;
 1202 {
 1203         struct ufsmount *ump;
 1204 
 1205         item->wk_type = type;
 1206         item->wk_mp = mp;
 1207         item->wk_state = 0;
 1208 
 1209         ump = VFSTOUFS(mp);
 1210         ACQUIRE_GBLLOCK(&lk);
 1211         dep_current[type]++;
 1212         if (dep_current[type] > dep_highuse[type])
 1213                 dep_highuse[type] = dep_current[type];
 1214         dep_total[type]++;
 1215         FREE_GBLLOCK(&lk);
 1216         ACQUIRE_LOCK(ump);
 1217         ump->softdep_curdeps[type] += 1;
 1218         ump->softdep_deps++;
 1219         ump->softdep_accdeps++;
 1220         FREE_LOCK(ump);
 1221 }
 1222 
 1223 static void
 1224 workitem_reassign(item, newtype)
 1225         struct worklist *item;
 1226         int newtype;
 1227 {
 1228         struct ufsmount *ump;
 1229 
 1230         ump = VFSTOUFS(item->wk_mp);
 1231         LOCK_OWNED(ump);
 1232         KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
 1233             ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
 1234             VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
 1235         ump->softdep_curdeps[item->wk_type] -= 1;
 1236         ump->softdep_curdeps[newtype] += 1;
 1237         KASSERT(dep_current[item->wk_type] > 0,
 1238             ("workitem_reassign: %s: dep_current[%s] going negative",
 1239             VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
 1240         ACQUIRE_GBLLOCK(&lk);
 1241         dep_current[newtype]++;
 1242         dep_current[item->wk_type]--;
 1243         if (dep_current[newtype] > dep_highuse[newtype])
 1244                 dep_highuse[newtype] = dep_current[newtype];
 1245         dep_total[newtype]++;
 1246         FREE_GBLLOCK(&lk);
 1247         item->wk_type = newtype;
 1248 }
 1249 
 1250 /*
 1251  * Workitem queue management
 1252  */
 1253 static int max_softdeps;        /* maximum number of structs before slowdown */
 1254 static int tickdelay = 2;       /* number of ticks to pause during slowdown */
 1255 static int proc_waiting;        /* tracks whether we have a timeout posted */
 1256 static int *stat_countp;        /* statistic to count in proc_waiting timeout */
 1257 static struct callout softdep_callout;
 1258 static int req_clear_inodedeps; /* syncer process flush some inodedeps */
 1259 static int req_clear_remove;    /* syncer process flush some freeblks */
 1260 static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
 1261 
 1262 /*
 1263  * runtime statistics
 1264  */
 1265 static int stat_flush_threads;  /* number of softdep flushing threads */
 1266 static int stat_worklist_push;  /* number of worklist cleanups */
 1267 static int stat_blk_limit_push; /* number of times block limit neared */
 1268 static int stat_ino_limit_push; /* number of times inode limit neared */
 1269 static int stat_blk_limit_hit;  /* number of times block slowdown imposed */
 1270 static int stat_ino_limit_hit;  /* number of times inode slowdown imposed */
 1271 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
 1272 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
 1273 static int stat_inode_bitmap;   /* bufs redirtied as inode bitmap not written */
 1274 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
 1275 static int stat_dir_entry;      /* bufs redirtied as dir entry cannot write */
 1276 static int stat_jaddref;        /* bufs redirtied as ino bitmap can not write */
 1277 static int stat_jnewblk;        /* bufs redirtied as blk bitmap can not write */
 1278 static int stat_journal_min;    /* Times hit journal min threshold */
 1279 static int stat_journal_low;    /* Times hit journal low threshold */
 1280 static int stat_journal_wait;   /* Times blocked in jwait(). */
 1281 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */
 1282 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */
 1283 static int stat_jwait_inode;    /* Times blocked in jwait() for inodes. */
 1284 static int stat_jwait_newblk;   /* Times blocked in jwait() for newblks. */
 1285 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
 1286 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
 1287 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
 1288 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
 1289 static int stat_cleanup_failures; /* Number of cleanup requests that failed */
 1290 static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
 1291 
 1292 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
 1293     &max_softdeps, 0, "");
 1294 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
 1295     &tickdelay, 0, "");
 1296 SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
 1297     &stat_flush_threads, 0, "");
 1298 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
 1299     &stat_worklist_push, 0,"");
 1300 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
 1301     &stat_blk_limit_push, 0,"");
 1302 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
 1303     &stat_ino_limit_push, 0,"");
 1304 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
 1305     &stat_blk_limit_hit, 0, "");
 1306 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
 1307     &stat_ino_limit_hit, 0, "");
 1308 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
 1309     &stat_sync_limit_hit, 0, "");
 1310 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
 1311     &stat_indir_blk_ptrs, 0, "");
 1312 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
 1313     &stat_inode_bitmap, 0, "");
 1314 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
 1315     &stat_direct_blk_ptrs, 0, "");
 1316 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
 1317     &stat_dir_entry, 0, "");
 1318 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
 1319     &stat_jaddref, 0, "");
 1320 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
 1321     &stat_jnewblk, 0, "");
 1322 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
 1323     &stat_journal_low, 0, "");
 1324 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
 1325     &stat_journal_min, 0, "");
 1326 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
 1327     &stat_journal_wait, 0, "");
 1328 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
 1329     &stat_jwait_filepage, 0, "");
 1330 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
 1331     &stat_jwait_freeblks, 0, "");
 1332 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
 1333     &stat_jwait_inode, 0, "");
 1334 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
 1335     &stat_jwait_newblk, 0, "");
 1336 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
 1337     &stat_cleanup_blkrequests, 0, "");
 1338 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
 1339     &stat_cleanup_inorequests, 0, "");
 1340 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
 1341     &stat_cleanup_high_delay, 0, "");
 1342 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
 1343     &stat_cleanup_retries, 0, "");
 1344 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
 1345     &stat_cleanup_failures, 0, "");
 1346 SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
 1347     &softdep_flushcache, 0, "");
 1348 SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
 1349     &stat_emptyjblocks, 0, "");
 1350 
 1351 SYSCTL_DECL(_vfs_ffs);
 1352 
 1353 /* Whether to recompute the summary at mount time */
 1354 static int compute_summary_at_mount = 0;
 1355 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
 1356            &compute_summary_at_mount, 0, "Recompute summary at mount");
 1357 static int print_threads = 0;
 1358 SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
 1359     &print_threads, 0, "Notify flusher thread start/stop");
 1360 
 1361 /* List of all filesystems mounted with soft updates */
 1362 static TAILQ_HEAD(, mount_softdeps) softdepmounts;
 1363 
 1364 /*
 1365  * This function cleans the worklist for a filesystem.
 1366  * Each filesystem running with soft dependencies gets its own
 1367  * thread to run in this function. The thread is started up in
 1368  * softdep_mount and shutdown in softdep_unmount. They show up
 1369  * as part of the kernel "bufdaemon" process whose process
 1370  * entry is available in bufdaemonproc.
 1371  */
 1372 static int searchfailed;
 1373 extern struct proc *bufdaemonproc;
 1374 static void
 1375 softdep_flush(addr)
 1376         void *addr;
 1377 {
 1378         struct mount *mp;
 1379         struct thread *td;
 1380         struct ufsmount *ump;
 1381 
 1382         td = curthread;
 1383         td->td_pflags |= TDP_NORUNNINGBUF;
 1384         mp = (struct mount *)addr;
 1385         ump = VFSTOUFS(mp);
 1386         atomic_add_int(&stat_flush_threads, 1);
 1387         ACQUIRE_LOCK(ump);
 1388         ump->softdep_flags &= ~FLUSH_STARTING;
 1389         wakeup(&ump->softdep_flushtd);
 1390         FREE_LOCK(ump);
 1391         if (print_threads) {
 1392                 if (stat_flush_threads == 1)
 1393                         printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
 1394                             bufdaemonproc->p_pid);
 1395                 printf("Start thread %s\n", td->td_name);
 1396         }
 1397         for (;;) {      
 1398                 while (softdep_process_worklist(mp, 0) > 0 ||
 1399                     (MOUNTEDSUJ(mp) &&
 1400                     VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
 1401                         kthread_suspend_check();
 1402                 ACQUIRE_LOCK(ump);
 1403                 if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
 1404                         msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
 1405                             "sdflush", hz / 2);
 1406                 ump->softdep_flags &= ~FLUSH_CLEANUP;
 1407                 /*
 1408                  * Check to see if we are done and need to exit.
 1409                  */
 1410                 if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
 1411                         FREE_LOCK(ump);
 1412                         continue;
 1413                 }
 1414                 ump->softdep_flags &= ~FLUSH_EXIT;
 1415                 FREE_LOCK(ump);
 1416                 wakeup(&ump->softdep_flags);
 1417                 if (print_threads)
 1418                         printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
 1419                 atomic_subtract_int(&stat_flush_threads, 1);
 1420                 kthread_exit();
 1421                 panic("kthread_exit failed\n");
 1422         }
 1423 }
 1424 
 1425 static void
 1426 worklist_speedup(mp)
 1427         struct mount *mp;
 1428 {
 1429         struct ufsmount *ump;
 1430 
 1431         ump = VFSTOUFS(mp);
 1432         LOCK_OWNED(ump);
 1433         if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
 1434                 ump->softdep_flags |= FLUSH_CLEANUP;
 1435         wakeup(&ump->softdep_flushtd);
 1436 }
 1437 
 1438 static int
 1439 softdep_speedup(ump)
 1440         struct ufsmount *ump;
 1441 {
 1442         struct ufsmount *altump;
 1443         struct mount_softdeps *sdp;
 1444 
 1445         LOCK_OWNED(ump);
 1446         worklist_speedup(ump->um_mountp);
 1447         bd_speedup();
 1448         /*
 1449          * If we have global shortages, then we need other
 1450          * filesystems to help with the cleanup. Here we wakeup a
 1451          * flusher thread for a filesystem that is over its fair
 1452          * share of resources.
 1453          */
 1454         if (req_clear_inodedeps || req_clear_remove) {
 1455                 ACQUIRE_GBLLOCK(&lk);
 1456                 TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
 1457                         if ((altump = sdp->sd_ump) == ump)
 1458                                 continue;
 1459                         if (((req_clear_inodedeps &&
 1460                             altump->softdep_curdeps[D_INODEDEP] >
 1461                             max_softdeps / stat_flush_threads) ||
 1462                             (req_clear_remove &&
 1463                             altump->softdep_curdeps[D_DIRREM] >
 1464                             (max_softdeps / 2) / stat_flush_threads)) &&
 1465                             TRY_ACQUIRE_LOCK(altump))
 1466                                 break;
 1467                 }
 1468                 if (sdp == NULL) {
 1469                         searchfailed++;
 1470                         FREE_GBLLOCK(&lk);
 1471                 } else {
 1472                         /*
 1473                          * Move to the end of the list so we pick a
 1474                          * different one on out next try.
 1475                          */
 1476                         TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
 1477                         TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
 1478                         FREE_GBLLOCK(&lk);
 1479                         if ((altump->softdep_flags &
 1480                             (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
 1481                                 altump->softdep_flags |= FLUSH_CLEANUP;
 1482                         altump->um_softdep->sd_cleanups++;
 1483                         wakeup(&altump->softdep_flushtd);
 1484                         FREE_LOCK(altump);
 1485                 }
 1486         }
 1487         return (speedup_syncer());
 1488 }
 1489 
 1490 /*
 1491  * Add an item to the end of the work queue.
 1492  * This routine requires that the lock be held.
 1493  * This is the only routine that adds items to the list.
 1494  * The following routine is the only one that removes items
 1495  * and does so in order from first to last.
 1496  */
 1497 
 1498 #define WK_HEAD         0x0001  /* Add to HEAD. */
 1499 #define WK_NODELAY      0x0002  /* Process immediately. */
 1500 
 1501 static void
 1502 add_to_worklist(wk, flags)
 1503         struct worklist *wk;
 1504         int flags;
 1505 {
 1506         struct ufsmount *ump;
 1507 
 1508         ump = VFSTOUFS(wk->wk_mp);
 1509         LOCK_OWNED(ump);
 1510         if (wk->wk_state & ONWORKLIST)
 1511                 panic("add_to_worklist: %s(0x%X) already on list",
 1512                     TYPENAME(wk->wk_type), wk->wk_state);
 1513         wk->wk_state |= ONWORKLIST;
 1514         if (ump->softdep_on_worklist == 0) {
 1515                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
 1516                 ump->softdep_worklist_tail = wk;
 1517         } else if (flags & WK_HEAD) {
 1518                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
 1519         } else {
 1520                 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
 1521                 ump->softdep_worklist_tail = wk;
 1522         }
 1523         ump->softdep_on_worklist += 1;
 1524         if (flags & WK_NODELAY)
 1525                 worklist_speedup(wk->wk_mp);
 1526 }
 1527 
 1528 /*
 1529  * Remove the item to be processed. If we are removing the last
 1530  * item on the list, we need to recalculate the tail pointer.
 1531  */
 1532 static void
 1533 remove_from_worklist(wk)
 1534         struct worklist *wk;
 1535 {
 1536         struct ufsmount *ump;
 1537 
 1538         ump = VFSTOUFS(wk->wk_mp);
 1539         if (ump->softdep_worklist_tail == wk)
 1540                 ump->softdep_worklist_tail =
 1541                     (struct worklist *)wk->wk_list.le_prev;
 1542         WORKLIST_REMOVE(wk);
 1543         ump->softdep_on_worklist -= 1;
 1544 }
 1545 
 1546 static void
 1547 wake_worklist(wk)
 1548         struct worklist *wk;
 1549 {
 1550         if (wk->wk_state & IOWAITING) {
 1551                 wk->wk_state &= ~IOWAITING;
 1552                 wakeup(wk);
 1553         }
 1554 }
 1555 
 1556 static void
 1557 wait_worklist(wk, wmesg)
 1558         struct worklist *wk;
 1559         char *wmesg;
 1560 {
 1561         struct ufsmount *ump;
 1562 
 1563         ump = VFSTOUFS(wk->wk_mp);
 1564         wk->wk_state |= IOWAITING;
 1565         msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
 1566 }
 1567 
 1568 /*
 1569  * Process that runs once per second to handle items in the background queue.
 1570  *
 1571  * Note that we ensure that everything is done in the order in which they
 1572  * appear in the queue. The code below depends on this property to ensure
 1573  * that blocks of a file are freed before the inode itself is freed. This
 1574  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
 1575  * until all the old ones have been purged from the dependency lists.
 1576  */
 1577 static int 
 1578 softdep_process_worklist(mp, full)
 1579         struct mount *mp;
 1580         int full;
 1581 {
 1582         int cnt, matchcnt;
 1583         struct ufsmount *ump;
 1584         long starttime;
 1585 
 1586         KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
 1587         if (MOUNTEDSOFTDEP(mp) == 0)
 1588                 return (0);
 1589         matchcnt = 0;
 1590         ump = VFSTOUFS(mp);
 1591         ACQUIRE_LOCK(ump);
 1592         starttime = time_second;
 1593         softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
 1594         check_clear_deps(mp);
 1595         while (ump->softdep_on_worklist > 0) {
 1596                 if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
 1597                         break;
 1598                 else
 1599                         matchcnt += cnt;
 1600                 check_clear_deps(mp);
 1601                 /*
 1602                  * We do not generally want to stop for buffer space, but if
 1603                  * we are really being a buffer hog, we will stop and wait.
 1604                  */
 1605                 if (should_yield()) {
 1606                         FREE_LOCK(ump);
 1607                         kern_yield(PRI_USER);
 1608                         bwillwrite();
 1609                         ACQUIRE_LOCK(ump);
 1610                 }
 1611                 /*
 1612                  * Never allow processing to run for more than one
 1613                  * second. This gives the syncer thread the opportunity
 1614                  * to pause if appropriate.
 1615                  */
 1616                 if (!full && starttime != time_second)
 1617                         break;
 1618         }
 1619         if (full == 0)
 1620                 journal_unsuspend(ump);
 1621         FREE_LOCK(ump);
 1622         return (matchcnt);
 1623 }
 1624 
 1625 /*
 1626  * Process all removes associated with a vnode if we are running out of
 1627  * journal space.  Any other process which attempts to flush these will
 1628  * be unable as we have the vnodes locked.
 1629  */
 1630 static void
 1631 process_removes(vp)
 1632         struct vnode *vp;
 1633 {
 1634         struct inodedep *inodedep;
 1635         struct dirrem *dirrem;
 1636         struct ufsmount *ump;
 1637         struct mount *mp;
 1638         ino_t inum;
 1639 
 1640         mp = vp->v_mount;
 1641         ump = VFSTOUFS(mp);
 1642         LOCK_OWNED(ump);
 1643         inum = VTOI(vp)->i_number;
 1644         for (;;) {
 1645 top:
 1646                 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
 1647                         return;
 1648                 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
 1649                         /*
 1650                          * If another thread is trying to lock this vnode
 1651                          * it will fail but we must wait for it to do so
 1652                          * before we can proceed.
 1653                          */
 1654                         if (dirrem->dm_state & INPROGRESS) {
 1655                                 wait_worklist(&dirrem->dm_list, "pwrwait");
 1656                                 goto top;
 1657                         }
 1658                         if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == 
 1659                             (COMPLETE | ONWORKLIST))
 1660                                 break;
 1661                 }
 1662                 if (dirrem == NULL)
 1663                         return;
 1664                 remove_from_worklist(&dirrem->dm_list);
 1665                 FREE_LOCK(ump);
 1666                 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
 1667                         panic("process_removes: suspended filesystem");
 1668                 handle_workitem_remove(dirrem, 0);
 1669                 vn_finished_secondary_write(mp);
 1670                 ACQUIRE_LOCK(ump);
 1671         }
 1672 }
 1673 
 1674 /*
 1675  * Process all truncations associated with a vnode if we are running out
 1676  * of journal space.  This is called when the vnode lock is already held
 1677  * and no other process can clear the truncation.  This function returns
 1678  * a value greater than zero if it did any work.
 1679  */
 1680 static void
 1681 process_truncates(vp)
 1682         struct vnode *vp;
 1683 {
 1684         struct inodedep *inodedep;
 1685         struct freeblks *freeblks;
 1686         struct ufsmount *ump;
 1687         struct mount *mp;
 1688         ino_t inum;
 1689         int cgwait;
 1690 
 1691         mp = vp->v_mount;
 1692         ump = VFSTOUFS(mp);
 1693         LOCK_OWNED(ump);
 1694         inum = VTOI(vp)->i_number;
 1695         for (;;) {
 1696                 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
 1697                         return;
 1698                 cgwait = 0;
 1699                 TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
 1700                         /* Journal entries not yet written.  */
 1701                         if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
 1702                                 jwait(&LIST_FIRST(
 1703                                     &freeblks->fb_jblkdephd)->jb_list,
 1704                                     MNT_WAIT);
 1705                                 break;
 1706                         }
 1707                         /* Another thread is executing this item. */
 1708                         if (freeblks->fb_state & INPROGRESS) {
 1709                                 wait_worklist(&freeblks->fb_list, "ptrwait");
 1710                                 break;
 1711                         }
 1712                         /* Freeblks is waiting on a inode write. */
 1713                         if ((freeblks->fb_state & COMPLETE) == 0) {
 1714                                 FREE_LOCK(ump);
 1715                                 ffs_update(vp, 1);
 1716                                 ACQUIRE_LOCK(ump);
 1717                                 break;
 1718                         }
 1719                         if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
 1720                             (ALLCOMPLETE | ONWORKLIST)) {
 1721                                 remove_from_worklist(&freeblks->fb_list);
 1722                                 freeblks->fb_state |= INPROGRESS;
 1723                                 FREE_LOCK(ump);
 1724                                 if (vn_start_secondary_write(NULL, &mp,
 1725                                     V_NOWAIT))
 1726                                         panic("process_truncates: "
 1727                                             "suspended filesystem");
 1728                                 handle_workitem_freeblocks(freeblks, 0);
 1729                                 vn_finished_secondary_write(mp);
 1730                                 ACQUIRE_LOCK(ump);
 1731                                 break;
 1732                         }
 1733                         if (freeblks->fb_cgwait)
 1734                                 cgwait++;
 1735                 }
 1736                 if (cgwait) {
 1737                         FREE_LOCK(ump);
 1738                         sync_cgs(mp, MNT_WAIT);
 1739                         ffs_sync_snap(mp, MNT_WAIT);
 1740                         ACQUIRE_LOCK(ump);
 1741                         continue;
 1742                 }
 1743                 if (freeblks == NULL)
 1744                         break;
 1745         }
 1746         return;
 1747 }
 1748 
 1749 /*
 1750  * Process one item on the worklist.
 1751  */
 1752 static int
 1753 process_worklist_item(mp, target, flags)
 1754         struct mount *mp;
 1755         int target;
 1756         int flags;
 1757 {
 1758         struct worklist sentinel;
 1759         struct worklist *wk;
 1760         struct ufsmount *ump;
 1761         int matchcnt;
 1762         int error;
 1763 
 1764         KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
 1765         /*
 1766          * If we are being called because of a process doing a
 1767          * copy-on-write, then it is not safe to write as we may
 1768          * recurse into the copy-on-write routine.
 1769          */
 1770         if (curthread->td_pflags & TDP_COWINPROGRESS)
 1771                 return (-1);
 1772         PHOLD(curproc); /* Don't let the stack go away. */
 1773         ump = VFSTOUFS(mp);
 1774         LOCK_OWNED(ump);
 1775         matchcnt = 0;
 1776         sentinel.wk_mp = NULL;
 1777         sentinel.wk_type = D_SENTINEL;
 1778         LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
 1779         for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
 1780             wk = LIST_NEXT(&sentinel, wk_list)) {
 1781                 if (wk->wk_type == D_SENTINEL) {
 1782                         LIST_REMOVE(&sentinel, wk_list);
 1783                         LIST_INSERT_AFTER(wk, &sentinel, wk_list);
 1784                         continue;
 1785                 }
 1786                 if (wk->wk_state & INPROGRESS)
 1787                         panic("process_worklist_item: %p already in progress.",
 1788                             wk);
 1789                 wk->wk_state |= INPROGRESS;
 1790                 remove_from_worklist(wk);
 1791                 FREE_LOCK(ump);
 1792                 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
 1793                         panic("process_worklist_item: suspended filesystem");
 1794                 switch (wk->wk_type) {
 1795                 case D_DIRREM:
 1796                         /* removal of a directory entry */
 1797                         error = handle_workitem_remove(WK_DIRREM(wk), flags);
 1798                         break;
 1799 
 1800                 case D_FREEBLKS:
 1801                         /* releasing blocks and/or fragments from a file */
 1802                         error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
 1803                             flags);
 1804                         break;
 1805 
 1806                 case D_FREEFRAG:
 1807                         /* releasing a fragment when replaced as a file grows */
 1808                         handle_workitem_freefrag(WK_FREEFRAG(wk));
 1809                         error = 0;
 1810                         break;
 1811 
 1812                 case D_FREEFILE:
 1813                         /* releasing an inode when its link count drops to 0 */
 1814                         handle_workitem_freefile(WK_FREEFILE(wk));
 1815                         error = 0;
 1816                         break;
 1817 
 1818                 default:
 1819                         panic("%s_process_worklist: Unknown type %s",
 1820                             "softdep", TYPENAME(wk->wk_type));
 1821                         /* NOTREACHED */
 1822                 }
 1823                 vn_finished_secondary_write(mp);
 1824                 ACQUIRE_LOCK(ump);
 1825                 if (error == 0) {
 1826                         if (++matchcnt == target)
 1827                                 break;
 1828                         continue;
 1829                 }
 1830                 /*
 1831                  * We have to retry the worklist item later.  Wake up any
 1832                  * waiters who may be able to complete it immediately and
 1833                  * add the item back to the head so we don't try to execute
 1834                  * it again.
 1835                  */
 1836                 wk->wk_state &= ~INPROGRESS;
 1837                 wake_worklist(wk);
 1838                 add_to_worklist(wk, WK_HEAD);
 1839         }
 1840         /* Sentinal could've become the tail from remove_from_worklist. */
 1841         if (ump->softdep_worklist_tail == &sentinel)
 1842                 ump->softdep_worklist_tail =
 1843                     (struct worklist *)sentinel.wk_list.le_prev;
 1844         LIST_REMOVE(&sentinel, wk_list);
 1845         PRELE(curproc);
 1846         return (matchcnt);
 1847 }
 1848 
 1849 /*
 1850  * Move dependencies from one buffer to another.
 1851  */
 1852 int
 1853 softdep_move_dependencies(oldbp, newbp)
 1854         struct buf *oldbp;
 1855         struct buf *newbp;
 1856 {
 1857         struct worklist *wk, *wktail;
 1858         struct ufsmount *ump;
 1859         int dirty;
 1860 
 1861         if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
 1862                 return (0);
 1863         KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
 1864             ("softdep_move_dependencies called on non-softdep filesystem"));
 1865         dirty = 0;
 1866         wktail = NULL;
 1867         ump = VFSTOUFS(wk->wk_mp);
 1868         ACQUIRE_LOCK(ump);
 1869         while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
 1870                 LIST_REMOVE(wk, wk_list);
 1871                 if (wk->wk_type == D_BMSAFEMAP &&
 1872                     bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
 1873                         dirty = 1;
 1874                 if (wktail == NULL)
 1875                         LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
 1876                 else
 1877                         LIST_INSERT_AFTER(wktail, wk, wk_list);
 1878                 wktail = wk;
 1879         }
 1880         FREE_LOCK(ump);
 1881 
 1882         return (dirty);
 1883 }
 1884 
 1885 /*
 1886  * Purge the work list of all items associated with a particular mount point.
 1887  */
 1888 int
 1889 softdep_flushworklist(oldmnt, countp, td)
 1890         struct mount *oldmnt;
 1891         int *countp;
 1892         struct thread *td;
 1893 {
 1894         struct vnode *devvp;
 1895         struct ufsmount *ump;
 1896         int count, error;
 1897 
 1898         /*
 1899          * Alternately flush the block device associated with the mount
 1900          * point and process any dependencies that the flushing
 1901          * creates. We continue until no more worklist dependencies
 1902          * are found.
 1903          */
 1904         *countp = 0;
 1905         error = 0;
 1906         ump = VFSTOUFS(oldmnt);
 1907         devvp = ump->um_devvp;
 1908         while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
 1909                 *countp += count;
 1910                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 1911                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
 1912                 VOP_UNLOCK(devvp, 0);
 1913                 if (error != 0)
 1914                         break;
 1915         }
 1916         return (error);
 1917 }
 1918 
 1919 #define SU_WAITIDLE_RETRIES     20
 1920 static int
 1921 softdep_waitidle(struct mount *mp, int flags __unused)
 1922 {
 1923         struct ufsmount *ump;
 1924         struct vnode *devvp;
 1925         struct thread *td;
 1926         int error, i;
 1927 
 1928         ump = VFSTOUFS(mp);
 1929         devvp = ump->um_devvp;
 1930         td = curthread;
 1931         error = 0;
 1932         ACQUIRE_LOCK(ump);
 1933         for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
 1934                 ump->softdep_req = 1;
 1935                 KASSERT((flags & FORCECLOSE) == 0 ||
 1936                     ump->softdep_on_worklist == 0,
 1937                     ("softdep_waitidle: work added after flush"));
 1938                 msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
 1939                     "softdeps", 10 * hz);
 1940                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 1941                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
 1942                 VOP_UNLOCK(devvp, 0);
 1943                 ACQUIRE_LOCK(ump);
 1944                 if (error != 0)
 1945                         break;
 1946         }
 1947         ump->softdep_req = 0;
 1948         if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
 1949                 error = EBUSY;
 1950                 printf("softdep_waitidle: Failed to flush worklist for %p\n",
 1951                     mp);
 1952         }
 1953         FREE_LOCK(ump);
 1954         return (error);
 1955 }
 1956 
 1957 /*
 1958  * Flush all vnodes and worklist items associated with a specified mount point.
 1959  */
 1960 int
 1961 softdep_flushfiles(oldmnt, flags, td)
 1962         struct mount *oldmnt;
 1963         int flags;
 1964         struct thread *td;
 1965 {
 1966 #ifdef QUOTA
 1967         struct ufsmount *ump;
 1968         int i;
 1969 #endif
 1970         int error, early, depcount, loopcnt, retry_flush_count, retry;
 1971         int morework;
 1972 
 1973         KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
 1974             ("softdep_flushfiles called on non-softdep filesystem"));
 1975         loopcnt = 10;
 1976         retry_flush_count = 3;
 1977 retry_flush:
 1978         error = 0;
 1979 
 1980         /*
 1981          * Alternately flush the vnodes associated with the mount
 1982          * point and process any dependencies that the flushing
 1983          * creates. In theory, this loop can happen at most twice,
 1984          * but we give it a few extra just to be sure.
 1985          */
 1986         for (; loopcnt > 0; loopcnt--) {
 1987                 /*
 1988                  * Do another flush in case any vnodes were brought in
 1989                  * as part of the cleanup operations.
 1990                  */
 1991                 early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
 1992                     MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
 1993                 if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
 1994                         break;
 1995                 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
 1996                     depcount == 0)
 1997                         break;
 1998         }
 1999         /*
 2000          * If we are unmounting then it is an error to fail. If we
 2001          * are simply trying to downgrade to read-only, then filesystem
 2002          * activity can keep us busy forever, so we just fail with EBUSY.
 2003          */
 2004         if (loopcnt == 0) {
 2005                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
 2006                         panic("softdep_flushfiles: looping");
 2007                 error = EBUSY;
 2008         }
 2009         if (!error)
 2010                 error = softdep_waitidle(oldmnt, flags);
 2011         if (!error) {
 2012                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
 2013                         retry = 0;
 2014                         MNT_ILOCK(oldmnt);
 2015                         KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
 2016                             ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
 2017                         morework = oldmnt->mnt_nvnodelistsize > 0;
 2018 #ifdef QUOTA
 2019                         ump = VFSTOUFS(oldmnt);
 2020                         UFS_LOCK(ump);
 2021                         for (i = 0; i < MAXQUOTAS; i++) {
 2022                                 if (ump->um_quotas[i] != NULLVP)
 2023                                         morework = 1;
 2024                         }
 2025                         UFS_UNLOCK(ump);
 2026 #endif
 2027                         if (morework) {
 2028                                 if (--retry_flush_count > 0) {
 2029                                         retry = 1;
 2030                                         loopcnt = 3;
 2031                                 } else
 2032                                         error = EBUSY;
 2033                         }
 2034                         MNT_IUNLOCK(oldmnt);
 2035                         if (retry)
 2036                                 goto retry_flush;
 2037                 }
 2038         }
 2039         return (error);
 2040 }
 2041 
 2042 /*
 2043  * Structure hashing.
 2044  * 
 2045  * There are four types of structures that can be looked up:
 2046  *      1) pagedep structures identified by mount point, inode number,
 2047  *         and logical block.
 2048  *      2) inodedep structures identified by mount point and inode number.
 2049  *      3) newblk structures identified by mount point and
 2050  *         physical block number.
 2051  *      4) bmsafemap structures identified by mount point and
 2052  *         cylinder group number.
 2053  *
 2054  * The "pagedep" and "inodedep" dependency structures are hashed
 2055  * separately from the file blocks and inodes to which they correspond.
 2056  * This separation helps when the in-memory copy of an inode or
 2057  * file block must be replaced. It also obviates the need to access
 2058  * an inode or file page when simply updating (or de-allocating)
 2059  * dependency structures. Lookup of newblk structures is needed to
 2060  * find newly allocated blocks when trying to associate them with
 2061  * their allocdirect or allocindir structure.
 2062  *
 2063  * The lookup routines optionally create and hash a new instance when
 2064  * an existing entry is not found. The bmsafemap lookup routine always
 2065  * allocates a new structure if an existing one is not found.
 2066  */
 2067 #define DEPALLOC        0x0001  /* allocate structure if lookup fails */
 2068 
 2069 /*
 2070  * Structures and routines associated with pagedep caching.
 2071  */
 2072 #define PAGEDEP_HASH(ump, inum, lbn) \
 2073         (&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
 2074 
 2075 static int
 2076 pagedep_find(pagedephd, ino, lbn, pagedeppp)
 2077         struct pagedep_hashhead *pagedephd;
 2078         ino_t ino;
 2079         ufs_lbn_t lbn;
 2080         struct pagedep **pagedeppp;
 2081 {
 2082         struct pagedep *pagedep;
 2083 
 2084         LIST_FOREACH(pagedep, pagedephd, pd_hash) {
 2085                 if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
 2086                         *pagedeppp = pagedep;
 2087                         return (1);
 2088                 }
 2089         }
 2090         *pagedeppp = NULL;
 2091         return (0);
 2092 }
 2093 /*
 2094  * Look up a pagedep. Return 1 if found, 0 otherwise.
 2095  * If not found, allocate if DEPALLOC flag is passed.
 2096  * Found or allocated entry is returned in pagedeppp.
 2097  * This routine must be called with splbio interrupts blocked.
 2098  */
 2099 static int
 2100 pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
 2101         struct mount *mp;
 2102         struct buf *bp;
 2103         ino_t ino;
 2104         ufs_lbn_t lbn;
 2105         int flags;
 2106         struct pagedep **pagedeppp;
 2107 {
 2108         struct pagedep *pagedep;
 2109         struct pagedep_hashhead *pagedephd;
 2110         struct worklist *wk;
 2111         struct ufsmount *ump;
 2112         int ret;
 2113         int i;
 2114 
 2115         ump = VFSTOUFS(mp);
 2116         LOCK_OWNED(ump);
 2117         if (bp) {
 2118                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 2119                         if (wk->wk_type == D_PAGEDEP) {
 2120                                 *pagedeppp = WK_PAGEDEP(wk);
 2121                                 return (1);
 2122                         }
 2123                 }
 2124         }
 2125         pagedephd = PAGEDEP_HASH(ump, ino, lbn);
 2126         ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
 2127         if (ret) {
 2128                 if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
 2129                         WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
 2130                 return (1);
 2131         }
 2132         if ((flags & DEPALLOC) == 0)
 2133                 return (0);
 2134         FREE_LOCK(ump);
 2135         pagedep = malloc(sizeof(struct pagedep),
 2136             M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
 2137         workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
 2138         ACQUIRE_LOCK(ump);
 2139         ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
 2140         if (*pagedeppp) {
 2141                 /*
 2142                  * This should never happen since we only create pagedeps
 2143                  * with the vnode lock held.  Could be an assert.
 2144                  */
 2145                 WORKITEM_FREE(pagedep, D_PAGEDEP);
 2146                 return (ret);
 2147         }
 2148         pagedep->pd_ino = ino;
 2149         pagedep->pd_lbn = lbn;
 2150         LIST_INIT(&pagedep->pd_dirremhd);
 2151         LIST_INIT(&pagedep->pd_pendinghd);
 2152         for (i = 0; i < DAHASHSZ; i++)
 2153                 LIST_INIT(&pagedep->pd_diraddhd[i]);
 2154         LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
 2155         WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 2156         *pagedeppp = pagedep;
 2157         return (0);
 2158 }
 2159 
 2160 /*
 2161  * Structures and routines associated with inodedep caching.
 2162  */
 2163 #define INODEDEP_HASH(ump, inum) \
 2164       (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
 2165 
 2166 static int
 2167 inodedep_find(inodedephd, inum, inodedeppp)
 2168         struct inodedep_hashhead *inodedephd;
 2169         ino_t inum;
 2170         struct inodedep **inodedeppp;
 2171 {
 2172         struct inodedep *inodedep;
 2173 
 2174         LIST_FOREACH(inodedep, inodedephd, id_hash)
 2175                 if (inum == inodedep->id_ino)
 2176                         break;
 2177         if (inodedep) {
 2178                 *inodedeppp = inodedep;
 2179                 return (1);
 2180         }
 2181         *inodedeppp = NULL;
 2182 
 2183         return (0);
 2184 }
 2185 /*
 2186  * Look up an inodedep. Return 1 if found, 0 if not found.
 2187  * If not found, allocate if DEPALLOC flag is passed.
 2188  * Found or allocated entry is returned in inodedeppp.
 2189  * This routine must be called with splbio interrupts blocked.
 2190  */
 2191 static int
 2192 inodedep_lookup(mp, inum, flags, inodedeppp)
 2193         struct mount *mp;
 2194         ino_t inum;
 2195         int flags;
 2196         struct inodedep **inodedeppp;
 2197 {
 2198         struct inodedep *inodedep;
 2199         struct inodedep_hashhead *inodedephd;
 2200         struct ufsmount *ump;
 2201         struct fs *fs;
 2202 
 2203         ump = VFSTOUFS(mp);
 2204         LOCK_OWNED(ump);
 2205         fs = ump->um_fs;
 2206         inodedephd = INODEDEP_HASH(ump, inum);
 2207 
 2208         if (inodedep_find(inodedephd, inum, inodedeppp))
 2209                 return (1);
 2210         if ((flags & DEPALLOC) == 0)
 2211                 return (0);
 2212         /*
 2213          * If the system is over its limit and our filesystem is
 2214          * responsible for more than our share of that usage and
 2215          * we are not in a rush, request some inodedep cleanup.
 2216          */
 2217         if (softdep_excess_items(ump, D_INODEDEP))
 2218                 schedule_cleanup(mp);
 2219         else
 2220                 FREE_LOCK(ump);
 2221         inodedep = malloc(sizeof(struct inodedep),
 2222                 M_INODEDEP, M_SOFTDEP_FLAGS);
 2223         workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
 2224         ACQUIRE_LOCK(ump);
 2225         if (inodedep_find(inodedephd, inum, inodedeppp)) {
 2226                 WORKITEM_FREE(inodedep, D_INODEDEP);
 2227                 return (1);
 2228         }
 2229         inodedep->id_fs = fs;
 2230         inodedep->id_ino = inum;
 2231         inodedep->id_state = ALLCOMPLETE;
 2232         inodedep->id_nlinkdelta = 0;
 2233         inodedep->id_savedino1 = NULL;
 2234         inodedep->id_savedsize = -1;
 2235         inodedep->id_savedextsize = -1;
 2236         inodedep->id_savednlink = -1;
 2237         inodedep->id_bmsafemap = NULL;
 2238         inodedep->id_mkdiradd = NULL;
 2239         LIST_INIT(&inodedep->id_dirremhd);
 2240         LIST_INIT(&inodedep->id_pendinghd);
 2241         LIST_INIT(&inodedep->id_inowait);
 2242         LIST_INIT(&inodedep->id_bufwait);
 2243         TAILQ_INIT(&inodedep->id_inoreflst);
 2244         TAILQ_INIT(&inodedep->id_inoupdt);
 2245         TAILQ_INIT(&inodedep->id_newinoupdt);
 2246         TAILQ_INIT(&inodedep->id_extupdt);
 2247         TAILQ_INIT(&inodedep->id_newextupdt);
 2248         TAILQ_INIT(&inodedep->id_freeblklst);
 2249         LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
 2250         *inodedeppp = inodedep;
 2251         return (0);
 2252 }
 2253 
 2254 /*
 2255  * Structures and routines associated with newblk caching.
 2256  */
 2257 #define NEWBLK_HASH(ump, inum) \
 2258         (&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
 2259 
 2260 static int
 2261 newblk_find(newblkhd, newblkno, flags, newblkpp)
 2262         struct newblk_hashhead *newblkhd;
 2263         ufs2_daddr_t newblkno;
 2264         int flags;
 2265         struct newblk **newblkpp;
 2266 {
 2267         struct newblk *newblk;
 2268 
 2269         LIST_FOREACH(newblk, newblkhd, nb_hash) {
 2270                 if (newblkno != newblk->nb_newblkno)
 2271                         continue;
 2272                 /*
 2273                  * If we're creating a new dependency don't match those that
 2274                  * have already been converted to allocdirects.  This is for
 2275                  * a frag extend.
 2276                  */
 2277                 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
 2278                         continue;
 2279                 break;
 2280         }
 2281         if (newblk) {
 2282                 *newblkpp = newblk;
 2283                 return (1);
 2284         }
 2285         *newblkpp = NULL;
 2286         return (0);
 2287 }
 2288 
 2289 /*
 2290  * Look up a newblk. Return 1 if found, 0 if not found.
 2291  * If not found, allocate if DEPALLOC flag is passed.
 2292  * Found or allocated entry is returned in newblkpp.
 2293  */
 2294 static int
 2295 newblk_lookup(mp, newblkno, flags, newblkpp)
 2296         struct mount *mp;
 2297         ufs2_daddr_t newblkno;
 2298         int flags;
 2299         struct newblk **newblkpp;
 2300 {
 2301         struct newblk *newblk;
 2302         struct newblk_hashhead *newblkhd;
 2303         struct ufsmount *ump;
 2304 
 2305         ump = VFSTOUFS(mp);
 2306         LOCK_OWNED(ump);
 2307         newblkhd = NEWBLK_HASH(ump, newblkno);
 2308         if (newblk_find(newblkhd, newblkno, flags, newblkpp))
 2309                 return (1);
 2310         if ((flags & DEPALLOC) == 0)
 2311                 return (0);
 2312         if (softdep_excess_items(ump, D_NEWBLK) ||
 2313             softdep_excess_items(ump, D_ALLOCDIRECT) ||
 2314             softdep_excess_items(ump, D_ALLOCINDIR))
 2315                 schedule_cleanup(mp);
 2316         else
 2317                 FREE_LOCK(ump);
 2318         newblk = malloc(sizeof(union allblk), M_NEWBLK,
 2319             M_SOFTDEP_FLAGS | M_ZERO);
 2320         workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
 2321         ACQUIRE_LOCK(ump);
 2322         if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
 2323                 WORKITEM_FREE(newblk, D_NEWBLK);
 2324                 return (1);
 2325         }
 2326         newblk->nb_freefrag = NULL;
 2327         LIST_INIT(&newblk->nb_indirdeps);
 2328         LIST_INIT(&newblk->nb_newdirblk);
 2329         LIST_INIT(&newblk->nb_jwork);
 2330         newblk->nb_state = ATTACHED;
 2331         newblk->nb_newblkno = newblkno;
 2332         LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
 2333         *newblkpp = newblk;
 2334         return (0);
 2335 }
 2336 
 2337 /*
 2338  * Structures and routines associated with freed indirect block caching.
 2339  */
 2340 #define INDIR_HASH(ump, blkno) \
 2341         (&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
 2342 
 2343 /*
 2344  * Lookup an indirect block in the indir hash table.  The freework is
 2345  * removed and potentially freed.  The caller must do a blocking journal
 2346  * write before writing to the blkno.
 2347  */
 2348 static int
 2349 indirblk_lookup(mp, blkno)
 2350         struct mount *mp;
 2351         ufs2_daddr_t blkno;
 2352 {
 2353         struct freework *freework;
 2354         struct indir_hashhead *wkhd;
 2355         struct ufsmount *ump;
 2356 
 2357         ump = VFSTOUFS(mp);
 2358         wkhd = INDIR_HASH(ump, blkno);
 2359         TAILQ_FOREACH(freework, wkhd, fw_next) {
 2360                 if (freework->fw_blkno != blkno)
 2361                         continue;
 2362                 indirblk_remove(freework);
 2363                 return (1);
 2364         }
 2365         return (0);
 2366 }
 2367 
 2368 /*
 2369  * Insert an indirect block represented by freework into the indirblk
 2370  * hash table so that it may prevent the block from being re-used prior
 2371  * to the journal being written.
 2372  */
 2373 static void
 2374 indirblk_insert(freework)
 2375         struct freework *freework;
 2376 {
 2377         struct jblocks *jblocks;
 2378         struct jseg *jseg;
 2379         struct ufsmount *ump;
 2380 
 2381         ump = VFSTOUFS(freework->fw_list.wk_mp);
 2382         jblocks = ump->softdep_jblocks;
 2383         jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
 2384         if (jseg == NULL)
 2385                 return;
 2386         
 2387         LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
 2388         TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
 2389             fw_next);
 2390         freework->fw_state &= ~DEPCOMPLETE;
 2391 }
 2392 
 2393 static void
 2394 indirblk_remove(freework)
 2395         struct freework *freework;
 2396 {
 2397         struct ufsmount *ump;
 2398 
 2399         ump = VFSTOUFS(freework->fw_list.wk_mp);
 2400         LIST_REMOVE(freework, fw_segs);
 2401         TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
 2402         freework->fw_state |= DEPCOMPLETE;
 2403         if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
 2404                 WORKITEM_FREE(freework, D_FREEWORK);
 2405 }
 2406 
 2407 /*
 2408  * Executed during filesystem system initialization before
 2409  * mounting any filesystems.
 2410  */
 2411 void 
 2412 softdep_initialize()
 2413 {
 2414 
 2415         TAILQ_INIT(&softdepmounts);
 2416 #ifdef __LP64__
 2417         max_softdeps = desiredvnodes * 4;
 2418 #else
 2419         max_softdeps = desiredvnodes * 2;
 2420 #endif
 2421 
 2422         /* initialise bioops hack */
 2423         bioops.io_start = softdep_disk_io_initiation;
 2424         bioops.io_complete = softdep_disk_write_complete;
 2425         bioops.io_deallocate = softdep_deallocate_dependencies;
 2426         bioops.io_countdeps = softdep_count_dependencies;
 2427         softdep_ast_cleanup = softdep_ast_cleanup_proc;
 2428 
 2429         /* Initialize the callout with an mtx. */
 2430         callout_init_mtx(&softdep_callout, &lk, 0);
 2431 }
 2432 
 2433 /*
 2434  * Executed after all filesystems have been unmounted during
 2435  * filesystem module unload.
 2436  */
 2437 void
 2438 softdep_uninitialize()
 2439 {
 2440 
 2441         /* clear bioops hack */
 2442         bioops.io_start = NULL;
 2443         bioops.io_complete = NULL;
 2444         bioops.io_deallocate = NULL;
 2445         bioops.io_countdeps = NULL;
 2446         softdep_ast_cleanup = NULL;
 2447 
 2448         callout_drain(&softdep_callout);
 2449 }
 2450 
 2451 /*
 2452  * Called at mount time to notify the dependency code that a
 2453  * filesystem wishes to use it.
 2454  */
 2455 int
 2456 softdep_mount(devvp, mp, fs, cred)
 2457         struct vnode *devvp;
 2458         struct mount *mp;
 2459         struct fs *fs;
 2460         struct ucred *cred;
 2461 {
 2462         struct csum_total cstotal;
 2463         struct mount_softdeps *sdp;
 2464         struct ufsmount *ump;
 2465         struct cg *cgp;
 2466         struct buf *bp;
 2467         int i, error, cyl;
 2468 
 2469         sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
 2470             M_WAITOK | M_ZERO);
 2471         MNT_ILOCK(mp);
 2472         mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
 2473         if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
 2474                 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 
 2475                         MNTK_SOFTDEP | MNTK_NOASYNC;
 2476         }
 2477         ump = VFSTOUFS(mp);
 2478         ump->um_softdep = sdp;
 2479         MNT_IUNLOCK(mp);
 2480         rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock");
 2481         sdp->sd_ump = ump;
 2482         LIST_INIT(&ump->softdep_workitem_pending);
 2483         LIST_INIT(&ump->softdep_journal_pending);
 2484         TAILQ_INIT(&ump->softdep_unlinked);
 2485         LIST_INIT(&ump->softdep_dirtycg);
 2486         ump->softdep_worklist_tail = NULL;
 2487         ump->softdep_on_worklist = 0;
 2488         ump->softdep_deps = 0;
 2489         LIST_INIT(&ump->softdep_mkdirlisthd);
 2490         ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
 2491             &ump->pagedep_hash_size);
 2492         ump->pagedep_nextclean = 0;
 2493         ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
 2494             &ump->inodedep_hash_size);
 2495         ump->inodedep_nextclean = 0;
 2496         ump->newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK,
 2497             &ump->newblk_hash_size);
 2498         ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
 2499             &ump->bmsafemap_hash_size);
 2500         i = 1 << (ffs(desiredvnodes / 10) - 1);
 2501         ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
 2502             M_FREEWORK, M_WAITOK);
 2503         ump->indir_hash_size = i - 1;
 2504         for (i = 0; i <= ump->indir_hash_size; i++)
 2505                 TAILQ_INIT(&ump->indir_hashtbl[i]);
 2506         ACQUIRE_GBLLOCK(&lk);
 2507         TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
 2508         FREE_GBLLOCK(&lk);
 2509         if ((fs->fs_flags & FS_SUJ) &&
 2510             (error = journal_mount(mp, fs, cred)) != 0) {
 2511                 printf("Failed to start journal: %d\n", error);
 2512                 softdep_unmount(mp);
 2513                 return (error);
 2514         }
 2515         /*
 2516          * Start our flushing thread in the bufdaemon process.
 2517          */
 2518         ACQUIRE_LOCK(ump);
 2519         ump->softdep_flags |= FLUSH_STARTING;
 2520         FREE_LOCK(ump);
 2521         kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
 2522             &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
 2523             mp->mnt_stat.f_mntonname);
 2524         ACQUIRE_LOCK(ump);
 2525         while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
 2526                 msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
 2527                     hz / 2);
 2528         }
 2529         FREE_LOCK(ump);
 2530         /*
 2531          * When doing soft updates, the counters in the
 2532          * superblock may have gotten out of sync. Recomputation
 2533          * can take a long time and can be deferred for background
 2534          * fsck.  However, the old behavior of scanning the cylinder
 2535          * groups and recalculating them at mount time is available
 2536          * by setting vfs.ffs.compute_summary_at_mount to one.
 2537          */
 2538         if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
 2539                 return (0);
 2540         bzero(&cstotal, sizeof cstotal);
 2541         for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
 2542                 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
 2543                     fs->fs_cgsize, cred, &bp)) != 0) {
 2544                         brelse(bp);
 2545                         softdep_unmount(mp);
 2546                         return (error);
 2547                 }
 2548                 cgp = (struct cg *)bp->b_data;
 2549                 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
 2550                 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
 2551                 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
 2552                 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
 2553                 fs->fs_cs(fs, cyl) = cgp->cg_cs;
 2554                 brelse(bp);
 2555         }
 2556 #ifdef DEBUG
 2557         if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
 2558                 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
 2559 #endif
 2560         bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
 2561         return (0);
 2562 }
 2563 
 2564 void
 2565 softdep_unmount(mp)
 2566         struct mount *mp;
 2567 {
 2568         struct ufsmount *ump;
 2569 #ifdef INVARIANTS
 2570         int i;
 2571 #endif
 2572 
 2573         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 2574             ("softdep_unmount called on non-softdep filesystem"));
 2575         ump = VFSTOUFS(mp);
 2576         MNT_ILOCK(mp);
 2577         mp->mnt_flag &= ~MNT_SOFTDEP;
 2578         if (MOUNTEDSUJ(mp) == 0) {
 2579                 MNT_IUNLOCK(mp);
 2580         } else {
 2581                 mp->mnt_flag &= ~MNT_SUJ;
 2582                 MNT_IUNLOCK(mp);
 2583                 journal_unmount(ump);
 2584         }
 2585         /*
 2586          * Shut down our flushing thread. Check for NULL is if
 2587          * softdep_mount errors out before the thread has been created.
 2588          */
 2589         if (ump->softdep_flushtd != NULL) {
 2590                 ACQUIRE_LOCK(ump);
 2591                 ump->softdep_flags |= FLUSH_EXIT;
 2592                 wakeup(&ump->softdep_flushtd);
 2593                 msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
 2594                     "sdwait", 0);
 2595                 KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
 2596                     ("Thread shutdown failed"));
 2597         }
 2598         /*
 2599          * Free up our resources.
 2600          */
 2601         ACQUIRE_GBLLOCK(&lk);
 2602         TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
 2603         FREE_GBLLOCK(&lk);
 2604         rw_destroy(LOCK_PTR(ump));
 2605         hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
 2606         hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
 2607         hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
 2608         hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
 2609             ump->bmsafemap_hash_size);
 2610         free(ump->indir_hashtbl, M_FREEWORK);
 2611 #ifdef INVARIANTS
 2612         for (i = 0; i <= D_LAST; i++)
 2613                 KASSERT(ump->softdep_curdeps[i] == 0,
 2614                     ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
 2615                     TYPENAME(i), ump->softdep_curdeps[i]));
 2616 #endif
 2617         free(ump->um_softdep, M_MOUNTDATA);
 2618 }
 2619 
 2620 static struct jblocks *
 2621 jblocks_create(void)
 2622 {
 2623         struct jblocks *jblocks;
 2624 
 2625         jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
 2626         TAILQ_INIT(&jblocks->jb_segs);
 2627         jblocks->jb_avail = 10;
 2628         jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
 2629             M_JBLOCKS, M_WAITOK | M_ZERO);
 2630 
 2631         return (jblocks);
 2632 }
 2633 
 2634 static ufs2_daddr_t
 2635 jblocks_alloc(jblocks, bytes, actual)
 2636         struct jblocks *jblocks;
 2637         int bytes;
 2638         int *actual;
 2639 {
 2640         ufs2_daddr_t daddr;
 2641         struct jextent *jext;
 2642         int freecnt;
 2643         int blocks;
 2644 
 2645         blocks = bytes / DEV_BSIZE;
 2646         jext = &jblocks->jb_extent[jblocks->jb_head];
 2647         freecnt = jext->je_blocks - jblocks->jb_off;
 2648         if (freecnt == 0) {
 2649                 jblocks->jb_off = 0;
 2650                 if (++jblocks->jb_head > jblocks->jb_used)
 2651                         jblocks->jb_head = 0;
 2652                 jext = &jblocks->jb_extent[jblocks->jb_head];
 2653                 freecnt = jext->je_blocks;
 2654         }
 2655         if (freecnt > blocks)
 2656                 freecnt = blocks;
 2657         *actual = freecnt * DEV_BSIZE;
 2658         daddr = jext->je_daddr + jblocks->jb_off;
 2659         jblocks->jb_off += freecnt;
 2660         jblocks->jb_free -= freecnt;
 2661 
 2662         return (daddr);
 2663 }
 2664 
 2665 static void
 2666 jblocks_free(jblocks, mp, bytes)
 2667         struct jblocks *jblocks;
 2668         struct mount *mp;
 2669         int bytes;
 2670 {
 2671 
 2672         LOCK_OWNED(VFSTOUFS(mp));
 2673         jblocks->jb_free += bytes / DEV_BSIZE;
 2674         if (jblocks->jb_suspended)
 2675                 worklist_speedup(mp);
 2676         wakeup(jblocks);
 2677 }
 2678 
 2679 static void
 2680 jblocks_destroy(jblocks)
 2681         struct jblocks *jblocks;
 2682 {
 2683 
 2684         if (jblocks->jb_extent)
 2685                 free(jblocks->jb_extent, M_JBLOCKS);
 2686         free(jblocks, M_JBLOCKS);
 2687 }
 2688 
 2689 static void
 2690 jblocks_add(jblocks, daddr, blocks)
 2691         struct jblocks *jblocks;
 2692         ufs2_daddr_t daddr;
 2693         int blocks;
 2694 {
 2695         struct jextent *jext;
 2696 
 2697         jblocks->jb_blocks += blocks;
 2698         jblocks->jb_free += blocks;
 2699         jext = &jblocks->jb_extent[jblocks->jb_used];
 2700         /* Adding the first block. */
 2701         if (jext->je_daddr == 0) {
 2702                 jext->je_daddr = daddr;
 2703                 jext->je_blocks = blocks;
 2704                 return;
 2705         }
 2706         /* Extending the last extent. */
 2707         if (jext->je_daddr + jext->je_blocks == daddr) {
 2708                 jext->je_blocks += blocks;
 2709                 return;
 2710         }
 2711         /* Adding a new extent. */
 2712         if (++jblocks->jb_used == jblocks->jb_avail) {
 2713                 jblocks->jb_avail *= 2;
 2714                 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
 2715                     M_JBLOCKS, M_WAITOK | M_ZERO);
 2716                 memcpy(jext, jblocks->jb_extent,
 2717                     sizeof(struct jextent) * jblocks->jb_used);
 2718                 free(jblocks->jb_extent, M_JBLOCKS);
 2719                 jblocks->jb_extent = jext;
 2720         }
 2721         jext = &jblocks->jb_extent[jblocks->jb_used];
 2722         jext->je_daddr = daddr;
 2723         jext->je_blocks = blocks;
 2724         return;
 2725 }
 2726 
 2727 int
 2728 softdep_journal_lookup(mp, vpp)
 2729         struct mount *mp;
 2730         struct vnode **vpp;
 2731 {
 2732         struct componentname cnp;
 2733         struct vnode *dvp;
 2734         ino_t sujournal;
 2735         int error;
 2736 
 2737         error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
 2738         if (error)
 2739                 return (error);
 2740         bzero(&cnp, sizeof(cnp));
 2741         cnp.cn_nameiop = LOOKUP;
 2742         cnp.cn_flags = ISLASTCN;
 2743         cnp.cn_thread = curthread;
 2744         cnp.cn_cred = curthread->td_ucred;
 2745         cnp.cn_pnbuf = SUJ_FILE;
 2746         cnp.cn_nameptr = SUJ_FILE;
 2747         cnp.cn_namelen = strlen(SUJ_FILE);
 2748         error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
 2749         vput(dvp);
 2750         if (error != 0)
 2751                 return (error);
 2752         error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
 2753         return (error);
 2754 }
 2755 
 2756 /*
 2757  * Open and verify the journal file.
 2758  */
 2759 static int
 2760 journal_mount(mp, fs, cred)
 2761         struct mount *mp;
 2762         struct fs *fs;
 2763         struct ucred *cred;
 2764 {
 2765         struct jblocks *jblocks;
 2766         struct ufsmount *ump;
 2767         struct vnode *vp;
 2768         struct inode *ip;
 2769         ufs2_daddr_t blkno;
 2770         int bcount;
 2771         int error;
 2772         int i;
 2773 
 2774         ump = VFSTOUFS(mp);
 2775         ump->softdep_journal_tail = NULL;
 2776         ump->softdep_on_journal = 0;
 2777         ump->softdep_accdeps = 0;
 2778         ump->softdep_req = 0;
 2779         ump->softdep_jblocks = NULL;
 2780         error = softdep_journal_lookup(mp, &vp);
 2781         if (error != 0) {
 2782                 printf("Failed to find journal.  Use tunefs to create one\n");
 2783                 return (error);
 2784         }
 2785         ip = VTOI(vp);
 2786         if (ip->i_size < SUJ_MIN) {
 2787                 error = ENOSPC;
 2788                 goto out;
 2789         }
 2790         bcount = lblkno(fs, ip->i_size);        /* Only use whole blocks. */
 2791         jblocks = jblocks_create();
 2792         for (i = 0; i < bcount; i++) {
 2793                 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
 2794                 if (error)
 2795                         break;
 2796                 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
 2797         }
 2798         if (error) {
 2799                 jblocks_destroy(jblocks);
 2800                 goto out;
 2801         }
 2802         jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */
 2803         jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
 2804         ump->softdep_jblocks = jblocks;
 2805 out:
 2806         if (error == 0) {
 2807                 MNT_ILOCK(mp);
 2808                 mp->mnt_flag |= MNT_SUJ;
 2809                 mp->mnt_flag &= ~MNT_SOFTDEP;
 2810                 MNT_IUNLOCK(mp);
 2811                 /*
 2812                  * Only validate the journal contents if the
 2813                  * filesystem is clean, otherwise we write the logs
 2814                  * but they'll never be used.  If the filesystem was
 2815                  * still dirty when we mounted it the journal is
 2816                  * invalid and a new journal can only be valid if it
 2817                  * starts from a clean mount.
 2818                  */
 2819                 if (fs->fs_clean) {
 2820                         DIP_SET(ip, i_modrev, fs->fs_mtime);
 2821                         ip->i_flags |= IN_MODIFIED;
 2822                         ffs_update(vp, 1);
 2823                 }
 2824         }
 2825         vput(vp);
 2826         return (error);
 2827 }
 2828 
 2829 static void
 2830 journal_unmount(ump)
 2831         struct ufsmount *ump;
 2832 {
 2833 
 2834         if (ump->softdep_jblocks)
 2835                 jblocks_destroy(ump->softdep_jblocks);
 2836         ump->softdep_jblocks = NULL;
 2837 }
 2838 
 2839 /*
 2840  * Called when a journal record is ready to be written.  Space is allocated
 2841  * and the journal entry is created when the journal is flushed to stable
 2842  * store.
 2843  */
 2844 static void
 2845 add_to_journal(wk)
 2846         struct worklist *wk;
 2847 {
 2848         struct ufsmount *ump;
 2849 
 2850         ump = VFSTOUFS(wk->wk_mp);
 2851         LOCK_OWNED(ump);
 2852         if (wk->wk_state & ONWORKLIST)
 2853                 panic("add_to_journal: %s(0x%X) already on list",
 2854                     TYPENAME(wk->wk_type), wk->wk_state);
 2855         wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
 2856         if (LIST_EMPTY(&ump->softdep_journal_pending)) {
 2857                 ump->softdep_jblocks->jb_age = ticks;
 2858                 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
 2859         } else
 2860                 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
 2861         ump->softdep_journal_tail = wk;
 2862         ump->softdep_on_journal += 1;
 2863 }
 2864 
 2865 /*
 2866  * Remove an arbitrary item for the journal worklist maintain the tail
 2867  * pointer.  This happens when a new operation obviates the need to
 2868  * journal an old operation.
 2869  */
 2870 static void
 2871 remove_from_journal(wk)
 2872         struct worklist *wk;
 2873 {
 2874         struct ufsmount *ump;
 2875 
 2876         ump = VFSTOUFS(wk->wk_mp);
 2877         LOCK_OWNED(ump);
 2878 #ifdef SUJ_DEBUG
 2879         {
 2880                 struct worklist *wkn;
 2881 
 2882                 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
 2883                         if (wkn == wk)
 2884                                 break;
 2885                 if (wkn == NULL)
 2886                         panic("remove_from_journal: %p is not in journal", wk);
 2887         }
 2888 #endif
 2889         /*
 2890          * We emulate a TAILQ to save space in most structures which do not
 2891          * require TAILQ semantics.  Here we must update the tail position
 2892          * when removing the tail which is not the final entry. This works
 2893          * only if the worklist linkage are at the beginning of the structure.
 2894          */
 2895         if (ump->softdep_journal_tail == wk)
 2896                 ump->softdep_journal_tail =
 2897                     (struct worklist *)wk->wk_list.le_prev;
 2898         WORKLIST_REMOVE(wk);
 2899         ump->softdep_on_journal -= 1;
 2900 }
 2901 
 2902 /*
 2903  * Check for journal space as well as dependency limits so the prelink
 2904  * code can throttle both journaled and non-journaled filesystems.
 2905  * Threshold is 0 for low and 1 for min.
 2906  */
 2907 static int
 2908 journal_space(ump, thresh)
 2909         struct ufsmount *ump;
 2910         int thresh;
 2911 {
 2912         struct jblocks *jblocks;
 2913         int limit, avail;
 2914 
 2915         jblocks = ump->softdep_jblocks;
 2916         if (jblocks == NULL)
 2917                 return (1);
 2918         /*
 2919          * We use a tighter restriction here to prevent request_cleanup()
 2920          * running in threads from running into locks we currently hold.
 2921          * We have to be over the limit and our filesystem has to be
 2922          * responsible for more than our share of that usage.
 2923          */
 2924         limit = (max_softdeps / 10) * 9;
 2925         if (dep_current[D_INODEDEP] > limit &&
 2926             ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
 2927                 return (0);
 2928         if (thresh)
 2929                 thresh = jblocks->jb_min;
 2930         else
 2931                 thresh = jblocks->jb_low;
 2932         avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
 2933         avail = jblocks->jb_free - avail;
 2934 
 2935         return (avail > thresh);
 2936 }
 2937 
 2938 static void
 2939 journal_suspend(ump)
 2940         struct ufsmount *ump;
 2941 {
 2942         struct jblocks *jblocks;
 2943         struct mount *mp;
 2944 
 2945         mp = UFSTOVFS(ump);
 2946         jblocks = ump->softdep_jblocks;
 2947         MNT_ILOCK(mp);
 2948         if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
 2949                 stat_journal_min++;
 2950                 mp->mnt_kern_flag |= MNTK_SUSPEND;
 2951                 mp->mnt_susp_owner = ump->softdep_flushtd;
 2952         }
 2953         jblocks->jb_suspended = 1;
 2954         MNT_IUNLOCK(mp);
 2955 }
 2956 
 2957 static int
 2958 journal_unsuspend(struct ufsmount *ump)
 2959 {
 2960         struct jblocks *jblocks;
 2961         struct mount *mp;
 2962 
 2963         mp = UFSTOVFS(ump);
 2964         jblocks = ump->softdep_jblocks;
 2965 
 2966         if (jblocks != NULL && jblocks->jb_suspended &&
 2967             journal_space(ump, jblocks->jb_min)) {
 2968                 jblocks->jb_suspended = 0;
 2969                 FREE_LOCK(ump);
 2970                 mp->mnt_susp_owner = curthread;
 2971                 vfs_write_resume(mp, 0);
 2972                 ACQUIRE_LOCK(ump);
 2973                 return (1);
 2974         }
 2975         return (0);
 2976 }
 2977 
 2978 /*
 2979  * Called before any allocation function to be certain that there is
 2980  * sufficient space in the journal prior to creating any new records.
 2981  * Since in the case of block allocation we may have multiple locked
 2982  * buffers at the time of the actual allocation we can not block
 2983  * when the journal records are created.  Doing so would create a deadlock
 2984  * if any of these buffers needed to be flushed to reclaim space.  Instead
 2985  * we require a sufficiently large amount of available space such that
 2986  * each thread in the system could have passed this allocation check and
 2987  * still have sufficient free space.  With 20% of a minimum journal size
 2988  * of 1MB we have 6553 records available.
 2989  */
 2990 int
 2991 softdep_prealloc(vp, waitok)
 2992         struct vnode *vp;
 2993         int waitok;
 2994 {
 2995         struct ufsmount *ump;
 2996 
 2997         KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
 2998             ("softdep_prealloc called on non-softdep filesystem"));
 2999         /*
 3000          * Nothing to do if we are not running journaled soft updates.
 3001          * If we currently hold the snapshot lock, we must avoid
 3002          * handling other resources that could cause deadlock.  Do not
 3003          * touch quotas vnode since it is typically recursed with
 3004          * other vnode locks held.
 3005          */
 3006         if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) ||
 3007             (vp->v_vflag & VV_SYSTEM) != 0)
 3008                 return (0);
 3009         ump = VFSTOUFS(vp->v_mount);
 3010         ACQUIRE_LOCK(ump);
 3011         if (journal_space(ump, 0)) {
 3012                 FREE_LOCK(ump);
 3013                 return (0);
 3014         }
 3015         stat_journal_low++;
 3016         FREE_LOCK(ump);
 3017         if (waitok == MNT_NOWAIT)
 3018                 return (ENOSPC);
 3019         /*
 3020          * Attempt to sync this vnode once to flush any journal
 3021          * work attached to it.
 3022          */
 3023         if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
 3024                 ffs_syncvnode(vp, waitok, 0);
 3025         ACQUIRE_LOCK(ump);
 3026         process_removes(vp);
 3027         process_truncates(vp);
 3028         if (journal_space(ump, 0) == 0) {
 3029                 softdep_speedup(ump);
 3030                 if (journal_space(ump, 1) == 0)
 3031                         journal_suspend(ump);
 3032         }
 3033         FREE_LOCK(ump);
 3034 
 3035         return (0);
 3036 }
 3037 
 3038 /*
 3039  * Before adjusting a link count on a vnode verify that we have sufficient
 3040  * journal space.  If not, process operations that depend on the currently
 3041  * locked pair of vnodes to try to flush space as the syncer, buf daemon,
 3042  * and softdep flush threads can not acquire these locks to reclaim space.
 3043  */
 3044 static void
 3045 softdep_prelink(dvp, vp)
 3046         struct vnode *dvp;
 3047         struct vnode *vp;
 3048 {
 3049         struct ufsmount *ump;
 3050 
 3051         ump = VFSTOUFS(dvp->v_mount);
 3052         LOCK_OWNED(ump);
 3053         /*
 3054          * Nothing to do if we have sufficient journal space.
 3055          * If we currently hold the snapshot lock, we must avoid
 3056          * handling other resources that could cause deadlock.
 3057          */
 3058         if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
 3059                 return;
 3060         stat_journal_low++;
 3061         FREE_LOCK(ump);
 3062         if (vp)
 3063                 ffs_syncvnode(vp, MNT_NOWAIT, 0);
 3064         ffs_syncvnode(dvp, MNT_WAIT, 0);
 3065         ACQUIRE_LOCK(ump);
 3066         /* Process vp before dvp as it may create .. removes. */
 3067         if (vp) {
 3068                 process_removes(vp);
 3069                 process_truncates(vp);
 3070         }
 3071         process_removes(dvp);
 3072         process_truncates(dvp);
 3073         softdep_speedup(ump);
 3074         process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
 3075         if (journal_space(ump, 0) == 0) {
 3076                 softdep_speedup(ump);
 3077                 if (journal_space(ump, 1) == 0)
 3078                         journal_suspend(ump);
 3079         }
 3080 }
 3081 
 3082 static void
 3083 jseg_write(ump, jseg, data)
 3084         struct ufsmount *ump;
 3085         struct jseg *jseg;
 3086         uint8_t *data;
 3087 {
 3088         struct jsegrec *rec;
 3089 
 3090         rec = (struct jsegrec *)data;
 3091         rec->jsr_seq = jseg->js_seq;
 3092         rec->jsr_oldest = jseg->js_oldseq;
 3093         rec->jsr_cnt = jseg->js_cnt;
 3094         rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
 3095         rec->jsr_crc = 0;
 3096         rec->jsr_time = ump->um_fs->fs_mtime;
 3097 }
 3098 
 3099 static inline void
 3100 inoref_write(inoref, jseg, rec)
 3101         struct inoref *inoref;
 3102         struct jseg *jseg;
 3103         struct jrefrec *rec;
 3104 {
 3105 
 3106         inoref->if_jsegdep->jd_seg = jseg;
 3107         rec->jr_ino = inoref->if_ino;
 3108         rec->jr_parent = inoref->if_parent;
 3109         rec->jr_nlink = inoref->if_nlink;
 3110         rec->jr_mode = inoref->if_mode;
 3111         rec->jr_diroff = inoref->if_diroff;
 3112 }
 3113 
 3114 static void
 3115 jaddref_write(jaddref, jseg, data)
 3116         struct jaddref *jaddref;
 3117         struct jseg *jseg;
 3118         uint8_t *data;
 3119 {
 3120         struct jrefrec *rec;
 3121 
 3122         rec = (struct jrefrec *)data;
 3123         rec->jr_op = JOP_ADDREF;
 3124         inoref_write(&jaddref->ja_ref, jseg, rec);
 3125 }
 3126 
 3127 static void
 3128 jremref_write(jremref, jseg, data)
 3129         struct jremref *jremref;
 3130         struct jseg *jseg;
 3131         uint8_t *data;
 3132 {
 3133         struct jrefrec *rec;
 3134 
 3135         rec = (struct jrefrec *)data;
 3136         rec->jr_op = JOP_REMREF;
 3137         inoref_write(&jremref->jr_ref, jseg, rec);
 3138 }
 3139 
 3140 static void
 3141 jmvref_write(jmvref, jseg, data)
 3142         struct jmvref *jmvref;
 3143         struct jseg *jseg;
 3144         uint8_t *data;
 3145 {
 3146         struct jmvrec *rec;
 3147 
 3148         rec = (struct jmvrec *)data;
 3149         rec->jm_op = JOP_MVREF;
 3150         rec->jm_ino = jmvref->jm_ino;
 3151         rec->jm_parent = jmvref->jm_parent;
 3152         rec->jm_oldoff = jmvref->jm_oldoff;
 3153         rec->jm_newoff = jmvref->jm_newoff;
 3154 }
 3155 
 3156 static void
 3157 jnewblk_write(jnewblk, jseg, data)
 3158         struct jnewblk *jnewblk;
 3159         struct jseg *jseg;
 3160         uint8_t *data;
 3161 {
 3162         struct jblkrec *rec;
 3163 
 3164         jnewblk->jn_jsegdep->jd_seg = jseg;
 3165         rec = (struct jblkrec *)data;
 3166         rec->jb_op = JOP_NEWBLK;
 3167         rec->jb_ino = jnewblk->jn_ino;
 3168         rec->jb_blkno = jnewblk->jn_blkno;
 3169         rec->jb_lbn = jnewblk->jn_lbn;
 3170         rec->jb_frags = jnewblk->jn_frags;
 3171         rec->jb_oldfrags = jnewblk->jn_oldfrags;
 3172 }
 3173 
 3174 static void
 3175 jfreeblk_write(jfreeblk, jseg, data)
 3176         struct jfreeblk *jfreeblk;
 3177         struct jseg *jseg;
 3178         uint8_t *data;
 3179 {
 3180         struct jblkrec *rec;
 3181 
 3182         jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
 3183         rec = (struct jblkrec *)data;
 3184         rec->jb_op = JOP_FREEBLK;
 3185         rec->jb_ino = jfreeblk->jf_ino;
 3186         rec->jb_blkno = jfreeblk->jf_blkno;
 3187         rec->jb_lbn = jfreeblk->jf_lbn;
 3188         rec->jb_frags = jfreeblk->jf_frags;
 3189         rec->jb_oldfrags = 0;
 3190 }
 3191 
 3192 static void
 3193 jfreefrag_write(jfreefrag, jseg, data)
 3194         struct jfreefrag *jfreefrag;
 3195         struct jseg *jseg;
 3196         uint8_t *data;
 3197 {
 3198         struct jblkrec *rec;
 3199 
 3200         jfreefrag->fr_jsegdep->jd_seg = jseg;
 3201         rec = (struct jblkrec *)data;
 3202         rec->jb_op = JOP_FREEBLK;
 3203         rec->jb_ino = jfreefrag->fr_ino;
 3204         rec->jb_blkno = jfreefrag->fr_blkno;
 3205         rec->jb_lbn = jfreefrag->fr_lbn;
 3206         rec->jb_frags = jfreefrag->fr_frags;
 3207         rec->jb_oldfrags = 0;
 3208 }
 3209 
 3210 static void
 3211 jtrunc_write(jtrunc, jseg, data)
 3212         struct jtrunc *jtrunc;
 3213         struct jseg *jseg;
 3214         uint8_t *data;
 3215 {
 3216         struct jtrncrec *rec;
 3217 
 3218         jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
 3219         rec = (struct jtrncrec *)data;
 3220         rec->jt_op = JOP_TRUNC;
 3221         rec->jt_ino = jtrunc->jt_ino;
 3222         rec->jt_size = jtrunc->jt_size;
 3223         rec->jt_extsize = jtrunc->jt_extsize;
 3224 }
 3225 
 3226 static void
 3227 jfsync_write(jfsync, jseg, data)
 3228         struct jfsync *jfsync;
 3229         struct jseg *jseg;
 3230         uint8_t *data;
 3231 {
 3232         struct jtrncrec *rec;
 3233 
 3234         rec = (struct jtrncrec *)data;
 3235         rec->jt_op = JOP_SYNC;
 3236         rec->jt_ino = jfsync->jfs_ino;
 3237         rec->jt_size = jfsync->jfs_size;
 3238         rec->jt_extsize = jfsync->jfs_extsize;
 3239 }
 3240 
 3241 static void
 3242 softdep_flushjournal(mp)
 3243         struct mount *mp;
 3244 {
 3245         struct jblocks *jblocks;
 3246         struct ufsmount *ump;
 3247 
 3248         if (MOUNTEDSUJ(mp) == 0)
 3249                 return;
 3250         ump = VFSTOUFS(mp);
 3251         jblocks = ump->softdep_jblocks;
 3252         ACQUIRE_LOCK(ump);
 3253         while (ump->softdep_on_journal) {
 3254                 jblocks->jb_needseg = 1;
 3255                 softdep_process_journal(mp, NULL, MNT_WAIT);
 3256         }
 3257         FREE_LOCK(ump);
 3258 }
 3259 
 3260 static void softdep_synchronize_completed(struct bio *);
 3261 static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
 3262 
 3263 static void
 3264 softdep_synchronize_completed(bp)
 3265         struct bio *bp;
 3266 {
 3267         struct jseg *oldest;
 3268         struct jseg *jseg;
 3269         struct ufsmount *ump;
 3270 
 3271         /*
 3272          * caller1 marks the last segment written before we issued the
 3273          * synchronize cache.
 3274          */
 3275         jseg = bp->bio_caller1;
 3276         if (jseg == NULL) {
 3277                 g_destroy_bio(bp);
 3278                 return;
 3279         }
 3280         ump = VFSTOUFS(jseg->js_list.wk_mp);
 3281         ACQUIRE_LOCK(ump);
 3282         oldest = NULL;
 3283         /*
 3284          * Mark all the journal entries waiting on the synchronize cache
 3285          * as completed so they may continue on.
 3286          */
 3287         while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
 3288                 jseg->js_state |= COMPLETE;
 3289                 oldest = jseg;
 3290                 jseg = TAILQ_PREV(jseg, jseglst, js_next);
 3291         }
 3292         /*
 3293          * Restart deferred journal entry processing from the oldest
 3294          * completed jseg.
 3295          */
 3296         if (oldest)
 3297                 complete_jsegs(oldest);
 3298 
 3299         FREE_LOCK(ump);
 3300         g_destroy_bio(bp);
 3301 }
 3302 
 3303 /*
 3304  * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
 3305  * barriers.  The journal must be written prior to any blocks that depend
 3306  * on it and the journal can not be released until the blocks have be
 3307  * written.  This code handles both barriers simultaneously.
 3308  */
 3309 static void
 3310 softdep_synchronize(bp, ump, caller1)
 3311         struct bio *bp;
 3312         struct ufsmount *ump;
 3313         void *caller1;
 3314 {
 3315 
 3316         bp->bio_cmd = BIO_FLUSH;
 3317         bp->bio_flags |= BIO_ORDERED;
 3318         bp->bio_data = NULL;
 3319         bp->bio_offset = ump->um_cp->provider->mediasize;
 3320         bp->bio_length = 0;
 3321         bp->bio_done = softdep_synchronize_completed;
 3322         bp->bio_caller1 = caller1;
 3323         g_io_request(bp,
 3324             (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
 3325 }
 3326 
 3327 /*
 3328  * Flush some journal records to disk.
 3329  */
 3330 static void
 3331 softdep_process_journal(mp, needwk, flags)
 3332         struct mount *mp;
 3333         struct worklist *needwk;
 3334         int flags;
 3335 {
 3336         struct jblocks *jblocks;
 3337         struct ufsmount *ump;
 3338         struct worklist *wk;
 3339         struct jseg *jseg;
 3340         struct buf *bp;
 3341         struct bio *bio;
 3342         uint8_t *data;
 3343         struct fs *fs;
 3344         int shouldflush;
 3345         int segwritten;
 3346         int jrecmin;    /* Minimum records per block. */
 3347         int jrecmax;    /* Maximum records per block. */
 3348         int size;
 3349         int cnt;
 3350         int off;
 3351         int devbsize;
 3352 
 3353         if (MOUNTEDSUJ(mp) == 0)
 3354                 return;
 3355         shouldflush = softdep_flushcache;
 3356         bio = NULL;
 3357         jseg = NULL;
 3358         ump = VFSTOUFS(mp);
 3359         LOCK_OWNED(ump);
 3360         fs = ump->um_fs;
 3361         jblocks = ump->softdep_jblocks;
 3362         devbsize = ump->um_devvp->v_bufobj.bo_bsize;
 3363         /*
 3364          * We write anywhere between a disk block and fs block.  The upper
 3365          * bound is picked to prevent buffer cache fragmentation and limit
 3366          * processing time per I/O.
 3367          */
 3368         jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
 3369         jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
 3370         segwritten = 0;
 3371         for (;;) {
 3372                 cnt = ump->softdep_on_journal;
 3373                 /*
 3374                  * Criteria for writing a segment:
 3375                  * 1) We have a full block.
 3376                  * 2) We're called from jwait() and haven't found the
 3377                  *    journal item yet.
 3378                  * 3) Always write if needseg is set.
 3379                  * 4) If we are called from process_worklist and have
 3380                  *    not yet written anything we write a partial block
 3381                  *    to enforce a 1 second maximum latency on journal
 3382                  *    entries.
 3383                  */
 3384                 if (cnt < (jrecmax - 1) && needwk == NULL &&
 3385                     jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
 3386                         break;
 3387                 cnt++;
 3388                 /*
 3389                  * Verify some free journal space.  softdep_prealloc() should
 3390                  * guarantee that we don't run out so this is indicative of
 3391                  * a problem with the flow control.  Try to recover
 3392                  * gracefully in any event.
 3393                  */
 3394                 while (jblocks->jb_free == 0) {
 3395                         if (flags != MNT_WAIT)
 3396                                 break;
 3397                         printf("softdep: Out of journal space!\n");
 3398                         softdep_speedup(ump);
 3399                         msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
 3400                 }
 3401                 FREE_LOCK(ump);
 3402                 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
 3403                 workitem_alloc(&jseg->js_list, D_JSEG, mp);
 3404                 LIST_INIT(&jseg->js_entries);
 3405                 LIST_INIT(&jseg->js_indirs);
 3406                 jseg->js_state = ATTACHED;
 3407                 if (shouldflush == 0)
 3408                         jseg->js_state |= COMPLETE;
 3409                 else if (bio == NULL)
 3410                         bio = g_alloc_bio();
 3411                 jseg->js_jblocks = jblocks;
 3412                 bp = geteblk(fs->fs_bsize, 0);
 3413                 ACQUIRE_LOCK(ump);
 3414                 /*
 3415                  * If there was a race while we were allocating the block
 3416                  * and jseg the entry we care about was likely written.
 3417                  * We bail out in both the WAIT and NOWAIT case and assume
 3418                  * the caller will loop if the entry it cares about is
 3419                  * not written.
 3420                  */
 3421                 cnt = ump->softdep_on_journal;
 3422                 if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
 3423                         bp->b_flags |= B_INVAL | B_NOCACHE;
 3424                         WORKITEM_FREE(jseg, D_JSEG);
 3425                         FREE_LOCK(ump);
 3426                         brelse(bp);
 3427                         ACQUIRE_LOCK(ump);
 3428                         break;
 3429                 }
 3430                 /*
 3431                  * Calculate the disk block size required for the available
 3432                  * records rounded to the min size.
 3433                  */
 3434                 if (cnt == 0)
 3435                         size = devbsize;
 3436                 else if (cnt < jrecmax)
 3437                         size = howmany(cnt, jrecmin) * devbsize;
 3438                 else
 3439                         size = fs->fs_bsize;
 3440                 /*
 3441                  * Allocate a disk block for this journal data and account
 3442                  * for truncation of the requested size if enough contiguous
 3443                  * space was not available.
 3444                  */
 3445                 bp->b_blkno = jblocks_alloc(jblocks, size, &size);
 3446                 bp->b_lblkno = bp->b_blkno;
 3447                 bp->b_offset = bp->b_blkno * DEV_BSIZE;
 3448                 bp->b_bcount = size;
 3449                 bp->b_flags &= ~B_INVAL;
 3450                 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
 3451                 /*
 3452                  * Initialize our jseg with cnt records.  Assign the next
 3453                  * sequence number to it and link it in-order.
 3454                  */
 3455                 cnt = MIN(cnt, (size / devbsize) * jrecmin);
 3456                 jseg->js_buf = bp;
 3457                 jseg->js_cnt = cnt;
 3458                 jseg->js_refs = cnt + 1;        /* Self ref. */
 3459                 jseg->js_size = size;
 3460                 jseg->js_seq = jblocks->jb_nextseq++;
 3461                 if (jblocks->jb_oldestseg == NULL)
 3462                         jblocks->jb_oldestseg = jseg;
 3463                 jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
 3464                 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
 3465                 if (jblocks->jb_writeseg == NULL)
 3466                         jblocks->jb_writeseg = jseg;
 3467                 /*
 3468                  * Start filling in records from the pending list.
 3469                  */
 3470                 data = bp->b_data;
 3471                 off = 0;
 3472 
 3473                 /*
 3474                  * Always put a header on the first block.
 3475                  * XXX As with below, there might not be a chance to get
 3476                  * into the loop.  Ensure that something valid is written.
 3477                  */
 3478                 jseg_write(ump, jseg, data);
 3479                 off += JREC_SIZE;
 3480                 data = bp->b_data + off;
 3481 
 3482                 /*
 3483                  * XXX Something is wrong here.  There's no work to do,
 3484                  * but we need to perform and I/O and allow it to complete
 3485                  * anyways.
 3486                  */
 3487                 if (LIST_EMPTY(&ump->softdep_journal_pending))
 3488                         stat_emptyjblocks++;
 3489 
 3490                 while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
 3491                     != NULL) {
 3492                         if (cnt == 0)
 3493                                 break;
 3494                         /* Place a segment header on every device block. */
 3495                         if ((off % devbsize) == 0) {
 3496                                 jseg_write(ump, jseg, data);
 3497                                 off += JREC_SIZE;
 3498                                 data = bp->b_data + off;
 3499                         }
 3500                         if (wk == needwk)
 3501                                 needwk = NULL;
 3502                         remove_from_journal(wk);
 3503                         wk->wk_state |= INPROGRESS;
 3504                         WORKLIST_INSERT(&jseg->js_entries, wk);
 3505                         switch (wk->wk_type) {
 3506                         case D_JADDREF:
 3507                                 jaddref_write(WK_JADDREF(wk), jseg, data);
 3508                                 break;
 3509                         case D_JREMREF:
 3510                                 jremref_write(WK_JREMREF(wk), jseg, data);
 3511                                 break;
 3512                         case D_JMVREF:
 3513                                 jmvref_write(WK_JMVREF(wk), jseg, data);
 3514                                 break;
 3515                         case D_JNEWBLK:
 3516                                 jnewblk_write(WK_JNEWBLK(wk), jseg, data);
 3517                                 break;
 3518                         case D_JFREEBLK:
 3519                                 jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
 3520                                 break;
 3521                         case D_JFREEFRAG:
 3522                                 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
 3523                                 break;
 3524                         case D_JTRUNC:
 3525                                 jtrunc_write(WK_JTRUNC(wk), jseg, data);
 3526                                 break;
 3527                         case D_JFSYNC:
 3528                                 jfsync_write(WK_JFSYNC(wk), jseg, data);
 3529                                 break;
 3530                         default:
 3531                                 panic("process_journal: Unknown type %s",
 3532                                     TYPENAME(wk->wk_type));
 3533                                 /* NOTREACHED */
 3534                         }
 3535                         off += JREC_SIZE;
 3536                         data = bp->b_data + off;
 3537                         cnt--;
 3538                 }
 3539 
 3540                 /* Clear any remaining space so we don't leak kernel data */
 3541                 if (size > off)
 3542                         bzero(data, size - off);
 3543 
 3544                 /*
 3545                  * Write this one buffer and continue.
 3546                  */
 3547                 segwritten = 1;
 3548                 jblocks->jb_needseg = 0;
 3549                 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
 3550                 FREE_LOCK(ump);
 3551                 pbgetvp(ump->um_devvp, bp);
 3552                 /*
 3553                  * We only do the blocking wait once we find the journal
 3554                  * entry we're looking for.
 3555                  */
 3556                 if (needwk == NULL && flags == MNT_WAIT)
 3557                         bwrite(bp);
 3558                 else
 3559                         bawrite(bp);
 3560                 ACQUIRE_LOCK(ump);
 3561         }
 3562         /*
 3563          * If we wrote a segment issue a synchronize cache so the journal
 3564          * is reflected on disk before the data is written.  Since reclaiming
 3565          * journal space also requires writing a journal record this
 3566          * process also enforces a barrier before reclamation.
 3567          */
 3568         if (segwritten && shouldflush) {
 3569                 softdep_synchronize(bio, ump, 
 3570                     TAILQ_LAST(&jblocks->jb_segs, jseglst));
 3571         } else if (bio)
 3572                 g_destroy_bio(bio);
 3573         /*
 3574          * If we've suspended the filesystem because we ran out of journal
 3575          * space either try to sync it here to make some progress or
 3576          * unsuspend it if we already have.
 3577          */
 3578         if (flags == 0 && jblocks->jb_suspended) {
 3579                 if (journal_unsuspend(ump))
 3580                         return;
 3581                 FREE_LOCK(ump);
 3582                 VFS_SYNC(mp, MNT_NOWAIT);
 3583                 ffs_sbupdate(ump, MNT_WAIT, 0);
 3584                 ACQUIRE_LOCK(ump);
 3585         }
 3586 }
 3587 
 3588 /*
 3589  * Complete a jseg, allowing all dependencies awaiting journal writes
 3590  * to proceed.  Each journal dependency also attaches a jsegdep to dependent
 3591  * structures so that the journal segment can be freed to reclaim space.
 3592  */
 3593 static void
 3594 complete_jseg(jseg)
 3595         struct jseg *jseg;
 3596 {
 3597         struct worklist *wk;
 3598         struct jmvref *jmvref;
 3599 #ifdef INVARIANTS
 3600         int i = 0;
 3601 #endif
 3602 
 3603         while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
 3604                 WORKLIST_REMOVE(wk);
 3605                 wk->wk_state &= ~INPROGRESS;
 3606                 wk->wk_state |= COMPLETE;
 3607                 KASSERT(i++ < jseg->js_cnt,
 3608                     ("handle_written_jseg: overflow %d >= %d",
 3609                     i - 1, jseg->js_cnt));
 3610                 switch (wk->wk_type) {
 3611                 case D_JADDREF:
 3612                         handle_written_jaddref(WK_JADDREF(wk));
 3613                         break;
 3614                 case D_JREMREF:
 3615                         handle_written_jremref(WK_JREMREF(wk));
 3616                         break;
 3617                 case D_JMVREF:
 3618                         rele_jseg(jseg);        /* No jsegdep. */
 3619                         jmvref = WK_JMVREF(wk);
 3620                         LIST_REMOVE(jmvref, jm_deps);
 3621                         if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
 3622                                 free_pagedep(jmvref->jm_pagedep);
 3623                         WORKITEM_FREE(jmvref, D_JMVREF);
 3624                         break;
 3625                 case D_JNEWBLK:
 3626                         handle_written_jnewblk(WK_JNEWBLK(wk));
 3627                         break;
 3628                 case D_JFREEBLK:
 3629                         handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
 3630                         break;
 3631                 case D_JTRUNC:
 3632                         handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
 3633                         break;
 3634                 case D_JFSYNC:
 3635                         rele_jseg(jseg);        /* No jsegdep. */
 3636                         WORKITEM_FREE(wk, D_JFSYNC);
 3637                         break;
 3638                 case D_JFREEFRAG:
 3639                         handle_written_jfreefrag(WK_JFREEFRAG(wk));
 3640                         break;
 3641                 default:
 3642                         panic("handle_written_jseg: Unknown type %s",
 3643                             TYPENAME(wk->wk_type));
 3644                         /* NOTREACHED */
 3645                 }
 3646         }
 3647         /* Release the self reference so the structure may be freed. */
 3648         rele_jseg(jseg);
 3649 }
 3650 
 3651 /*
 3652  * Determine which jsegs are ready for completion processing.  Waits for
 3653  * synchronize cache to complete as well as forcing in-order completion
 3654  * of journal entries.
 3655  */
 3656 static void
 3657 complete_jsegs(jseg)
 3658         struct jseg *jseg;
 3659 {
 3660         struct jblocks *jblocks;
 3661         struct jseg *jsegn;
 3662 
 3663         jblocks = jseg->js_jblocks;
 3664         /*
 3665          * Don't allow out of order completions.  If this isn't the first
 3666          * block wait for it to write before we're done.
 3667          */
 3668         if (jseg != jblocks->jb_writeseg)
 3669                 return;
 3670         /* Iterate through available jsegs processing their entries. */
 3671         while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
 3672                 jblocks->jb_oldestwrseq = jseg->js_oldseq;
 3673                 jsegn = TAILQ_NEXT(jseg, js_next);
 3674                 complete_jseg(jseg);
 3675                 jseg = jsegn;
 3676         }
 3677         jblocks->jb_writeseg = jseg;
 3678         /*
 3679          * Attempt to free jsegs now that oldestwrseq may have advanced. 
 3680          */
 3681         free_jsegs(jblocks);
 3682 }
 3683 
 3684 /*
 3685  * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
 3686  * the final completions.
 3687  */
 3688 static void
 3689 handle_written_jseg(jseg, bp)
 3690         struct jseg *jseg;
 3691         struct buf *bp;
 3692 {
 3693 
 3694         if (jseg->js_refs == 0)
 3695                 panic("handle_written_jseg: No self-reference on %p", jseg);
 3696         jseg->js_state |= DEPCOMPLETE;
 3697         /*
 3698          * We'll never need this buffer again, set flags so it will be
 3699          * discarded.
 3700          */
 3701         bp->b_flags |= B_INVAL | B_NOCACHE;
 3702         pbrelvp(bp);
 3703         complete_jsegs(jseg);
 3704 }
 3705 
 3706 static inline struct jsegdep *
 3707 inoref_jseg(inoref)
 3708         struct inoref *inoref;
 3709 {
 3710         struct jsegdep *jsegdep;
 3711 
 3712         jsegdep = inoref->if_jsegdep;
 3713         inoref->if_jsegdep = NULL;
 3714 
 3715         return (jsegdep);
 3716 }
 3717 
 3718 /*
 3719  * Called once a jremref has made it to stable store.  The jremref is marked
 3720  * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
 3721  * for the jremref to complete will be awoken by free_jremref.
 3722  */
 3723 static void
 3724 handle_written_jremref(jremref)
 3725         struct jremref *jremref;
 3726 {
 3727         struct inodedep *inodedep;
 3728         struct jsegdep *jsegdep;
 3729         struct dirrem *dirrem;
 3730 
 3731         /* Grab the jsegdep. */
 3732         jsegdep = inoref_jseg(&jremref->jr_ref);
 3733         /*
 3734          * Remove us from the inoref list.
 3735          */
 3736         if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
 3737             0, &inodedep) == 0)
 3738                 panic("handle_written_jremref: Lost inodedep");
 3739         TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
 3740         /*
 3741          * Complete the dirrem.
 3742          */
 3743         dirrem = jremref->jr_dirrem;
 3744         jremref->jr_dirrem = NULL;
 3745         LIST_REMOVE(jremref, jr_deps);
 3746         jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
 3747         jwork_insert(&dirrem->dm_jwork, jsegdep);
 3748         if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
 3749             (dirrem->dm_state & COMPLETE) != 0)
 3750                 add_to_worklist(&dirrem->dm_list, 0);
 3751         free_jremref(jremref);
 3752 }
 3753 
 3754 /*
 3755  * Called once a jaddref has made it to stable store.  The dependency is
 3756  * marked complete and any dependent structures are added to the inode
 3757  * bufwait list to be completed as soon as it is written.  If a bitmap write
 3758  * depends on this entry we move the inode into the inodedephd of the
 3759  * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
 3760  */
 3761 static void
 3762 handle_written_jaddref(jaddref)
 3763         struct jaddref *jaddref;
 3764 {
 3765         struct jsegdep *jsegdep;
 3766         struct inodedep *inodedep;
 3767         struct diradd *diradd;
 3768         struct mkdir *mkdir;
 3769 
 3770         /* Grab the jsegdep. */
 3771         jsegdep = inoref_jseg(&jaddref->ja_ref);
 3772         mkdir = NULL;
 3773         diradd = NULL;
 3774         if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
 3775             0, &inodedep) == 0)
 3776                 panic("handle_written_jaddref: Lost inodedep.");
 3777         if (jaddref->ja_diradd == NULL)
 3778                 panic("handle_written_jaddref: No dependency");
 3779         if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
 3780                 diradd = jaddref->ja_diradd;
 3781                 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
 3782         } else if (jaddref->ja_state & MKDIR_PARENT) {
 3783                 mkdir = jaddref->ja_mkdir;
 3784                 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
 3785         } else if (jaddref->ja_state & MKDIR_BODY)
 3786                 mkdir = jaddref->ja_mkdir;
 3787         else
 3788                 panic("handle_written_jaddref: Unknown dependency %p",
 3789                     jaddref->ja_diradd);
 3790         jaddref->ja_diradd = NULL;      /* also clears ja_mkdir */
 3791         /*
 3792          * Remove us from the inode list.
 3793          */
 3794         TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
 3795         /*
 3796          * The mkdir may be waiting on the jaddref to clear before freeing.
 3797          */
 3798         if (mkdir) {
 3799                 KASSERT(mkdir->md_list.wk_type == D_MKDIR,
 3800                     ("handle_written_jaddref: Incorrect type for mkdir %s",
 3801                     TYPENAME(mkdir->md_list.wk_type)));
 3802                 mkdir->md_jaddref = NULL;
 3803                 diradd = mkdir->md_diradd;
 3804                 mkdir->md_state |= DEPCOMPLETE;
 3805                 complete_mkdir(mkdir);
 3806         }
 3807         jwork_insert(&diradd->da_jwork, jsegdep);
 3808         if (jaddref->ja_state & NEWBLOCK) {
 3809                 inodedep->id_state |= ONDEPLIST;
 3810                 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
 3811                     inodedep, id_deps);
 3812         }
 3813         free_jaddref(jaddref);
 3814 }
 3815 
 3816 /*
 3817  * Called once a jnewblk journal is written.  The allocdirect or allocindir
 3818  * is placed in the bmsafemap to await notification of a written bitmap.  If
 3819  * the operation was canceled we add the segdep to the appropriate
 3820  * dependency to free the journal space once the canceling operation
 3821  * completes.
 3822  */
 3823 static void
 3824 handle_written_jnewblk(jnewblk)
 3825         struct jnewblk *jnewblk;
 3826 {
 3827         struct bmsafemap *bmsafemap;
 3828         struct freefrag *freefrag;
 3829         struct freework *freework;
 3830         struct jsegdep *jsegdep;
 3831         struct newblk *newblk;
 3832 
 3833         /* Grab the jsegdep. */
 3834         jsegdep = jnewblk->jn_jsegdep;
 3835         jnewblk->jn_jsegdep = NULL;
 3836         if (jnewblk->jn_dep == NULL) 
 3837                 panic("handle_written_jnewblk: No dependency for the segdep.");
 3838         switch (jnewblk->jn_dep->wk_type) {
 3839         case D_NEWBLK:
 3840         case D_ALLOCDIRECT:
 3841         case D_ALLOCINDIR:
 3842                 /*
 3843                  * Add the written block to the bmsafemap so it can
 3844                  * be notified when the bitmap is on disk.
 3845                  */
 3846                 newblk = WK_NEWBLK(jnewblk->jn_dep);
 3847                 newblk->nb_jnewblk = NULL;
 3848                 if ((newblk->nb_state & GOINGAWAY) == 0) {
 3849                         bmsafemap = newblk->nb_bmsafemap;
 3850                         newblk->nb_state |= ONDEPLIST;
 3851                         LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
 3852                             nb_deps);
 3853                 }
 3854                 jwork_insert(&newblk->nb_jwork, jsegdep);
 3855                 break;
 3856         case D_FREEFRAG:
 3857                 /*
 3858                  * A newblock being removed by a freefrag when replaced by
 3859                  * frag extension.
 3860                  */
 3861                 freefrag = WK_FREEFRAG(jnewblk->jn_dep);
 3862                 freefrag->ff_jdep = NULL;
 3863                 jwork_insert(&freefrag->ff_jwork, jsegdep);
 3864                 break;
 3865         case D_FREEWORK:
 3866                 /*
 3867                  * A direct block was removed by truncate.
 3868                  */
 3869                 freework = WK_FREEWORK(jnewblk->jn_dep);
 3870                 freework->fw_jnewblk = NULL;
 3871                 jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
 3872                 break;
 3873         default:
 3874                 panic("handle_written_jnewblk: Unknown type %d.",
 3875                     jnewblk->jn_dep->wk_type);
 3876         }
 3877         jnewblk->jn_dep = NULL;
 3878         free_jnewblk(jnewblk);
 3879 }
 3880 
 3881 /*
 3882  * Cancel a jfreefrag that won't be needed, probably due to colliding with
 3883  * an in-flight allocation that has not yet been committed.  Divorce us
 3884  * from the freefrag and mark it DEPCOMPLETE so that it may be added
 3885  * to the worklist.
 3886  */
 3887 static void
 3888 cancel_jfreefrag(jfreefrag)
 3889         struct jfreefrag *jfreefrag;
 3890 {
 3891         struct freefrag *freefrag;
 3892 
 3893         if (jfreefrag->fr_jsegdep) {
 3894                 free_jsegdep(jfreefrag->fr_jsegdep);
 3895                 jfreefrag->fr_jsegdep = NULL;
 3896         }
 3897         freefrag = jfreefrag->fr_freefrag;
 3898         jfreefrag->fr_freefrag = NULL;
 3899         free_jfreefrag(jfreefrag);
 3900         freefrag->ff_state |= DEPCOMPLETE;
 3901         CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
 3902 }
 3903 
 3904 /*
 3905  * Free a jfreefrag when the parent freefrag is rendered obsolete.
 3906  */
 3907 static void
 3908 free_jfreefrag(jfreefrag)
 3909         struct jfreefrag *jfreefrag;
 3910 {
 3911 
 3912         if (jfreefrag->fr_state & INPROGRESS)
 3913                 WORKLIST_REMOVE(&jfreefrag->fr_list);
 3914         else if (jfreefrag->fr_state & ONWORKLIST)
 3915                 remove_from_journal(&jfreefrag->fr_list);
 3916         if (jfreefrag->fr_freefrag != NULL)
 3917                 panic("free_jfreefrag:  Still attached to a freefrag.");
 3918         WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
 3919 }
 3920 
 3921 /*
 3922  * Called when the journal write for a jfreefrag completes.  The parent
 3923  * freefrag is added to the worklist if this completes its dependencies.
 3924  */
 3925 static void
 3926 handle_written_jfreefrag(jfreefrag)
 3927         struct jfreefrag *jfreefrag;
 3928 {
 3929         struct jsegdep *jsegdep;
 3930         struct freefrag *freefrag;
 3931 
 3932         /* Grab the jsegdep. */
 3933         jsegdep = jfreefrag->fr_jsegdep;
 3934         jfreefrag->fr_jsegdep = NULL;
 3935         freefrag = jfreefrag->fr_freefrag;
 3936         if (freefrag == NULL)
 3937                 panic("handle_written_jfreefrag: No freefrag.");
 3938         freefrag->ff_state |= DEPCOMPLETE;
 3939         freefrag->ff_jdep = NULL;
 3940         jwork_insert(&freefrag->ff_jwork, jsegdep);
 3941         if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
 3942                 add_to_worklist(&freefrag->ff_list, 0);
 3943         jfreefrag->fr_freefrag = NULL;
 3944         free_jfreefrag(jfreefrag);
 3945 }
 3946 
 3947 /*
 3948  * Called when the journal write for a jfreeblk completes.  The jfreeblk
 3949  * is removed from the freeblks list of pending journal writes and the
 3950  * jsegdep is moved to the freeblks jwork to be completed when all blocks
 3951  * have been reclaimed.
 3952  */
 3953 static void
 3954 handle_written_jblkdep(jblkdep)
 3955         struct jblkdep *jblkdep;
 3956 {
 3957         struct freeblks *freeblks;
 3958         struct jsegdep *jsegdep;
 3959 
 3960         /* Grab the jsegdep. */
 3961         jsegdep = jblkdep->jb_jsegdep;
 3962         jblkdep->jb_jsegdep = NULL;
 3963         freeblks = jblkdep->jb_freeblks;
 3964         LIST_REMOVE(jblkdep, jb_deps);
 3965         jwork_insert(&freeblks->fb_jwork, jsegdep);
 3966         /*
 3967          * If the freeblks is all journaled, we can add it to the worklist.
 3968          */
 3969         if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
 3970             (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
 3971                 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
 3972 
 3973         free_jblkdep(jblkdep);
 3974 }
 3975 
 3976 static struct jsegdep *
 3977 newjsegdep(struct worklist *wk)
 3978 {
 3979         struct jsegdep *jsegdep;
 3980 
 3981         jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
 3982         workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
 3983         jsegdep->jd_seg = NULL;
 3984 
 3985         return (jsegdep);
 3986 }
 3987 
 3988 static struct jmvref *
 3989 newjmvref(dp, ino, oldoff, newoff)
 3990         struct inode *dp;
 3991         ino_t ino;
 3992         off_t oldoff;
 3993         off_t newoff;
 3994 {
 3995         struct jmvref *jmvref;
 3996 
 3997         jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
 3998         workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp));
 3999         jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
 4000         jmvref->jm_parent = dp->i_number;
 4001         jmvref->jm_ino = ino;
 4002         jmvref->jm_oldoff = oldoff;
 4003         jmvref->jm_newoff = newoff;
 4004 
 4005         return (jmvref);
 4006 }
 4007 
 4008 /*
 4009  * Allocate a new jremref that tracks the removal of ip from dp with the
 4010  * directory entry offset of diroff.  Mark the entry as ATTACHED and
 4011  * DEPCOMPLETE as we have all the information required for the journal write
 4012  * and the directory has already been removed from the buffer.  The caller
 4013  * is responsible for linking the jremref into the pagedep and adding it
 4014  * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
 4015  * a DOTDOT addition so handle_workitem_remove() can properly assign
 4016  * the jsegdep when we're done.
 4017  */
 4018 static struct jremref *
 4019 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
 4020     off_t diroff, nlink_t nlink)
 4021 {
 4022         struct jremref *jremref;
 4023 
 4024         jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
 4025         workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp));
 4026         jremref->jr_state = ATTACHED;
 4027         newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
 4028            nlink, ip->i_mode);
 4029         jremref->jr_dirrem = dirrem;
 4030 
 4031         return (jremref);
 4032 }
 4033 
 4034 static inline void
 4035 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
 4036     nlink_t nlink, uint16_t mode)
 4037 {
 4038 
 4039         inoref->if_jsegdep = newjsegdep(&inoref->if_list);
 4040         inoref->if_diroff = diroff;
 4041         inoref->if_ino = ino;
 4042         inoref->if_parent = parent;
 4043         inoref->if_nlink = nlink;
 4044         inoref->if_mode = mode;
 4045 }
 4046 
 4047 /*
 4048  * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
 4049  * directory offset may not be known until later.  The caller is responsible
 4050  * adding the entry to the journal when this information is available.  nlink
 4051  * should be the link count prior to the addition and mode is only required
 4052  * to have the correct FMT.
 4053  */
 4054 static struct jaddref *
 4055 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
 4056     uint16_t mode)
 4057 {
 4058         struct jaddref *jaddref;
 4059 
 4060         jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
 4061         workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp));
 4062         jaddref->ja_state = ATTACHED;
 4063         jaddref->ja_mkdir = NULL;
 4064         newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
 4065 
 4066         return (jaddref);
 4067 }
 4068 
 4069 /*
 4070  * Create a new free dependency for a freework.  The caller is responsible
 4071  * for adjusting the reference count when it has the lock held.  The freedep
 4072  * will track an outstanding bitmap write that will ultimately clear the
 4073  * freework to continue.
 4074  */
 4075 static struct freedep *
 4076 newfreedep(struct freework *freework)
 4077 {
 4078         struct freedep *freedep;
 4079 
 4080         freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
 4081         workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
 4082         freedep->fd_freework = freework;
 4083 
 4084         return (freedep);
 4085 }
 4086 
 4087 /*
 4088  * Free a freedep structure once the buffer it is linked to is written.  If
 4089  * this is the last reference to the freework schedule it for completion.
 4090  */
 4091 static void
 4092 free_freedep(freedep)
 4093         struct freedep *freedep;
 4094 {
 4095         struct freework *freework;
 4096 
 4097         freework = freedep->fd_freework;
 4098         freework->fw_freeblks->fb_cgwait--;
 4099         if (--freework->fw_ref == 0)
 4100                 freework_enqueue(freework);
 4101         WORKITEM_FREE(freedep, D_FREEDEP);
 4102 }
 4103 
 4104 /*
 4105  * Allocate a new freework structure that may be a level in an indirect
 4106  * when parent is not NULL or a top level block when it is.  The top level
 4107  * freework structures are allocated without the per-filesystem lock held
 4108  * and before the freeblks is visible outside of softdep_setup_freeblocks().
 4109  */
 4110 static struct freework *
 4111 newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
 4112         struct ufsmount *ump;
 4113         struct freeblks *freeblks;
 4114         struct freework *parent;
 4115         ufs_lbn_t lbn;
 4116         ufs2_daddr_t nb;
 4117         int frags;
 4118         int off;
 4119         int journal;
 4120 {
 4121         struct freework *freework;
 4122 
 4123         freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
 4124         workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
 4125         freework->fw_state = ATTACHED;
 4126         freework->fw_jnewblk = NULL;
 4127         freework->fw_freeblks = freeblks;
 4128         freework->fw_parent = parent;
 4129         freework->fw_lbn = lbn;
 4130         freework->fw_blkno = nb;
 4131         freework->fw_frags = frags;
 4132         freework->fw_indir = NULL;
 4133         freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR)
 4134                 ? 0 : NINDIR(ump->um_fs) + 1;
 4135         freework->fw_start = freework->fw_off = off;
 4136         if (journal)
 4137                 newjfreeblk(freeblks, lbn, nb, frags);
 4138         if (parent == NULL) {
 4139                 ACQUIRE_LOCK(ump);
 4140                 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
 4141                 freeblks->fb_ref++;
 4142                 FREE_LOCK(ump);
 4143         }
 4144 
 4145         return (freework);
 4146 }
 4147 
 4148 /*
 4149  * Eliminate a jfreeblk for a block that does not need journaling.
 4150  */
 4151 static void
 4152 cancel_jfreeblk(freeblks, blkno)
 4153         struct freeblks *freeblks;
 4154         ufs2_daddr_t blkno;
 4155 {
 4156         struct jfreeblk *jfreeblk;
 4157         struct jblkdep *jblkdep;
 4158 
 4159         LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
 4160                 if (jblkdep->jb_list.wk_type != D_JFREEBLK)
 4161                         continue;
 4162                 jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
 4163                 if (jfreeblk->jf_blkno == blkno)
 4164                         break;
 4165         }
 4166         if (jblkdep == NULL)
 4167                 return;
 4168         CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
 4169         free_jsegdep(jblkdep->jb_jsegdep);
 4170         LIST_REMOVE(jblkdep, jb_deps);
 4171         WORKITEM_FREE(jfreeblk, D_JFREEBLK);
 4172 }
 4173 
 4174 /*
 4175  * Allocate a new jfreeblk to journal top level block pointer when truncating
 4176  * a file.  The caller must add this to the worklist when the per-filesystem
 4177  * lock is held.
 4178  */
 4179 static struct jfreeblk *
 4180 newjfreeblk(freeblks, lbn, blkno, frags)
 4181         struct freeblks *freeblks;
 4182         ufs_lbn_t lbn;
 4183         ufs2_daddr_t blkno;
 4184         int frags;
 4185 {
 4186         struct jfreeblk *jfreeblk;
 4187 
 4188         jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
 4189         workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
 4190             freeblks->fb_list.wk_mp);
 4191         jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
 4192         jfreeblk->jf_dep.jb_freeblks = freeblks;
 4193         jfreeblk->jf_ino = freeblks->fb_inum;
 4194         jfreeblk->jf_lbn = lbn;
 4195         jfreeblk->jf_blkno = blkno;
 4196         jfreeblk->jf_frags = frags;
 4197         LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
 4198 
 4199         return (jfreeblk);
 4200 }
 4201 
 4202 /*
 4203  * The journal is only prepared to handle full-size block numbers, so we
 4204  * have to adjust the record to reflect the change to a full-size block.
 4205  * For example, suppose we have a block made up of fragments 8-15 and
 4206  * want to free its last two fragments. We are given a request that says:
 4207  *     FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
 4208  * where frags are the number of fragments to free and oldfrags are the
 4209  * number of fragments to keep. To block align it, we have to change it to
 4210  * have a valid full-size blkno, so it becomes:
 4211  *     FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
 4212  */
 4213 static void
 4214 adjust_newfreework(freeblks, frag_offset)
 4215         struct freeblks *freeblks;
 4216         int frag_offset;
 4217 {
 4218         struct jfreeblk *jfreeblk;
 4219 
 4220         KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
 4221             LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
 4222             ("adjust_newfreework: Missing freeblks dependency"));
 4223 
 4224         jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
 4225         jfreeblk->jf_blkno -= frag_offset;
 4226         jfreeblk->jf_frags += frag_offset;
 4227 }
 4228 
 4229 /*
 4230  * Allocate a new jtrunc to track a partial truncation.
 4231  */
 4232 static struct jtrunc *
 4233 newjtrunc(freeblks, size, extsize)
 4234         struct freeblks *freeblks;
 4235         off_t size;
 4236         int extsize;
 4237 {
 4238         struct jtrunc *jtrunc;
 4239 
 4240         jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
 4241         workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
 4242             freeblks->fb_list.wk_mp);
 4243         jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
 4244         jtrunc->jt_dep.jb_freeblks = freeblks;
 4245         jtrunc->jt_ino = freeblks->fb_inum;
 4246         jtrunc->jt_size = size;
 4247         jtrunc->jt_extsize = extsize;
 4248         LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
 4249 
 4250         return (jtrunc);
 4251 }
 4252 
 4253 /*
 4254  * If we're canceling a new bitmap we have to search for another ref
 4255  * to move into the bmsafemap dep.  This might be better expressed
 4256  * with another structure.
 4257  */
 4258 static void
 4259 move_newblock_dep(jaddref, inodedep)
 4260         struct jaddref *jaddref;
 4261         struct inodedep *inodedep;
 4262 {
 4263         struct inoref *inoref;
 4264         struct jaddref *jaddrefn;
 4265 
 4266         jaddrefn = NULL;
 4267         for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
 4268             inoref = TAILQ_NEXT(inoref, if_deps)) {
 4269                 if ((jaddref->ja_state & NEWBLOCK) &&
 4270                     inoref->if_list.wk_type == D_JADDREF) {
 4271                         jaddrefn = (struct jaddref *)inoref;
 4272                         break;
 4273                 }
 4274         }
 4275         if (jaddrefn == NULL)
 4276                 return;
 4277         jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
 4278         jaddrefn->ja_state |= jaddref->ja_state &
 4279             (ATTACHED | UNDONE | NEWBLOCK);
 4280         jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
 4281         jaddref->ja_state |= ATTACHED;
 4282         LIST_REMOVE(jaddref, ja_bmdeps);
 4283         LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
 4284             ja_bmdeps);
 4285 }
 4286 
 4287 /*
 4288  * Cancel a jaddref either before it has been written or while it is being
 4289  * written.  This happens when a link is removed before the add reaches
 4290  * the disk.  The jaddref dependency is kept linked into the bmsafemap
 4291  * and inode to prevent the link count or bitmap from reaching the disk
 4292  * until handle_workitem_remove() re-adjusts the counts and bitmaps as
 4293  * required.
 4294  *
 4295  * Returns 1 if the canceled addref requires journaling of the remove and
 4296  * 0 otherwise.
 4297  */
 4298 static int
 4299 cancel_jaddref(jaddref, inodedep, wkhd)
 4300         struct jaddref *jaddref;
 4301         struct inodedep *inodedep;
 4302         struct workhead *wkhd;
 4303 {
 4304         struct inoref *inoref;
 4305         struct jsegdep *jsegdep;
 4306         int needsj;
 4307 
 4308         KASSERT((jaddref->ja_state & COMPLETE) == 0,
 4309             ("cancel_jaddref: Canceling complete jaddref"));
 4310         if (jaddref->ja_state & (INPROGRESS | COMPLETE))
 4311                 needsj = 1;
 4312         else
 4313                 needsj = 0;
 4314         if (inodedep == NULL)
 4315                 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
 4316                     0, &inodedep) == 0)
 4317                         panic("cancel_jaddref: Lost inodedep");
 4318         /*
 4319          * We must adjust the nlink of any reference operation that follows
 4320          * us so that it is consistent with the in-memory reference.  This
 4321          * ensures that inode nlink rollbacks always have the correct link.
 4322          */
 4323         if (needsj == 0) {
 4324                 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
 4325                     inoref = TAILQ_NEXT(inoref, if_deps)) {
 4326                         if (inoref->if_state & GOINGAWAY)
 4327                                 break;
 4328                         inoref->if_nlink--;
 4329                 }
 4330         }
 4331         jsegdep = inoref_jseg(&jaddref->ja_ref);
 4332         if (jaddref->ja_state & NEWBLOCK)
 4333                 move_newblock_dep(jaddref, inodedep);
 4334         wake_worklist(&jaddref->ja_list);
 4335         jaddref->ja_mkdir = NULL;
 4336         if (jaddref->ja_state & INPROGRESS) {
 4337                 jaddref->ja_state &= ~INPROGRESS;
 4338                 WORKLIST_REMOVE(&jaddref->ja_list);
 4339                 jwork_insert(wkhd, jsegdep);
 4340         } else {
 4341                 free_jsegdep(jsegdep);
 4342                 if (jaddref->ja_state & DEPCOMPLETE)
 4343                         remove_from_journal(&jaddref->ja_list);
 4344         }
 4345         jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
 4346         /*
 4347          * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
 4348          * can arrange for them to be freed with the bitmap.  Otherwise we
 4349          * no longer need this addref attached to the inoreflst and it
 4350          * will incorrectly adjust nlink if we leave it.
 4351          */
 4352         if ((jaddref->ja_state & NEWBLOCK) == 0) {
 4353                 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
 4354                     if_deps);
 4355                 jaddref->ja_state |= COMPLETE;
 4356                 free_jaddref(jaddref);
 4357                 return (needsj);
 4358         }
 4359         /*
 4360          * Leave the head of the list for jsegdeps for fast merging.
 4361          */
 4362         if (LIST_FIRST(wkhd) != NULL) {
 4363                 jaddref->ja_state |= ONWORKLIST;
 4364                 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
 4365         } else
 4366                 WORKLIST_INSERT(wkhd, &jaddref->ja_list);
 4367 
 4368         return (needsj);
 4369 }
 4370 
 4371 /* 
 4372  * Attempt to free a jaddref structure when some work completes.  This
 4373  * should only succeed once the entry is written and all dependencies have
 4374  * been notified.
 4375  */
 4376 static void
 4377 free_jaddref(jaddref)
 4378         struct jaddref *jaddref;
 4379 {
 4380 
 4381         if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
 4382                 return;
 4383         if (jaddref->ja_ref.if_jsegdep)
 4384                 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
 4385                     jaddref, jaddref->ja_state);
 4386         if (jaddref->ja_state & NEWBLOCK)
 4387                 LIST_REMOVE(jaddref, ja_bmdeps);
 4388         if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
 4389                 panic("free_jaddref: Bad state %p(0x%X)",
 4390                     jaddref, jaddref->ja_state);
 4391         if (jaddref->ja_mkdir != NULL)
 4392                 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
 4393         WORKITEM_FREE(jaddref, D_JADDREF);
 4394 }
 4395 
 4396 /*
 4397  * Free a jremref structure once it has been written or discarded.
 4398  */
 4399 static void
 4400 free_jremref(jremref)
 4401         struct jremref *jremref;
 4402 {
 4403 
 4404         if (jremref->jr_ref.if_jsegdep)
 4405                 free_jsegdep(jremref->jr_ref.if_jsegdep);
 4406         if (jremref->jr_state & INPROGRESS)
 4407                 panic("free_jremref: IO still pending");
 4408         WORKITEM_FREE(jremref, D_JREMREF);
 4409 }
 4410 
 4411 /*
 4412  * Free a jnewblk structure.
 4413  */
 4414 static void
 4415 free_jnewblk(jnewblk)
 4416         struct jnewblk *jnewblk;
 4417 {
 4418 
 4419         if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
 4420                 return;
 4421         LIST_REMOVE(jnewblk, jn_deps);
 4422         if (jnewblk->jn_dep != NULL)
 4423                 panic("free_jnewblk: Dependency still attached.");
 4424         WORKITEM_FREE(jnewblk, D_JNEWBLK);
 4425 }
 4426 
 4427 /*
 4428  * Cancel a jnewblk which has been been made redundant by frag extension.
 4429  */
 4430 static void
 4431 cancel_jnewblk(jnewblk, wkhd)
 4432         struct jnewblk *jnewblk;
 4433         struct workhead *wkhd;
 4434 {
 4435         struct jsegdep *jsegdep;
 4436 
 4437         CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
 4438         jsegdep = jnewblk->jn_jsegdep;
 4439         if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
 4440                 panic("cancel_jnewblk: Invalid state");
 4441         jnewblk->jn_jsegdep  = NULL;
 4442         jnewblk->jn_dep = NULL;
 4443         jnewblk->jn_state |= GOINGAWAY;
 4444         if (jnewblk->jn_state & INPROGRESS) {
 4445                 jnewblk->jn_state &= ~INPROGRESS;
 4446                 WORKLIST_REMOVE(&jnewblk->jn_list);
 4447                 jwork_insert(wkhd, jsegdep);
 4448         } else {
 4449                 free_jsegdep(jsegdep);
 4450                 remove_from_journal(&jnewblk->jn_list);
 4451         }
 4452         wake_worklist(&jnewblk->jn_list);
 4453         WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
 4454 }
 4455 
 4456 static void
 4457 free_jblkdep(jblkdep)
 4458         struct jblkdep *jblkdep;
 4459 {
 4460 
 4461         if (jblkdep->jb_list.wk_type == D_JFREEBLK)
 4462                 WORKITEM_FREE(jblkdep, D_JFREEBLK);
 4463         else if (jblkdep->jb_list.wk_type == D_JTRUNC)
 4464                 WORKITEM_FREE(jblkdep, D_JTRUNC);
 4465         else
 4466                 panic("free_jblkdep: Unexpected type %s",
 4467                     TYPENAME(jblkdep->jb_list.wk_type));
 4468 }
 4469 
 4470 /*
 4471  * Free a single jseg once it is no longer referenced in memory or on
 4472  * disk.  Reclaim journal blocks and dependencies waiting for the segment
 4473  * to disappear.
 4474  */
 4475 static void
 4476 free_jseg(jseg, jblocks)
 4477         struct jseg *jseg;
 4478         struct jblocks *jblocks;
 4479 {
 4480         struct freework *freework;
 4481 
 4482         /*
 4483          * Free freework structures that were lingering to indicate freed
 4484          * indirect blocks that forced journal write ordering on reallocate.
 4485          */
 4486         while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
 4487                 indirblk_remove(freework);
 4488         if (jblocks->jb_oldestseg == jseg)
 4489                 jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
 4490         TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
 4491         jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
 4492         KASSERT(LIST_EMPTY(&jseg->js_entries),
 4493             ("free_jseg: Freed jseg has valid entries."));
 4494         WORKITEM_FREE(jseg, D_JSEG);
 4495 }
 4496 
 4497 /*
 4498  * Free all jsegs that meet the criteria for being reclaimed and update
 4499  * oldestseg.
 4500  */
 4501 static void
 4502 free_jsegs(jblocks)
 4503         struct jblocks *jblocks;
 4504 {
 4505         struct jseg *jseg;
 4506 
 4507         /*
 4508          * Free only those jsegs which have none allocated before them to
 4509          * preserve the journal space ordering.
 4510          */
 4511         while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
 4512                 /*
 4513                  * Only reclaim space when nothing depends on this journal
 4514                  * set and another set has written that it is no longer
 4515                  * valid.
 4516                  */
 4517                 if (jseg->js_refs != 0) {
 4518                         jblocks->jb_oldestseg = jseg;
 4519                         return;
 4520                 }
 4521                 if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
 4522                         break;
 4523                 if (jseg->js_seq > jblocks->jb_oldestwrseq)
 4524                         break;
 4525                 /*
 4526                  * We can free jsegs that didn't write entries when
 4527                  * oldestwrseq == js_seq.
 4528                  */
 4529                 if (jseg->js_seq == jblocks->jb_oldestwrseq &&
 4530                     jseg->js_cnt != 0)
 4531                         break;
 4532                 free_jseg(jseg, jblocks);
 4533         }
 4534         /*
 4535          * If we exited the loop above we still must discover the
 4536          * oldest valid segment.
 4537          */
 4538         if (jseg)
 4539                 for (jseg = jblocks->jb_oldestseg; jseg != NULL;
 4540                      jseg = TAILQ_NEXT(jseg, js_next))
 4541                         if (jseg->js_refs != 0)
 4542                                 break;
 4543         jblocks->jb_oldestseg = jseg;
 4544         /*
 4545          * The journal has no valid records but some jsegs may still be
 4546          * waiting on oldestwrseq to advance.  We force a small record
 4547          * out to permit these lingering records to be reclaimed.
 4548          */
 4549         if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
 4550                 jblocks->jb_needseg = 1;
 4551 }
 4552 
 4553 /*
 4554  * Release one reference to a jseg and free it if the count reaches 0.  This
 4555  * should eventually reclaim journal space as well.
 4556  */
 4557 static void
 4558 rele_jseg(jseg)
 4559         struct jseg *jseg;
 4560 {
 4561 
 4562         KASSERT(jseg->js_refs > 0,
 4563             ("free_jseg: Invalid refcnt %d", jseg->js_refs));
 4564         if (--jseg->js_refs != 0)
 4565                 return;
 4566         free_jsegs(jseg->js_jblocks);
 4567 }
 4568 
 4569 /*
 4570  * Release a jsegdep and decrement the jseg count.
 4571  */
 4572 static void
 4573 free_jsegdep(jsegdep)
 4574         struct jsegdep *jsegdep;
 4575 {
 4576 
 4577         if (jsegdep->jd_seg)
 4578                 rele_jseg(jsegdep->jd_seg);
 4579         WORKITEM_FREE(jsegdep, D_JSEGDEP);
 4580 }
 4581 
 4582 /*
 4583  * Wait for a journal item to make it to disk.  Initiate journal processing
 4584  * if required.
 4585  */
 4586 static int
 4587 jwait(wk, waitfor)
 4588         struct worklist *wk;
 4589         int waitfor;
 4590 {
 4591 
 4592         LOCK_OWNED(VFSTOUFS(wk->wk_mp));
 4593         /*
 4594          * Blocking journal waits cause slow synchronous behavior.  Record
 4595          * stats on the frequency of these blocking operations.
 4596          */
 4597         if (waitfor == MNT_WAIT) {
 4598                 stat_journal_wait++;
 4599                 switch (wk->wk_type) {
 4600                 case D_JREMREF:
 4601                 case D_JMVREF:
 4602                         stat_jwait_filepage++;
 4603                         break;
 4604                 case D_JTRUNC:
 4605                 case D_JFREEBLK:
 4606                         stat_jwait_freeblks++;
 4607                         break;
 4608                 case D_JNEWBLK:
 4609                         stat_jwait_newblk++;
 4610                         break;
 4611                 case D_JADDREF:
 4612                         stat_jwait_inode++;
 4613                         break;
 4614                 default:
 4615                         break;
 4616                 }
 4617         }
 4618         /*
 4619          * If IO has not started we process the journal.  We can't mark the
 4620          * worklist item as IOWAITING because we drop the lock while
 4621          * processing the journal and the worklist entry may be freed after
 4622          * this point.  The caller may call back in and re-issue the request.
 4623          */
 4624         if ((wk->wk_state & INPROGRESS) == 0) {
 4625                 softdep_process_journal(wk->wk_mp, wk, waitfor);
 4626                 if (waitfor != MNT_WAIT)
 4627                         return (EBUSY);
 4628                 return (0);
 4629         }
 4630         if (waitfor != MNT_WAIT)
 4631                 return (EBUSY);
 4632         wait_worklist(wk, "jwait");
 4633         return (0);
 4634 }
 4635 
 4636 /*
 4637  * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
 4638  * appropriate.  This is a convenience function to reduce duplicate code
 4639  * for the setup and revert functions below.
 4640  */
 4641 static struct inodedep *
 4642 inodedep_lookup_ip(ip)
 4643         struct inode *ip;
 4644 {
 4645         struct inodedep *inodedep;
 4646 
 4647         KASSERT(ip->i_nlink >= ip->i_effnlink,
 4648             ("inodedep_lookup_ip: bad delta"));
 4649         (void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC,
 4650             &inodedep);
 4651         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 4652         KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
 4653 
 4654         return (inodedep);
 4655 }
 4656 
 4657 /*
 4658  * Called prior to creating a new inode and linking it to a directory.  The
 4659  * jaddref structure must already be allocated by softdep_setup_inomapdep
 4660  * and it is discovered here so we can initialize the mode and update
 4661  * nlinkdelta.
 4662  */
 4663 void
 4664 softdep_setup_create(dp, ip)
 4665         struct inode *dp;
 4666         struct inode *ip;
 4667 {
 4668         struct inodedep *inodedep;
 4669         struct jaddref *jaddref;
 4670         struct vnode *dvp;
 4671 
 4672         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4673             ("softdep_setup_create called on non-softdep filesystem"));
 4674         KASSERT(ip->i_nlink == 1,
 4675             ("softdep_setup_create: Invalid link count."));
 4676         dvp = ITOV(dp);
 4677         ACQUIRE_LOCK(ITOUMP(dp));
 4678         inodedep = inodedep_lookup_ip(ip);
 4679         if (DOINGSUJ(dvp)) {
 4680                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 4681                     inoreflst);
 4682                 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
 4683                     ("softdep_setup_create: No addref structure present."));
 4684         }
 4685         softdep_prelink(dvp, NULL);
 4686         FREE_LOCK(ITOUMP(dp));
 4687 }
 4688 
 4689 /*
 4690  * Create a jaddref structure to track the addition of a DOTDOT link when
 4691  * we are reparenting an inode as part of a rename.  This jaddref will be
 4692  * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
 4693  * non-journaling softdep.
 4694  */
 4695 void
 4696 softdep_setup_dotdot_link(dp, ip)
 4697         struct inode *dp;
 4698         struct inode *ip;
 4699 {
 4700         struct inodedep *inodedep;
 4701         struct jaddref *jaddref;
 4702         struct vnode *dvp;
 4703 
 4704         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4705             ("softdep_setup_dotdot_link called on non-softdep filesystem"));
 4706         dvp = ITOV(dp);
 4707         jaddref = NULL;
 4708         /*
 4709          * We don't set MKDIR_PARENT as this is not tied to a mkdir and
 4710          * is used as a normal link would be.
 4711          */
 4712         if (DOINGSUJ(dvp))
 4713                 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
 4714                     dp->i_effnlink - 1, dp->i_mode);
 4715         ACQUIRE_LOCK(ITOUMP(dp));
 4716         inodedep = inodedep_lookup_ip(dp);
 4717         if (jaddref)
 4718                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 4719                     if_deps);
 4720         softdep_prelink(dvp, ITOV(ip));
 4721         FREE_LOCK(ITOUMP(dp));
 4722 }
 4723 
 4724 /*
 4725  * Create a jaddref structure to track a new link to an inode.  The directory
 4726  * offset is not known until softdep_setup_directory_add or
 4727  * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
 4728  * softdep.
 4729  */
 4730 void
 4731 softdep_setup_link(dp, ip)
 4732         struct inode *dp;
 4733         struct inode *ip;
 4734 {
 4735         struct inodedep *inodedep;
 4736         struct jaddref *jaddref;
 4737         struct vnode *dvp;
 4738 
 4739         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4740             ("softdep_setup_link called on non-softdep filesystem"));
 4741         dvp = ITOV(dp);
 4742         jaddref = NULL;
 4743         if (DOINGSUJ(dvp))
 4744                 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
 4745                     ip->i_mode);
 4746         ACQUIRE_LOCK(ITOUMP(dp));
 4747         inodedep = inodedep_lookup_ip(ip);
 4748         if (jaddref)
 4749                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 4750                     if_deps);
 4751         softdep_prelink(dvp, ITOV(ip));
 4752         FREE_LOCK(ITOUMP(dp));
 4753 }
 4754 
 4755 /*
 4756  * Called to create the jaddref structures to track . and .. references as
 4757  * well as lookup and further initialize the incomplete jaddref created
 4758  * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
 4759  * nlinkdelta for non-journaling softdep.
 4760  */
 4761 void
 4762 softdep_setup_mkdir(dp, ip)
 4763         struct inode *dp;
 4764         struct inode *ip;
 4765 {
 4766         struct inodedep *inodedep;
 4767         struct jaddref *dotdotaddref;
 4768         struct jaddref *dotaddref;
 4769         struct jaddref *jaddref;
 4770         struct vnode *dvp;
 4771 
 4772         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4773             ("softdep_setup_mkdir called on non-softdep filesystem"));
 4774         dvp = ITOV(dp);
 4775         dotaddref = dotdotaddref = NULL;
 4776         if (DOINGSUJ(dvp)) {
 4777                 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
 4778                     ip->i_mode);
 4779                 dotaddref->ja_state |= MKDIR_BODY;
 4780                 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
 4781                     dp->i_effnlink - 1, dp->i_mode);
 4782                 dotdotaddref->ja_state |= MKDIR_PARENT;
 4783         }
 4784         ACQUIRE_LOCK(ITOUMP(dp));
 4785         inodedep = inodedep_lookup_ip(ip);
 4786         if (DOINGSUJ(dvp)) {
 4787                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 4788                     inoreflst);
 4789                 KASSERT(jaddref != NULL,
 4790                     ("softdep_setup_mkdir: No addref structure present."));
 4791                 KASSERT(jaddref->ja_parent == dp->i_number, 
 4792                     ("softdep_setup_mkdir: bad parent %ju",
 4793                     (uintmax_t)jaddref->ja_parent));
 4794                 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
 4795                     if_deps);
 4796         }
 4797         inodedep = inodedep_lookup_ip(dp);
 4798         if (DOINGSUJ(dvp))
 4799                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
 4800                     &dotdotaddref->ja_ref, if_deps);
 4801         softdep_prelink(ITOV(dp), NULL);
 4802         FREE_LOCK(ITOUMP(dp));
 4803 }
 4804 
 4805 /*
 4806  * Called to track nlinkdelta of the inode and parent directories prior to
 4807  * unlinking a directory.
 4808  */
 4809 void
 4810 softdep_setup_rmdir(dp, ip)
 4811         struct inode *dp;
 4812         struct inode *ip;
 4813 {
 4814         struct vnode *dvp;
 4815 
 4816         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4817             ("softdep_setup_rmdir called on non-softdep filesystem"));
 4818         dvp = ITOV(dp);
 4819         ACQUIRE_LOCK(ITOUMP(dp));
 4820         (void) inodedep_lookup_ip(ip);
 4821         (void) inodedep_lookup_ip(dp);
 4822         softdep_prelink(dvp, ITOV(ip));
 4823         FREE_LOCK(ITOUMP(dp));
 4824 }
 4825 
 4826 /*
 4827  * Called to track nlinkdelta of the inode and parent directories prior to
 4828  * unlink.
 4829  */
 4830 void
 4831 softdep_setup_unlink(dp, ip)
 4832         struct inode *dp;
 4833         struct inode *ip;
 4834 {
 4835         struct vnode *dvp;
 4836 
 4837         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4838             ("softdep_setup_unlink called on non-softdep filesystem"));
 4839         dvp = ITOV(dp);
 4840         ACQUIRE_LOCK(ITOUMP(dp));
 4841         (void) inodedep_lookup_ip(ip);
 4842         (void) inodedep_lookup_ip(dp);
 4843         softdep_prelink(dvp, ITOV(ip));
 4844         FREE_LOCK(ITOUMP(dp));
 4845 }
 4846 
 4847 /*
 4848  * Called to release the journal structures created by a failed non-directory
 4849  * creation.  Adjusts nlinkdelta for non-journaling softdep.
 4850  */
 4851 void
 4852 softdep_revert_create(dp, ip)
 4853         struct inode *dp;
 4854         struct inode *ip;
 4855 {
 4856         struct inodedep *inodedep;
 4857         struct jaddref *jaddref;
 4858         struct vnode *dvp;
 4859 
 4860         KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0,
 4861             ("softdep_revert_create called on non-softdep filesystem"));
 4862         dvp = ITOV(dp);
 4863         ACQUIRE_LOCK(ITOUMP(dp));
 4864         inodedep = inodedep_lookup_ip(ip);
 4865         if (DOINGSUJ(dvp)) {
 4866                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 4867                     inoreflst);
 4868                 KASSERT(jaddref->ja_parent == dp->i_number,
 4869                     ("softdep_revert_create: addref parent mismatch"));
 4870                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
 4871         }
 4872         FREE_LOCK(ITOUMP(dp));
 4873 }
 4874 
 4875 /*
 4876  * Called to release the journal structures created by a failed link
 4877  * addition.  Adjusts nlinkdelta for non-journaling softdep.
 4878  */
 4879 void
 4880 softdep_revert_link(dp, ip)
 4881         struct inode *dp;
 4882         struct inode *ip;
 4883 {
 4884         struct inodedep *inodedep;
 4885         struct jaddref *jaddref;
 4886         struct vnode *dvp;
 4887 
 4888         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4889             ("softdep_revert_link called on non-softdep filesystem"));
 4890         dvp = ITOV(dp);
 4891         ACQUIRE_LOCK(ITOUMP(dp));
 4892         inodedep = inodedep_lookup_ip(ip);
 4893         if (DOINGSUJ(dvp)) {
 4894                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 4895                     inoreflst);
 4896                 KASSERT(jaddref->ja_parent == dp->i_number,
 4897                     ("softdep_revert_link: addref parent mismatch"));
 4898                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
 4899         }
 4900         FREE_LOCK(ITOUMP(dp));
 4901 }
 4902 
 4903 /*
 4904  * Called to release the journal structures created by a failed mkdir
 4905  * attempt.  Adjusts nlinkdelta for non-journaling softdep.
 4906  */
 4907 void
 4908 softdep_revert_mkdir(dp, ip)
 4909         struct inode *dp;
 4910         struct inode *ip;
 4911 {
 4912         struct inodedep *inodedep;
 4913         struct jaddref *jaddref;
 4914         struct jaddref *dotaddref;
 4915         struct vnode *dvp;
 4916 
 4917         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4918             ("softdep_revert_mkdir called on non-softdep filesystem"));
 4919         dvp = ITOV(dp);
 4920 
 4921         ACQUIRE_LOCK(ITOUMP(dp));
 4922         inodedep = inodedep_lookup_ip(dp);
 4923         if (DOINGSUJ(dvp)) {
 4924                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 4925                     inoreflst);
 4926                 KASSERT(jaddref->ja_parent == ip->i_number,
 4927                     ("softdep_revert_mkdir: dotdot addref parent mismatch"));
 4928                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
 4929         }
 4930         inodedep = inodedep_lookup_ip(ip);
 4931         if (DOINGSUJ(dvp)) {
 4932                 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 4933                     inoreflst);
 4934                 KASSERT(jaddref->ja_parent == dp->i_number,
 4935                     ("softdep_revert_mkdir: addref parent mismatch"));
 4936                 dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
 4937                     inoreflst, if_deps);
 4938                 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
 4939                 KASSERT(dotaddref->ja_parent == ip->i_number,
 4940                     ("softdep_revert_mkdir: dot addref parent mismatch"));
 4941                 cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
 4942         }
 4943         FREE_LOCK(ITOUMP(dp));
 4944 }
 4945 
 4946 /* 
 4947  * Called to correct nlinkdelta after a failed rmdir.
 4948  */
 4949 void
 4950 softdep_revert_rmdir(dp, ip)
 4951         struct inode *dp;
 4952         struct inode *ip;
 4953 {
 4954 
 4955         KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
 4956             ("softdep_revert_rmdir called on non-softdep filesystem"));
 4957         ACQUIRE_LOCK(ITOUMP(dp));
 4958         (void) inodedep_lookup_ip(ip);
 4959         (void) inodedep_lookup_ip(dp);
 4960         FREE_LOCK(ITOUMP(dp));
 4961 }
 4962 
 4963 /*
 4964  * Protecting the freemaps (or bitmaps).
 4965  * 
 4966  * To eliminate the need to execute fsck before mounting a filesystem
 4967  * after a power failure, one must (conservatively) guarantee that the
 4968  * on-disk copy of the bitmaps never indicate that a live inode or block is
 4969  * free.  So, when a block or inode is allocated, the bitmap should be
 4970  * updated (on disk) before any new pointers.  When a block or inode is
 4971  * freed, the bitmap should not be updated until all pointers have been
 4972  * reset.  The latter dependency is handled by the delayed de-allocation
 4973  * approach described below for block and inode de-allocation.  The former
 4974  * dependency is handled by calling the following procedure when a block or
 4975  * inode is allocated. When an inode is allocated an "inodedep" is created
 4976  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
 4977  * Each "inodedep" is also inserted into the hash indexing structure so
 4978  * that any additional link additions can be made dependent on the inode
 4979  * allocation.
 4980  * 
 4981  * The ufs filesystem maintains a number of free block counts (e.g., per
 4982  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
 4983  * in addition to the bitmaps.  These counts are used to improve efficiency
 4984  * during allocation and therefore must be consistent with the bitmaps.
 4985  * There is no convenient way to guarantee post-crash consistency of these
 4986  * counts with simple update ordering, for two main reasons: (1) The counts
 4987  * and bitmaps for a single cylinder group block are not in the same disk
 4988  * sector.  If a disk write is interrupted (e.g., by power failure), one may
 4989  * be written and the other not.  (2) Some of the counts are located in the
 4990  * superblock rather than the cylinder group block. So, we focus our soft
 4991  * updates implementation on protecting the bitmaps. When mounting a
 4992  * filesystem, we recompute the auxiliary counts from the bitmaps.
 4993  */
 4994 
 4995 /*
 4996  * Called just after updating the cylinder group block to allocate an inode.
 4997  */
 4998 void
 4999 softdep_setup_inomapdep(bp, ip, newinum, mode)
 5000         struct buf *bp;         /* buffer for cylgroup block with inode map */
 5001         struct inode *ip;       /* inode related to allocation */
 5002         ino_t newinum;          /* new inode number being allocated */
 5003         int mode;
 5004 {
 5005         struct inodedep *inodedep;
 5006         struct bmsafemap *bmsafemap;
 5007         struct jaddref *jaddref;
 5008         struct mount *mp;
 5009         struct fs *fs;
 5010 
 5011         mp = ITOVFS(ip);
 5012         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 5013             ("softdep_setup_inomapdep called on non-softdep filesystem"));
 5014         fs = VFSTOUFS(mp)->um_fs;
 5015         jaddref = NULL;
 5016 
 5017         /*
 5018          * Allocate the journal reference add structure so that the bitmap
 5019          * can be dependent on it.
 5020          */
 5021         if (MOUNTEDSUJ(mp)) {
 5022                 jaddref = newjaddref(ip, newinum, 0, 0, mode);
 5023                 jaddref->ja_state |= NEWBLOCK;
 5024         }
 5025 
 5026         /*
 5027          * Create a dependency for the newly allocated inode.
 5028          * Panic if it already exists as something is seriously wrong.
 5029          * Otherwise add it to the dependency list for the buffer holding
 5030          * the cylinder group map from which it was allocated.
 5031          *
 5032          * We have to preallocate a bmsafemap entry in case it is needed
 5033          * in bmsafemap_lookup since once we allocate the inodedep, we
 5034          * have to finish initializing it before we can FREE_LOCK().
 5035          * By preallocating, we avoid FREE_LOCK() while doing a malloc
 5036          * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
 5037          * creating the inodedep as it can be freed during the time
 5038          * that we FREE_LOCK() while allocating the inodedep. We must
 5039          * call workitem_alloc() before entering the locked section as
 5040          * it also acquires the lock and we must avoid trying doing so
 5041          * recursively.
 5042          */
 5043         bmsafemap = malloc(sizeof(struct bmsafemap),
 5044             M_BMSAFEMAP, M_SOFTDEP_FLAGS);
 5045         workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
 5046         ACQUIRE_LOCK(ITOUMP(ip));
 5047         if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep)))
 5048                 panic("softdep_setup_inomapdep: dependency %p for new"
 5049                     "inode already exists", inodedep);
 5050         bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
 5051         if (jaddref) {
 5052                 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
 5053                 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
 5054                     if_deps);
 5055         } else {
 5056                 inodedep->id_state |= ONDEPLIST;
 5057                 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
 5058         }
 5059         inodedep->id_bmsafemap = bmsafemap;
 5060         inodedep->id_state &= ~DEPCOMPLETE;
 5061         FREE_LOCK(ITOUMP(ip));
 5062 }
 5063 
 5064 /*
 5065  * Called just after updating the cylinder group block to
 5066  * allocate block or fragment.
 5067  */
 5068 void
 5069 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
 5070         struct buf *bp;         /* buffer for cylgroup block with block map */
 5071         struct mount *mp;       /* filesystem doing allocation */
 5072         ufs2_daddr_t newblkno;  /* number of newly allocated block */
 5073         int frags;              /* Number of fragments. */
 5074         int oldfrags;           /* Previous number of fragments for extend. */
 5075 {
 5076         struct newblk *newblk;
 5077         struct bmsafemap *bmsafemap;
 5078         struct jnewblk *jnewblk;
 5079         struct ufsmount *ump;
 5080         struct fs *fs;
 5081 
 5082         KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 5083             ("softdep_setup_blkmapdep called on non-softdep filesystem"));
 5084         ump = VFSTOUFS(mp);
 5085         fs = ump->um_fs;
 5086         jnewblk = NULL;
 5087         /*
 5088          * Create a dependency for the newly allocated block.
 5089          * Add it to the dependency list for the buffer holding
 5090          * the cylinder group map from which it was allocated.
 5091          */
 5092         if (MOUNTEDSUJ(mp)) {
 5093                 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
 5094                 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
 5095                 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
 5096                 jnewblk->jn_state = ATTACHED;
 5097                 jnewblk->jn_blkno = newblkno;
 5098                 jnewblk->jn_frags = frags;
 5099                 jnewblk->jn_oldfrags = oldfrags;
 5100 #ifdef SUJ_DEBUG
 5101                 {
 5102                         struct cg *cgp;
 5103                         uint8_t *blksfree;
 5104                         long bno;
 5105                         int i;
 5106         
 5107                         cgp = (struct cg *)bp->b_data;
 5108                         blksfree = cg_blksfree(cgp);
 5109                         bno = dtogd(fs, jnewblk->jn_blkno);
 5110                         for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
 5111                             i++) {
 5112                                 if (isset(blksfree, bno + i))
 5113                                         panic("softdep_setup_blkmapdep: "
 5114                                             "free fragment %d from %d-%d "
 5115                                             "state 0x%X dep %p", i,
 5116                                             jnewblk->jn_oldfrags,
 5117                                             jnewblk->jn_frags,
 5118                                             jnewblk->jn_state,
 5119                                             jnewblk->jn_dep);
 5120                         }
 5121                 }
 5122 #endif
 5123         }
 5124 
 5125         CTR3(KTR_SUJ,
 5126             "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
 5127             newblkno, frags, oldfrags);
 5128         ACQUIRE_LOCK(ump);
 5129         if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
 5130                 panic("softdep_setup_blkmapdep: found block");
 5131         newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
 5132             dtog(fs, newblkno), NULL);
 5133         if (jnewblk) {
 5134                 jnewblk->jn_dep = (struct worklist *)newblk;
 5135                 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
 5136         } else {
 5137                 newblk->nb_state |= ONDEPLIST;
 5138                 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
 5139         }
 5140         newblk->nb_bmsafemap = bmsafemap;
 5141         newblk->nb_jnewblk = jnewblk;
 5142         FREE_LOCK(ump);
 5143 }
 5144 
 5145 #define BMSAFEMAP_HASH(ump, cg) \
 5146       (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
 5147 
 5148 static int
 5149 bmsafemap_find(bmsafemaphd, cg, bmsafemapp)
 5150         struct bmsafemap_hashhead *bmsafemaphd;
 5151         int cg;
 5152         struct bmsafemap **bmsafemapp;
 5153 {
 5154         struct bmsafemap *bmsafemap;
 5155 
 5156         LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
 5157                 if (bmsafemap->sm_cg == cg)
 5158                         break;
 5159         if (bmsafemap) {
 5160                 *bmsafemapp = bmsafemap;
 5161                 return (1);
 5162         }
 5163         *bmsafemapp = NULL;
 5164 
 5165         return (0);
 5166 }
 5167 
 5168 /*
 5169  * Find the bmsafemap associated with a cylinder group buffer.
 5170  * If none exists, create one. The buffer must be locked when
 5171  * this routine is called and this routine must be called with
 5172  * the softdep lock held. To avoid giving up the lock while
 5173  * allocating a new bmsafemap, a preallocated bmsafemap may be
 5174  * provided. If it is provided but not needed, it is freed.
 5175  */
 5176 static struct bmsafemap *
 5177 bmsafemap_lookup(mp, bp, cg, newbmsafemap)
 5178         struct mount *mp;
 5179         struct buf *bp;
 5180         int cg;
 5181         struct bmsafemap *newbmsafemap;
 5182 {
 5183         struct bmsafemap_hashhead *bmsafemaphd;
 5184         struct bmsafemap *bmsafemap, *collision;
 5185         struct worklist *wk;
 5186         struct ufsmount *ump;
 5187 
 5188         ump = VFSTOUFS(mp);
 5189         LOCK_OWNED(ump);
 5190         KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
 5191         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 5192                 if (wk->wk_type == D_BMSAFEMAP) {
 5193                         if (newbmsafemap)
 5194                                 WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
 5195                         return (WK_BMSAFEMAP(wk));
 5196                 }
 5197         }
 5198         bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
 5199         if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
 5200                 if (newbmsafemap)
 5201                         WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
 5202                 return (bmsafemap);
 5203         }
 5204         if (newbmsafemap) {
 5205                 bmsafemap = newbmsafemap;
 5206         } else {
 5207                 FREE_LOCK(ump);
 5208                 bmsafemap = malloc(sizeof(struct bmsafemap),
 5209                         M_BMSAFEMAP, M_SOFTDEP_FLAGS);
 5210                 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
 5211                 ACQUIRE_LOCK(ump);
 5212         }
 5213         bmsafemap->sm_buf = bp;
 5214         LIST_INIT(&bmsafemap->sm_inodedephd);
 5215         LIST_INIT(&bmsafemap->sm_inodedepwr);
 5216         LIST_INIT(&