The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/ufs/ffs/ffs_softdep.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
    3  *
    4  * The soft updates code is derived from the appendix of a University
    5  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
    6  * "Soft Updates: A Solution to the Metadata Update Problem in File
    7  * Systems", CSE-TR-254-95, August 1995).
    8  *
    9  * Further information about soft updates can be obtained from:
   10  *
   11  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
   12  *      1614 Oxford Street              mckusick@mckusick.com
   13  *      Berkeley, CA 94709-1608         +1-510-843-9542
   14  *      USA
   15  *
   16  * Redistribution and use in source and binary forms, with or without
   17  * modification, are permitted provided that the following conditions
   18  * are met:
   19  *
   20  * 1. Redistributions of source code must retain the above copyright
   21  *    notice, this list of conditions and the following disclaimer.
   22  * 2. Redistributions in binary form must reproduce the above copyright
   23  *    notice, this list of conditions and the following disclaimer in the
   24  *    documentation and/or other materials provided with the distribution.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
   27  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   28  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   29  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
   30  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD$");
   43 
   44 /*
   45  * For now we want the safety net that the DEBUG flag provides.
   46  */
   47 #ifndef DEBUG
   48 #define DEBUG
   49 #endif
   50 
   51 #include <sys/param.h>
   52 #include <sys/kernel.h>
   53 #include <sys/systm.h>
   54 #include <sys/bio.h>
   55 #include <sys/buf.h>
   56 #include <sys/kdb.h>
   57 #include <sys/kthread.h>
   58 #include <sys/lock.h>
   59 #include <sys/malloc.h>
   60 #include <sys/mount.h>
   61 #include <sys/mutex.h>
   62 #include <sys/proc.h>
   63 #include <sys/stat.h>
   64 #include <sys/sysctl.h>
   65 #include <sys/syslog.h>
   66 #include <sys/vnode.h>
   67 #include <sys/conf.h>
   68 #include <ufs/ufs/dir.h>
   69 #include <ufs/ufs/extattr.h>
   70 #include <ufs/ufs/quota.h>
   71 #include <ufs/ufs/inode.h>
   72 #include <ufs/ufs/ufsmount.h>
   73 #include <ufs/ffs/fs.h>
   74 #include <ufs/ffs/softdep.h>
   75 #include <ufs/ffs/ffs_extern.h>
   76 #include <ufs/ufs/ufs_extern.h>
   77 
   78 #include <vm/vm.h>
   79 
   80 #include "opt_ffs.h"
   81 #include "opt_quota.h"
   82 
   83 #ifndef SOFTUPDATES
   84 
   85 int
   86 softdep_flushfiles(oldmnt, flags, td)
   87         struct mount *oldmnt;
   88         int flags;
   89         struct thread *td;
   90 {
   91 
   92         panic("softdep_flushfiles called");
   93 }
   94 
   95 int
   96 softdep_mount(devvp, mp, fs, cred)
   97         struct vnode *devvp;
   98         struct mount *mp;
   99         struct fs *fs;
  100         struct ucred *cred;
  101 {
  102 
  103         return (0);
  104 }
  105 
  106 void 
  107 softdep_initialize()
  108 {
  109 
  110         return;
  111 }
  112 
  113 void
  114 softdep_uninitialize()
  115 {
  116 
  117         return;
  118 }
  119 
  120 void
  121 softdep_setup_inomapdep(bp, ip, newinum)
  122         struct buf *bp;
  123         struct inode *ip;
  124         ino_t newinum;
  125 {
  126 
  127         panic("softdep_setup_inomapdep called");
  128 }
  129 
  130 void
  131 softdep_setup_blkmapdep(bp, mp, newblkno)
  132         struct buf *bp;
  133         struct mount *mp;
  134         ufs2_daddr_t newblkno;
  135 {
  136 
  137         panic("softdep_setup_blkmapdep called");
  138 }
  139 
  140 void 
  141 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
  142         struct inode *ip;
  143         ufs_lbn_t lbn;
  144         ufs2_daddr_t newblkno;
  145         ufs2_daddr_t oldblkno;
  146         long newsize;
  147         long oldsize;
  148         struct buf *bp;
  149 {
  150         
  151         panic("softdep_setup_allocdirect called");
  152 }
  153 
  154 void 
  155 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
  156         struct inode *ip;
  157         ufs_lbn_t lbn;
  158         ufs2_daddr_t newblkno;
  159         ufs2_daddr_t oldblkno;
  160         long newsize;
  161         long oldsize;
  162         struct buf *bp;
  163 {
  164         
  165         panic("softdep_setup_allocext called");
  166 }
  167 
  168 void
  169 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
  170         struct inode *ip;
  171         ufs_lbn_t lbn;
  172         struct buf *bp;
  173         int ptrno;
  174         ufs2_daddr_t newblkno;
  175         ufs2_daddr_t oldblkno;
  176         struct buf *nbp;
  177 {
  178 
  179         panic("softdep_setup_allocindir_page called");
  180 }
  181 
  182 void
  183 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
  184         struct buf *nbp;
  185         struct inode *ip;
  186         struct buf *bp;
  187         int ptrno;
  188         ufs2_daddr_t newblkno;
  189 {
  190 
  191         panic("softdep_setup_allocindir_meta called");
  192 }
  193 
  194 void
  195 softdep_setup_freeblocks(ip, length, flags)
  196         struct inode *ip;
  197         off_t length;
  198         int flags;
  199 {
  200         
  201         panic("softdep_setup_freeblocks called");
  202 }
  203 
  204 void
  205 softdep_freefile(pvp, ino, mode)
  206                 struct vnode *pvp;
  207                 ino_t ino;
  208                 int mode;
  209 {
  210 
  211         panic("softdep_freefile called");
  212 }
  213 
  214 int 
  215 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
  216         struct buf *bp;
  217         struct inode *dp;
  218         off_t diroffset;
  219         ino_t newinum;
  220         struct buf *newdirbp;
  221         int isnewblk;
  222 {
  223 
  224         panic("softdep_setup_directory_add called");
  225 }
  226 
  227 void 
  228 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
  229         struct inode *dp;
  230         caddr_t base;
  231         caddr_t oldloc;
  232         caddr_t newloc;
  233         int entrysize;
  234 {
  235 
  236         panic("softdep_change_directoryentry_offset called");
  237 }
  238 
  239 void 
  240 softdep_setup_remove(bp, dp, ip, isrmdir)
  241         struct buf *bp;
  242         struct inode *dp;
  243         struct inode *ip;
  244         int isrmdir;
  245 {
  246         
  247         panic("softdep_setup_remove called");
  248 }
  249 
  250 void 
  251 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
  252         struct buf *bp;
  253         struct inode *dp;
  254         struct inode *ip;
  255         ino_t newinum;
  256         int isrmdir;
  257 {
  258 
  259         panic("softdep_setup_directory_change called");
  260 }
  261 
  262 void
  263 softdep_change_linkcnt(ip)
  264         struct inode *ip;
  265 {
  266 
  267         panic("softdep_change_linkcnt called");
  268 }
  269 
  270 void 
  271 softdep_load_inodeblock(ip)
  272         struct inode *ip;
  273 {
  274 
  275         panic("softdep_load_inodeblock called");
  276 }
  277 
  278 void 
  279 softdep_update_inodeblock(ip, bp, waitfor)
  280         struct inode *ip;
  281         struct buf *bp;
  282         int waitfor;
  283 {
  284 
  285         panic("softdep_update_inodeblock called");
  286 }
  287 
  288 int
  289 softdep_fsync(vp)
  290         struct vnode *vp;       /* the "in_core" copy of the inode */
  291 {
  292 
  293         return (0);
  294 }
  295 
  296 void
  297 softdep_fsync_mountdev(vp)
  298         struct vnode *vp;
  299 {
  300 
  301         return;
  302 }
  303 
  304 int
  305 softdep_flushworklist(oldmnt, countp, td)
  306         struct mount *oldmnt;
  307         int *countp;
  308         struct thread *td;
  309 {
  310 
  311         *countp = 0;
  312         return (0);
  313 }
  314 
  315 int
  316 softdep_sync_metadata(struct vnode *vp)
  317 {
  318 
  319         return (0);
  320 }
  321 
  322 int
  323 softdep_slowdown(vp)
  324         struct vnode *vp;
  325 {
  326 
  327         panic("softdep_slowdown called");
  328 }
  329 
  330 void
  331 softdep_releasefile(ip)
  332         struct inode *ip;       /* inode with the zero effective link count */
  333 {
  334 
  335         panic("softdep_releasefile called");
  336 }
  337 
  338 int
  339 softdep_request_cleanup(fs, vp)
  340         struct fs *fs;
  341         struct vnode *vp;
  342 {
  343 
  344         return (0);
  345 }
  346 
  347 int
  348 softdep_check_suspend(struct mount *mp,
  349                       struct vnode *devvp,
  350                       int softdep_deps,
  351                       int softdep_accdeps,
  352                       int secondary_writes,
  353                       int secondary_accwrites)
  354 {
  355         struct bufobj *bo;
  356         int error;
  357         
  358         (void) softdep_deps,
  359         (void) softdep_accdeps;
  360 
  361         ASSERT_VI_LOCKED(devvp, "softdep_check_suspend");
  362         bo = &devvp->v_bufobj;
  363 
  364         for (;;) {
  365                 if (!MNT_ITRYLOCK(mp)) {
  366                         VI_UNLOCK(devvp);
  367                         MNT_ILOCK(mp);
  368                         MNT_IUNLOCK(mp);
  369                         VI_LOCK(devvp);
  370                         continue;
  371                 }
  372                 if (mp->mnt_secondary_writes != 0) {
  373                         VI_UNLOCK(devvp);
  374                         msleep(&mp->mnt_secondary_writes,
  375                                MNT_MTX(mp),
  376                                (PUSER - 1) | PDROP, "secwr", 0);
  377                         VI_LOCK(devvp);
  378                         continue;
  379                 }
  380                 break;
  381         }
  382 
  383         /*
  384          * Reasons for needing more work before suspend:
  385          * - Dirty buffers on devvp.
  386          * - Secondary writes occurred after start of vnode sync loop
  387          */
  388         error = 0;
  389         if (bo->bo_numoutput > 0 ||
  390             bo->bo_dirty.bv_cnt > 0 ||
  391             secondary_writes != 0 ||
  392             mp->mnt_secondary_writes != 0 ||
  393             secondary_accwrites != mp->mnt_secondary_accwrites)
  394                 error = EAGAIN;
  395         VI_UNLOCK(devvp);
  396         return (error);
  397 }
  398 
  399 void
  400 softdep_get_depcounts(struct mount *mp,
  401                       int *softdepactivep,
  402                       int *softdepactiveaccp)
  403 {
  404         (void) mp;
  405         *softdepactivep = 0;
  406         *softdepactiveaccp = 0;
  407 }
  408 
  409 #else
  410 /*
  411  * These definitions need to be adapted to the system to which
  412  * this file is being ported.
  413  */
  414 /*
  415  * malloc types defined for the softdep system.
  416  */
  417 static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
  418 static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
  419 static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
  420 static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
  421 static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
  422 static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
  423 static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
  424 static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
  425 static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
  426 static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
  427 static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
  428 static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
  429 static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
  430 static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
  431 static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
  432 
  433 #define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)
  434 
  435 #define D_PAGEDEP       0
  436 #define D_INODEDEP      1
  437 #define D_NEWBLK        2
  438 #define D_BMSAFEMAP     3
  439 #define D_ALLOCDIRECT   4
  440 #define D_INDIRDEP      5
  441 #define D_ALLOCINDIR    6
  442 #define D_FREEFRAG      7
  443 #define D_FREEBLKS      8
  444 #define D_FREEFILE      9
  445 #define D_DIRADD        10
  446 #define D_MKDIR         11
  447 #define D_DIRREM        12
  448 #define D_NEWDIRBLK     13
  449 #define D_LAST          D_NEWDIRBLK
  450 
  451 /* 
  452  * translate from workitem type to memory type
  453  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
  454  */
  455 static struct malloc_type *memtype[] = {
  456         M_PAGEDEP,
  457         M_INODEDEP,
  458         M_NEWBLK,
  459         M_BMSAFEMAP,
  460         M_ALLOCDIRECT,
  461         M_INDIRDEP,
  462         M_ALLOCINDIR,
  463         M_FREEFRAG,
  464         M_FREEBLKS,
  465         M_FREEFILE,
  466         M_DIRADD,
  467         M_MKDIR,
  468         M_DIRREM,
  469         M_NEWDIRBLK
  470 };
  471 
  472 #define DtoM(type) (memtype[type])
  473 
  474 /*
  475  * Names of malloc types.
  476  */
  477 #define TYPENAME(type)  \
  478         ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
  479 /*
  480  * End system adaptation definitions.
  481  */
  482 
  483 /*
  484  * Forward declarations.
  485  */
  486 struct inodedep_hashhead;
  487 struct newblk_hashhead;
  488 struct pagedep_hashhead;
  489 
  490 /*
  491  * Internal function prototypes.
  492  */
  493 static  void softdep_error(char *, int);
  494 static  void drain_output(struct vnode *);
  495 static  struct buf *getdirtybuf(struct buf *, struct mtx *, int);
  496 static  void clear_remove(struct thread *);
  497 static  void clear_inodedeps(struct thread *);
  498 static  int flush_pagedep_deps(struct vnode *, struct mount *,
  499             struct diraddhd *);
  500 static  int flush_inodedep_deps(struct mount *, ino_t);
  501 static  int flush_deplist(struct allocdirectlst *, int, int *);
  502 static  int handle_written_filepage(struct pagedep *, struct buf *);
  503 static  void diradd_inode_written(struct diradd *, struct inodedep *);
  504 static  int handle_written_inodeblock(struct inodedep *, struct buf *);
  505 static  void handle_allocdirect_partdone(struct allocdirect *);
  506 static  void handle_allocindir_partdone(struct allocindir *);
  507 static  void initiate_write_filepage(struct pagedep *, struct buf *);
  508 static  void handle_written_mkdir(struct mkdir *, int);
  509 static  void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
  510 static  void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
  511 static  void handle_workitem_freefile(struct freefile *);
  512 static  void handle_workitem_remove(struct dirrem *, struct vnode *);
  513 static  struct dirrem *newdirrem(struct buf *, struct inode *,
  514             struct inode *, int, struct dirrem **);
  515 static  void free_diradd(struct diradd *);
  516 static  void free_allocindir(struct allocindir *, struct inodedep *);
  517 static  void free_newdirblk(struct newdirblk *);
  518 static  int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
  519             ufs2_daddr_t *);
  520 static  void deallocate_dependencies(struct buf *, struct inodedep *);
  521 static  void free_allocdirect(struct allocdirectlst *,
  522             struct allocdirect *, int);
  523 static  int check_inode_unwritten(struct inodedep *);
  524 static  int free_inodedep(struct inodedep *);
  525 static  void handle_workitem_freeblocks(struct freeblks *, int);
  526 static  void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
  527 static  void setup_allocindir_phase2(struct buf *, struct inode *,
  528             struct allocindir *);
  529 static  struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
  530             ufs2_daddr_t);
  531 static  void handle_workitem_freefrag(struct freefrag *);
  532 static  struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
  533 static  void allocdirect_merge(struct allocdirectlst *,
  534             struct allocdirect *, struct allocdirect *);
  535 static  struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *);
  536 static  int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t,
  537             struct newblk **);
  538 static  int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
  539 static  int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
  540             struct inodedep **);
  541 static  int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
  542 static  int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
  543 static  int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
  544             struct mount *mp, int, struct pagedep **);
  545 static  void pause_timer(void *);
  546 static  int request_cleanup(struct mount *, int);
  547 static  int process_worklist_item(struct mount *, int);
  548 static  void add_to_worklist(struct worklist *);
  549 static  void softdep_flush(void);
  550 static  int softdep_speedup(void);
  551 
  552 /*
  553  * Exported softdep operations.
  554  */
  555 static  void softdep_disk_io_initiation(struct buf *);
  556 static  void softdep_disk_write_complete(struct buf *);
  557 static  void softdep_deallocate_dependencies(struct buf *);
  558 static  int softdep_count_dependencies(struct buf *bp, int);
  559 
  560 static struct mtx lk;
  561 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
  562 
  563 #define TRY_ACQUIRE_LOCK(lk)            mtx_trylock(lk)
  564 #define ACQUIRE_LOCK(lk)                mtx_lock(lk)
  565 #define FREE_LOCK(lk)                   mtx_unlock(lk)
  566 
  567 /*
  568  * Worklist queue management.
  569  * These routines require that the lock be held.
  570  */
  571 #ifndef /* NOT */ DEBUG
  572 #define WORKLIST_INSERT(head, item) do {        \
  573         (item)->wk_state |= ONWORKLIST;         \
  574         LIST_INSERT_HEAD(head, item, wk_list);  \
  575 } while (0)
  576 #define WORKLIST_REMOVE(item) do {              \
  577         (item)->wk_state &= ~ONWORKLIST;        \
  578         LIST_REMOVE(item, wk_list);             \
  579 } while (0)
  580 #else /* DEBUG */
  581 static  void worklist_insert(struct workhead *, struct worklist *);
  582 static  void worklist_remove(struct worklist *);
  583 
  584 #define WORKLIST_INSERT(head, item) worklist_insert(head, item)
  585 #define WORKLIST_REMOVE(item) worklist_remove(item)
  586 
  587 static void
  588 worklist_insert(head, item)
  589         struct workhead *head;
  590         struct worklist *item;
  591 {
  592 
  593         mtx_assert(&lk, MA_OWNED);
  594         if (item->wk_state & ONWORKLIST)
  595                 panic("worklist_insert: already on list");
  596         item->wk_state |= ONWORKLIST;
  597         LIST_INSERT_HEAD(head, item, wk_list);
  598 }
  599 
  600 static void
  601 worklist_remove(item)
  602         struct worklist *item;
  603 {
  604 
  605         mtx_assert(&lk, MA_OWNED);
  606         if ((item->wk_state & ONWORKLIST) == 0)
  607                 panic("worklist_remove: not on list");
  608         item->wk_state &= ~ONWORKLIST;
  609         LIST_REMOVE(item, wk_list);
  610 }
  611 #endif /* DEBUG */
  612 
  613 /*
  614  * Routines for tracking and managing workitems.
  615  */
  616 static  void workitem_free(struct worklist *, int);
  617 static  void workitem_alloc(struct worklist *, int, struct mount *);
  618 
  619 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
  620 
  621 static void
  622 workitem_free(item, type)
  623         struct worklist *item;
  624         int type;
  625 {
  626         struct ufsmount *ump;
  627         mtx_assert(&lk, MA_OWNED);
  628 
  629 #ifdef DEBUG
  630         if (item->wk_state & ONWORKLIST)
  631                 panic("workitem_free: still on list");
  632         if (item->wk_type != type)
  633                 panic("workitem_free: type mismatch");
  634 #endif
  635         ump = VFSTOUFS(item->wk_mp);
  636         if (--ump->softdep_deps == 0 && ump->softdep_req)
  637                 wakeup(&ump->softdep_deps);
  638         FREE(item, DtoM(type));
  639 }
  640 
  641 static void
  642 workitem_alloc(item, type, mp)
  643         struct worklist *item;
  644         int type;
  645         struct mount *mp;
  646 {
  647         item->wk_type = type;
  648         item->wk_mp = mp;
  649         item->wk_state = 0;
  650         ACQUIRE_LOCK(&lk);
  651         VFSTOUFS(mp)->softdep_deps++;
  652         VFSTOUFS(mp)->softdep_accdeps++;
  653         FREE_LOCK(&lk);
  654 }
  655 
  656 /*
  657  * Workitem queue management
  658  */
  659 static int max_softdeps;        /* maximum number of structs before slowdown */
  660 static int maxindirdeps = 50;   /* max number of indirdeps before slowdown */
  661 static int tickdelay = 2;       /* number of ticks to pause during slowdown */
  662 static int proc_waiting;        /* tracks whether we have a timeout posted */
  663 static int *stat_countp;        /* statistic to count in proc_waiting timeout */
  664 static struct callout_handle handle; /* handle on posted proc_waiting timeout */
  665 static int req_pending;
  666 static int req_clear_inodedeps; /* syncer process flush some inodedeps */
  667 #define FLUSH_INODES            1
  668 static int req_clear_remove;    /* syncer process flush some freeblks */
  669 #define FLUSH_REMOVE            2
  670 #define FLUSH_REMOVE_WAIT       3
  671 static long num_freeblkdep;     /* number of freeblks workitems allocated */
  672 
  673 /*
  674  * runtime statistics
  675  */
  676 static int stat_worklist_push;  /* number of worklist cleanups */
  677 static int stat_blk_limit_push; /* number of times block limit neared */
  678 static int stat_ino_limit_push; /* number of times inode limit neared */
  679 static int stat_blk_limit_hit;  /* number of times block slowdown imposed */
  680 static int stat_ino_limit_hit;  /* number of times inode slowdown imposed */
  681 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
  682 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
  683 static int stat_inode_bitmap;   /* bufs redirtied as inode bitmap not written */
  684 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
  685 static int stat_dir_entry;      /* bufs redirtied as dir entry cannot write */
  686 
  687 SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
  688 SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
  689 SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
  690 SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
  691 SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
  692 SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
  693 SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
  694 SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
  695 SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
  696 SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
  697 SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
  698 SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
  699 SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
  700 /* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */
  701 
  702 SYSCTL_DECL(_vfs_ffs);
  703 
  704 static int compute_summary_at_mount = 0;        /* Whether to recompute the summary at mount time */
  705 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
  706            &compute_summary_at_mount, 0, "Recompute summary at mount");
  707 
  708 static struct proc *softdepproc;
  709 static struct kproc_desc softdep_kp = {
  710         "softdepflush",
  711         softdep_flush,
  712         &softdepproc
  713 };
  714 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, &softdep_kp)
  715 
  716 static void
  717 softdep_flush(void)
  718 {
  719         struct mount *nmp;
  720         struct mount *mp;
  721         struct ufsmount *ump;
  722         struct thread *td;
  723         int remaining;
  724         int vfslocked;
  725 
  726         td = curthread;
  727         td->td_pflags |= TDP_NORUNNINGBUF;
  728 
  729         for (;;) {      
  730                 kthread_suspend_check(softdepproc);
  731                 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
  732                 ACQUIRE_LOCK(&lk);
  733                 /*
  734                  * If requested, try removing inode or removal dependencies.
  735                  */
  736                 if (req_clear_inodedeps) {
  737                         clear_inodedeps(td);
  738                         req_clear_inodedeps -= 1;
  739                         wakeup_one(&proc_waiting);
  740                 }
  741                 if (req_clear_remove) {
  742                         clear_remove(td);
  743                         req_clear_remove -= 1;
  744                         wakeup_one(&proc_waiting);
  745                 }
  746                 FREE_LOCK(&lk);
  747                 VFS_UNLOCK_GIANT(vfslocked);
  748                 remaining = 0;
  749                 mtx_lock(&mountlist_mtx);
  750                 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
  751                         nmp = TAILQ_NEXT(mp, mnt_list);
  752                         if ((mp->mnt_flag & MNT_SOFTDEP) == 0)
  753                                 continue;
  754                         if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
  755                                 continue;
  756                         vfslocked = VFS_LOCK_GIANT(mp);
  757                         softdep_process_worklist(mp, 0);
  758                         ump = VFSTOUFS(mp);
  759                         remaining += ump->softdep_on_worklist -
  760                                 ump->softdep_on_worklist_inprogress;
  761                         VFS_UNLOCK_GIANT(vfslocked);
  762                         mtx_lock(&mountlist_mtx);
  763                         nmp = TAILQ_NEXT(mp, mnt_list);
  764                         vfs_unbusy(mp, td);
  765                 }
  766                 mtx_unlock(&mountlist_mtx);
  767                 if (remaining)
  768                         continue;
  769                 ACQUIRE_LOCK(&lk);
  770                 if (!req_pending)
  771                         msleep(&req_pending, &lk, PVM, "sdflush", hz);
  772                 req_pending = 0;
  773                 FREE_LOCK(&lk);
  774         }
  775 }
  776 
  777 static int
  778 softdep_speedup(void)
  779 {
  780 
  781         mtx_assert(&lk, MA_OWNED);
  782         if (req_pending == 0) {
  783                 req_pending = 1;
  784                 wakeup(&req_pending);
  785         }
  786 
  787         return speedup_syncer();
  788 }
  789 
  790 /*
  791  * Add an item to the end of the work queue.
  792  * This routine requires that the lock be held.
  793  * This is the only routine that adds items to the list.
  794  * The following routine is the only one that removes items
  795  * and does so in order from first to last.
  796  */
  797 static void
  798 add_to_worklist(wk)
  799         struct worklist *wk;
  800 {
  801         struct ufsmount *ump;
  802 
  803         mtx_assert(&lk, MA_OWNED);
  804         ump = VFSTOUFS(wk->wk_mp);
  805         if (wk->wk_state & ONWORKLIST)
  806                 panic("add_to_worklist: already on list");
  807         wk->wk_state |= ONWORKLIST;
  808         if (LIST_EMPTY(&ump->softdep_workitem_pending))
  809                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
  810         else
  811                 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
  812         ump->softdep_worklist_tail = wk;
  813         ump->softdep_on_worklist += 1;
  814 }
  815 
  816 /*
  817  * Process that runs once per second to handle items in the background queue.
  818  *
  819  * Note that we ensure that everything is done in the order in which they
  820  * appear in the queue. The code below depends on this property to ensure
  821  * that blocks of a file are freed before the inode itself is freed. This
  822  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
  823  * until all the old ones have been purged from the dependency lists.
  824  */
  825 int 
  826 softdep_process_worklist(mp, full)
  827         struct mount *mp;
  828         int full;
  829 {
  830         struct thread *td = curthread;
  831         int cnt, matchcnt, loopcount;
  832         struct ufsmount *ump;
  833         long starttime;
  834 
  835         KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
  836         /*
  837          * Record the process identifier of our caller so that we can give
  838          * this process preferential treatment in request_cleanup below.
  839          */
  840         matchcnt = 0;
  841         ump = VFSTOUFS(mp);
  842         ACQUIRE_LOCK(&lk);
  843         loopcount = 1;
  844         starttime = time_second;
  845         while (ump->softdep_on_worklist > 0) {
  846                 if ((cnt = process_worklist_item(mp, 0)) == -1)
  847                         break;
  848                 else
  849                         matchcnt += cnt;
  850                 /*
  851                  * If requested, try removing inode or removal dependencies.
  852                  */
  853                 if (req_clear_inodedeps) {
  854                         clear_inodedeps(td);
  855                         req_clear_inodedeps -= 1;
  856                         wakeup_one(&proc_waiting);
  857                 }
  858                 if (req_clear_remove) {
  859                         clear_remove(td);
  860                         req_clear_remove -= 1;
  861                         wakeup_one(&proc_waiting);
  862                 }
  863                 /*
  864                  * We do not generally want to stop for buffer space, but if
  865                  * we are really being a buffer hog, we will stop and wait.
  866                  */
  867                 if (loopcount++ % 128 == 0) {
  868                         FREE_LOCK(&lk);
  869                         bwillwrite();
  870                         ACQUIRE_LOCK(&lk);
  871                 }
  872                 /*
  873                  * Never allow processing to run for more than one
  874                  * second. Otherwise the other mountpoints may get
  875                  * excessively backlogged.
  876                  */
  877                 if (!full && starttime != time_second) {
  878                         matchcnt = -1;
  879                         break;
  880                 }
  881         }
  882         FREE_LOCK(&lk);
  883         return (matchcnt);
  884 }
  885 
  886 /*
  887  * Process one item on the worklist.
  888  */
  889 static int
  890 process_worklist_item(mp, flags)
  891         struct mount *mp;
  892         int flags;
  893 {
  894         struct worklist *wk, *wkend;
  895         struct ufsmount *ump;
  896         struct vnode *vp;
  897         int matchcnt = 0;
  898 
  899         mtx_assert(&lk, MA_OWNED);
  900         KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
  901         /*
  902          * If we are being called because of a process doing a
  903          * copy-on-write, then it is not safe to write as we may
  904          * recurse into the copy-on-write routine.
  905          */
  906         if (curthread->td_pflags & TDP_COWINPROGRESS)
  907                 return (-1);
  908         /*
  909          * Normally we just process each item on the worklist in order.
  910          * However, if we are in a situation where we cannot lock any
  911          * inodes, we have to skip over any dirrem requests whose
  912          * vnodes are resident and locked.
  913          */
  914         ump = VFSTOUFS(mp);
  915         vp = NULL;
  916         LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
  917                 if (wk->wk_state & INPROGRESS)
  918                         continue;
  919                 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
  920                         break;
  921                 wk->wk_state |= INPROGRESS;
  922                 ump->softdep_on_worklist_inprogress++;
  923                 FREE_LOCK(&lk);
  924                 ffs_vget(mp, WK_DIRREM(wk)->dm_oldinum,
  925                     LK_NOWAIT | LK_EXCLUSIVE, &vp);
  926                 ACQUIRE_LOCK(&lk);
  927                 wk->wk_state &= ~INPROGRESS;
  928                 ump->softdep_on_worklist_inprogress--;
  929                 if (vp != NULL)
  930                         break;
  931         }
  932         if (wk == 0)
  933                 return (-1);
  934         /*
  935          * Remove the item to be processed. If we are removing the last
  936          * item on the list, we need to recalculate the tail pointer.
  937          * As this happens rarely and usually when the list is short,
  938          * we just run down the list to find it rather than tracking it
  939          * in the above loop.
  940          */
  941         WORKLIST_REMOVE(wk);
  942         if (wk == ump->softdep_worklist_tail) {
  943                 LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
  944                         if (LIST_NEXT(wkend, wk_list) == NULL)
  945                                 break;
  946                 ump->softdep_worklist_tail = wkend;
  947         }
  948         ump->softdep_on_worklist -= 1;
  949         FREE_LOCK(&lk);
  950         if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
  951                 panic("process_worklist_item: suspended filesystem");
  952         matchcnt++;
  953         switch (wk->wk_type) {
  954 
  955         case D_DIRREM:
  956                 /* removal of a directory entry */
  957                 handle_workitem_remove(WK_DIRREM(wk), vp);
  958                 break;
  959 
  960         case D_FREEBLKS:
  961                 /* releasing blocks and/or fragments from a file */
  962                 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
  963                 break;
  964 
  965         case D_FREEFRAG:
  966                 /* releasing a fragment when replaced as a file grows */
  967                 handle_workitem_freefrag(WK_FREEFRAG(wk));
  968                 break;
  969 
  970         case D_FREEFILE:
  971                 /* releasing an inode when its link count drops to 0 */
  972                 handle_workitem_freefile(WK_FREEFILE(wk));
  973                 break;
  974 
  975         default:
  976                 panic("%s_process_worklist: Unknown type %s",
  977                     "softdep", TYPENAME(wk->wk_type));
  978                 /* NOTREACHED */
  979         }
  980         vn_finished_secondary_write(mp);
  981         ACQUIRE_LOCK(&lk);
  982         return (matchcnt);
  983 }
  984 
  985 /*
  986  * Move dependencies from one buffer to another.
  987  */
  988 void
  989 softdep_move_dependencies(oldbp, newbp)
  990         struct buf *oldbp;
  991         struct buf *newbp;
  992 {
  993         struct worklist *wk, *wktail;
  994 
  995         if (!LIST_EMPTY(&newbp->b_dep))
  996                 panic("softdep_move_dependencies: need merge code");
  997         wktail = 0;
  998         ACQUIRE_LOCK(&lk);
  999         while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
 1000                 LIST_REMOVE(wk, wk_list);
 1001                 if (wktail == 0)
 1002                         LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
 1003                 else
 1004                         LIST_INSERT_AFTER(wktail, wk, wk_list);
 1005                 wktail = wk;
 1006         }
 1007         FREE_LOCK(&lk);
 1008 }
 1009 
 1010 /*
 1011  * Purge the work list of all items associated with a particular mount point.
 1012  */
 1013 int
 1014 softdep_flushworklist(oldmnt, countp, td)
 1015         struct mount *oldmnt;
 1016         int *countp;
 1017         struct thread *td;
 1018 {
 1019         struct vnode *devvp;
 1020         int count, error = 0;
 1021         struct ufsmount *ump;
 1022 
 1023         /*
 1024          * Alternately flush the block device associated with the mount
 1025          * point and process any dependencies that the flushing
 1026          * creates. We continue until no more worklist dependencies
 1027          * are found.
 1028          */
 1029         *countp = 0;
 1030         ump = VFSTOUFS(oldmnt);
 1031         devvp = ump->um_devvp;
 1032         while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
 1033                 *countp += count;
 1034                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 1035                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
 1036                 VOP_UNLOCK(devvp, 0, td);
 1037                 if (error)
 1038                         break;
 1039         }
 1040         return (error);
 1041 }
 1042 
 1043 int
 1044 softdep_waitidle(struct mount *mp)
 1045 {
 1046         struct ufsmount *ump;
 1047         int error;
 1048         int i;
 1049 
 1050         ump = VFSTOUFS(mp);
 1051         ACQUIRE_LOCK(&lk);
 1052         for (i = 0; i < 10 && ump->softdep_deps; i++) {
 1053                 ump->softdep_req = 1;
 1054                 if (ump->softdep_on_worklist)
 1055                         panic("softdep_waitidle: work added after flush.");
 1056                 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
 1057         }
 1058         ump->softdep_req = 0;
 1059         FREE_LOCK(&lk);
 1060         error = 0;
 1061         if (i == 10) {
 1062                 error = EBUSY;
 1063                 printf("softdep_waitidle: Failed to flush worklist for %p\n",
 1064                     mp);
 1065         }
 1066 
 1067         return (error);
 1068 }
 1069 
 1070 /*
 1071  * Flush all vnodes and worklist items associated with a specified mount point.
 1072  */
 1073 int
 1074 softdep_flushfiles(oldmnt, flags, td)
 1075         struct mount *oldmnt;
 1076         int flags;
 1077         struct thread *td;
 1078 {
 1079         int error, count, loopcnt;
 1080 
 1081         error = 0;
 1082 
 1083         /*
 1084          * Alternately flush the vnodes associated with the mount
 1085          * point and process any dependencies that the flushing
 1086          * creates. In theory, this loop can happen at most twice,
 1087          * but we give it a few extra just to be sure.
 1088          */
 1089         for (loopcnt = 10; loopcnt > 0; loopcnt--) {
 1090                 /*
 1091                  * Do another flush in case any vnodes were brought in
 1092                  * as part of the cleanup operations.
 1093                  */
 1094                 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
 1095                         break;
 1096                 if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 ||
 1097                     count == 0)
 1098                         break;
 1099         }
 1100         /*
 1101          * If we are unmounting then it is an error to fail. If we
 1102          * are simply trying to downgrade to read-only, then filesystem
 1103          * activity can keep us busy forever, so we just fail with EBUSY.
 1104          */
 1105         if (loopcnt == 0) {
 1106                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
 1107                         panic("softdep_flushfiles: looping");
 1108                 error = EBUSY;
 1109         }
 1110         if (!error)
 1111                 error = softdep_waitidle(oldmnt);
 1112         return (error);
 1113 }
 1114 
 1115 /*
 1116  * Structure hashing.
 1117  * 
 1118  * There are three types of structures that can be looked up:
 1119  *      1) pagedep structures identified by mount point, inode number,
 1120  *         and logical block.
 1121  *      2) inodedep structures identified by mount point and inode number.
 1122  *      3) newblk structures identified by mount point and
 1123  *         physical block number.
 1124  *
 1125  * The "pagedep" and "inodedep" dependency structures are hashed
 1126  * separately from the file blocks and inodes to which they correspond.
 1127  * This separation helps when the in-memory copy of an inode or
 1128  * file block must be replaced. It also obviates the need to access
 1129  * an inode or file page when simply updating (or de-allocating)
 1130  * dependency structures. Lookup of newblk structures is needed to
 1131  * find newly allocated blocks when trying to associate them with
 1132  * their allocdirect or allocindir structure.
 1133  *
 1134  * The lookup routines optionally create and hash a new instance when
 1135  * an existing entry is not found.
 1136  */
 1137 #define DEPALLOC        0x0001  /* allocate structure if lookup fails */
 1138 #define NODELAY         0x0002  /* cannot do background work */
 1139 
 1140 /*
 1141  * Structures and routines associated with pagedep caching.
 1142  */
 1143 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
 1144 u_long  pagedep_hash;           /* size of hash table - 1 */
 1145 #define PAGEDEP_HASH(mp, inum, lbn) \
 1146         (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
 1147             pagedep_hash])
 1148 
 1149 static int
 1150 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
 1151         struct pagedep_hashhead *pagedephd;
 1152         ino_t ino;
 1153         ufs_lbn_t lbn;
 1154         struct mount *mp;
 1155         int flags;
 1156         struct pagedep **pagedeppp;
 1157 {
 1158         struct pagedep *pagedep;
 1159 
 1160         LIST_FOREACH(pagedep, pagedephd, pd_hash)
 1161                 if (ino == pagedep->pd_ino &&
 1162                     lbn == pagedep->pd_lbn &&
 1163                     mp == pagedep->pd_list.wk_mp)
 1164                         break;
 1165         if (pagedep) {
 1166                 *pagedeppp = pagedep;
 1167                 if ((flags & DEPALLOC) != 0 &&
 1168                     (pagedep->pd_state & ONWORKLIST) == 0)
 1169                         return (0);
 1170                 return (1);
 1171         }
 1172         *pagedeppp = NULL;
 1173         return (0);
 1174 }
 1175 /*
 1176  * Look up a pagedep. Return 1 if found, 0 if not found or found
 1177  * when asked to allocate but not associated with any buffer.
 1178  * If not found, allocate if DEPALLOC flag is passed.
 1179  * Found or allocated entry is returned in pagedeppp.
 1180  * This routine must be called with splbio interrupts blocked.
 1181  */
 1182 static int
 1183 pagedep_lookup(ip, lbn, flags, pagedeppp)
 1184         struct inode *ip;
 1185         ufs_lbn_t lbn;
 1186         int flags;
 1187         struct pagedep **pagedeppp;
 1188 {
 1189         struct pagedep *pagedep;
 1190         struct pagedep_hashhead *pagedephd;
 1191         struct mount *mp;
 1192         int ret;
 1193         int i;
 1194 
 1195         mtx_assert(&lk, MA_OWNED);
 1196         mp = ITOV(ip)->v_mount;
 1197         pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
 1198 
 1199         ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
 1200         if (*pagedeppp || (flags & DEPALLOC) == 0)
 1201                 return (ret);
 1202         FREE_LOCK(&lk);
 1203         MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep),
 1204             M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
 1205         workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
 1206         ACQUIRE_LOCK(&lk);
 1207         ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
 1208         if (*pagedeppp) {
 1209                 WORKITEM_FREE(pagedep, D_PAGEDEP);
 1210                 return (ret);
 1211         }
 1212         pagedep->pd_ino = ip->i_number;
 1213         pagedep->pd_lbn = lbn;
 1214         LIST_INIT(&pagedep->pd_dirremhd);
 1215         LIST_INIT(&pagedep->pd_pendinghd);
 1216         for (i = 0; i < DAHASHSZ; i++)
 1217                 LIST_INIT(&pagedep->pd_diraddhd[i]);
 1218         LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
 1219         *pagedeppp = pagedep;
 1220         return (0);
 1221 }
 1222 
 1223 /*
 1224  * Structures and routines associated with inodedep caching.
 1225  */
 1226 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
 1227 static u_long   inodedep_hash;  /* size of hash table - 1 */
 1228 static long     num_inodedep;   /* number of inodedep allocated */
 1229 #define INODEDEP_HASH(fs, inum) \
 1230       (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
 1231 
 1232 static int
 1233 inodedep_find(inodedephd, fs, inum, inodedeppp)
 1234         struct inodedep_hashhead *inodedephd;
 1235         struct fs *fs;
 1236         ino_t inum;
 1237         struct inodedep **inodedeppp;
 1238 {
 1239         struct inodedep *inodedep;
 1240 
 1241         LIST_FOREACH(inodedep, inodedephd, id_hash)
 1242                 if (inum == inodedep->id_ino && fs == inodedep->id_fs)
 1243                         break;
 1244         if (inodedep) {
 1245                 *inodedeppp = inodedep;
 1246                 return (1);
 1247         }
 1248         *inodedeppp = NULL;
 1249 
 1250         return (0);
 1251 }
 1252 /*
 1253  * Look up an inodedep. Return 1 if found, 0 if not found.
 1254  * If not found, allocate if DEPALLOC flag is passed.
 1255  * Found or allocated entry is returned in inodedeppp.
 1256  * This routine must be called with splbio interrupts blocked.
 1257  */
 1258 static int
 1259 inodedep_lookup(mp, inum, flags, inodedeppp)
 1260         struct mount *mp;
 1261         ino_t inum;
 1262         int flags;
 1263         struct inodedep **inodedeppp;
 1264 {
 1265         struct inodedep *inodedep;
 1266         struct inodedep_hashhead *inodedephd;
 1267         struct fs *fs;
 1268 
 1269         mtx_assert(&lk, MA_OWNED);
 1270         fs = VFSTOUFS(mp)->um_fs;
 1271         inodedephd = INODEDEP_HASH(fs, inum);
 1272 
 1273         if (inodedep_find(inodedephd, fs, inum, inodedeppp))
 1274                 return (1);
 1275         if ((flags & DEPALLOC) == 0)
 1276                 return (0);
 1277         /*
 1278          * If we are over our limit, try to improve the situation.
 1279          */
 1280         if (num_inodedep > max_softdeps && (flags & NODELAY) == 0)
 1281                 request_cleanup(mp, FLUSH_INODES);
 1282         FREE_LOCK(&lk);
 1283         MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
 1284                 M_INODEDEP, M_SOFTDEP_FLAGS);
 1285         workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
 1286         ACQUIRE_LOCK(&lk);
 1287         if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
 1288                 WORKITEM_FREE(inodedep, D_INODEDEP);
 1289                 return (1);
 1290         }
 1291         num_inodedep += 1;
 1292         inodedep->id_fs = fs;
 1293         inodedep->id_ino = inum;
 1294         inodedep->id_state = ALLCOMPLETE;
 1295         inodedep->id_nlinkdelta = 0;
 1296         inodedep->id_savedino1 = NULL;
 1297         inodedep->id_savedsize = -1;
 1298         inodedep->id_savedextsize = -1;
 1299         inodedep->id_buf = NULL;
 1300         LIST_INIT(&inodedep->id_pendinghd);
 1301         LIST_INIT(&inodedep->id_inowait);
 1302         LIST_INIT(&inodedep->id_bufwait);
 1303         TAILQ_INIT(&inodedep->id_inoupdt);
 1304         TAILQ_INIT(&inodedep->id_newinoupdt);
 1305         TAILQ_INIT(&inodedep->id_extupdt);
 1306         TAILQ_INIT(&inodedep->id_newextupdt);
 1307         LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
 1308         *inodedeppp = inodedep;
 1309         return (0);
 1310 }
 1311 
 1312 /*
 1313  * Structures and routines associated with newblk caching.
 1314  */
 1315 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
 1316 u_long  newblk_hash;            /* size of hash table - 1 */
 1317 #define NEWBLK_HASH(fs, inum) \
 1318         (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
 1319 
 1320 static int
 1321 newblk_find(newblkhd, fs, newblkno, newblkpp)
 1322         struct newblk_hashhead *newblkhd;
 1323         struct fs *fs;
 1324         ufs2_daddr_t newblkno;
 1325         struct newblk **newblkpp;
 1326 {
 1327         struct newblk *newblk;
 1328 
 1329         LIST_FOREACH(newblk, newblkhd, nb_hash)
 1330                 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
 1331                         break;
 1332         if (newblk) {
 1333                 *newblkpp = newblk;
 1334                 return (1);
 1335         }
 1336         *newblkpp = NULL;
 1337         return (0);
 1338 }
 1339 
 1340 /*
 1341  * Look up a newblk. Return 1 if found, 0 if not found.
 1342  * If not found, allocate if DEPALLOC flag is passed.
 1343  * Found or allocated entry is returned in newblkpp.
 1344  */
 1345 static int
 1346 newblk_lookup(fs, newblkno, flags, newblkpp)
 1347         struct fs *fs;
 1348         ufs2_daddr_t newblkno;
 1349         int flags;
 1350         struct newblk **newblkpp;
 1351 {
 1352         struct newblk *newblk;
 1353         struct newblk_hashhead *newblkhd;
 1354 
 1355         newblkhd = NEWBLK_HASH(fs, newblkno);
 1356         if (newblk_find(newblkhd, fs, newblkno, newblkpp))
 1357                 return (1);
 1358         if ((flags & DEPALLOC) == 0)
 1359                 return (0);
 1360         FREE_LOCK(&lk);
 1361         MALLOC(newblk, struct newblk *, sizeof(struct newblk),
 1362                 M_NEWBLK, M_SOFTDEP_FLAGS);
 1363         ACQUIRE_LOCK(&lk);
 1364         if (newblk_find(newblkhd, fs, newblkno, newblkpp)) {
 1365                 FREE(newblk, M_NEWBLK);
 1366                 return (1);
 1367         }
 1368         newblk->nb_state = 0;
 1369         newblk->nb_fs = fs;
 1370         newblk->nb_newblkno = newblkno;
 1371         LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
 1372         *newblkpp = newblk;
 1373         return (0);
 1374 }
 1375 
 1376 /*
 1377  * Executed during filesystem system initialization before
 1378  * mounting any filesystems.
 1379  */
 1380 void 
 1381 softdep_initialize()
 1382 {
 1383 
 1384         LIST_INIT(&mkdirlisthd);
 1385         max_softdeps = desiredvnodes * 4;
 1386         pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
 1387             &pagedep_hash);
 1388         inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
 1389         newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
 1390 
 1391         /* initialise bioops hack */
 1392         bioops.io_start = softdep_disk_io_initiation;
 1393         bioops.io_complete = softdep_disk_write_complete;
 1394         bioops.io_deallocate = softdep_deallocate_dependencies;
 1395         bioops.io_countdeps = softdep_count_dependencies;
 1396 }
 1397 
 1398 /*
 1399  * Executed after all filesystems have been unmounted during
 1400  * filesystem module unload.
 1401  */
 1402 void
 1403 softdep_uninitialize()
 1404 {
 1405 
 1406         hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
 1407         hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
 1408         hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
 1409 }
 1410 
 1411 /*
 1412  * Called at mount time to notify the dependency code that a
 1413  * filesystem wishes to use it.
 1414  */
 1415 int
 1416 softdep_mount(devvp, mp, fs, cred)
 1417         struct vnode *devvp;
 1418         struct mount *mp;
 1419         struct fs *fs;
 1420         struct ucred *cred;
 1421 {
 1422         struct csum_total cstotal;
 1423         struct ufsmount *ump;
 1424         struct cg *cgp;
 1425         struct buf *bp;
 1426         int error, cyl;
 1427 
 1428         MNT_ILOCK(mp);
 1429         mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
 1430         MNT_IUNLOCK(mp);
 1431         ump = VFSTOUFS(mp);
 1432         LIST_INIT(&ump->softdep_workitem_pending);
 1433         ump->softdep_worklist_tail = NULL;
 1434         ump->softdep_on_worklist = 0;
 1435         ump->softdep_deps = 0;
 1436         /*
 1437          * When doing soft updates, the counters in the
 1438          * superblock may have gotten out of sync. Recomputation
 1439          * can take a long time and can be deferred for background
 1440          * fsck.  However, the old behavior of scanning the cylinder
 1441          * groups and recalculating them at mount time is available
 1442          * by setting vfs.ffs.compute_summary_at_mount to one.
 1443          */
 1444         if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
 1445                 return (0);
 1446         bzero(&cstotal, sizeof cstotal);
 1447         for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
 1448                 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
 1449                     fs->fs_cgsize, cred, &bp)) != 0) {
 1450                         brelse(bp);
 1451                         return (error);
 1452                 }
 1453                 cgp = (struct cg *)bp->b_data;
 1454                 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
 1455                 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
 1456                 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
 1457                 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
 1458                 fs->fs_cs(fs, cyl) = cgp->cg_cs;
 1459                 brelse(bp);
 1460         }
 1461 #ifdef DEBUG
 1462         if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
 1463                 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
 1464 #endif
 1465         bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
 1466         return (0);
 1467 }
 1468 
 1469 /*
 1470  * Protecting the freemaps (or bitmaps).
 1471  * 
 1472  * To eliminate the need to execute fsck before mounting a filesystem
 1473  * after a power failure, one must (conservatively) guarantee that the
 1474  * on-disk copy of the bitmaps never indicate that a live inode or block is
 1475  * free.  So, when a block or inode is allocated, the bitmap should be
 1476  * updated (on disk) before any new pointers.  When a block or inode is
 1477  * freed, the bitmap should not be updated until all pointers have been
 1478  * reset.  The latter dependency is handled by the delayed de-allocation
 1479  * approach described below for block and inode de-allocation.  The former
 1480  * dependency is handled by calling the following procedure when a block or
 1481  * inode is allocated. When an inode is allocated an "inodedep" is created
 1482  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
 1483  * Each "inodedep" is also inserted into the hash indexing structure so
 1484  * that any additional link additions can be made dependent on the inode
 1485  * allocation.
 1486  * 
 1487  * The ufs filesystem maintains a number of free block counts (e.g., per
 1488  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
 1489  * in addition to the bitmaps.  These counts are used to improve efficiency
 1490  * during allocation and therefore must be consistent with the bitmaps.
 1491  * There is no convenient way to guarantee post-crash consistency of these
 1492  * counts with simple update ordering, for two main reasons: (1) The counts
 1493  * and bitmaps for a single cylinder group block are not in the same disk
 1494  * sector.  If a disk write is interrupted (e.g., by power failure), one may
 1495  * be written and the other not.  (2) Some of the counts are located in the
 1496  * superblock rather than the cylinder group block. So, we focus our soft
 1497  * updates implementation on protecting the bitmaps. When mounting a
 1498  * filesystem, we recompute the auxiliary counts from the bitmaps.
 1499  */
 1500 
 1501 /*
 1502  * Called just after updating the cylinder group block to allocate an inode.
 1503  */
 1504 void
 1505 softdep_setup_inomapdep(bp, ip, newinum)
 1506         struct buf *bp;         /* buffer for cylgroup block with inode map */
 1507         struct inode *ip;       /* inode related to allocation */
 1508         ino_t newinum;          /* new inode number being allocated */
 1509 {
 1510         struct inodedep *inodedep;
 1511         struct bmsafemap *bmsafemap;
 1512 
 1513         /*
 1514          * Create a dependency for the newly allocated inode.
 1515          * Panic if it already exists as something is seriously wrong.
 1516          * Otherwise add it to the dependency list for the buffer holding
 1517          * the cylinder group map from which it was allocated.
 1518          */
 1519         ACQUIRE_LOCK(&lk);
 1520         if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY,
 1521             &inodedep)))
 1522                 panic("softdep_setup_inomapdep: dependency for new inode "
 1523                     "already exists");
 1524         inodedep->id_buf = bp;
 1525         inodedep->id_state &= ~DEPCOMPLETE;
 1526         bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp);
 1527         LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
 1528         FREE_LOCK(&lk);
 1529 }
 1530 
 1531 /*
 1532  * Called just after updating the cylinder group block to
 1533  * allocate block or fragment.
 1534  */
 1535 void
 1536 softdep_setup_blkmapdep(bp, mp, newblkno)
 1537         struct buf *bp;         /* buffer for cylgroup block with block map */
 1538         struct mount *mp;       /* filesystem doing allocation */
 1539         ufs2_daddr_t newblkno;  /* number of newly allocated block */
 1540 {
 1541         struct newblk *newblk;
 1542         struct bmsafemap *bmsafemap;
 1543         struct fs *fs;
 1544 
 1545         fs = VFSTOUFS(mp)->um_fs;
 1546         /*
 1547          * Create a dependency for the newly allocated block.
 1548          * Add it to the dependency list for the buffer holding
 1549          * the cylinder group map from which it was allocated.
 1550          */
 1551         ACQUIRE_LOCK(&lk);
 1552         if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
 1553                 panic("softdep_setup_blkmapdep: found block");
 1554         newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp);
 1555         LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
 1556         FREE_LOCK(&lk);
 1557 }
 1558 
 1559 /*
 1560  * Find the bmsafemap associated with a cylinder group buffer.
 1561  * If none exists, create one. The buffer must be locked when
 1562  * this routine is called and this routine must be called with
 1563  * splbio interrupts blocked.
 1564  */
 1565 static struct bmsafemap *
 1566 bmsafemap_lookup(mp, bp)
 1567         struct mount *mp;
 1568         struct buf *bp;
 1569 {
 1570         struct bmsafemap *bmsafemap;
 1571         struct worklist *wk;
 1572 
 1573         mtx_assert(&lk, MA_OWNED);
 1574         LIST_FOREACH(wk, &bp->b_dep, wk_list)
 1575                 if (wk->wk_type == D_BMSAFEMAP)
 1576                         return (WK_BMSAFEMAP(wk));
 1577         FREE_LOCK(&lk);
 1578         MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
 1579                 M_BMSAFEMAP, M_SOFTDEP_FLAGS);
 1580         workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
 1581         bmsafemap->sm_buf = bp;
 1582         LIST_INIT(&bmsafemap->sm_allocdirecthd);
 1583         LIST_INIT(&bmsafemap->sm_allocindirhd);
 1584         LIST_INIT(&bmsafemap->sm_inodedephd);
 1585         LIST_INIT(&bmsafemap->sm_newblkhd);
 1586         ACQUIRE_LOCK(&lk);
 1587         WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
 1588         return (bmsafemap);
 1589 }
 1590 
 1591 /*
 1592  * Direct block allocation dependencies.
 1593  * 
 1594  * When a new block is allocated, the corresponding disk locations must be
 1595  * initialized (with zeros or new data) before the on-disk inode points to
 1596  * them.  Also, the freemap from which the block was allocated must be
 1597  * updated (on disk) before the inode's pointer. These two dependencies are
 1598  * independent of each other and are needed for all file blocks and indirect
 1599  * blocks that are pointed to directly by the inode.  Just before the
 1600  * "in-core" version of the inode is updated with a newly allocated block
 1601  * number, a procedure (below) is called to setup allocation dependency
 1602  * structures.  These structures are removed when the corresponding
 1603  * dependencies are satisfied or when the block allocation becomes obsolete
 1604  * (i.e., the file is deleted, the block is de-allocated, or the block is a
 1605  * fragment that gets upgraded).  All of these cases are handled in
 1606  * procedures described later.
 1607  * 
 1608  * When a file extension causes a fragment to be upgraded, either to a larger
 1609  * fragment or to a full block, the on-disk location may change (if the
 1610  * previous fragment could not simply be extended). In this case, the old
 1611  * fragment must be de-allocated, but not until after the inode's pointer has
 1612  * been updated. In most cases, this is handled by later procedures, which
 1613  * will construct a "freefrag" structure to be added to the workitem queue
 1614  * when the inode update is complete (or obsolete).  The main exception to
 1615  * this is when an allocation occurs while a pending allocation dependency
 1616  * (for the same block pointer) remains.  This case is handled in the main
 1617  * allocation dependency setup procedure by immediately freeing the
 1618  * unreferenced fragments.
 1619  */ 
 1620 void 
 1621 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 1622         struct inode *ip;       /* inode to which block is being added */
 1623         ufs_lbn_t lbn;          /* block pointer within inode */
 1624         ufs2_daddr_t newblkno;  /* disk block number being added */
 1625         ufs2_daddr_t oldblkno;  /* previous block number, 0 unless frag */
 1626         long newsize;           /* size of new block */
 1627         long oldsize;           /* size of new block */
 1628         struct buf *bp;         /* bp for allocated block */
 1629 {
 1630         struct allocdirect *adp, *oldadp;
 1631         struct allocdirectlst *adphead;
 1632         struct bmsafemap *bmsafemap;
 1633         struct inodedep *inodedep;
 1634         struct pagedep *pagedep;
 1635         struct newblk *newblk;
 1636         struct mount *mp;
 1637 
 1638         mp = UFSTOVFS(ip->i_ump);
 1639         MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
 1640                 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
 1641         workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
 1642         adp->ad_lbn = lbn;
 1643         adp->ad_newblkno = newblkno;
 1644         adp->ad_oldblkno = oldblkno;
 1645         adp->ad_newsize = newsize;
 1646         adp->ad_oldsize = oldsize;
 1647         adp->ad_state = ATTACHED;
 1648         LIST_INIT(&adp->ad_newdirblk);
 1649         if (newblkno == oldblkno)
 1650                 adp->ad_freefrag = NULL;
 1651         else
 1652                 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
 1653 
 1654         ACQUIRE_LOCK(&lk);
 1655         if (lbn >= NDADDR) {
 1656                 /* allocating an indirect block */
 1657                 if (oldblkno != 0)
 1658                         panic("softdep_setup_allocdirect: non-zero indir");
 1659         } else {
 1660                 /*
 1661                  * Allocating a direct block.
 1662                  *
 1663                  * If we are allocating a directory block, then we must
 1664                  * allocate an associated pagedep to track additions and
 1665                  * deletions.
 1666                  */
 1667                 if ((ip->i_mode & IFMT) == IFDIR &&
 1668                     pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
 1669                         WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 1670         }
 1671         if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
 1672                 panic("softdep_setup_allocdirect: lost block");
 1673         if (newblk->nb_state == DEPCOMPLETE) {
 1674                 adp->ad_state |= DEPCOMPLETE;
 1675                 adp->ad_buf = NULL;
 1676         } else {
 1677                 bmsafemap = newblk->nb_bmsafemap;
 1678                 adp->ad_buf = bmsafemap->sm_buf;
 1679                 LIST_REMOVE(newblk, nb_deps);
 1680                 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
 1681         }
 1682         LIST_REMOVE(newblk, nb_hash);
 1683         FREE(newblk, M_NEWBLK);
 1684 
 1685         inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
 1686         adp->ad_inodedep = inodedep;
 1687         WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
 1688         /*
 1689          * The list of allocdirects must be kept in sorted and ascending
 1690          * order so that the rollback routines can quickly determine the
 1691          * first uncommitted block (the size of the file stored on disk
 1692          * ends at the end of the lowest committed fragment, or if there
 1693          * are no fragments, at the end of the highest committed block).
 1694          * Since files generally grow, the typical case is that the new
 1695          * block is to be added at the end of the list. We speed this
 1696          * special case by checking against the last allocdirect in the
 1697          * list before laboriously traversing the list looking for the
 1698          * insertion point.
 1699          */
 1700         adphead = &inodedep->id_newinoupdt;
 1701         oldadp = TAILQ_LAST(adphead, allocdirectlst);
 1702         if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
 1703                 /* insert at end of list */
 1704                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 1705                 if (oldadp != NULL && oldadp->ad_lbn == lbn)
 1706                         allocdirect_merge(adphead, adp, oldadp);
 1707                 FREE_LOCK(&lk);
 1708                 return;
 1709         }
 1710         TAILQ_FOREACH(oldadp, adphead, ad_next) {
 1711                 if (oldadp->ad_lbn >= lbn)
 1712                         break;
 1713         }
 1714         if (oldadp == NULL)
 1715                 panic("softdep_setup_allocdirect: lost entry");
 1716         /* insert in middle of list */
 1717         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
 1718         if (oldadp->ad_lbn == lbn)
 1719                 allocdirect_merge(adphead, adp, oldadp);
 1720         FREE_LOCK(&lk);
 1721 }
 1722 
 1723 /*
 1724  * Replace an old allocdirect dependency with a newer one.
 1725  * This routine must be called with splbio interrupts blocked.
 1726  */
 1727 static void
 1728 allocdirect_merge(adphead, newadp, oldadp)
 1729         struct allocdirectlst *adphead; /* head of list holding allocdirects */
 1730         struct allocdirect *newadp;     /* allocdirect being added */
 1731         struct allocdirect *oldadp;     /* existing allocdirect being checked */
 1732 {
 1733         struct worklist *wk;
 1734         struct freefrag *freefrag;
 1735         struct newdirblk *newdirblk;
 1736 
 1737         mtx_assert(&lk, MA_OWNED);
 1738         if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
 1739             newadp->ad_oldsize != oldadp->ad_newsize ||
 1740             newadp->ad_lbn >= NDADDR)
 1741                 panic("%s %jd != new %jd || old size %ld != new %ld",
 1742                     "allocdirect_merge: old blkno",
 1743                     (intmax_t)newadp->ad_oldblkno,
 1744                     (intmax_t)oldadp->ad_newblkno,
 1745                     newadp->ad_oldsize, oldadp->ad_newsize);
 1746         newadp->ad_oldblkno = oldadp->ad_oldblkno;
 1747         newadp->ad_oldsize = oldadp->ad_oldsize;
 1748         /*
 1749          * If the old dependency had a fragment to free or had never
 1750          * previously had a block allocated, then the new dependency
 1751          * can immediately post its freefrag and adopt the old freefrag.
 1752          * This action is done by swapping the freefrag dependencies.
 1753          * The new dependency gains the old one's freefrag, and the
 1754          * old one gets the new one and then immediately puts it on
 1755          * the worklist when it is freed by free_allocdirect. It is
 1756          * not possible to do this swap when the old dependency had a
 1757          * non-zero size but no previous fragment to free. This condition
 1758          * arises when the new block is an extension of the old block.
 1759          * Here, the first part of the fragment allocated to the new
 1760          * dependency is part of the block currently claimed on disk by
 1761          * the old dependency, so cannot legitimately be freed until the
 1762          * conditions for the new dependency are fulfilled.
 1763          */
 1764         if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
 1765                 freefrag = newadp->ad_freefrag;
 1766                 newadp->ad_freefrag = oldadp->ad_freefrag;
 1767                 oldadp->ad_freefrag = freefrag;
 1768         }
 1769         /*
 1770          * If we are tracking a new directory-block allocation,
 1771          * move it from the old allocdirect to the new allocdirect.
 1772          */
 1773         if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
 1774                 newdirblk = WK_NEWDIRBLK(wk);
 1775                 WORKLIST_REMOVE(&newdirblk->db_list);
 1776                 if (!LIST_EMPTY(&oldadp->ad_newdirblk))
 1777                         panic("allocdirect_merge: extra newdirblk");
 1778                 WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
 1779         }
 1780         free_allocdirect(adphead, oldadp, 0);
 1781 }
 1782                 
 1783 /*
 1784  * Allocate a new freefrag structure if needed.
 1785  */
 1786 static struct freefrag *
 1787 newfreefrag(ip, blkno, size)
 1788         struct inode *ip;
 1789         ufs2_daddr_t blkno;
 1790         long size;
 1791 {
 1792         struct freefrag *freefrag;
 1793         struct fs *fs;
 1794 
 1795         if (blkno == 0)
 1796                 return (NULL);
 1797         fs = ip->i_fs;
 1798         if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
 1799                 panic("newfreefrag: frag size");
 1800         MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
 1801                 M_FREEFRAG, M_SOFTDEP_FLAGS);
 1802         workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
 1803         freefrag->ff_inum = ip->i_number;
 1804         freefrag->ff_blkno = blkno;
 1805         freefrag->ff_fragsize = size;
 1806         return (freefrag);
 1807 }
 1808 
 1809 /*
 1810  * This workitem de-allocates fragments that were replaced during
 1811  * file block allocation.
 1812  */
 1813 static void 
 1814 handle_workitem_freefrag(freefrag)
 1815         struct freefrag *freefrag;
 1816 {
 1817         struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
 1818 
 1819         ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
 1820             freefrag->ff_fragsize, freefrag->ff_inum);
 1821         ACQUIRE_LOCK(&lk);
 1822         WORKITEM_FREE(freefrag, D_FREEFRAG);
 1823         FREE_LOCK(&lk);
 1824 }
 1825 
 1826 /*
 1827  * Set up a dependency structure for an external attributes data block.
 1828  * This routine follows much of the structure of softdep_setup_allocdirect.
 1829  * See the description of softdep_setup_allocdirect above for details.
 1830  */
 1831 void 
 1832 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 1833         struct inode *ip;
 1834         ufs_lbn_t lbn;
 1835         ufs2_daddr_t newblkno;
 1836         ufs2_daddr_t oldblkno;
 1837         long newsize;
 1838         long oldsize;
 1839         struct buf *bp;
 1840 {
 1841         struct allocdirect *adp, *oldadp;
 1842         struct allocdirectlst *adphead;
 1843         struct bmsafemap *bmsafemap;
 1844         struct inodedep *inodedep;
 1845         struct newblk *newblk;
 1846         struct mount *mp;
 1847 
 1848         mp = UFSTOVFS(ip->i_ump);
 1849         MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
 1850                 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
 1851         workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
 1852         adp->ad_lbn = lbn;
 1853         adp->ad_newblkno = newblkno;
 1854         adp->ad_oldblkno = oldblkno;
 1855         adp->ad_newsize = newsize;
 1856         adp->ad_oldsize = oldsize;
 1857         adp->ad_state = ATTACHED | EXTDATA;
 1858         LIST_INIT(&adp->ad_newdirblk);
 1859         if (newblkno == oldblkno)
 1860                 adp->ad_freefrag = NULL;
 1861         else
 1862                 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
 1863 
 1864         ACQUIRE_LOCK(&lk);
 1865         if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
 1866                 panic("softdep_setup_allocext: lost block");
 1867 
 1868         inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
 1869         adp->ad_inodedep = inodedep;
 1870 
 1871         if (newblk->nb_state == DEPCOMPLETE) {
 1872                 adp->ad_state |= DEPCOMPLETE;
 1873                 adp->ad_buf = NULL;
 1874         } else {
 1875                 bmsafemap = newblk->nb_bmsafemap;
 1876                 adp->ad_buf = bmsafemap->sm_buf;
 1877                 LIST_REMOVE(newblk, nb_deps);
 1878                 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
 1879         }
 1880         LIST_REMOVE(newblk, nb_hash);
 1881         FREE(newblk, M_NEWBLK);
 1882 
 1883         WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
 1884         if (lbn >= NXADDR)
 1885                 panic("softdep_setup_allocext: lbn %lld > NXADDR",
 1886                     (long long)lbn);
 1887         /*
 1888          * The list of allocdirects must be kept in sorted and ascending
 1889          * order so that the rollback routines can quickly determine the
 1890          * first uncommitted block (the size of the file stored on disk
 1891          * ends at the end of the lowest committed fragment, or if there
 1892          * are no fragments, at the end of the highest committed block).
 1893          * Since files generally grow, the typical case is that the new
 1894          * block is to be added at the end of the list. We speed this
 1895          * special case by checking against the last allocdirect in the
 1896          * list before laboriously traversing the list looking for the
 1897          * insertion point.
 1898          */
 1899         adphead = &inodedep->id_newextupdt;
 1900         oldadp = TAILQ_LAST(adphead, allocdirectlst);
 1901         if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
 1902                 /* insert at end of list */
 1903                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 1904                 if (oldadp != NULL && oldadp->ad_lbn == lbn)
 1905                         allocdirect_merge(adphead, adp, oldadp);
 1906                 FREE_LOCK(&lk);
 1907                 return;
 1908         }
 1909         TAILQ_FOREACH(oldadp, adphead, ad_next) {
 1910                 if (oldadp->ad_lbn >= lbn)
 1911                         break;
 1912         }
 1913         if (oldadp == NULL)
 1914                 panic("softdep_setup_allocext: lost entry");
 1915         /* insert in middle of list */
 1916         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
 1917         if (oldadp->ad_lbn == lbn)
 1918                 allocdirect_merge(adphead, adp, oldadp);
 1919         FREE_LOCK(&lk);
 1920 }
 1921 
 1922 /*
 1923  * Indirect block allocation dependencies.
 1924  * 
 1925  * The same dependencies that exist for a direct block also exist when
 1926  * a new block is allocated and pointed to by an entry in a block of
 1927  * indirect pointers. The undo/redo states described above are also
 1928  * used here. Because an indirect block contains many pointers that
 1929  * may have dependencies, a second copy of the entire in-memory indirect
 1930  * block is kept. The buffer cache copy is always completely up-to-date.
 1931  * The second copy, which is used only as a source for disk writes,
 1932  * contains only the safe pointers (i.e., those that have no remaining
 1933  * update dependencies). The second copy is freed when all pointers
 1934  * are safe. The cache is not allowed to replace indirect blocks with
 1935  * pending update dependencies. If a buffer containing an indirect
 1936  * block with dependencies is written, these routines will mark it
 1937  * dirty again. It can only be successfully written once all the
 1938  * dependencies are removed. The ffs_fsync routine in conjunction with
 1939  * softdep_sync_metadata work together to get all the dependencies
 1940  * removed so that a file can be successfully written to disk. Three
 1941  * procedures are used when setting up indirect block pointer
 1942  * dependencies. The division is necessary because of the organization
 1943  * of the "balloc" routine and because of the distinction between file
 1944  * pages and file metadata blocks.
 1945  */
 1946 
 1947 /*
 1948  * Allocate a new allocindir structure.
 1949  */
 1950 static struct allocindir *
 1951 newallocindir(ip, ptrno, newblkno, oldblkno)
 1952         struct inode *ip;       /* inode for file being extended */
 1953         int ptrno;              /* offset of pointer in indirect block */
 1954         ufs2_daddr_t newblkno;  /* disk block number being added */
 1955         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
 1956 {
 1957         struct allocindir *aip;
 1958 
 1959         MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
 1960                 M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
 1961         workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump));
 1962         aip->ai_state = ATTACHED;
 1963         aip->ai_offset = ptrno;
 1964         aip->ai_newblkno = newblkno;
 1965         aip->ai_oldblkno = oldblkno;
 1966         aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
 1967         return (aip);
 1968 }
 1969 
 1970 /*
 1971  * Called just before setting an indirect block pointer
 1972  * to a newly allocated file page.
 1973  */
 1974 void
 1975 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 1976         struct inode *ip;       /* inode for file being extended */
 1977         ufs_lbn_t lbn;          /* allocated block number within file */
 1978         struct buf *bp;         /* buffer with indirect blk referencing page */
 1979         int ptrno;              /* offset of pointer in indirect block */
 1980         ufs2_daddr_t newblkno;  /* disk block number being added */
 1981         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
 1982         struct buf *nbp;        /* buffer holding allocated page */
 1983 {
 1984         struct allocindir *aip;
 1985         struct pagedep *pagedep;
 1986 
 1987         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
 1988         aip = newallocindir(ip, ptrno, newblkno, oldblkno);
 1989         ACQUIRE_LOCK(&lk);
 1990         /*
 1991          * If we are allocating a directory page, then we must
 1992          * allocate an associated pagedep to track additions and
 1993          * deletions.
 1994          */
 1995         if ((ip->i_mode & IFMT) == IFDIR &&
 1996             pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
 1997                 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
 1998         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
 1999         setup_allocindir_phase2(bp, ip, aip);
 2000         FREE_LOCK(&lk);
 2001 }
 2002 
 2003 /*
 2004  * Called just before setting an indirect block pointer to a
 2005  * newly allocated indirect block.
 2006  */
 2007 void
 2008 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 2009         struct buf *nbp;        /* newly allocated indirect block */
 2010         struct inode *ip;       /* inode for file being extended */
 2011         struct buf *bp;         /* indirect block referencing allocated block */
 2012         int ptrno;              /* offset of pointer in indirect block */
 2013         ufs2_daddr_t newblkno;  /* disk block number being added */
 2014 {
 2015         struct allocindir *aip;
 2016 
 2017         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
 2018         aip = newallocindir(ip, ptrno, newblkno, 0);
 2019         ACQUIRE_LOCK(&lk);
 2020         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
 2021         setup_allocindir_phase2(bp, ip, aip);
 2022         FREE_LOCK(&lk);
 2023 }
 2024 
 2025 /*
 2026  * Called to finish the allocation of the "aip" allocated
 2027  * by one of the two routines above.
 2028  */
 2029 static void 
 2030 setup_allocindir_phase2(bp, ip, aip)
 2031         struct buf *bp;         /* in-memory copy of the indirect block */
 2032         struct inode *ip;       /* inode for file being extended */
 2033         struct allocindir *aip; /* allocindir allocated by the above routines */
 2034 {
 2035         struct worklist *wk;
 2036         struct indirdep *indirdep, *newindirdep;
 2037         struct bmsafemap *bmsafemap;
 2038         struct allocindir *oldaip;
 2039         struct freefrag *freefrag;
 2040         struct newblk *newblk;
 2041         ufs2_daddr_t blkno;
 2042 
 2043         mtx_assert(&lk, MA_OWNED);
 2044         if (bp->b_lblkno >= 0)
 2045                 panic("setup_allocindir_phase2: not indir blk");
 2046         for (indirdep = NULL, newindirdep = NULL; ; ) {
 2047                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 2048                         if (wk->wk_type != D_INDIRDEP)
 2049                                 continue;
 2050                         indirdep = WK_INDIRDEP(wk);
 2051                         break;
 2052                 }
 2053                 if (indirdep == NULL && newindirdep) {
 2054                         indirdep = newindirdep;
 2055                         WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
 2056                         newindirdep = NULL;
 2057                 }
 2058                 if (indirdep) {
 2059                         if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
 2060                             &newblk) == 0)
 2061                                 panic("setup_allocindir: lost block");
 2062                         if (newblk->nb_state == DEPCOMPLETE) {
 2063                                 aip->ai_state |= DEPCOMPLETE;
 2064                                 aip->ai_buf = NULL;
 2065                         } else {
 2066                                 bmsafemap = newblk->nb_bmsafemap;
 2067                                 aip->ai_buf = bmsafemap->sm_buf;
 2068                                 LIST_REMOVE(newblk, nb_deps);
 2069                                 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
 2070                                     aip, ai_deps);
 2071                         }
 2072                         LIST_REMOVE(newblk, nb_hash);
 2073                         FREE(newblk, M_NEWBLK);
 2074                         aip->ai_indirdep = indirdep;
 2075                         /*
 2076                          * Check to see if there is an existing dependency
 2077                          * for this block. If there is, merge the old
 2078                          * dependency into the new one.
 2079                          */
 2080                         if (aip->ai_oldblkno == 0)
 2081                                 oldaip = NULL;
 2082                         else
 2083 
 2084                                 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
 2085                                         if (oldaip->ai_offset == aip->ai_offset)
 2086                                                 break;
 2087                         freefrag = NULL;
 2088                         if (oldaip != NULL) {
 2089                                 if (oldaip->ai_newblkno != aip->ai_oldblkno)
 2090                                         panic("setup_allocindir_phase2: blkno");
 2091                                 aip->ai_oldblkno = oldaip->ai_oldblkno;
 2092                                 freefrag = aip->ai_freefrag;
 2093                                 aip->ai_freefrag = oldaip->ai_freefrag;
 2094                                 oldaip->ai_freefrag = NULL;
 2095                                 free_allocindir(oldaip, NULL);
 2096                         }
 2097                         LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
 2098                         if (ip->i_ump->um_fstype == UFS1)
 2099                                 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
 2100                                     [aip->ai_offset] = aip->ai_oldblkno;
 2101                         else
 2102                                 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
 2103                                     [aip->ai_offset] = aip->ai_oldblkno;
 2104                         FREE_LOCK(&lk);
 2105                         if (freefrag != NULL)
 2106                                 handle_workitem_freefrag(freefrag);
 2107                 } else
 2108                         FREE_LOCK(&lk);
 2109                 if (newindirdep) {
 2110                         newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
 2111                         brelse(newindirdep->ir_savebp);
 2112                         ACQUIRE_LOCK(&lk);
 2113                         WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
 2114                         if (indirdep)
 2115                                 break;
 2116                         FREE_LOCK(&lk);
 2117                 }
 2118                 if (indirdep) {
 2119                         ACQUIRE_LOCK(&lk);
 2120                         break;
 2121                 }
 2122                 MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
 2123                         M_INDIRDEP, M_SOFTDEP_FLAGS);
 2124                 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP,
 2125                     UFSTOVFS(ip->i_ump));
 2126                 newindirdep->ir_state = ATTACHED;
 2127                 if (ip->i_ump->um_fstype == UFS1)
 2128                         newindirdep->ir_state |= UFS1FMT;
 2129                 LIST_INIT(&newindirdep->ir_deplisthd);
 2130                 LIST_INIT(&newindirdep->ir_donehd);
 2131                 if (bp->b_blkno == bp->b_lblkno) {
 2132                         ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
 2133                             NULL, NULL);
 2134                         bp->b_blkno = blkno;
 2135                 }
 2136                 newindirdep->ir_savebp =
 2137                     getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
 2138                 BUF_KERNPROC(newindirdep->ir_savebp);
 2139                 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
 2140                 ACQUIRE_LOCK(&lk);
 2141         }
 2142 }
 2143 
 2144 /*
 2145  * Block de-allocation dependencies.
 2146  * 
 2147  * When blocks are de-allocated, the on-disk pointers must be nullified before
 2148  * the blocks are made available for use by other files.  (The true
 2149  * requirement is that old pointers must be nullified before new on-disk
 2150  * pointers are set.  We chose this slightly more stringent requirement to
 2151  * reduce complexity.) Our implementation handles this dependency by updating
 2152  * the inode (or indirect block) appropriately but delaying the actual block
 2153  * de-allocation (i.e., freemap and free space count manipulation) until
 2154  * after the updated versions reach stable storage.  After the disk is
 2155  * updated, the blocks can be safely de-allocated whenever it is convenient.
 2156  * This implementation handles only the common case of reducing a file's
 2157  * length to zero. Other cases are handled by the conventional synchronous
 2158  * write approach.
 2159  *
 2160  * The ffs implementation with which we worked double-checks
 2161  * the state of the block pointers and file size as it reduces
 2162  * a file's length.  Some of this code is replicated here in our
 2163  * soft updates implementation.  The freeblks->fb_chkcnt field is
 2164  * used to transfer a part of this information to the procedure
 2165  * that eventually de-allocates the blocks.
 2166  *
 2167  * This routine should be called from the routine that shortens
 2168  * a file's length, before the inode's size or block pointers
 2169  * are modified. It will save the block pointer information for
 2170  * later release and zero the inode so that the calling routine
 2171  * can release it.
 2172  */
 2173 void
 2174 softdep_setup_freeblocks(ip, length, flags)
 2175         struct inode *ip;       /* The inode whose length is to be reduced */
 2176         off_t length;           /* The new length for the file */
 2177         int flags;              /* IO_EXT and/or IO_NORMAL */
 2178 {
 2179         struct freeblks *freeblks;
 2180         struct inodedep *inodedep;
 2181         struct allocdirect *adp;
 2182         struct vnode *vp;
 2183         struct buf *bp;
 2184         struct fs *fs;
 2185         ufs2_daddr_t extblocks, datablocks;
 2186         struct mount *mp;
 2187         int i, delay, error;
 2188 
 2189         fs = ip->i_fs;
 2190         mp = UFSTOVFS(ip->i_ump);
 2191         if (length != 0)
 2192                 panic("softdep_setup_freeblocks: non-zero length");
 2193         MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
 2194                 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
 2195         workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
 2196         freeblks->fb_state = ATTACHED;
 2197         freeblks->fb_uid = ip->i_uid;
 2198         freeblks->fb_previousinum = ip->i_number;
 2199         freeblks->fb_devvp = ip->i_devvp;
 2200         ACQUIRE_LOCK(&lk);
 2201         num_freeblkdep++;
 2202         FREE_LOCK(&lk);
 2203         extblocks = 0;
 2204         if (fs->fs_magic == FS_UFS2_MAGIC)
 2205                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
 2206         datablocks = DIP(ip, i_blocks) - extblocks;
 2207         if ((flags & IO_NORMAL) == 0) {
 2208                 freeblks->fb_oldsize = 0;
 2209                 freeblks->fb_chkcnt = 0;
 2210         } else {
 2211                 freeblks->fb_oldsize = ip->i_size;
 2212                 ip->i_size = 0;
 2213                 DIP_SET(ip, i_size, 0);
 2214                 freeblks->fb_chkcnt = datablocks;
 2215                 for (i = 0; i < NDADDR; i++) {
 2216                         freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
 2217                         DIP_SET(ip, i_db[i], 0);
 2218                 }
 2219                 for (i = 0; i < NIADDR; i++) {
 2220                         freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
 2221                         DIP_SET(ip, i_ib[i], 0);
 2222                 }
 2223                 /*
 2224                  * If the file was removed, then the space being freed was
 2225                  * accounted for then (see softdep_releasefile()). If the
 2226                  * file is merely being truncated, then we account for it now.
 2227                  */
 2228                 if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
 2229                         UFS_LOCK(ip->i_ump);
 2230                         fs->fs_pendingblocks += datablocks;
 2231                         UFS_UNLOCK(ip->i_ump);
 2232                 }
 2233         }
 2234         if ((flags & IO_EXT) == 0) {
 2235                 freeblks->fb_oldextsize = 0;
 2236         } else {
 2237                 freeblks->fb_oldextsize = ip->i_din2->di_extsize;
 2238                 ip->i_din2->di_extsize = 0;
 2239                 freeblks->fb_chkcnt += extblocks;
 2240                 for (i = 0; i < NXADDR; i++) {
 2241                         freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
 2242                         ip->i_din2->di_extb[i] = 0;
 2243                 }
 2244         }
 2245         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
 2246         /*
 2247          * Push the zero'ed inode to to its disk buffer so that we are free
 2248          * to delete its dependencies below. Once the dependencies are gone
 2249          * the buffer can be safely released.
 2250          */
 2251         if ((error = bread(ip->i_devvp,
 2252             fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 2253             (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
 2254                 brelse(bp);
 2255                 softdep_error("softdep_setup_freeblocks", error);
 2256         }
 2257         if (ip->i_ump->um_fstype == UFS1)
 2258                 *((struct ufs1_dinode *)bp->b_data +
 2259                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
 2260         else
 2261                 *((struct ufs2_dinode *)bp->b_data +
 2262                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
 2263         /*
 2264          * Find and eliminate any inode dependencies.
 2265          */
 2266         ACQUIRE_LOCK(&lk);
 2267         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 2268         if ((inodedep->id_state & IOSTARTED) != 0)
 2269                 panic("softdep_setup_freeblocks: inode busy");
 2270         /*
 2271          * Add the freeblks structure to the list of operations that
 2272          * must await the zero'ed inode being written to disk. If we
 2273          * still have a bitmap dependency (delay == 0), then the inode
 2274          * has never been written to disk, so we can process the
 2275          * freeblks below once we have deleted the dependencies.
 2276          */
 2277         delay = (inodedep->id_state & DEPCOMPLETE);
 2278         if (delay)
 2279                 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
 2280         /*
 2281          * Because the file length has been truncated to zero, any
 2282          * pending block allocation dependency structures associated
 2283          * with this inode are obsolete and can simply be de-allocated.
 2284          * We must first merge the two dependency lists to get rid of
 2285          * any duplicate freefrag structures, then purge the merged list.
 2286          * If we still have a bitmap dependency, then the inode has never
 2287          * been written to disk, so we can free any fragments without delay.
 2288          */
 2289         if (flags & IO_NORMAL) {
 2290                 merge_inode_lists(&inodedep->id_newinoupdt,
 2291                     &inodedep->id_inoupdt);
 2292                 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
 2293                         free_allocdirect(&inodedep->id_inoupdt, adp, delay);
 2294         }
 2295         if (flags & IO_EXT) {
 2296                 merge_inode_lists(&inodedep->id_newextupdt,
 2297                     &inodedep->id_extupdt);
 2298                 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
 2299                         free_allocdirect(&inodedep->id_extupdt, adp, delay);
 2300         }
 2301         FREE_LOCK(&lk);
 2302         bdwrite(bp);
 2303         /*
 2304          * We must wait for any I/O in progress to finish so that
 2305          * all potential buffers on the dirty list will be visible.
 2306          * Once they are all there, walk the list and get rid of
 2307          * any dependencies.
 2308          */
 2309         vp = ITOV(ip);
 2310         VI_LOCK(vp);
 2311         drain_output(vp);
 2312 restart:
 2313         TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
 2314                 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
 2315                     ((flags & IO_NORMAL) == 0 &&
 2316                       (bp->b_xflags & BX_ALTDATA) == 0))
 2317                         continue;
 2318                 if ((bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT)) == NULL)
 2319                         goto restart;
 2320                 VI_UNLOCK(vp);
 2321                 ACQUIRE_LOCK(&lk);
 2322                 (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
 2323                 deallocate_dependencies(bp, inodedep);
 2324                 FREE_LOCK(&lk);
 2325                 bp->b_flags |= B_INVAL | B_NOCACHE;
 2326                 brelse(bp);
 2327                 VI_LOCK(vp);
 2328                 goto restart;
 2329         }
 2330         VI_UNLOCK(vp);
 2331         ACQUIRE_LOCK(&lk);
 2332         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
 2333                 (void) free_inodedep(inodedep);
 2334 
 2335         if(delay) {
 2336                 freeblks->fb_state |= DEPCOMPLETE;
 2337                 /*
 2338                  * If the inode with zeroed block pointers is now on disk
 2339                  * we can start freeing blocks. Add freeblks to the worklist
 2340                  * instead of calling  handle_workitem_freeblocks directly as
 2341                  * it is more likely that additional IO is needed to complete
 2342                  * the request here than in the !delay case.
 2343                  */  
 2344                 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
 2345                         add_to_worklist(&freeblks->fb_list);
 2346         }
 2347 
 2348         FREE_LOCK(&lk);
 2349         /*
 2350          * If the inode has never been written to disk (delay == 0),
 2351          * then we can process the freeblks now that we have deleted
 2352          * the dependencies.
 2353          */
 2354         if (!delay)
 2355                 handle_workitem_freeblocks(freeblks, 0);
 2356 }
 2357 
 2358 /*
 2359  * Reclaim any dependency structures from a buffer that is about to
 2360  * be reallocated to a new vnode. The buffer must be locked, thus,
 2361  * no I/O completion operations can occur while we are manipulating
 2362  * its associated dependencies. The mutex is held so that other I/O's
 2363  * associated with related dependencies do not occur.
 2364  */
 2365 static void
 2366 deallocate_dependencies(bp, inodedep)
 2367         struct buf *bp;
 2368         struct inodedep *inodedep;
 2369 {
 2370         struct worklist *wk;
 2371         struct indirdep *indirdep;
 2372         struct allocindir *aip;
 2373         struct pagedep *pagedep;
 2374         struct dirrem *dirrem;
 2375         struct diradd *dap;
 2376         int i;
 2377 
 2378         mtx_assert(&lk, MA_OWNED);
 2379         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 2380                 switch (wk->wk_type) {
 2381 
 2382                 case D_INDIRDEP:
 2383                         indirdep = WK_INDIRDEP(wk);
 2384                         /*
 2385                          * None of the indirect pointers will ever be visible,
 2386                          * so they can simply be tossed. GOINGAWAY ensures
 2387                          * that allocated pointers will be saved in the buffer
 2388                          * cache until they are freed. Note that they will
 2389                          * only be able to be found by their physical address
 2390                          * since the inode mapping the logical address will
 2391                          * be gone. The save buffer used for the safe copy
 2392                          * was allocated in setup_allocindir_phase2 using
 2393                          * the physical address so it could be used for this
 2394                          * purpose. Hence we swap the safe copy with the real
 2395                          * copy, allowing the safe copy to be freed and holding
 2396                          * on to the real copy for later use in indir_trunc.
 2397                          */
 2398                         if (indirdep->ir_state & GOINGAWAY)
 2399                                 panic("deallocate_dependencies: already gone");
 2400                         indirdep->ir_state |= GOINGAWAY;
 2401                         VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
 2402                         while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
 2403                                 free_allocindir(aip, inodedep);
 2404                         if (bp->b_lblkno >= 0 ||
 2405                             bp->b_blkno != indirdep->ir_savebp->b_lblkno)
 2406                                 panic("deallocate_dependencies: not indir");
 2407                         bcopy(bp->b_data, indirdep->ir_savebp->b_data,
 2408                             bp->b_bcount);
 2409                         WORKLIST_REMOVE(wk);
 2410                         WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
 2411                         continue;
 2412 
 2413                 case D_PAGEDEP:
 2414                         pagedep = WK_PAGEDEP(wk);
 2415                         /*
 2416                          * None of the directory additions will ever be
 2417                          * visible, so they can simply be tossed.
 2418                          */
 2419                         for (i = 0; i < DAHASHSZ; i++)
 2420                                 while ((dap =
 2421                                     LIST_FIRST(&pagedep->pd_diraddhd[i])))
 2422                                         free_diradd(dap);
 2423                         while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
 2424                                 free_diradd(dap);
 2425                         /*
 2426                          * Copy any directory remove dependencies to the list
 2427                          * to be processed after the zero'ed inode is written.
 2428                          * If the inode has already been written, then they 
 2429                          * can be dumped directly onto the work list.
 2430                          */
 2431                         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
 2432                                 LIST_REMOVE(dirrem, dm_next);
 2433                                 dirrem->dm_dirinum = pagedep->pd_ino;
 2434                                 if (inodedep == NULL ||
 2435                                     (inodedep->id_state & ALLCOMPLETE) ==
 2436                                      ALLCOMPLETE)
 2437                                         add_to_worklist(&dirrem->dm_list);
 2438                                 else
 2439                                         WORKLIST_INSERT(&inodedep->id_bufwait,
 2440                                             &dirrem->dm_list);
 2441                         }
 2442                         if ((pagedep->pd_state & NEWBLOCK) != 0) {
 2443                                 LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
 2444                                         if (wk->wk_type == D_NEWDIRBLK &&
 2445                                             WK_NEWDIRBLK(wk)->db_pagedep ==
 2446                                               pagedep)
 2447                                                 break;
 2448                                 if (wk != NULL) {
 2449                                         WORKLIST_REMOVE(wk);
 2450                                         free_newdirblk(WK_NEWDIRBLK(wk));
 2451                                 } else
 2452                                         panic("deallocate_dependencies: "
 2453                                               "lost pagedep");
 2454                         }
 2455                         WORKLIST_REMOVE(&pagedep->pd_list);
 2456                         LIST_REMOVE(pagedep, pd_hash);
 2457                         WORKITEM_FREE(pagedep, D_PAGEDEP);
 2458                         continue;
 2459 
 2460                 case D_ALLOCINDIR:
 2461                         free_allocindir(WK_ALLOCINDIR(wk), inodedep);
 2462                         continue;
 2463 
 2464                 case D_ALLOCDIRECT:
 2465                 case D_INODEDEP:
 2466                         panic("deallocate_dependencies: Unexpected type %s",
 2467                             TYPENAME(wk->wk_type));
 2468                         /* NOTREACHED */
 2469 
 2470                 default:
 2471                         panic("deallocate_dependencies: Unknown type %s",
 2472                             TYPENAME(wk->wk_type));
 2473                         /* NOTREACHED */
 2474                 }
 2475         }
 2476 }
 2477 
 2478 /*
 2479  * Free an allocdirect. Generate a new freefrag work request if appropriate.
 2480  * This routine must be called with splbio interrupts blocked.
 2481  */
 2482 static void
 2483 free_allocdirect(adphead, adp, delay)
 2484         struct allocdirectlst *adphead;
 2485         struct allocdirect *adp;
 2486         int delay;
 2487 {
 2488         struct newdirblk *newdirblk;
 2489         struct worklist *wk;
 2490 
 2491         mtx_assert(&lk, MA_OWNED);
 2492         if ((adp->ad_state & DEPCOMPLETE) == 0)
 2493                 LIST_REMOVE(adp, ad_deps);
 2494         TAILQ_REMOVE(adphead, adp, ad_next);
 2495         if ((adp->ad_state & COMPLETE) == 0)
 2496                 WORKLIST_REMOVE(&adp->ad_list);
 2497         if (adp->ad_freefrag != NULL) {
 2498                 if (delay)
 2499                         WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
 2500                             &adp->ad_freefrag->ff_list);
 2501                 else
 2502                         add_to_worklist(&adp->ad_freefrag->ff_list);
 2503         }
 2504         if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
 2505                 newdirblk = WK_NEWDIRBLK(wk);
 2506                 WORKLIST_REMOVE(&newdirblk->db_list);
 2507                 if (!LIST_EMPTY(&adp->ad_newdirblk))
 2508                         panic("free_allocdirect: extra newdirblk");
 2509                 if (delay)
 2510                         WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
 2511                             &newdirblk->db_list);
 2512                 else
 2513                         free_newdirblk(newdirblk);
 2514         }
 2515         WORKITEM_FREE(adp, D_ALLOCDIRECT);
 2516 }
 2517 
 2518 /*
 2519  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
 2520  * This routine must be called with splbio interrupts blocked.
 2521  */
 2522 static void
 2523 free_newdirblk(newdirblk)
 2524         struct newdirblk *newdirblk;
 2525 {
 2526         struct pagedep *pagedep;
 2527         struct diradd *dap;
 2528         int i;
 2529 
 2530         mtx_assert(&lk, MA_OWNED);
 2531         /*
 2532          * If the pagedep is still linked onto the directory buffer
 2533          * dependency chain, then some of the entries on the
 2534          * pd_pendinghd list may not be committed to disk yet. In
 2535          * this case, we will simply clear the NEWBLOCK flag and
 2536          * let the pd_pendinghd list be processed when the pagedep
 2537          * is next written. If the pagedep is no longer on the buffer
 2538          * dependency chain, then all the entries on the pd_pending
 2539          * list are committed to disk and we can free them here.
 2540          */
 2541         pagedep = newdirblk->db_pagedep;
 2542         pagedep->pd_state &= ~NEWBLOCK;
 2543         if ((pagedep->pd_state & ONWORKLIST) == 0)
 2544                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 2545                         free_diradd(dap);
 2546         /*
 2547          * If no dependencies remain, the pagedep will be freed.
 2548          */
 2549         for (i = 0; i < DAHASHSZ; i++)
 2550                 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
 2551                         break;
 2552         if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
 2553                 LIST_REMOVE(pagedep, pd_hash);
 2554                 WORKITEM_FREE(pagedep, D_PAGEDEP);
 2555         }
 2556         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 2557 }
 2558 
 2559 /*
 2560  * Prepare an inode to be freed. The actual free operation is not
 2561  * done until the zero'ed inode has been written to disk.
 2562  */
 2563 void
 2564 softdep_freefile(pvp, ino, mode)
 2565         struct vnode *pvp;
 2566         ino_t ino;
 2567         int mode;
 2568 {
 2569         struct inode *ip = VTOI(pvp);
 2570         struct inodedep *inodedep;
 2571         struct freefile *freefile;
 2572 
 2573         /*
 2574          * This sets up the inode de-allocation dependency.
 2575          */
 2576         MALLOC(freefile, struct freefile *, sizeof(struct freefile),
 2577                 M_FREEFILE, M_SOFTDEP_FLAGS);
 2578         workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
 2579         freefile->fx_mode = mode;
 2580         freefile->fx_oldinum = ino;
 2581         freefile->fx_devvp = ip->i_devvp;
 2582         if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
 2583                 UFS_LOCK(ip->i_ump);
 2584                 ip->i_fs->fs_pendinginodes += 1;
 2585                 UFS_UNLOCK(ip->i_ump);
 2586         }
 2587 
 2588         /*
 2589          * If the inodedep does not exist, then the zero'ed inode has
 2590          * been written to disk. If the allocated inode has never been
 2591          * written to disk, then the on-disk inode is zero'ed. In either
 2592          * case we can free the file immediately.
 2593          */
 2594         ACQUIRE_LOCK(&lk);
 2595         if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 ||
 2596             check_inode_unwritten(inodedep)) {
 2597                 FREE_LOCK(&lk);
 2598                 handle_workitem_freefile(freefile);
 2599                 return;
 2600         }
 2601         WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
 2602         FREE_LOCK(&lk);
 2603         ip->i_flag |= IN_MODIFIED;
 2604 }
 2605 
 2606 /*
 2607  * Check to see if an inode has never been written to disk. If
 2608  * so free the inodedep and return success, otherwise return failure.
 2609  * This routine must be called with splbio interrupts blocked.
 2610  *
 2611  * If we still have a bitmap dependency, then the inode has never
 2612  * been written to disk. Drop the dependency as it is no longer
 2613  * necessary since the inode is being deallocated. We set the
 2614  * ALLCOMPLETE flags since the bitmap now properly shows that the
 2615  * inode is not allocated. Even if the inode is actively being
 2616  * written, it has been rolled back to its zero'ed state, so we
 2617  * are ensured that a zero inode is what is on the disk. For short
 2618  * lived files, this change will usually result in removing all the
 2619  * dependencies from the inode so that it can be freed immediately.
 2620  */
 2621 static int
 2622 check_inode_unwritten(inodedep)
 2623         struct inodedep *inodedep;
 2624 {
 2625 
 2626         mtx_assert(&lk, MA_OWNED);
 2627         if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
 2628             !LIST_EMPTY(&inodedep->id_pendinghd) ||
 2629             !LIST_EMPTY(&inodedep->id_bufwait) ||
 2630             !LIST_EMPTY(&inodedep->id_inowait) ||
 2631             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 2632             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
 2633             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 2634             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 2635             inodedep->id_nlinkdelta != 0)
 2636                 return (0);
 2637 
 2638         /*
 2639          * Another process might be in initiate_write_inodeblock_ufs[12]
 2640          * trying to allocate memory without holding "Softdep Lock".
 2641          */
 2642         if ((inodedep->id_state & IOSTARTED) != 0 &&
 2643             inodedep->id_savedino1 == NULL)
 2644                 return (0);
 2645 
 2646         inodedep->id_state |= ALLCOMPLETE;
 2647         LIST_REMOVE(inodedep, id_deps);
 2648         inodedep->id_buf = NULL;
 2649         if (inodedep->id_state & ONWORKLIST)
 2650                 WORKLIST_REMOVE(&inodedep->id_list);
 2651         if (inodedep->id_savedino1 != NULL) {
 2652                 FREE(inodedep->id_savedino1, M_SAVEDINO);
 2653                 inodedep->id_savedino1 = NULL;
 2654         }
 2655         if (free_inodedep(inodedep) == 0)
 2656                 panic("check_inode_unwritten: busy inode");
 2657         return (1);
 2658 }
 2659 
 2660 /*
 2661  * Try to free an inodedep structure. Return 1 if it could be freed.
 2662  */
 2663 static int
 2664 free_inodedep(inodedep)
 2665         struct inodedep *inodedep;
 2666 {
 2667 
 2668         mtx_assert(&lk, MA_OWNED);
 2669         if ((inodedep->id_state & ONWORKLIST) != 0 ||
 2670             (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
 2671             !LIST_EMPTY(&inodedep->id_pendinghd) ||
 2672             !LIST_EMPTY(&inodedep->id_bufwait) ||
 2673             !LIST_EMPTY(&inodedep->id_inowait) ||
 2674             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 2675             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
 2676             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 2677             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 2678             inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
 2679                 return (0);
 2680         LIST_REMOVE(inodedep, id_hash);
 2681         WORKITEM_FREE(inodedep, D_INODEDEP);
 2682         num_inodedep -= 1;
 2683         return (1);
 2684 }
 2685 
 2686 /*
 2687  * This workitem routine performs the block de-allocation.
 2688  * The workitem is added to the pending list after the updated
 2689  * inode block has been written to disk.  As mentioned above,
 2690  * checks regarding the number of blocks de-allocated (compared
 2691  * to the number of blocks allocated for the file) are also
 2692  * performed in this function.
 2693  */
 2694 static void
 2695 handle_workitem_freeblocks(freeblks, flags)
 2696         struct freeblks *freeblks;
 2697         int flags;
 2698 {
 2699         struct inode *ip;
 2700         struct vnode *vp;
 2701         struct fs *fs;
 2702         struct ufsmount *ump;
 2703         int i, nblocks, level, bsize;
 2704         ufs2_daddr_t bn, blocksreleased = 0;
 2705         int error, allerror = 0;
 2706         ufs_lbn_t baselbns[NIADDR], tmpval;
 2707         int fs_pendingblocks;
 2708 
 2709         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 2710         fs = ump->um_fs;
 2711         fs_pendingblocks = 0;
 2712         tmpval = 1;
 2713         baselbns[0] = NDADDR;
 2714         for (i = 1; i < NIADDR; i++) {
 2715                 tmpval *= NINDIR(fs);
 2716                 baselbns[i] = baselbns[i - 1] + tmpval;
 2717         }
 2718         nblocks = btodb(fs->fs_bsize);
 2719         blocksreleased = 0;
 2720         /*
 2721          * Release all extended attribute blocks or frags.
 2722          */
 2723         if (freeblks->fb_oldextsize > 0) {
 2724                 for (i = (NXADDR - 1); i >= 0; i--) {
 2725                         if ((bn = freeblks->fb_eblks[i]) == 0)
 2726                                 continue;
 2727                         bsize = sblksize(fs, freeblks->fb_oldextsize, i);
 2728                         ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
 2729                             freeblks->fb_previousinum);
 2730                         blocksreleased += btodb(bsize);
 2731                 }
 2732         }
 2733         /*
 2734          * Release all data blocks or frags.
 2735          */
 2736         if (freeblks->fb_oldsize > 0) {
 2737                 /*
 2738                  * Indirect blocks first.
 2739                  */
 2740                 for (level = (NIADDR - 1); level >= 0; level--) {
 2741                         if ((bn = freeblks->fb_iblks[level]) == 0)
 2742                                 continue;
 2743                         if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
 2744                             level, baselbns[level], &blocksreleased)) != 0)
 2745                                 allerror = error;
 2746                         ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,
 2747                             fs->fs_bsize, freeblks->fb_previousinum);
 2748                         fs_pendingblocks += nblocks;
 2749                         blocksreleased += nblocks;
 2750                 }
 2751                 /*
 2752                  * All direct blocks or frags.
 2753                  */
 2754                 for (i = (NDADDR - 1); i >= 0; i--) {
 2755                         if ((bn = freeblks->fb_dblks[i]) == 0)
 2756                                 continue;
 2757                         bsize = sblksize(fs, freeblks->fb_oldsize, i);
 2758                         ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
 2759                             freeblks->fb_previousinum);
 2760                         fs_pendingblocks += btodb(bsize);
 2761                         blocksreleased += btodb(bsize);
 2762                 }
 2763         }
 2764         UFS_LOCK(ump);
 2765         fs->fs_pendingblocks -= fs_pendingblocks;
 2766         UFS_UNLOCK(ump);
 2767         /*
 2768          * If we still have not finished background cleanup, then check
 2769          * to see if the block count needs to be adjusted.
 2770          */
 2771         if (freeblks->fb_chkcnt != blocksreleased &&
 2772             (fs->fs_flags & FS_UNCLEAN) != 0 &&
 2773             ffs_vget(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
 2774             (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) {
 2775                 ip = VTOI(vp);
 2776                 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \
 2777                     freeblks->fb_chkcnt - blocksreleased);
 2778                 ip->i_flag |= IN_CHANGE;
 2779                 vput(vp);
 2780         }
 2781 
 2782 #ifdef INVARIANTS
 2783         if (freeblks->fb_chkcnt != blocksreleased &&
 2784             ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
 2785                 printf("handle_workitem_freeblocks: block count\n");
 2786         if (allerror)
 2787                 softdep_error("handle_workitem_freeblks", allerror);
 2788 #endif /* INVARIANTS */
 2789 
 2790         ACQUIRE_LOCK(&lk);
 2791         WORKITEM_FREE(freeblks, D_FREEBLKS);
 2792         num_freeblkdep--;
 2793         FREE_LOCK(&lk);
 2794 }
 2795 
 2796 /*
 2797  * Release blocks associated with the inode ip and stored in the indirect
 2798  * block dbn. If level is greater than SINGLE, the block is an indirect block
 2799  * and recursive calls to indirtrunc must be used to cleanse other indirect
 2800  * blocks.
 2801  */
 2802 static int
 2803 indir_trunc(freeblks, dbn, level, lbn, countp)
 2804         struct freeblks *freeblks;
 2805         ufs2_daddr_t dbn;
 2806         int level;
 2807         ufs_lbn_t lbn;
 2808         ufs2_daddr_t *countp;
 2809 {
 2810         struct buf *bp;
 2811         struct fs *fs;
 2812         struct worklist *wk;
 2813         struct indirdep *indirdep;
 2814         struct ufsmount *ump;
 2815         ufs1_daddr_t *bap1 = 0;
 2816         ufs2_daddr_t nb, *bap2 = 0;
 2817         ufs_lbn_t lbnadd;
 2818         int i, nblocks, ufs1fmt;
 2819         int error, allerror = 0;
 2820         int fs_pendingblocks;
 2821 
 2822         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 2823         fs = ump->um_fs;
 2824         fs_pendingblocks = 0;
 2825         lbnadd = 1;
 2826         for (i = level; i > 0; i--)
 2827                 lbnadd *= NINDIR(fs);
 2828         /*
 2829          * Get buffer of block pointers to be freed. This routine is not
 2830          * called until the zero'ed inode has been written, so it is safe
 2831          * to free blocks as they are encountered. Because the inode has
 2832          * been zero'ed, calls to bmap on these blocks will fail. So, we
 2833          * have to use the on-disk address and the block device for the
 2834          * filesystem to look them up. If the file was deleted before its
 2835          * indirect blocks were all written to disk, the routine that set
 2836          * us up (deallocate_dependencies) will have arranged to leave
 2837          * a complete copy of the indirect block in memory for our use.
 2838          * Otherwise we have to read the blocks in from the disk.
 2839          */
 2840 #ifdef notyet
 2841         bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
 2842             GB_NOCREAT);
 2843 #else
 2844         bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
 2845 #endif
 2846         ACQUIRE_LOCK(&lk);
 2847         if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 2848                 if (wk->wk_type != D_INDIRDEP ||
 2849                     (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
 2850                     (indirdep->ir_state & GOINGAWAY) == 0)
 2851                         panic("indir_trunc: lost indirdep");
 2852                 WORKLIST_REMOVE(wk);
 2853                 WORKITEM_FREE(indirdep, D_INDIRDEP);
 2854                 if (!LIST_EMPTY(&bp->b_dep))
 2855                         panic("indir_trunc: dangling dep");
 2856                 ump->um_numindirdeps -= 1;
 2857                 FREE_LOCK(&lk);
 2858         } else {
 2859 #ifdef notyet
 2860                 if (bp)
 2861                         brelse(bp);
 2862 #endif
 2863                 FREE_LOCK(&lk);
 2864                 error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
 2865                     NOCRED, &bp);
 2866                 if (error) {
 2867                         brelse(bp);
 2868                         return (error);
 2869                 }
 2870         }
 2871         /*
 2872          * Recursively free indirect blocks.
 2873          */
 2874         if (ump->um_fstype == UFS1) {
 2875                 ufs1fmt = 1;
 2876                 bap1 = (ufs1_daddr_t *)bp->b_data;
 2877         } else {
 2878                 ufs1fmt = 0;
 2879                 bap2 = (ufs2_daddr_t *)bp->b_data;
 2880         }
 2881         nblocks = btodb(fs->fs_bsize);
 2882         for (i = NINDIR(fs) - 1; i >= 0; i--) {
 2883                 if (ufs1fmt)
 2884                         nb = bap1[i];
 2885                 else
 2886                         nb = bap2[i];
 2887                 if (nb == 0)
 2888                         continue;
 2889                 if (level != 0) {
 2890                         if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
 2891                              level - 1, lbn + (i * lbnadd), countp)) != 0)
 2892                                 allerror = error;
 2893                 }
 2894                 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize,
 2895                     freeblks->fb_previousinum);
 2896                 fs_pendingblocks += nblocks;
 2897                 *countp += nblocks;
 2898         }
 2899         UFS_LOCK(ump);
 2900         fs->fs_pendingblocks -= fs_pendingblocks;
 2901         UFS_UNLOCK(ump);
 2902         bp->b_flags |= B_INVAL | B_NOCACHE;
 2903         brelse(bp);
 2904         return (allerror);
 2905 }
 2906 
 2907 /*
 2908  * Free an allocindir.
 2909  * This routine must be called with splbio interrupts blocked.
 2910  */
 2911 static void
 2912 free_allocindir(aip, inodedep)
 2913         struct allocindir *aip;
 2914         struct inodedep *inodedep;
 2915 {
 2916         struct freefrag *freefrag;
 2917 
 2918         mtx_assert(&lk, MA_OWNED);
 2919         if ((aip->ai_state & DEPCOMPLETE) == 0)
 2920                 LIST_REMOVE(aip, ai_deps);
 2921         if (aip->ai_state & ONWORKLIST)
 2922                 WORKLIST_REMOVE(&aip->ai_list);
 2923         LIST_REMOVE(aip, ai_next);
 2924         if ((freefrag = aip->ai_freefrag) != NULL) {
 2925                 if (inodedep == NULL)
 2926                         add_to_worklist(&freefrag->ff_list);
 2927                 else
 2928                         WORKLIST_INSERT(&inodedep->id_bufwait,
 2929                             &freefrag->ff_list);
 2930         }
 2931         WORKITEM_FREE(aip, D_ALLOCINDIR);
 2932 }
 2933 
 2934 /*
 2935  * Directory entry addition dependencies.
 2936  * 
 2937  * When adding a new directory entry, the inode (with its incremented link
 2938  * count) must be written to disk before the directory entry's pointer to it.
 2939  * Also, if the inode is newly allocated, the corresponding freemap must be
 2940  * updated (on disk) before the directory entry's pointer. These requirements
 2941  * are met via undo/redo on the directory entry's pointer, which consists
 2942  * simply of the inode number.
 2943  * 
 2944  * As directory entries are added and deleted, the free space within a
 2945  * directory block can become fragmented.  The ufs filesystem will compact
 2946  * a fragmented directory block to make space for a new entry. When this
 2947  * occurs, the offsets of previously added entries change. Any "diradd"
 2948  * dependency structures corresponding to these entries must be updated with
 2949  * the new offsets.
 2950  */
 2951 
 2952 /*
 2953  * This routine is called after the in-memory inode's link
 2954  * count has been incremented, but before the directory entry's
 2955  * pointer to the inode has been set.
 2956  */
 2957 int
 2958 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 2959         struct buf *bp;         /* buffer containing directory block */
 2960         struct inode *dp;       /* inode for directory */
 2961         off_t diroffset;        /* offset of new entry in directory */
 2962         ino_t newinum;          /* inode referenced by new directory entry */
 2963         struct buf *newdirbp;   /* non-NULL => contents of new mkdir */
 2964         int isnewblk;           /* entry is in a newly allocated block */
 2965 {
 2966         int offset;             /* offset of new entry within directory block */
 2967         ufs_lbn_t lbn;          /* block in directory containing new entry */
 2968         struct fs *fs;
 2969         struct diradd *dap;
 2970         struct allocdirect *adp;
 2971         struct pagedep *pagedep;
 2972         struct inodedep *inodedep;
 2973         struct newdirblk *newdirblk = 0;
 2974         struct mkdir *mkdir1, *mkdir2;
 2975         struct mount *mp;
 2976 
 2977         /*
 2978          * Whiteouts have no dependencies.
 2979          */
 2980         if (newinum == WINO) {
 2981                 if (newdirbp != NULL)
 2982                         bdwrite(newdirbp);
 2983                 return (0);
 2984         }
 2985         mp = UFSTOVFS(dp->i_ump);
 2986         fs = dp->i_fs;
 2987         lbn = lblkno(fs, diroffset);
 2988         offset = blkoff(fs, diroffset);
 2989         MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
 2990                 M_SOFTDEP_FLAGS|M_ZERO);
 2991         workitem_alloc(&dap->da_list, D_DIRADD, mp);
 2992         dap->da_offset = offset;
 2993         dap->da_newinum = newinum;
 2994         dap->da_state = ATTACHED;
 2995         if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
 2996                 MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk),
 2997                     M_NEWDIRBLK, M_SOFTDEP_FLAGS);
 2998                 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
 2999         }
 3000         if (newdirbp == NULL) {
 3001                 dap->da_state |= DEPCOMPLETE;
 3002                 ACQUIRE_LOCK(&lk);
 3003         } else {
 3004                 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
 3005                 MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
 3006                     M_SOFTDEP_FLAGS);
 3007                 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
 3008                 mkdir1->md_state = MKDIR_BODY;
 3009                 mkdir1->md_diradd = dap;
 3010                 MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
 3011                     M_SOFTDEP_FLAGS);
 3012                 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
 3013                 mkdir2->md_state = MKDIR_PARENT;
 3014                 mkdir2->md_diradd = dap;
 3015                 /*
 3016                  * Dependency on "." and ".." being written to disk.
 3017                  */
 3018                 mkdir1->md_buf = newdirbp;
 3019                 ACQUIRE_LOCK(&lk);
 3020                 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
 3021                 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
 3022                 FREE_LOCK(&lk);
 3023                 bdwrite(newdirbp);
 3024                 /*
 3025                  * Dependency on link count increase for parent directory
 3026                  */
 3027                 ACQUIRE_LOCK(&lk);
 3028                 if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0
 3029                     || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 3030                         dap->da_state &= ~MKDIR_PARENT;
 3031                         WORKITEM_FREE(mkdir2, D_MKDIR);
 3032                 } else {
 3033                         LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
 3034                         WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
 3035                 }
 3036         }
 3037         /*
 3038          * Link into parent directory pagedep to await its being written.
 3039          */
 3040         if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
 3041                 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 3042         dap->da_pagedep = pagedep;
 3043         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
 3044             da_pdlist);
 3045         /*
 3046          * Link into its inodedep. Put it on the id_bufwait list if the inode
 3047          * is not yet written. If it is written, do the post-inode write
 3048          * processing to put it on the id_pendinghd list.
 3049          */
 3050         (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
 3051         if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
 3052                 diradd_inode_written(dap, inodedep);
 3053         else
 3054                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 3055         if (isnewblk) {
 3056                 /*
 3057                  * Directories growing into indirect blocks are rare
 3058                  * enough and the frequency of new block allocation
 3059                  * in those cases even more rare, that we choose not
 3060                  * to bother tracking them. Rather we simply force the
 3061                  * new directory entry to disk.
 3062                  */
 3063                 if (lbn >= NDADDR) {
 3064                         FREE_LOCK(&lk);
 3065                         /*
 3066                          * We only have a new allocation when at the
 3067                          * beginning of a new block, not when we are
 3068                          * expanding into an existing block.
 3069                          */
 3070                         if (blkoff(fs, diroffset) == 0)
 3071                                 return (1);
 3072                         return (0);
 3073                 }
 3074                 /*
 3075                  * We only have a new allocation when at the beginning
 3076                  * of a new fragment, not when we are expanding into an
 3077                  * existing fragment. Also, there is nothing to do if we
 3078                  * are already tracking this block.
 3079                  */
 3080                 if (fragoff(fs, diroffset) != 0) {
 3081                         FREE_LOCK(&lk);
 3082                         return (0);
 3083                 }
 3084                 if ((pagedep->pd_state & NEWBLOCK) != 0) {
 3085                         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 3086                         FREE_LOCK(&lk);
 3087                         return (0);
 3088                 }
 3089                 /*
 3090                  * Find our associated allocdirect and have it track us.
 3091                  */
 3092                 if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0)
 3093                         panic("softdep_setup_directory_add: lost inodedep");
 3094                 adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
 3095                 if (adp == NULL || adp->ad_lbn != lbn)
 3096                         panic("softdep_setup_directory_add: lost entry");
 3097                 pagedep->pd_state |= NEWBLOCK;
 3098                 newdirblk->db_pagedep = pagedep;
 3099                 WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
 3100         }
 3101         FREE_LOCK(&lk);
 3102         return (0);
 3103 }
 3104 
 3105 /*
 3106  * This procedure is called to change the offset of a directory
 3107  * entry when compacting a directory block which must be owned
 3108  * exclusively by the caller. Note that the actual entry movement
 3109  * must be done in this procedure to ensure that no I/O completions
 3110  * occur while the move is in progress.
 3111  */
 3112 void 
 3113 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
 3114         struct inode *dp;       /* inode for directory */
 3115         caddr_t base;           /* address of dp->i_offset */
 3116         caddr_t oldloc;         /* address of old directory location */
 3117         caddr_t newloc;         /* address of new directory location */
 3118         int entrysize;          /* size of directory entry */
 3119 {
 3120         int offset, oldoffset, newoffset;
 3121         struct pagedep *pagedep;
 3122         struct diradd *dap;
 3123         ufs_lbn_t lbn;
 3124 
 3125         ACQUIRE_LOCK(&lk);
 3126         lbn = lblkno(dp->i_fs, dp->i_offset);
 3127         offset = blkoff(dp->i_fs, dp->i_offset);
 3128         if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
 3129                 goto done;
 3130         oldoffset = offset + (oldloc - base);
 3131         newoffset = offset + (newloc - base);
 3132 
 3133         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
 3134                 if (dap->da_offset != oldoffset)
 3135                         continue;
 3136                 dap->da_offset = newoffset;
 3137                 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
 3138                         break;
 3139                 LIST_REMOVE(dap, da_pdlist);
 3140                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
 3141                     dap, da_pdlist);
 3142                 break;
 3143         }
 3144         if (dap == NULL) {
 3145 
 3146                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
 3147                         if (dap->da_offset == oldoffset) {
 3148                                 dap->da_offset = newoffset;
 3149                                 break;
 3150                         }
 3151                 }
 3152         }
 3153 done:
 3154         bcopy(oldloc, newloc, entrysize);
 3155         FREE_LOCK(&lk);
 3156 }
 3157 
 3158 /*
 3159  * Free a diradd dependency structure. This routine must be called
 3160  * with splbio interrupts blocked.
 3161  */
 3162 static void
 3163 free_diradd(dap)
 3164         struct diradd *dap;
 3165 {
 3166         struct dirrem *dirrem;
 3167         struct pagedep *pagedep;
 3168         struct inodedep *inodedep;
 3169         struct mkdir *mkdir, *nextmd;
 3170 
 3171         mtx_assert(&lk, MA_OWNED);
 3172         WORKLIST_REMOVE(&dap->da_list);
 3173         LIST_REMOVE(dap, da_pdlist);
 3174         if ((dap->da_state & DIRCHG) == 0) {
 3175                 pagedep = dap->da_pagedep;
 3176         } else {
 3177                 dirrem = dap->da_previous;
 3178                 pagedep = dirrem->dm_pagedep;
 3179                 dirrem->dm_dirinum = pagedep->pd_ino;
 3180                 add_to_worklist(&dirrem->dm_list);
 3181         }
 3182         if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
 3183             0, &inodedep) != 0)
 3184                 (void) free_inodedep(inodedep);
 3185         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
 3186                 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
 3187                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
 3188                         if (mkdir->md_diradd != dap)
 3189                                 continue;
 3190                         dap->da_state &= ~mkdir->md_state;
 3191                         WORKLIST_REMOVE(&mkdir->md_list);
 3192                         LIST_REMOVE(mkdir, md_mkdirs);
 3193                         WORKITEM_FREE(mkdir, D_MKDIR);
 3194                 }
 3195                 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
 3196                         panic("free_diradd: unfound ref");
 3197         }
 3198         WORKITEM_FREE(dap, D_DIRADD);
 3199 }
 3200 
 3201 /*
 3202  * Directory entry removal dependencies.
 3203  * 
 3204  * When removing a directory entry, the entry's inode pointer must be
 3205  * zero'ed on disk before the corresponding inode's link count is decremented
 3206  * (possibly freeing the inode for re-use). This dependency is handled by
 3207  * updating the directory entry but delaying the inode count reduction until
 3208  * after the directory block has been written to disk. After this point, the
 3209  * inode count can be decremented whenever it is convenient.
 3210  */
 3211 
 3212 /*
 3213  * This routine should be called immediately after removing
 3214  * a directory entry.  The inode's link count should not be
 3215  * decremented by the calling procedure -- the soft updates
 3216  * code will do this task when it is safe.
 3217  */
 3218 void 
 3219 softdep_setup_remove(bp, dp, ip, isrmdir)
 3220         struct buf *bp;         /* buffer containing directory block */
 3221         struct inode *dp;       /* inode for the directory being modified */
 3222         struct inode *ip;       /* inode for directory entry being removed */
 3223         int isrmdir;            /* indicates if doing RMDIR */
 3224 {
 3225         struct dirrem *dirrem, *prevdirrem;
 3226 
 3227         /*
 3228          * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
 3229          */
 3230         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
 3231 
 3232         /*
 3233          * If the COMPLETE flag is clear, then there were no active
 3234          * entries and we want to roll back to a zeroed entry until
 3235          * the new inode is committed to disk. If the COMPLETE flag is
 3236          * set then we have deleted an entry that never made it to
 3237          * disk. If the entry we deleted resulted from a name change,
 3238          * then the old name still resides on disk. We cannot delete
 3239          * its inode (returned to us in prevdirrem) until the zeroed
 3240          * directory entry gets to disk. The new inode has never been
 3241          * referenced on the disk, so can be deleted immediately.
 3242          */
 3243         if ((dirrem->dm_state & COMPLETE) == 0) {
 3244                 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
 3245                     dm_next);
 3246                 FREE_LOCK(&lk);
 3247         } else {
 3248                 if (prevdirrem != NULL)
 3249                         LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
 3250                             prevdirrem, dm_next);
 3251                 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
 3252                 FREE_LOCK(&lk);
 3253                 handle_workitem_remove(dirrem, NULL);
 3254         }
 3255 }
 3256 
 3257 /*
 3258  * Allocate a new dirrem if appropriate and return it along with
 3259  * its associated pagedep. Called without a lock, returns with lock.
 3260  */
 3261 static long num_dirrem;         /* number of dirrem allocated */
 3262 static struct dirrem *
 3263 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
 3264         struct buf *bp;         /* buffer containing directory block */
 3265         struct inode *dp;       /* inode for the directory being modified */
 3266         struct inode *ip;       /* inode for directory entry being removed */
 3267         int isrmdir;            /* indicates if doing RMDIR */
 3268         struct dirrem **prevdirremp; /* previously referenced inode, if any */
 3269 {
 3270         int offset;
 3271         ufs_lbn_t lbn;
 3272         struct diradd *dap;
 3273         struct dirrem *dirrem;
 3274         struct pagedep *pagedep;
 3275 
 3276         /*
 3277          * Whiteouts have no deletion dependencies.
 3278          */
 3279         if (ip == NULL)
 3280                 panic("newdirrem: whiteout");
 3281         /*
 3282          * If we are over our limit, try to improve the situation.
 3283          * Limiting the number of dirrem structures will also limit
 3284          * the number of freefile and freeblks structures.
 3285          */
 3286         ACQUIRE_LOCK(&lk);
 3287         if (num_dirrem > max_softdeps / 2)
 3288                 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE);
 3289         num_dirrem += 1;
 3290         FREE_LOCK(&lk);
 3291         MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
 3292                 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
 3293         workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount);
 3294         dirrem->dm_state = isrmdir ? RMDIR : 0;
 3295         dirrem->dm_oldinum = ip->i_number;
 3296         *prevdirremp = NULL;
 3297 
 3298         ACQUIRE_LOCK(&lk);
 3299         lbn = lblkno(dp->i_fs, dp->i_offset);
 3300         offset = blkoff(dp->i_fs, dp->i_offset);
 3301         if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
 3302                 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 3303         dirrem->dm_pagedep = pagedep;
 3304         /*
 3305          * Check for a diradd dependency for the same directory entry.
 3306          * If present, then both dependencies become obsolete and can
 3307          * be de-allocated. Check for an entry on both the pd_dirraddhd
 3308          * list and the pd_pendinghd list.
 3309          */
 3310 
 3311         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
 3312                 if (dap->da_offset == offset)
 3313                         break;
 3314         if (dap == NULL) {
 3315 
 3316                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
 3317                         if (dap->da_offset == offset)
 3318                                 break;
 3319                 if (dap == NULL)
 3320                         return (dirrem);
 3321         }
 3322         /*
 3323          * Must be ATTACHED at this point.
 3324          */
 3325         if ((dap->da_state & ATTACHED) == 0)
 3326                 panic("newdirrem: not ATTACHED");
 3327         if (dap->da_newinum != ip->i_number)
 3328                 panic("newdirrem: inum %d should be %d",
 3329                     ip->i_number, dap->da_newinum);
 3330         /*
 3331          * If we are deleting a changed name that never made it to disk,
 3332          * then return the dirrem describing the previous inode (which
 3333          * represents the inode currently referenced from this entry on disk).
 3334          */
 3335         if ((dap->da_state & DIRCHG) != 0) {
 3336                 *prevdirremp = dap->da_previous;
 3337                 dap->da_state &= ~DIRCHG;
 3338                 dap->da_pagedep = pagedep;
 3339         }
 3340         /*
 3341          * We are deleting an entry that never made it to disk.
 3342          * Mark it COMPLETE so we can delete its inode immediately.
 3343          */
 3344         dirrem->dm_state |= COMPLETE;
 3345         free_diradd(dap);
 3346         return (dirrem);
 3347 }
 3348 
 3349 /*
 3350  * Directory entry change dependencies.
 3351  * 
 3352  * Changing an existing directory entry requires that an add operation
 3353  * be completed first followed by a deletion. The semantics for the addition
 3354  * are identical to the description of adding a new entry above except
 3355  * that the rollback is to the old inode number rather than zero. Once
 3356  * the addition dependency is completed, the removal is done as described
 3357  * in the removal routine above.
 3358  */
 3359 
 3360 /*
 3361  * This routine should be called immediately after changing
 3362  * a directory entry.  The inode's link count should not be
 3363  * decremented by the calling procedure -- the soft updates
 3364  * code will perform this task when it is safe.
 3365  */
 3366 void 
 3367 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 3368         struct buf *bp;         /* buffer containing directory block */
 3369         struct inode *dp;       /* inode for the directory being modified */
 3370         struct inode *ip;       /* inode for directory entry being removed */
 3371         ino_t newinum;          /* new inode number for changed entry */
 3372         int isrmdir;            /* indicates if doing RMDIR */
 3373 {
 3374         int offset;
 3375         struct diradd *dap = NULL;
 3376         struct dirrem *dirrem, *prevdirrem;
 3377         struct pagedep *pagedep;
 3378         struct inodedep *inodedep;
 3379         struct mount *mp;
 3380 
 3381         offset = blkoff(dp->i_fs, dp->i_offset);
 3382         mp = UFSTOVFS(dp->i_ump);
 3383 
 3384         /*
 3385          * Whiteouts do not need diradd dependencies.
 3386          */
 3387         if (newinum != WINO) {
 3388                 MALLOC(dap, struct diradd *, sizeof(struct diradd),
 3389                     M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
 3390                 workitem_alloc(&dap->da_list, D_DIRADD, mp);
 3391                 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
 3392                 dap->da_offset = offset;
 3393                 dap->da_newinum = newinum;
 3394         }
 3395 
 3396         /*
 3397          * Allocate a new dirrem and ACQUIRE_LOCK.
 3398          */
 3399         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
 3400         pagedep = dirrem->dm_pagedep;
 3401         /*
 3402          * The possible values for isrmdir:
 3403          *      0 - non-directory file rename
 3404          *      1 - directory rename within same directory
 3405          *   inum - directory rename to new directory of given inode number
 3406          * When renaming to a new directory, we are both deleting and
 3407          * creating a new directory entry, so the link count on the new
 3408          * directory should not change. Thus we do not need the followup
 3409          * dirrem which is usually done in handle_workitem_remove. We set
 3410          * the DIRCHG flag to tell handle_workitem_remove to skip the 
 3411          * followup dirrem.
 3412          */
 3413         if (isrmdir > 1)
 3414                 dirrem->dm_state |= DIRCHG;
 3415 
 3416         /*
 3417          * Whiteouts have no additional dependencies,
 3418          * so just put the dirrem on the correct list.
 3419          */
 3420         if (newinum == WINO) {
 3421                 if ((dirrem->dm_state & COMPLETE) == 0) {
 3422                         LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
 3423                             dm_next);
 3424                 } else {
 3425                         dirrem->dm_dirinum = pagedep->pd_ino;
 3426                         add_to_worklist(&dirrem->dm_list);
 3427                 }
 3428                 FREE_LOCK(&lk);
 3429                 return;
 3430         }
 3431 
 3432         /*
 3433          * If the COMPLETE flag is clear, then there were no active
 3434          * entries and we want to roll back to the previous inode until
 3435          * the new inode is committed to disk. If the COMPLETE flag is
 3436          * set, then we have deleted an entry that never made it to disk.
 3437          * If the entry we deleted resulted from a name change, then the old
 3438          * inode reference still resides on disk. Any rollback that we do
 3439          * needs to be to that old inode (returned to us in prevdirrem). If
 3440          * the entry we deleted resulted from a create, then there is
 3441          * no entry on the disk, so we want to roll back to zero rather
 3442          * than the uncommitted inode. In either of the COMPLETE cases we
 3443          * want to immediately free the unwritten and unreferenced inode.
 3444          */
 3445         if ((dirrem->dm_state & COMPLETE) == 0) {
 3446                 dap->da_previous = dirrem;
 3447         } else {
 3448                 if (prevdirrem != NULL) {
 3449                         dap->da_previous = prevdirrem;
 3450                 } else {
 3451                         dap->da_state &= ~DIRCHG;
 3452                         dap->da_pagedep = pagedep;
 3453                 }
 3454                 dirrem->dm_dirinum = pagedep->pd_ino;
 3455                 add_to_worklist(&dirrem->dm_list);
 3456         }
 3457         /*
 3458          * Link into its inodedep. Put it on the id_bufwait list if the inode
 3459          * is not yet written. If it is written, do the post-inode write
 3460          * processing to put it on the id_pendinghd list.
 3461          */
 3462         if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 ||
 3463             (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 3464                 dap->da_state |= COMPLETE;
 3465                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 3466                 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
 3467         } else {
 3468                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
 3469                     dap, da_pdlist);
 3470                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 3471         }
 3472         FREE_LOCK(&lk);
 3473 }
 3474 
 3475 /*
 3476  * Called whenever the link count on an inode is changed.
 3477  * It creates an inode dependency so that the new reference(s)
 3478  * to the inode cannot be committed to disk until the updated
 3479  * inode has been written.
 3480  */
 3481 void
 3482 softdep_change_linkcnt(ip)
 3483         struct inode *ip;       /* the inode with the increased link count */
 3484 {
 3485         struct inodedep *inodedep;
 3486 
 3487         ACQUIRE_LOCK(&lk);
 3488         (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
 3489             DEPALLOC, &inodedep);
 3490         if (ip->i_nlink < ip->i_effnlink)
 3491                 panic("softdep_change_linkcnt: bad delta");
 3492         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 3493         FREE_LOCK(&lk);
 3494 }
 3495 
 3496 /*
 3497  * Called when the effective link count and the reference count
 3498  * on an inode drops to zero. At this point there are no names
 3499  * referencing the file in the filesystem and no active file
 3500  * references. The space associated with the file will be freed
 3501  * as soon as the necessary soft dependencies are cleared.
 3502  */
 3503 void
 3504 softdep_releasefile(ip)
 3505         struct inode *ip;       /* inode with the zero effective link count */
 3506 {
 3507         struct inodedep *inodedep;
 3508         struct fs *fs;
 3509         int extblocks;
 3510 
 3511         if (ip->i_effnlink > 0)
 3512                 panic("softdep_releasefile: file still referenced");
 3513         /*
 3514          * We may be called several times as the on-disk link count
 3515          * drops to zero. We only want to account for the space once.
 3516          */
 3517         if (ip->i_flag & IN_SPACECOUNTED)
 3518                 return;
 3519         /*
 3520          * We have to deactivate a snapshot otherwise copyonwrites may
 3521          * add blocks and the cleanup may remove blocks after we have
 3522          * tried to account for them.
 3523          */
 3524         if ((ip->i_flags & SF_SNAPSHOT) != 0)
 3525                 ffs_snapremove(ITOV(ip));
 3526         /*
 3527          * If we are tracking an nlinkdelta, we have to also remember
 3528          * whether we accounted for the freed space yet.
 3529          */
 3530         ACQUIRE_LOCK(&lk);
 3531         if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep)))
 3532                 inodedep->id_state |= SPACECOUNTED;
 3533         FREE_LOCK(&lk);
 3534         fs = ip->i_fs;
 3535         extblocks = 0;
 3536         if (fs->fs_magic == FS_UFS2_MAGIC)
 3537                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
 3538         UFS_LOCK(ip->i_ump);
 3539         ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks;
 3540         ip->i_fs->fs_pendinginodes += 1;
 3541         UFS_UNLOCK(ip->i_ump);
 3542         ip->i_flag |= IN_SPACECOUNTED;
 3543 }
 3544 
 3545 /*
 3546  * This workitem decrements the inode's link count.
 3547  * If the link count reaches zero, the file is removed.
 3548  */
 3549 static void 
 3550 handle_workitem_remove(dirrem, xp)
 3551         struct dirrem *dirrem;
 3552         struct vnode *xp;
 3553 {
 3554         struct thread *td = curthread;
 3555         struct inodedep *inodedep;
 3556         struct vnode *vp;
 3557         struct inode *ip;
 3558         ino_t oldinum;
 3559         int error;
 3560 
 3561         if ((vp = xp) == NULL &&
 3562             (error = ffs_vget(dirrem->dm_list.wk_mp,
 3563             dirrem->dm_oldinum, LK_EXCLUSIVE, &vp)) != 0) {
 3564                 softdep_error("handle_workitem_remove: vget", error);
 3565                 return;
 3566         }
 3567         ip = VTOI(vp);
 3568         ACQUIRE_LOCK(&lk);
 3569         if ((inodedep_lookup(dirrem->dm_list.wk_mp,
 3570             dirrem->dm_oldinum, 0, &inodedep)) == 0)
 3571                 panic("handle_workitem_remove: lost inodedep");
 3572         /*
 3573          * Normal file deletion.
 3574          */
 3575         if ((dirrem->dm_state & RMDIR) == 0) {
 3576                 ip->i_nlink--;
 3577                 DIP_SET(ip, i_nlink, ip->i_nlink);
 3578                 ip->i_flag |= IN_CHANGE;
 3579                 if (ip->i_nlink < ip->i_effnlink)
 3580                         panic("handle_workitem_remove: bad file delta");
 3581                 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 3582                 num_dirrem -= 1;
 3583                 WORKITEM_FREE(dirrem, D_DIRREM);
 3584                 FREE_LOCK(&lk);
 3585                 vput(vp);
 3586                 return;
 3587         }
 3588         /*
 3589          * Directory deletion. Decrement reference count for both the
 3590          * just deleted parent directory entry and the reference for ".".
 3591          * Next truncate the directory to length zero. When the
 3592          * truncation completes, arrange to have the reference count on
 3593          * the parent decremented to account for the loss of "..".
 3594          */
 3595         ip->i_nlink -= 2;
 3596         DIP_SET(ip, i_nlink, ip->i_nlink);
 3597         ip->i_flag |= IN_CHANGE;
 3598         if (ip->i_nlink < ip->i_effnlink)
 3599                 panic("handle_workitem_remove: bad dir delta");
 3600         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 3601         FREE_LOCK(&lk);
 3602         if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
 3603                 softdep_error("handle_workitem_remove: truncate", error);
 3604         ACQUIRE_LOCK(&lk);
 3605         /*
 3606          * Rename a directory to a new parent. Since, we are both deleting
 3607          * and creating a new directory entry, the link count on the new
 3608          * directory should not change. Thus we skip the followup dirrem.
 3609          */
 3610         if (dirrem->dm_state & DIRCHG) {
 3611                 num_dirrem -= 1;
 3612                 WORKITEM_FREE(dirrem, D_DIRREM);
 3613                 FREE_LOCK(&lk);
 3614                 vput(vp);
 3615                 return;
 3616         }
 3617         /*
 3618          * If the inodedep does not exist, then the zero'ed inode has
 3619          * been written to disk. If the allocated inode has never been
 3620          * written to disk, then the on-disk inode is zero'ed. In either
 3621          * case we can remove the file immediately.
 3622          */
 3623         dirrem->dm_state = 0;
 3624         oldinum = dirrem->dm_oldinum;
 3625         dirrem->dm_oldinum = dirrem->dm_dirinum;
 3626         if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum,
 3627             0, &inodedep) == 0 || check_inode_unwritten(inodedep)) {
 3628                 if (xp != NULL)
 3629                         add_to_worklist(&dirrem->dm_list);
 3630                 FREE_LOCK(&lk);
 3631                 vput(vp);
 3632                 if (xp == NULL)
 3633                         handle_workitem_remove(dirrem, NULL);
 3634                 return;
 3635         }
 3636         WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
 3637         FREE_LOCK(&lk);
 3638         ip->i_flag |= IN_CHANGE;
 3639         ffs_update(vp, 0);
 3640         vput(vp);
 3641 }
 3642 
 3643 /*
 3644  * Inode de-allocation dependencies.
 3645  * 
 3646  * When an inode's link count is reduced to zero, it can be de-allocated. We
 3647  * found it convenient to postpone de-allocation until after the inode is
 3648  * written to disk with its new link count (zero).  At this point, all of the
 3649  * on-disk inode's block pointers are nullified and, with careful dependency
 3650  * list ordering, all dependencies related to the inode will be satisfied and
 3651  * the corresponding dependency structures de-allocated.  So, if/when the
 3652  * inode is reused, there will be no mixing of old dependencies with new
 3653  * ones.  This artificial dependency is set up by the block de-allocation
 3654  * procedure above (softdep_setup_freeblocks) and completed by the
 3655  * following procedure.
 3656  */
 3657 static void 
 3658 handle_workitem_freefile(freefile)
 3659         struct freefile *freefile;
 3660 {
 3661         struct fs *fs;
 3662         struct inodedep *idp;
 3663         struct ufsmount *ump;
 3664         int error;
 3665 
 3666         ump = VFSTOUFS(freefile->fx_list.wk_mp);
 3667         fs = ump->um_fs;
 3668 #ifdef DEBUG
 3669         ACQUIRE_LOCK(&lk);
 3670         error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
 3671         FREE_LOCK(&lk);
 3672         if (error)
 3673                 panic("handle_workitem_freefile: inodedep survived");
 3674 #endif
 3675         UFS_LOCK(ump);
 3676         fs->fs_pendinginodes -= 1;
 3677         UFS_UNLOCK(ump);
 3678         if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
 3679             freefile->fx_oldinum, freefile->fx_mode)) != 0)
 3680                 softdep_error("handle_workitem_freefile", error);
 3681         ACQUIRE_LOCK(&lk);
 3682         WORKITEM_FREE(freefile, D_FREEFILE);
 3683         FREE_LOCK(&lk);
 3684 }
 3685 
 3686 
 3687 /*
 3688  * Helper function which unlinks marker element from work list and returns
 3689  * the next element on the list.
 3690  */
 3691 static __inline struct worklist *
 3692 markernext(struct worklist *marker)
 3693 {
 3694         struct worklist *next;
 3695         
 3696         next = LIST_NEXT(marker, wk_list);
 3697         LIST_REMOVE(marker, wk_list);
 3698         return next;
 3699 }
 3700 
 3701 /*
 3702  * Disk writes.
 3703  * 
 3704  * The dependency structures constructed above are most actively used when file
 3705  * system blocks are written to disk.  No constraints are placed on when a
 3706  * block can be written, but unsatisfied update dependencies are made safe by
 3707  * modifying (or replacing) the source memory for the duration of the disk
 3708  * write.  When the disk write completes, the memory block is again brought
 3709  * up-to-date.
 3710  *
 3711  * In-core inode structure reclamation.
 3712  * 
 3713  * Because there are a finite number of "in-core" inode structures, they are
 3714  * reused regularly.  By transferring all inode-related dependencies to the
 3715  * in-memory inode block and indexing them separately (via "inodedep"s), we
 3716  * can allow "in-core" inode structures to be reused at any time and avoid
 3717  * any increase in contention.
 3718  *
 3719  * Called just before entering the device driver to initiate a new disk I/O.
 3720  * The buffer must be locked, thus, no I/O completion operations can occur
 3721  * while we are manipulating its associated dependencies.
 3722  */
 3723 static void 
 3724 softdep_disk_io_initiation(bp)
 3725         struct buf *bp;         /* structure describing disk write to occur */
 3726 {
 3727         struct worklist *wk;
 3728         struct worklist marker;
 3729         struct indirdep *indirdep;
 3730         struct inodedep *inodedep;
 3731 
 3732         /*
 3733          * We only care about write operations. There should never
 3734          * be dependencies for reads.
 3735          */
 3736         if (bp->b_iocmd != BIO_WRITE)
 3737                 panic("softdep_disk_io_initiation: not write");
 3738 
 3739         marker.wk_type = D_LAST + 1;    /* Not a normal workitem */
 3740         PHOLD(curproc);                 /* Don't swap out kernel stack */
 3741 
 3742         ACQUIRE_LOCK(&lk);
 3743         /*
 3744          * Do any necessary pre-I/O processing.
 3745          */
 3746         for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
 3747              wk = markernext(&marker)) {
 3748                 LIST_INSERT_AFTER(wk, &marker, wk_list);
 3749                 switch (wk->wk_type) {
 3750 
 3751                 case D_PAGEDEP:
 3752                         initiate_write_filepage(WK_PAGEDEP(wk), bp);
 3753                         continue;
 3754 
 3755                 case D_INODEDEP:
 3756                         inodedep = WK_INODEDEP(wk);
 3757                         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
 3758                                 initiate_write_inodeblock_ufs1(inodedep, bp);
 3759                         else
 3760                                 initiate_write_inodeblock_ufs2(inodedep, bp);
 3761                         continue;
 3762 
 3763                 case D_INDIRDEP:
 3764                         indirdep = WK_INDIRDEP(wk);
 3765                         if (indirdep->ir_state & GOINGAWAY)
 3766                                 panic("disk_io_initiation: indirdep gone");
 3767                         /*
 3768                          * If there are no remaining dependencies, this
 3769                          * will be writing the real pointers, so the
 3770                          * dependency can be freed.
 3771                          */
 3772                         if (LIST_EMPTY(&indirdep->ir_deplisthd)) {
 3773                                 struct buf *bp;
 3774 
 3775                                 bp = indirdep->ir_savebp;
 3776                                 bp->b_flags |= B_INVAL | B_NOCACHE;
 3777                                 /* inline expand WORKLIST_REMOVE(wk); */
 3778                                 wk->wk_state &= ~ONWORKLIST;
 3779                                 LIST_REMOVE(wk, wk_list);
 3780                                 WORKITEM_FREE(indirdep, D_INDIRDEP);
 3781                                 FREE_LOCK(&lk);
 3782                                 brelse(bp);
 3783                                 ACQUIRE_LOCK(&lk);
 3784                                 continue;
 3785                         }
 3786                         /*
 3787                          * Replace up-to-date version with safe version.
 3788                          */
 3789                         FREE_LOCK(&lk);
 3790                         MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
 3791                             M_INDIRDEP, M_SOFTDEP_FLAGS);
 3792                         ACQUIRE_LOCK(&lk);
 3793                         indirdep->ir_state &= ~ATTACHED;
 3794                         indirdep->ir_state |= UNDONE;
 3795                         bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
 3796                         bcopy(indirdep->ir_savebp->b_data, bp->b_data,
 3797                             bp->b_bcount);
 3798                         continue;
 3799 
 3800                 case D_MKDIR:
 3801                 case D_BMSAFEMAP:
 3802                 case D_ALLOCDIRECT:
 3803                 case D_ALLOCINDIR:
 3804                         continue;
 3805 
 3806                 default:
 3807                         panic("handle_disk_io_initiation: Unexpected type %s",
 3808                             TYPENAME(wk->wk_type));
 3809                         /* NOTREACHED */
 3810                 }
 3811         }
 3812         FREE_LOCK(&lk);
 3813         PRELE(curproc);                 /* Allow swapout of kernel stack */
 3814 }
 3815 
 3816 /*
 3817  * Called from within the procedure above to deal with unsatisfied
 3818  * allocation dependencies in a directory. The buffer must be locked,
 3819  * thus, no I/O completion operations can occur while we are
 3820  * manipulating its associated dependencies.
 3821  */
 3822 static void
 3823 initiate_write_filepage(pagedep, bp)
 3824         struct pagedep *pagedep;
 3825         struct buf *bp;
 3826 {
 3827         struct diradd *dap;
 3828         struct direct *ep;
 3829         int i;
 3830 
 3831         if (pagedep->pd_state & IOSTARTED) {
 3832                 /*
 3833                  * This can only happen if there is a driver that does not
 3834                  * understand chaining. Here biodone will reissue the call
 3835                  * to strategy for the incomplete buffers.
 3836                  */
 3837                 printf("initiate_write_filepage: already started\n");
 3838                 return;
 3839         }
 3840         pagedep->pd_state |= IOSTARTED;
 3841         for (i = 0; i < DAHASHSZ; i++) {
 3842                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
 3843                         ep = (struct direct *)
 3844                             ((char *)bp->b_data + dap->da_offset);
 3845                         if (ep->d_ino != dap->da_newinum)
 3846                                 panic("%s: dir inum %d != new %d",
 3847                                     "initiate_write_filepage",
 3848                                     ep->d_ino, dap->da_newinum);
 3849                         if (dap->da_state & DIRCHG)
 3850                                 ep->d_ino = dap->da_previous->dm_oldinum;
 3851                         else
 3852                                 ep->d_ino = 0;
 3853                         dap->da_state &= ~ATTACHED;
 3854                         dap->da_state |= UNDONE;
 3855                 }
 3856         }
 3857 }
 3858 
 3859 /*
 3860  * Version of initiate_write_inodeblock that handles UFS1 dinodes.
 3861  * Note that any bug fixes made to this routine must be done in the
 3862  * version found below.
 3863  *
 3864  * Called from within the procedure above to deal with unsatisfied
 3865  * allocation dependencies in an inodeblock. The buffer must be
 3866  * locked, thus, no I/O completion operations can occur while we
 3867  * are manipulating its associated dependencies.
 3868  */
 3869 static void 
 3870 initiate_write_inodeblock_ufs1(inodedep, bp)
 3871         struct inodedep *inodedep;
 3872         struct buf *bp;                 /* The inode block */
 3873 {
 3874         struct allocdirect *adp, *lastadp;
 3875         struct ufs1_dinode *dp;
 3876         struct ufs1_dinode *sip;
 3877         struct fs *fs;
 3878         ufs_lbn_t i;
 3879 #ifdef INVARIANTS
 3880         ufs_lbn_t prevlbn = 0;
 3881 #endif
 3882         int deplist;
 3883 
 3884         if (inodedep->id_state & IOSTARTED)
 3885                 panic("initiate_write_inodeblock_ufs1: already started");
 3886         inodedep->id_state |= IOSTARTED;
 3887         fs = inodedep->id_fs;
 3888         dp = (struct ufs1_dinode *)bp->b_data +
 3889             ino_to_fsbo(fs, inodedep->id_ino);
 3890         /*
 3891          * If the bitmap is not yet written, then the allocated
 3892          * inode cannot be written to disk.
 3893          */
 3894         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 3895                 if (inodedep->id_savedino1 != NULL)
 3896                         panic("initiate_write_inodeblock_ufs1: I/O underway");
 3897                 FREE_LOCK(&lk);
 3898                 MALLOC(sip, struct ufs1_dinode *,
 3899                     sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
 3900                 ACQUIRE_LOCK(&lk);
 3901                 inodedep->id_savedino1 = sip;
 3902                 *inodedep->id_savedino1 = *dp;
 3903                 bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
 3904                 dp->di_gen = inodedep->id_savedino1->di_gen;
 3905                 return;
 3906         }
 3907         /*
 3908          * If no dependencies, then there is nothing to roll back.
 3909          */
 3910         inodedep->id_savedsize = dp->di_size;
 3911         inodedep->id_savedextsize = 0;
 3912         if (TAILQ_EMPTY(&inodedep->id_inoupdt))
 3913                 return;
 3914         /*
 3915          * Set the dependencies to busy.
 3916          */
 3917         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 3918              adp = TAILQ_NEXT(adp, ad_next)) {
 3919 #ifdef INVARIANTS
 3920                 if (deplist != 0 && prevlbn >= adp->ad_lbn)
 3921                         panic("softdep_write_inodeblock: lbn order");
 3922                 prevlbn = adp->ad_lbn;
 3923                 if (adp->ad_lbn < NDADDR &&
 3924                     dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
 3925                         panic("%s: direct pointer #%jd mismatch %d != %jd",
 3926                             "softdep_write_inodeblock",
 3927                             (intmax_t)adp->ad_lbn,
 3928                             dp->di_db[adp->ad_lbn],
 3929                             (intmax_t)adp->ad_newblkno);
 3930                 if (adp->ad_lbn >= NDADDR &&
 3931                     dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
 3932                         panic("%s: indirect pointer #%jd mismatch %d != %jd",
 3933                             "softdep_write_inodeblock",
 3934                             (intmax_t)adp->ad_lbn - NDADDR,
 3935                             dp->di_ib[adp->ad_lbn - NDADDR],
 3936                             (intmax_t)adp->ad_newblkno);
 3937                 deplist |= 1 << adp->ad_lbn;
 3938                 if ((adp->ad_state & ATTACHED) == 0)
 3939                         panic("softdep_write_inodeblock: Unknown state 0x%x",
 3940                             adp->ad_state);
 3941 #endif /* INVARIANTS */
 3942                 adp->ad_state &= ~ATTACHED;
 3943                 adp->ad_state |= UNDONE;
 3944         }
 3945         /*
 3946          * The on-disk inode cannot claim to be any larger than the last
 3947          * fragment that has been written. Otherwise, the on-disk inode
 3948          * might have fragments that were not the last block in the file
 3949          * which would corrupt the filesystem.
 3950          */
 3951         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 3952              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 3953                 if (adp->ad_lbn >= NDADDR)
 3954                         break;
 3955                 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
 3956                 /* keep going until hitting a rollback to a frag */
 3957                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 3958                         continue;
 3959                 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 3960                 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
 3961 #ifdef INVARIANTS
 3962                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
 3963                                 panic("softdep_write_inodeblock: lost dep1");
 3964 #endif /* INVARIANTS */
 3965                         dp->di_db[i] = 0;
 3966                 }
 3967                 for (i = 0; i < NIADDR; i++) {
 3968 #ifdef INVARIANTS
 3969                         if (dp->di_ib[i] != 0 &&
 3970                             (deplist & ((1 << NDADDR) << i)) == 0)
 3971                                 panic("softdep_write_inodeblock: lost dep2");
 3972 #endif /* INVARIANTS */
 3973                         dp->di_ib[i] = 0;
 3974                 }
 3975                 return;
 3976         }
 3977         /*
 3978          * If we have zero'ed out the last allocated block of the file,
 3979          * roll back the size to the last currently allocated block.
 3980          * We know that this last allocated block is a full-sized as
 3981          * we already checked for fragments in the loop above.
 3982          */
 3983         if (lastadp != NULL &&
 3984             dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 3985                 for (i = lastadp->ad_lbn; i >= 0; i--)
 3986                         if (dp->di_db[i] != 0)
 3987                                 break;
 3988                 dp->di_size = (i + 1) * fs->fs_bsize;
 3989         }
 3990         /*
 3991          * The only dependencies are for indirect blocks.
 3992          *
 3993          * The file size for indirect block additions is not guaranteed.
 3994          * Such a guarantee would be non-trivial to achieve. The conventional
 3995          * synchronous write implementation also does not make this guarantee.
 3996          * Fsck should catch and fix discrepancies. Arguably, the file size
 3997          * can be over-estimated without destroying integrity when the file
 3998          * moves into the indirect blocks (i.e., is large). If we want to
 3999          * postpone fsck, we are stuck with this argument.
 4000          */
 4001         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
 4002                 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
 4003 }
 4004                 
 4005 /*
 4006  * Version of initiate_write_inodeblock that handles UFS2 dinodes.
 4007  * Note that any bug fixes made to this routine must be done in the
 4008  * version found above.
 4009  *
 4010  * Called from within the procedure above to deal with unsatisfied
 4011  * allocation dependencies in an inodeblock. The buffer must be
 4012  * locked, thus, no I/O completion operations can occur while we
 4013  * are manipulating its associated dependencies.
 4014  */
 4015 static void 
 4016 initiate_write_inodeblock_ufs2(inodedep, bp)
 4017         struct inodedep *inodedep;
 4018         struct buf *bp;                 /* The inode block */
 4019 {
 4020         struct allocdirect *adp, *lastadp;
 4021         struct ufs2_dinode *dp;
 4022         struct ufs2_dinode *sip;
 4023         struct fs *fs;
 4024         ufs_lbn_t i;
 4025 #ifdef INVARIANTS
 4026         ufs_lbn_t prevlbn = 0;
 4027 #endif
 4028         int deplist;
 4029 
 4030         if (inodedep->id_state & IOSTARTED)
 4031                 panic("initiate_write_inodeblock_ufs2: already started");
 4032         inodedep->id_state |= IOSTARTED;
 4033         fs = inodedep->id_fs;
 4034         dp = (struct ufs2_dinode *)bp->b_data +
 4035             ino_to_fsbo(fs, inodedep->id_ino);
 4036         /*
 4037          * If the bitmap is not yet written, then the allocated
 4038          * inode cannot be written to disk.
 4039          */
 4040         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 4041                 if (inodedep->id_savedino2 != NULL)
 4042                         panic("initiate_write_inodeblock_ufs2: I/O underway");
 4043                 FREE_LOCK(&lk);
 4044                 MALLOC(sip, struct ufs2_dinode *,
 4045                     sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
 4046                 ACQUIRE_LOCK(&lk);
 4047                 inodedep->id_savedino2 = sip;
 4048                 *inodedep->id_savedino2 = *dp;
 4049                 bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
 4050                 dp->di_gen = inodedep->id_savedino2->di_gen;
 4051                 return;
 4052         }
 4053         /*
 4054          * If no dependencies, then there is nothing to roll back.
 4055          */
 4056         inodedep->id_savedsize = dp->di_size;
 4057         inodedep->id_savedextsize = dp->di_extsize;
 4058         if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
 4059             TAILQ_EMPTY(&inodedep->id_extupdt))
 4060                 return;
 4061         /*
 4062          * Set the ext data dependencies to busy.
 4063          */
 4064         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
 4065              adp = TAILQ_NEXT(adp, ad_next)) {
 4066 #ifdef INVARIANTS
 4067                 if (deplist != 0 && prevlbn >= adp->ad_lbn)
 4068                         panic("softdep_write_inodeblock: lbn order");
 4069                 prevlbn = adp->ad_lbn;
 4070                 if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno)
 4071                         panic("%s: direct pointer #%jd mismatch %jd != %jd",
 4072                             "softdep_write_inodeblock",
 4073                             (intmax_t)adp->ad_lbn,
 4074                             (intmax_t)dp->di_extb[adp->ad_lbn],
 4075                             (intmax_t)adp->ad_newblkno);
 4076                 deplist |= 1 << adp->ad_lbn;
 4077                 if ((adp->ad_state & ATTACHED) == 0)
 4078                         panic("softdep_write_inodeblock: Unknown state 0x%x",
 4079                             adp->ad_state);
 4080 #endif /* INVARIANTS */
 4081                 adp->ad_state &= ~ATTACHED;
 4082                 adp->ad_state |= UNDONE;
 4083         }
 4084         /*
 4085          * The on-disk inode cannot claim to be any larger than the last
 4086          * fragment that has been written. Otherwise, the on-disk inode
 4087          * might have fragments that were not the last block in the ext
 4088          * data which would corrupt the filesystem.
 4089          */
 4090         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
 4091              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 4092                 dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
 4093                 /* keep going until hitting a rollback to a frag */
 4094                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 4095                         continue;
 4096                 dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 4097                 for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
 4098 #ifdef INVARIANTS
 4099                         if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
 4100                                 panic("softdep_write_inodeblock: lost dep1");
 4101 #endif /* INVARIANTS */
 4102                         dp->di_extb[i] = 0;
 4103                 }
 4104                 lastadp = NULL;
 4105                 break;
 4106         }
 4107         /*
 4108          * If we have zero'ed out the last allocated block of the ext
 4109          * data, roll back the size to the last currently allocated block.
 4110          * We know that this last allocated block is a full-sized as
 4111          * we already checked for fragments in the loop above.
 4112          */
 4113         if (lastadp != NULL &&
 4114             dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 4115                 for (i = lastadp->ad_lbn; i >= 0; i--)
 4116                         if (dp->di_extb[i] != 0)
 4117                                 break;
 4118                 dp->di_extsize = (i + 1) * fs->fs_bsize;
 4119         }
 4120         /*
 4121          * Set the file data dependencies to busy.
 4122          */
 4123         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 4124              adp = TAILQ_NEXT(adp, ad_next)) {
 4125 #ifdef INVARIANTS
 4126                 if (deplist != 0 && prevlbn >= adp->ad_lbn)
 4127                         panic("softdep_write_inodeblock: lbn order");
 4128                 prevlbn = adp->ad_lbn;
 4129                 if (adp->ad_lbn < NDADDR &&
 4130                     dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
 4131                         panic("%s: direct pointer #%jd mismatch %jd != %jd",
 4132                             "softdep_write_inodeblock",
 4133                             (intmax_t)adp->ad_lbn,
 4134                             (intmax_t)dp->di_db[adp->ad_lbn],
 4135                             (intmax_t)adp->ad_newblkno);
 4136                 if (adp->ad_lbn >= NDADDR &&
 4137                     dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
 4138                         panic("%s indirect pointer #%jd mismatch %jd != %jd",
 4139                             "softdep_write_inodeblock:",
 4140                             (intmax_t)adp->ad_lbn - NDADDR,
 4141                             (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
 4142                             (intmax_t)adp->ad_newblkno);
 4143                 deplist |= 1 << adp->ad_lbn;
 4144                 if ((adp->ad_state & ATTACHED) == 0)
 4145                         panic("softdep_write_inodeblock: Unknown state 0x%x",
 4146                             adp->ad_state);
 4147 #endif /* INVARIANTS */
 4148                 adp->ad_state &= ~ATTACHED;
 4149                 adp->ad_state |= UNDONE;
 4150         }
 4151         /*
 4152          * The on-disk inode cannot claim to be any larger than the last
 4153          * fragment that has been written. Otherwise, the on-disk inode
 4154          * might have fragments that were not the last block in the file
 4155          * which would corrupt the filesystem.
 4156          */
 4157         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 4158              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 4159                 if (adp->ad_lbn >= NDADDR)
 4160                         break;
 4161                 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
 4162                 /* keep going until hitting a rollback to a frag */
 4163                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 4164                         continue;
 4165                 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 4166                 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
 4167 #ifdef INVARIANTS
 4168                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
 4169                                 panic("softdep_write_inodeblock: lost dep2");
 4170 #endif /* INVARIANTS */
 4171                         dp->di_db[i] = 0;
 4172                 }
 4173                 for (i = 0; i < NIADDR; i++) {
 4174 #ifdef INVARIANTS
 4175                         if (dp->di_ib[i] != 0 &&
 4176                             (deplist & ((1 << NDADDR) << i)) == 0)
 4177                                 panic("softdep_write_inodeblock: lost dep3");
 4178 #endif /* INVARIANTS */
 4179                         dp->di_ib[i] = 0;
 4180                 }
 4181                 return;
 4182         }
 4183         /*
 4184          * If we have zero'ed out the last allocated block of the file,
 4185          * roll back the size to the last currently allocated block.
 4186          * We know that this last allocated block is a full-sized as
 4187          * we already checked for fragments in the loop above.
 4188          */
 4189         if (lastadp != NULL &&
 4190             dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 4191                 for (i = lastadp->ad_lbn; i >= 0; i--)
 4192                         if (dp->di_db[i] != 0)
 4193                                 break;
 4194                 dp->di_size = (i + 1) * fs->fs_bsize;
 4195         }
 4196         /*
 4197          * The only dependencies are for indirect blocks.
 4198          *
 4199          * The file size for indirect block additions is not guaranteed.
 4200          * Such a guarantee would be non-trivial to achieve. The conventional
 4201          * synchronous write implementation also does not make this guarantee.
 4202          * Fsck should catch and fix discrepancies. Arguably, the file size
 4203          * can be over-estimated without destroying integrity when the file
 4204          * moves into the indirect blocks (i.e., is large). If we want to
 4205          * postpone fsck, we are stuck with this argument.
 4206          */
 4207         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
 4208                 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
 4209 }
 4210 
 4211 /*
 4212  * This routine is called during the completion interrupt
 4213  * service routine for a disk write (from the procedure called
 4214  * by the device driver to inform the filesystem caches of
 4215  * a request completion).  It should be called early in this
 4216  * procedure, before the block is made available to other
 4217  * processes or other routines are called.
 4218  */
 4219 static void 
 4220 softdep_disk_write_complete(bp)
 4221         struct buf *bp;         /* describes the completed disk write */
 4222 {
 4223         struct worklist *wk;
 4224         struct worklist *owk;
 4225         struct workhead reattach;
 4226         struct newblk *newblk;
 4227         struct allocindir *aip;
 4228         struct allocdirect *adp;
 4229         struct indirdep *indirdep;
 4230         struct inodedep *inodedep;
 4231         struct bmsafemap *bmsafemap;
 4232 
 4233         /*
 4234          * If an error occurred while doing the write, then the data
 4235          * has not hit the disk and the dependencies cannot be unrolled.
 4236          */
 4237         if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
 4238                 return;
 4239         LIST_INIT(&reattach);
 4240         /*
 4241          * This lock must not be released anywhere in this code segment.
 4242          */
 4243         ACQUIRE_LOCK(&lk);
 4244         owk = NULL;
 4245         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 4246                 WORKLIST_REMOVE(wk);
 4247                 if (wk == owk)
 4248                         panic("duplicate worklist: %p\n", wk);
 4249                 owk = wk;
 4250                 switch (wk->wk_type) {
 4251 
 4252                 case D_PAGEDEP:
 4253                         if (handle_written_filepage(WK_PAGEDEP(wk), bp))
 4254                                 WORKLIST_INSERT(&reattach, wk);
 4255                         continue;
 4256 
 4257                 case D_INODEDEP:
 4258                         if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
 4259                                 WORKLIST_INSERT(&reattach, wk);
 4260                         continue;
 4261 
 4262                 case D_BMSAFEMAP:
 4263                         bmsafemap = WK_BMSAFEMAP(wk);
 4264                         while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
 4265                                 newblk->nb_state |= DEPCOMPLETE;
 4266                                 newblk->nb_bmsafemap = NULL;
 4267                                 LIST_REMOVE(newblk, nb_deps);
 4268                         }
 4269                         while ((adp =
 4270                            LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
 4271                                 adp->ad_state |= DEPCOMPLETE;
 4272                                 adp->ad_buf = NULL;
 4273                                 LIST_REMOVE(adp, ad_deps);
 4274                                 handle_allocdirect_partdone(adp);
 4275                         }
 4276                         while ((aip =
 4277                             LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
 4278                                 aip->ai_state |= DEPCOMPLETE;
 4279                                 aip->ai_buf = NULL;
 4280                                 LIST_REMOVE(aip, ai_deps);
 4281                                 handle_allocindir_partdone(aip);
 4282                         }
 4283                         while ((inodedep =
 4284                              LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
 4285                                 inodedep->id_state |= DEPCOMPLETE;
 4286                                 LIST_REMOVE(inodedep, id_deps);
 4287                                 inodedep->id_buf = NULL;
 4288                         }
 4289                         WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
 4290                         continue;
 4291 
 4292                 case D_MKDIR:
 4293                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
 4294                         continue;
 4295 
 4296                 case D_ALLOCDIRECT:
 4297                         adp = WK_ALLOCDIRECT(wk);
 4298                         adp->ad_state |= COMPLETE;
 4299                         handle_allocdirect_partdone(adp);
 4300                         continue;
 4301 
 4302                 case D_ALLOCINDIR:
 4303                         aip = WK_ALLOCINDIR(wk);
 4304                         aip->ai_state |= COMPLETE;
 4305                         handle_allocindir_partdone(aip);
 4306                         continue;
 4307 
 4308                 case D_INDIRDEP:
 4309                         indirdep = WK_INDIRDEP(wk);
 4310                         if (indirdep->ir_state & GOINGAWAY)
 4311                                 panic("disk_write_complete: indirdep gone");
 4312                         bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
 4313                         FREE(indirdep->ir_saveddata, M_INDIRDEP);
 4314                         indirdep->ir_saveddata = 0;
 4315                         indirdep->ir_state &= ~UNDONE;
 4316                         indirdep->ir_state |= ATTACHED;
 4317                         while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
 4318                                 handle_allocindir_partdone(aip);
 4319                                 if (aip == LIST_FIRST(&indirdep->ir_donehd))
 4320                                         panic("disk_write_complete: not gone");
 4321                         }
 4322                         WORKLIST_INSERT(&reattach, wk);
 4323                         if ((bp->b_flags & B_DELWRI) == 0)
 4324                                 stat_indir_blk_ptrs++;
 4325                         bdirty(bp);
 4326                         continue;
 4327 
 4328                 default:
 4329                         panic("handle_disk_write_complete: Unknown type %s",
 4330                             TYPENAME(wk->wk_type));
 4331                         /* NOTREACHED */
 4332                 }
 4333         }
 4334         /*
 4335          * Reattach any requests that must be redone.
 4336          */
 4337         while ((wk = LIST_FIRST(&reattach)) != NULL) {
 4338                 WORKLIST_REMOVE(wk);
 4339                 WORKLIST_INSERT(&bp->b_dep, wk);
 4340         }
 4341         FREE_LOCK(&lk);
 4342 }
 4343 
 4344 /*
 4345  * Called from within softdep_disk_write_complete above. Note that
 4346  * this routine is always called from interrupt level with further
 4347  * splbio interrupts blocked.
 4348  */
 4349 static void 
 4350 handle_allocdirect_partdone(adp)
 4351         struct allocdirect *adp;        /* the completed allocdirect */
 4352 {
 4353         struct allocdirectlst *listhead;
 4354         struct allocdirect *listadp;
 4355         struct inodedep *inodedep;
 4356         long bsize, delay;
 4357 
 4358         if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
 4359                 return;
 4360         if (adp->ad_buf != NULL)
 4361                 panic("handle_allocdirect_partdone: dangling dep");
 4362         /*
 4363          * The on-disk inode cannot claim to be any larger than the last
 4364          * fragment that has been written. Otherwise, the on-disk inode
 4365          * might have fragments that were not the last block in the file
 4366          * which would corrupt the filesystem. Thus, we cannot free any
 4367          * allocdirects after one whose ad_oldblkno claims a fragment as
 4368          * these blocks must be rolled back to zero before writing the inode.
 4369          * We check the currently active set of allocdirects in id_inoupdt
 4370          * or id_extupdt as appropriate.
 4371          */
 4372         inodedep = adp->ad_inodedep;
 4373         bsize = inodedep->id_fs->fs_bsize;
 4374         if (adp->ad_state & EXTDATA)
 4375                 listhead = &inodedep->id_extupdt;
 4376         else
 4377                 listhead = &inodedep->id_inoupdt;
 4378         TAILQ_FOREACH(listadp, listhead, ad_next) {
 4379                 /* found our block */
 4380                 if (listadp == adp)
 4381                         break;
 4382                 /* continue if ad_oldlbn is not a fragment */
 4383                 if (listadp->ad_oldsize == 0 ||
 4384                     listadp->ad_oldsize == bsize)
 4385                         continue;
 4386                 /* hit a fragment */
 4387                 return;
 4388         }
 4389         /*
 4390          * If we have reached the end of the current list without
 4391          * finding the just finished dependency, then it must be
 4392          * on the future dependency list. Future dependencies cannot
 4393          * be freed until they are moved to the current list.
 4394          */
 4395         if (listadp == NULL) {
 4396 #ifdef DEBUG
 4397                 if (adp->ad_state & EXTDATA)
 4398                         listhead = &inodedep->id_newextupdt;
 4399                 else
 4400                         listhead = &inodedep->id_newinoupdt;
 4401                 TAILQ_FOREACH(listadp, listhead, ad_next)
 4402                         /* found our block */
 4403                         if (listadp == adp)
 4404                                 break;
 4405                 if (listadp == NULL)
 4406                         panic("handle_allocdirect_partdone: lost dep");
 4407 #endif /* DEBUG */
 4408                 return;
 4409         }
 4410         /*
 4411          * If we have found the just finished dependency, then free
 4412          * it along with anything that follows it that is complete.
 4413          * If the inode still has a bitmap dependency, then it has
 4414          * never been written to disk, hence the on-disk inode cannot
 4415          * reference the old fragment so we can free it without delay.
 4416          */
 4417         delay = (inodedep->id_state & DEPCOMPLETE);
 4418         for (; adp; adp = listadp) {
 4419                 listadp = TAILQ_NEXT(adp, ad_next);
 4420                 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
 4421                         return;
 4422                 free_allocdirect(listhead, adp, delay);
 4423         }
 4424 }
 4425 
 4426 /*
 4427  * Called from within softdep_disk_write_complete above. Note that
 4428  * this routine is always called from interrupt level with further
 4429  * splbio interrupts blocked.
 4430  */
 4431 static void
 4432 handle_allocindir_partdone(aip)
 4433         struct allocindir *aip;         /* the completed allocindir */
 4434 {
 4435         struct indirdep *indirdep;
 4436 
 4437         if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
 4438                 return;
 4439         if (aip->ai_buf != NULL)
 4440                 panic("handle_allocindir_partdone: dangling dependency");
 4441         indirdep = aip->ai_indirdep;
 4442         if (indirdep->ir_state & UNDONE) {
 4443                 LIST_REMOVE(aip, ai_next);
 4444                 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
 4445                 return;
 4446         }
 4447         if (indirdep->ir_state & UFS1FMT)
 4448                 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
 4449                     aip->ai_newblkno;
 4450         else
 4451                 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
 4452                     aip->ai_newblkno;
 4453         LIST_REMOVE(aip, ai_next);
 4454         if (aip->ai_freefrag != NULL)
 4455                 add_to_worklist(&aip->ai_freefrag->ff_list);
 4456         WORKITEM_FREE(aip, D_ALLOCINDIR);
 4457 }
 4458 
 4459 /*
 4460  * Called from within softdep_disk_write_complete above to restore
 4461  * in-memory inode block contents to their most up-to-date state. Note
 4462  * that this routine is always called from interrupt level with further
 4463  * splbio interrupts blocked.
 4464  */
 4465 static int 
 4466 handle_written_inodeblock(inodedep, bp)
 4467         struct inodedep *inodedep;
 4468         struct buf *bp;         /* buffer containing the inode block */
 4469 {
 4470         struct worklist *wk, *filefree;
 4471         struct allocdirect *adp, *nextadp;
 4472         struct ufs1_dinode *dp1 = NULL;
 4473         struct ufs2_dinode *dp2 = NULL;
 4474         int hadchanges, fstype;
 4475 
 4476         if ((inodedep->id_state & IOSTARTED) == 0)
 4477                 panic("handle_written_inodeblock: not started");
 4478         inodedep->id_state &= ~IOSTARTED;
 4479         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
 4480                 fstype = UFS1;
 4481                 dp1 = (struct ufs1_dinode *)bp->b_data +
 4482                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
 4483         } else {
 4484                 fstype = UFS2;
 4485                 dp2 = (struct ufs2_dinode *)bp->b_data +
 4486                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
 4487         }
 4488         /*
 4489          * If we had to rollback the inode allocation because of
 4490          * bitmaps being incomplete, then simply restore it.
 4491          * Keep the block dirty so that it will not be reclaimed until
 4492          * all associated dependencies have been cleared and the
 4493          * corresponding updates written to disk.
 4494          */
 4495         if (inodedep->id_savedino1 != NULL) {
 4496                 if (fstype == UFS1)
 4497                         *dp1 = *inodedep->id_savedino1;
 4498                 else
 4499                         *dp2 = *inodedep->id_savedino2;
 4500                 FREE(inodedep->id_savedino1, M_SAVEDINO);
 4501                 inodedep->id_savedino1 = NULL;
 4502                 if ((bp->b_flags & B_DELWRI) == 0)
 4503                         stat_inode_bitmap++;
 4504                 bdirty(bp);
 4505                 return (1);
 4506         }
 4507         inodedep->id_state |= COMPLETE;
 4508         /*
 4509          * Roll forward anything that had to be rolled back before 
 4510          * the inode could be updated.
 4511          */
 4512         hadchanges = 0;
 4513         for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
 4514                 nextadp = TAILQ_NEXT(adp, ad_next);
 4515                 if (adp->ad_state & ATTACHED)
 4516                         panic("handle_written_inodeblock: new entry");
 4517                 if (fstype == UFS1) {
 4518                         if (adp->ad_lbn < NDADDR) {
 4519                                 if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
 4520                                         panic("%s %s #%jd mismatch %d != %jd",
 4521                                             "handle_written_inodeblock:",
 4522                                             "direct pointer",
 4523                                             (intmax_t)adp->ad_lbn,
 4524                                             dp1->di_db[adp->ad_lbn],
 4525                                             (intmax_t)adp->ad_oldblkno);
 4526                                 dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
 4527                         } else {
 4528                                 if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
 4529                                         panic("%s: %s #%jd allocated as %d",
 4530                                             "handle_written_inodeblock",
 4531                                             "indirect pointer",
 4532                                             (intmax_t)adp->ad_lbn - NDADDR,
 4533                                             dp1->di_ib[adp->ad_lbn - NDADDR]);
 4534                                 dp1->di_ib[adp->ad_lbn - NDADDR] =
 4535                                     adp->ad_newblkno;
 4536                         }
 4537                 } else {
 4538                         if (adp->ad_lbn < NDADDR) {
 4539                                 if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
 4540                                         panic("%s: %s #%jd %s %jd != %jd",
 4541                                             "handle_written_inodeblock",
 4542                                             "direct pointer",
 4543                                             (intmax_t)adp->ad_lbn, "mismatch",
 4544                                             (intmax_t)dp2->di_db[adp->ad_lbn],
 4545                                             (intmax_t)adp->ad_oldblkno);
 4546                                 dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
 4547                         } else {
 4548                                 if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
 4549                                         panic("%s: %s #%jd allocated as %jd",
 4550                                             "handle_written_inodeblock",
 4551                                             "indirect pointer",
 4552                                             (intmax_t)adp->ad_lbn - NDADDR,
 4553                                             (intmax_t)
 4554                                             dp2->di_ib[adp->ad_lbn - NDADDR]);
 4555                                 dp2->di_ib[adp->ad_lbn - NDADDR] =
 4556                                     adp->ad_newblkno;
 4557                         }
 4558                 }
 4559                 adp->ad_state &= ~UNDONE;
 4560                 adp->ad_state |= ATTACHED;
 4561                 hadchanges = 1;
 4562         }
 4563         for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
 4564                 nextadp = TAILQ_NEXT(adp, ad_next);
 4565                 if (adp->ad_state & ATTACHED)
 4566                         panic("handle_written_inodeblock: new entry");
 4567                 if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno)
 4568                         panic("%s: direct pointers #%jd %s %jd != %jd",
 4569                             "handle_written_inodeblock",
 4570                             (intmax_t)adp->ad_lbn, "mismatch",
 4571                             (intmax_t)dp2->di_extb[adp->ad_lbn],
 4572                             (intmax_t)adp->ad_oldblkno);
 4573                 dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
 4574                 adp->ad_state &= ~UNDONE;
 4575                 adp->ad_state |= ATTACHED;
 4576                 hadchanges = 1;
 4577         }
 4578         if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
 4579                 stat_direct_blk_ptrs++;
 4580         /*
 4581          * Reset the file size to its most up-to-date value.
 4582          */
 4583         if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
 4584                 panic("handle_written_inodeblock: bad size");
 4585         if (fstype == UFS1) {
 4586                 if (dp1->di_size != inodedep->id_savedsize) {
 4587                         dp1->di_size = inodedep->id_savedsize;
 4588                         hadchanges = 1;
 4589                 }
 4590         } else {
 4591                 if (dp2->di_size != inodedep->id_savedsize) {
 4592                         dp2->di_size = inodedep->id_savedsize;
 4593                         hadchanges = 1;
 4594                 }
 4595                 if (dp2->di_extsize != inodedep->id_savedextsize) {
 4596                         dp2->di_extsize = inodedep->id_savedextsize;
 4597                         hadchanges = 1;
 4598                 }
 4599         }
 4600         inodedep->id_savedsize = -1;
 4601         inodedep->id_savedextsize = -1;
 4602         /*
 4603          * If there were any rollbacks in the inode block, then it must be
 4604          * marked dirty so that its will eventually get written back in
 4605          * its correct form.
 4606          */
 4607         if (hadchanges)
 4608                 bdirty(bp);
 4609         /*
 4610          * Process any allocdirects that completed during the update.
 4611          */
 4612         if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
 4613                 handle_allocdirect_partdone(adp);
 4614         if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
 4615                 handle_allocdirect_partdone(adp);
 4616         /*
 4617          * Process deallocations that were held pending until the
 4618          * inode had been written to disk. Freeing of the inode
 4619          * is delayed until after all blocks have been freed to
 4620          * avoid creation of new <vfsid, inum, lbn> triples
 4621          * before the old ones have been deleted.
 4622          */
 4623         filefree = NULL;
 4624         while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
 4625                 WORKLIST_REMOVE(wk);
 4626                 switch (wk->wk_type) {
 4627 
 4628                 case D_FREEFILE:
 4629                         /*
 4630                          * We defer adding filefree to the worklist until
 4631                          * all other additions have been made to ensure
 4632                          * that it will be done after all the old blocks
 4633                          * have been freed.
 4634                          */
 4635                         if (filefree != NULL)
 4636                                 panic("handle_written_inodeblock: filefree");
 4637                         filefree = wk;
 4638                         continue;
 4639 
 4640                 case D_MKDIR:
 4641                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
 4642                         continue;
 4643 
 4644                 case D_DIRADD:
 4645                         diradd_inode_written(WK_DIRADD(wk), inodedep);
 4646                         continue;
 4647 
 4648                 case D_FREEBLKS:
 4649                         wk->wk_state |= COMPLETE;
 4650                         if ((wk->wk_state  & ALLCOMPLETE) != ALLCOMPLETE)
 4651                                 continue;
 4652                          /* -- fall through -- */
 4653                 case D_FREEFRAG:
 4654                 case D_DIRREM:
 4655                         add_to_worklist(wk);
 4656                         continue;
 4657 
 4658                 case D_NEWDIRBLK:
 4659                         free_newdirblk(WK_NEWDIRBLK(wk));
 4660                         continue;
 4661 
 4662                 default:
 4663                         panic("handle_written_inodeblock: Unknown type %s",
 4664                             TYPENAME(wk->wk_type));
 4665                         /* NOTREACHED */
 4666                 }
 4667         }
 4668         if (filefree != NULL) {
 4669                 if (free_inodedep(inodedep) == 0)
 4670                         panic("handle_written_inodeblock: live inodedep");
 4671                 add_to_worklist(filefree);
 4672                 return (0);
 4673         }
 4674 
 4675         /*
 4676          * If no outstanding dependencies, free it.
 4677          */
 4678         if (free_inodedep(inodedep) ||
 4679             (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
 4680              TAILQ_FIRST(&inodedep->id_extupdt) == 0))
 4681                 return (0);
 4682         return (hadchanges);
 4683 }
 4684 
 4685 /*
 4686  * Process a diradd entry after its dependent inode has been written.
 4687  * This routine must be called with splbio interrupts blocked.
 4688  */
 4689 static void
 4690 diradd_inode_written(dap, inodedep)
 4691         struct diradd *dap;
 4692         struct inodedep *inodedep;
 4693 {
 4694         struct pagedep *pagedep;
 4695 
 4696         dap->da_state |= COMPLETE;
 4697         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 4698                 if (dap->da_state & DIRCHG)
 4699                         pagedep = dap->da_previous->dm_pagedep;
 4700                 else
 4701                         pagedep = dap->da_pagedep;
 4702                 LIST_REMOVE(dap, da_pdlist);
 4703                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 4704         }
 4705         WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
 4706 }
 4707 
 4708 /*
 4709  * Handle the completion of a mkdir dependency.
 4710  */
 4711 static void
 4712 handle_written_mkdir(mkdir, type)
 4713         struct mkdir *mkdir;
 4714         int type;
 4715 {
 4716         struct diradd *dap;
 4717         struct pagedep *pagedep;
 4718 
 4719         if (mkdir->md_state != type)
 4720                 panic("handle_written_mkdir: bad type");
 4721         dap = mkdir->md_diradd;
 4722         dap->da_state &= ~type;
 4723         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
 4724                 dap->da_state |= DEPCOMPLETE;
 4725         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 4726                 if (dap->da_state & DIRCHG)
 4727                         pagedep = dap->da_previous->dm_pagedep;
 4728                 else
 4729                         pagedep = dap->da_pagedep;
 4730                 LIST_REMOVE(dap, da_pdlist);
 4731                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 4732         }
 4733         LIST_REMOVE(mkdir, md_mkdirs);
 4734         WORKITEM_FREE(mkdir, D_MKDIR);
 4735 }
 4736 
 4737 /*
 4738  * Called from within softdep_disk_write_complete above.
 4739  * A write operation was just completed. Removed inodes can
 4740  * now be freed and associated block pointers may be committed.
 4741  * Note that this routine is always called from interrupt level
 4742  * with further splbio interrupts blocked.
 4743  */
 4744 static int 
 4745 handle_written_filepage(pagedep, bp)
 4746         struct pagedep *pagedep;
 4747         struct buf *bp;         /* buffer containing the written page */
 4748 {
 4749         struct dirrem *dirrem;
 4750         struct diradd *dap, *nextdap;
 4751         struct direct *ep;
 4752         int i, chgs;
 4753 
 4754         if ((pagedep->pd_state & IOSTARTED) == 0)
 4755                 panic("handle_written_filepage: not started");
 4756         pagedep->pd_state &= ~IOSTARTED;
 4757         /*
 4758          * Process any directory removals that have been committed.
 4759          */
 4760         while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
 4761                 LIST_REMOVE(dirrem, dm_next);
 4762                 dirrem->dm_dirinum = pagedep->pd_ino;
 4763                 add_to_worklist(&dirrem->dm_list);
 4764         }
 4765         /*
 4766          * Free any directory additions that have been committed.
 4767          * If it is a newly allocated block, we have to wait until
 4768          * the on-disk directory inode claims the new block.
 4769          */
 4770         if ((pagedep->pd_state & NEWBLOCK) == 0)
 4771                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 4772                         free_diradd(dap);
 4773         /*
 4774          * Uncommitted directory entries must be restored.
 4775          */
 4776         for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
 4777                 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
 4778                      dap = nextdap) {
 4779                         nextdap = LIST_NEXT(dap, da_pdlist);
 4780                         if (dap->da_state & ATTACHED)
 4781                                 panic("handle_written_filepage: attached");
 4782                         ep = (struct direct *)
 4783                             ((char *)bp->b_data + dap->da_offset);
 4784                         ep->d_ino = dap->da_newinum;
 4785                         dap->da_state &= ~UNDONE;
 4786                         dap->da_state |= ATTACHED;
 4787                         chgs = 1;
 4788                         /*
 4789                          * If the inode referenced by the directory has
 4790                          * been written out, then the dependency can be
 4791                          * moved to the pending list.
 4792                          */
 4793                         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 4794                                 LIST_REMOVE(dap, da_pdlist);
 4795                                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
 4796                                     da_pdlist);
 4797                         }
 4798                 }
 4799         }
 4800         /*
 4801          * If there were any rollbacks in the directory, then it must be
 4802          * marked dirty so that its will eventually get written back in
 4803          * its correct form.
 4804          */
 4805         if (chgs) {
 4806                 if ((bp->b_flags & B_DELWRI) == 0)
 4807                         stat_dir_entry++;
 4808                 bdirty(bp);
 4809                 return (1);
 4810         }
 4811         /*
 4812          * If we are not waiting for a new directory block to be
 4813          * claimed by its inode, then the pagedep will be freed.
 4814          * Otherwise it will remain to track any new entries on
 4815          * the page in case they are fsync'ed.
 4816          */
 4817         if ((pagedep->pd_state & NEWBLOCK) == 0) {
 4818                 LIST_REMOVE(pagedep, pd_hash);
 4819                 WORKITEM_FREE(pagedep, D_PAGEDEP);
 4820         }
 4821         return (0);
 4822 }
 4823 
 4824 /*
 4825  * Writing back in-core inode structures.
 4826  * 
 4827  * The filesystem only accesses an inode's contents when it occupies an
 4828  * "in-core" inode structure.  These "in-core" structures are separate from
 4829  * the page frames used to cache inode blocks.  Only the latter are
 4830  * transferred to/from the disk.  So, when the updated contents of the
 4831  * "in-core" inode structure are copied to the corresponding in-memory inode
 4832  * block, the dependencies are also transferred.  The following procedure is
 4833  * called when copying a dirty "in-core" inode to a cached inode block.
 4834  */
 4835 
 4836 /*
 4837  * Called when an inode is loaded from disk. If the effective link count
 4838  * differed from the actual link count when it was last flushed, then we
 4839  * need to ensure that the correct effective link count is put back.
 4840  */
 4841 void 
 4842 softdep_load_inodeblock(ip)
 4843         struct inode *ip;       /* the "in_core" copy of the inode */
 4844 {
 4845         struct inodedep *inodedep;
 4846 
 4847         /*
 4848          * Check for alternate nlink count.
 4849          */
 4850         ip->i_effnlink = ip->i_nlink;
 4851         ACQUIRE_LOCK(&lk);
 4852         if (inodedep_lookup(UFSTOVFS(ip->i_ump),
 4853             ip->i_number, 0, &inodedep) == 0) {
 4854                 FREE_LOCK(&lk);
 4855                 return;
 4856         }
 4857         ip->i_effnlink -= inodedep->id_nlinkdelta;
 4858         if (inodedep->id_state & SPACECOUNTED)
 4859                 ip->i_flag |= IN_SPACECOUNTED;
 4860         FREE_LOCK(&lk);
 4861 }
 4862 
 4863 /*
 4864  * This routine is called just before the "in-core" inode
 4865  * information is to be copied to the in-memory inode block.
 4866  * Recall that an inode block contains several inodes. If
 4867  * the force flag is set, then the dependencies will be
 4868  * cleared so that the update can always be made. Note that
 4869  * the buffer is locked when this routine is called, so we
 4870  * will never be in the middle of writing the inode block 
 4871  * to disk.
 4872  */
 4873 void 
 4874 softdep_update_inodeblock(ip, bp, waitfor)
 4875         struct inode *ip;       /* the "in_core" copy of the inode */
 4876         struct buf *bp;         /* the buffer containing the inode block */
 4877         int waitfor;            /* nonzero => update must be allowed */
 4878 {
 4879         struct inodedep *inodedep;
 4880         struct worklist *wk;
 4881         struct mount *mp;
 4882         struct buf *ibp;
 4883         int error;
 4884 
 4885         /*
 4886          * If the effective link count is not equal to the actual link
 4887          * count, then we must track the difference in an inodedep while
 4888          * the inode is (potentially) tossed out of the cache. Otherwise,
 4889          * if there is no existing inodedep, then there are no dependencies
 4890          * to track.
 4891          */
 4892         mp = UFSTOVFS(ip->i_ump);
 4893         ACQUIRE_LOCK(&lk);
 4894         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
 4895                 FREE_LOCK(&lk);
 4896                 if (ip->i_effnlink != ip->i_nlink)
 4897                         panic("softdep_update_inodeblock: bad link count");
 4898                 return;
 4899         }
 4900         if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
 4901                 panic("softdep_update_inodeblock: bad delta");
 4902         /*
 4903          * Changes have been initiated. Anything depending on these
 4904          * changes cannot occur until this inode has been written.
 4905          */
 4906         inodedep->id_state &= ~COMPLETE;
 4907         if ((inodedep->id_state & ONWORKLIST) == 0)
 4908                 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
 4909         /*
 4910          * Any new dependencies associated with the incore inode must 
 4911          * now be moved to the list associated with the buffer holding
 4912          * the in-memory copy of the inode. Once merged process any
 4913          * allocdirects that are completed by the merger.
 4914          */
 4915         merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
 4916         if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
 4917                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
 4918         merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
 4919         if (!TAILQ_EMPTY(&inodedep->id_extupdt))
 4920                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
 4921         /*
 4922          * Now that the inode has been pushed into the buffer, the
 4923          * operations dependent on the inode being written to disk
 4924          * can be moved to the id_bufwait so that they will be
 4925          * processed when the buffer I/O completes.
 4926          */
 4927         while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
 4928                 WORKLIST_REMOVE(wk);
 4929                 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
 4930         }
 4931         /*
 4932          * Newly allocated inodes cannot be written until the bitmap
 4933          * that allocates them have been written (indicated by
 4934          * DEPCOMPLETE being set in id_state). If we are doing a
 4935          * forced sync (e.g., an fsync on a file), we force the bitmap
 4936          * to be written so that the update can be done.
 4937          */
 4938         if (waitfor == 0) {
 4939                 FREE_LOCK(&lk);
 4940                 return;
 4941         }
 4942 retry:
 4943         if ((inodedep->id_state & DEPCOMPLETE) != 0) {
 4944                 FREE_LOCK(&lk);
 4945                 return;
 4946         }
 4947         ibp = inodedep->id_buf;
 4948         ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
 4949         if (ibp == NULL) {
 4950                 /*
 4951                  * If ibp came back as NULL, the dependency could have been
 4952                  * freed while we slept.  Look it up again, and check to see
 4953                  * that it has completed.
 4954                  */
 4955                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
 4956                         goto retry;
 4957                 FREE_LOCK(&lk);
 4958                 return;
 4959         }
 4960         FREE_LOCK(&lk);
 4961         if ((error = bwrite(ibp)) != 0)
 4962                 softdep_error("softdep_update_inodeblock: bwrite", error);
 4963 }
 4964 
 4965 /*
 4966  * Merge the a new inode dependency list (such as id_newinoupdt) into an
 4967  * old inode dependency list (such as id_inoupdt). This routine must be
 4968  * called with splbio interrupts blocked.
 4969  */
 4970 static void
 4971 merge_inode_lists(newlisthead, oldlisthead)
 4972         struct allocdirectlst *newlisthead;
 4973         struct allocdirectlst *oldlisthead;
 4974 {
 4975         struct allocdirect *listadp, *newadp;
 4976 
 4977         newadp = TAILQ_FIRST(newlisthead);
 4978         for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
 4979                 if (listadp->ad_lbn < newadp->ad_lbn) {
 4980                         listadp = TAILQ_NEXT(listadp, ad_next);
 4981                         continue;
 4982                 }
 4983                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
 4984                 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
 4985                 if (listadp->ad_lbn == newadp->ad_lbn) {
 4986                         allocdirect_merge(oldlisthead, newadp,
 4987                             listadp);
 4988                         listadp = newadp;
 4989                 }
 4990                 newadp = TAILQ_FIRST(newlisthead);
 4991         }
 4992         while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
 4993                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
 4994                 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
 4995         }
 4996 }
 4997 
 4998 /*
 4999  * If we are doing an fsync, then we must ensure that any directory
 5000  * entries for the inode have been written after the inode gets to disk.
 5001  */
 5002 int
 5003 softdep_fsync(vp)
 5004         struct vnode *vp;       /* the "in_core" copy of the inode */
 5005 {
 5006         struct inodedep *inodedep;
 5007         struct pagedep *pagedep;
 5008         struct worklist *wk;
 5009         struct diradd *dap;
 5010         struct mount *mp;
 5011         struct vnode *pvp;
 5012         struct inode *ip;
 5013         struct buf *bp;
 5014         struct fs *fs;
 5015         struct thread *td = curthread;
 5016         int error, flushparent, pagedep_new_block;
 5017         ino_t parentino;
 5018         ufs_lbn_t lbn;
 5019 
 5020         ip = VTOI(vp);
 5021         fs = ip->i_fs;
 5022         mp = vp->v_mount;
 5023         ACQUIRE_LOCK(&lk);
 5024         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
 5025                 FREE_LOCK(&lk);
 5026                 return (0);
 5027         }
 5028         if (!LIST_EMPTY(&inodedep->id_inowait) ||
 5029             !LIST_EMPTY(&inodedep->id_bufwait) ||
 5030             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 5031             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 5032             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 5033             !TAILQ_EMPTY(&inodedep->id_newinoupdt))
 5034                 panic("softdep_fsync: pending ops");
 5035         for (error = 0, flushparent = 0; ; ) {
 5036                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
 5037                         break;
 5038                 if (wk->wk_type != D_DIRADD)
 5039                         panic("softdep_fsync: Unexpected type %s",
 5040                             TYPENAME(wk->wk_type));
 5041                 dap = WK_DIRADD(wk);
 5042                 /*
 5043                  * Flush our parent if this directory entry has a MKDIR_PARENT
 5044                  * dependency or is contained in a newly allocated block.
 5045                  */
 5046                 if (dap->da_state & DIRCHG)
 5047                         pagedep = dap->da_previous->dm_pagedep;
 5048                 else
 5049                         pagedep = dap->da_pagedep;
 5050                 parentino = pagedep->pd_ino;
 5051                 lbn = pagedep->pd_lbn;
 5052                 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
 5053                         panic("softdep_fsync: dirty");
 5054                 if ((dap->da_state & MKDIR_PARENT) ||
 5055                     (pagedep->pd_state & NEWBLOCK))
 5056                         flushparent = 1;
 5057                 else
 5058                         flushparent = 0;
 5059                 /*
 5060                  * If we are being fsync'ed as part of vgone'ing this vnode,
 5061                  * then we will not be able to release and recover the
 5062                  * vnode below, so we just have to give up on writing its
 5063                  * directory entry out. It will eventually be written, just
 5064                  * not now, but then the user was not asking to have it
 5065                  * written, so we are not breaking any promises.
 5066                  */
 5067                 if (vp->v_iflag & VI_DOOMED)
 5068                         break;
 5069                 /*
 5070                  * We prevent deadlock by always fetching inodes from the
 5071                  * root, moving down the directory tree. Thus, when fetching
 5072                  * our parent directory, we first try to get the lock. If
 5073                  * that fails, we must unlock ourselves before requesting
 5074                  * the lock on our parent. See the comment in ufs_lookup
 5075                  * for details on possible races.
 5076                  */
 5077                 FREE_LOCK(&lk);
 5078                 if (ffs_vget(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) {
 5079                         VOP_UNLOCK(vp, 0, td);
 5080                         error = ffs_vget(mp, parentino, LK_EXCLUSIVE, &pvp);
 5081                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 5082                         if (error != 0)
 5083                                 return (error);
 5084                 }
 5085                 /*
 5086                  * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
 5087                  * that are contained in direct blocks will be resolved by 
 5088                  * doing a ffs_update. Pagedeps contained in indirect blocks
 5089                  * may require a complete sync'ing of the directory. So, we
 5090                  * try the cheap and fast ffs_update first, and if that fails,
 5091                  * then we do the slower ffs_syncvnode of the directory.
 5092                  */
 5093                 if (flushparent) {
 5094                         int locked;
 5095 
 5096                         if ((error = ffs_update(pvp, 1)) != 0) {
 5097                                 vput(pvp);
 5098                                 return (error);
 5099                         }
 5100                         ACQUIRE_LOCK(&lk);
 5101                         locked = 1;
 5102                         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
 5103                                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
 5104                                         if (wk->wk_type != D_DIRADD)
 5105                                                 panic("softdep_fsync: Unexpected type %s",
 5106                                                       TYPENAME(wk->wk_type));
 5107                                         dap = WK_DIRADD(wk);
 5108                                         if (dap->da_state & DIRCHG)
 5109                                                 pagedep = dap->da_previous->dm_pagedep;
 5110                                         else
 5111                                                 pagedep = dap->da_pagedep;
 5112                                         pagedep_new_block = pagedep->pd_state & NEWBLOCK;
 5113                                         FREE_LOCK(&lk);
 5114                                         locked = 0;
 5115                                         if (pagedep_new_block &&
 5116                                             (error = ffs_syncvnode(pvp, MNT_WAIT))) {
 5117                                                 vput(pvp);
 5118                                                 return (error);
 5119                                         }
 5120                                 }
 5121                         }
 5122                         if (locked)
 5123                                 FREE_LOCK(&lk);
 5124                 }
 5125                 /*
 5126                  * Flush directory page containing the inode's name.
 5127                  */
 5128                 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
 5129                     &bp);
 5130                 if (error == 0)
 5131                         error = bwrite(bp);
 5132                 else
 5133                         brelse(bp);
 5134                 vput(pvp);
 5135                 if (error != 0)
 5136                         return (error);
 5137                 ACQUIRE_LOCK(&lk);
 5138                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
 5139                         break;
 5140         }
 5141         FREE_LOCK(&lk);
 5142         return (0);
 5143 }
 5144 
 5145 /*
 5146  * Flush all the dirty bitmaps associated with the block device
 5147  * before flushing the rest of the dirty blocks so as to reduce
 5148  * the number of dependencies that will have to be rolled back.
 5149  */
 5150 void
 5151 softdep_fsync_mountdev(vp)
 5152         struct vnode *vp;
 5153 {
 5154         struct buf *bp, *nbp;
 5155         struct worklist *wk;
 5156 
 5157         if (!vn_isdisk(vp, NULL))
 5158                 panic("softdep_fsync_mountdev: vnode not a disk");
 5159 restart:
 5160         ACQUIRE_LOCK(&lk);
 5161         VI_LOCK(vp);
 5162         TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
 5163                 /* 
 5164                  * If it is already scheduled, skip to the next buffer.
 5165                  */
 5166                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
 5167                         continue;
 5168 
 5169                 if ((bp->b_flags & B_DELWRI) == 0)
 5170                         panic("softdep_fsync_mountdev: not dirty");
 5171                 /*
 5172                  * We are only interested in bitmaps with outstanding
 5173                  * dependencies.
 5174                  */
 5175                 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
 5176                     wk->wk_type != D_BMSAFEMAP ||
 5177                     (bp->b_vflags & BV_BKGRDINPROG)) {
 5178                         BUF_UNLOCK(bp);
 5179                         continue;
 5180                 }
 5181                 VI_UNLOCK(vp);
 5182                 FREE_LOCK(&lk);
 5183                 bremfree(bp);
 5184                 (void) bawrite(bp);
 5185                 goto restart;
 5186         }
 5187         FREE_LOCK(&lk);
 5188         drain_output(vp);
 5189         VI_UNLOCK(vp);
 5190 }
 5191 
 5192 /*
 5193  * This routine is called when we are trying to synchronously flush a
 5194  * file. This routine must eliminate any filesystem metadata dependencies
 5195  * so that the syncing routine can succeed by pushing the dirty blocks
 5196  * associated with the file. If any I/O errors occur, they are returned.
 5197  */
 5198 int
 5199 softdep_sync_metadata(struct vnode *vp)
 5200 {
 5201         struct pagedep *pagedep;
 5202         struct allocdirect *adp;
 5203         struct allocindir *aip;
 5204         struct buf *bp, *nbp;
 5205         struct worklist *wk;
 5206         int i, error, waitfor;
 5207 
 5208         if (!DOINGSOFTDEP(vp))
 5209                 return (0);
 5210         /*
 5211          * Ensure that any direct block dependencies have been cleared.
 5212          */
 5213         ACQUIRE_LOCK(&lk);
 5214         if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) {
 5215                 FREE_LOCK(&lk);
 5216                 return (error);
 5217         }
 5218         FREE_LOCK(&lk);
 5219         /*
 5220          * For most files, the only metadata dependencies are the
 5221          * cylinder group maps that allocate their inode or blocks.
 5222          * The block allocation dependencies can be found by traversing
 5223          * the dependency lists for any buffers that remain on their
 5224          * dirty buffer list. The inode allocation dependency will
 5225          * be resolved when the inode is updated with MNT_WAIT.
 5226          * This work is done in two passes. The first pass grabs most
 5227          * of the buffers and begins asynchronously writing them. The
 5228          * only way to wait for these asynchronous writes is to sleep
 5229          * on the filesystem vnode which may stay busy for a long time
 5230          * if the filesystem is active. So, instead, we make a second
 5231          * pass over the dependencies blocking on each write. In the
 5232          * usual case we will be blocking against a write that we
 5233          * initiated, so when it is done the dependency will have been
 5234          * resolved. Thus the second pass is expected to end quickly.
 5235          */
 5236         waitfor = MNT_NOWAIT;
 5237 
 5238 top:
 5239         /*
 5240          * We must wait for any I/O in progress to finish so that
 5241          * all potential buffers on the dirty list will be visible.
 5242          */
 5243         VI_LOCK(vp);
 5244         drain_output(vp);
 5245         while ((bp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd)) != NULL) {
 5246                 bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT);
 5247                 if (bp)
 5248                         break;
 5249         }
 5250         VI_UNLOCK(vp);
 5251         if (bp == NULL)
 5252                 return (0);
 5253 loop:
 5254         /* While syncing snapshots, we must allow recursive lookups */
 5255         bp->b_lock.lk_flags |= LK_CANRECURSE;
 5256         ACQUIRE_LOCK(&lk);
 5257         /*
 5258          * As we hold the buffer locked, none of its dependencies
 5259          * will disappear.
 5260          */
 5261         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 5262                 switch (wk->wk_type) {
 5263 
 5264                 case D_ALLOCDIRECT:
 5265                         adp = WK_ALLOCDIRECT(wk);
 5266                         if (adp->ad_state & DEPCOMPLETE)
 5267                                 continue;
 5268                         nbp = adp->ad_buf;
 5269                         nbp = getdirtybuf(nbp, &lk, waitfor);
 5270                         if (nbp == NULL)
 5271                                 continue;
 5272                         FREE_LOCK(&lk);
 5273                         if (waitfor == MNT_NOWAIT) {
 5274                                 bawrite(nbp);
 5275                         } else if ((error = bwrite(nbp)) != 0) {
 5276                                 break;
 5277                         }
 5278                         ACQUIRE_LOCK(&lk);
 5279                         continue;
 5280 
 5281                 case D_ALLOCINDIR:
 5282                         aip = WK_ALLOCINDIR(wk);
 5283                         if (aip->ai_state & DEPCOMPLETE)
 5284                                 continue;
 5285                         nbp = aip->ai_buf;
 5286                         nbp = getdirtybuf(nbp, &lk, waitfor);
 5287                         if (nbp == NULL)
 5288                                 continue;
 5289                         FREE_LOCK(&lk);
 5290                         if (waitfor == MNT_NOWAIT) {
 5291                                 bawrite(nbp);
 5292                         } else if ((error = bwrite(nbp)) != 0) {
 5293                                 break;
 5294                         }
 5295                         ACQUIRE_LOCK(&lk);
 5296                         continue;
 5297 
 5298                 case D_INDIRDEP:
 5299                 restart:
 5300 
 5301                         LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
 5302                                 if (aip->ai_state & DEPCOMPLETE)
 5303                                         continue;
 5304                                 nbp = aip->ai_buf;
 5305                                 nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
 5306                                 if (nbp == NULL)
 5307                                         goto restart;
 5308                                 FREE_LOCK(&lk);
 5309                                 if ((error = bwrite(nbp)) != 0) {
 5310                                         goto loop_end;
 5311                                 }
 5312                                 ACQUIRE_LOCK(&lk);
 5313                                 goto restart;
 5314                         }
 5315                         continue;
 5316 
 5317                 case D_INODEDEP:
 5318                         if ((error = flush_inodedep_deps(wk->wk_mp,
 5319                             WK_INODEDEP(wk)->id_ino)) != 0) {
 5320                                 FREE_LOCK(&lk);
 5321                                 break;
 5322                         }
 5323                         continue;
 5324 
 5325                 case D_PAGEDEP:
 5326                         /*
 5327                          * We are trying to sync a directory that may
 5328                          * have dependencies on both its own metadata
 5329                          * and/or dependencies on the inodes of any
 5330                          * recently allocated files. We walk its diradd
 5331                          * lists pushing out the associated inode.
 5332                          */
 5333                         pagedep = WK_PAGEDEP(wk);
 5334                         for (i = 0; i < DAHASHSZ; i++) {
 5335                                 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
 5336                                         continue;
 5337                                 if ((error =
 5338                                     flush_pagedep_deps(vp, wk->wk_mp,
 5339                                                 &pagedep->pd_diraddhd[i]))) {
 5340                                         FREE_LOCK(&lk);
 5341                                         goto loop_end;
 5342                                 }
 5343                         }
 5344                         continue;
 5345 
 5346                 case D_MKDIR:
 5347                         /*
 5348                          * This case should never happen if the vnode has
 5349                          * been properly sync'ed. However, if this function
 5350                          * is used at a place where the vnode has not yet
 5351                          * been sync'ed, this dependency can show up. So,
 5352                          * rather than panic, just flush it.
 5353                          */
 5354                         nbp = WK_MKDIR(wk)->md_buf;
 5355                         nbp = getdirtybuf(nbp, &lk, waitfor);
 5356                         if (nbp == NULL)
 5357                                 continue;
 5358                         FREE_LOCK(&lk);
 5359                         if (waitfor == MNT_NOWAIT) {
 5360                                 bawrite(nbp);
 5361                         } else if ((error = bwrite(nbp)) != 0) {
 5362                                 break;
 5363                         }
 5364                         ACQUIRE_LOCK(&lk);
 5365                         continue;
 5366 
 5367                 case D_BMSAFEMAP:
 5368                         /*
 5369                          * This case should never happen if the vnode has
 5370                          * been properly sync'ed. However, if this function
 5371                          * is used at a place where the vnode has not yet
 5372                          * been sync'ed, this dependency can show up. So,
 5373                          * rather than panic, just flush it.
 5374                          */
 5375                         nbp = WK_BMSAFEMAP(wk)->sm_buf;
 5376                         nbp = getdirtybuf(nbp, &lk, waitfor);
 5377                         if (nbp == NULL)
 5378                                 continue;
 5379                         FREE_LOCK(&lk);
 5380                         if (waitfor == MNT_NOWAIT) {
 5381                                 bawrite(nbp);
 5382                         } else if ((error = bwrite(nbp)) != 0) {
 5383                                 break;
 5384                         }
 5385                         ACQUIRE_LOCK(&lk);
 5386                         continue;
 5387 
 5388                 default:
 5389                         panic("softdep_sync_metadata: Unknown type %s",
 5390                             TYPENAME(wk->wk_type));
 5391                         /* NOTREACHED */
 5392                 }
 5393         loop_end:
 5394                 /* We reach here only in error and unlocked */
 5395                 if (error == 0)
 5396                         panic("softdep_sync_metadata: zero error");
 5397                 bp->b_lock.lk_flags &= ~LK_CANRECURSE;
 5398                 bawrite(bp);
 5399                 return (error);
 5400         }
 5401         FREE_LOCK(&lk);
 5402         VI_LOCK(vp);
 5403         while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) {
 5404                 nbp = getdirtybuf(nbp, VI_MTX(vp), MNT_WAIT);
 5405                 if (nbp)
 5406                         break;
 5407         }
 5408         VI_UNLOCK(vp);
 5409         bp->b_lock.lk_flags &= ~LK_CANRECURSE;
 5410         bawrite(bp);
 5411         if (nbp != NULL) {
 5412                 bp = nbp;
 5413                 goto loop;
 5414         }
 5415         /*
 5416          * The brief unlock is to allow any pent up dependency
 5417          * processing to be done. Then proceed with the second pass.
 5418          */
 5419         if (waitfor == MNT_NOWAIT) {
 5420                 waitfor = MNT_WAIT;
 5421                 goto top;
 5422         }
 5423 
 5424         /*
 5425          * If we have managed to get rid of all the dirty buffers,
 5426          * then we are done. For certain directories and block
 5427          * devices, we may need to do further work.
 5428          *
 5429          * We must wait for any I/O in progress to finish so that
 5430          * all potential buffers on the dirty list will be visible.
 5431          */
 5432         VI_LOCK(vp);
 5433         drain_output(vp);
 5434         VI_UNLOCK(vp);
 5435         return (0);
 5436 }
 5437 
 5438 /*
 5439  * Flush the dependencies associated with an inodedep.
 5440  * Called with splbio blocked.
 5441  */
 5442 static int
 5443 flush_inodedep_deps(mp, ino)
 5444         struct mount *mp;
 5445         ino_t ino;
 5446 {
 5447         struct inodedep *inodedep;
 5448         int error, waitfor;
 5449 
 5450         /*
 5451          * This work is done in two passes. The first pass grabs most
 5452          * of the buffers and begins asynchronously writing them. The
 5453          * only way to wait for these asynchronous writes is to sleep
 5454          * on the filesystem vnode which may stay busy for a long time
 5455          * if the filesystem is active. So, instead, we make a second
 5456          * pass over the dependencies blocking on each write. In the
 5457          * usual case we will be blocking against a write that we
 5458          * initiated, so when it is done the dependency will have been
 5459          * resolved. Thus the second pass is expected to end quickly.
 5460          * We give a brief window at the top of the loop to allow
 5461          * any pending I/O to complete.
 5462          */
 5463         for (error = 0, waitfor = MNT_NOWAIT; ; ) {
 5464                 if (error)
 5465                         return (error);
 5466                 FREE_LOCK(&lk);
 5467                 ACQUIRE_LOCK(&lk);
 5468                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
 5469                         return (0);
 5470                 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
 5471                     flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
 5472                     flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
 5473                     flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
 5474                         continue;
 5475                 /*
 5476                  * If pass2, we are done, otherwise do pass 2.
 5477                  */
 5478                 if (waitfor == MNT_WAIT)
 5479                         break;
 5480                 waitfor = MNT_WAIT;
 5481         }
 5482         /*
 5483          * Try freeing inodedep in case all dependencies have been removed.
 5484          */
 5485         if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
 5486                 (void) free_inodedep(inodedep);
 5487         return (0);
 5488 }
 5489 
 5490 /*
 5491  * Flush an inode dependency list.
 5492  * Called with splbio blocked.
 5493  */
 5494 static int
 5495 flush_deplist(listhead, waitfor, errorp)
 5496         struct allocdirectlst *listhead;
 5497         int waitfor;
 5498         int *errorp;
 5499 {
 5500         struct allocdirect *adp;
 5501         struct buf *bp;
 5502 
 5503         mtx_assert(&lk, MA_OWNED);
 5504         TAILQ_FOREACH(adp, listhead, ad_next) {
 5505                 if (adp->ad_state & DEPCOMPLETE)
 5506                         continue;
 5507                 bp = adp->ad_buf;
 5508                 bp = getdirtybuf(bp, &lk, waitfor);
 5509                 if (bp == NULL) {
 5510                         if (waitfor == MNT_NOWAIT)
 5511                                 continue;
 5512                         return (1);
 5513                 }
 5514                 FREE_LOCK(&lk);
 5515                 if (waitfor == MNT_NOWAIT) {
 5516                         bawrite(bp);
 5517                 } else if ((*errorp = bwrite(bp)) != 0) {
 5518                         ACQUIRE_LOCK(&lk);
 5519                         return (1);
 5520                 }
 5521                 ACQUIRE_LOCK(&lk);
 5522                 return (1);
 5523         }
 5524         return (0);
 5525 }
 5526 
 5527 /*
 5528  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
 5529  * Called with splbio blocked.
 5530  */
 5531 static int
 5532 flush_pagedep_deps(pvp, mp, diraddhdp)
 5533         struct vnode *pvp;
 5534         struct mount *mp;
 5535         struct diraddhd *diraddhdp;
 5536 {
 5537         struct inodedep *inodedep;
 5538         struct ufsmount *ump;
 5539         struct diradd *dap;
 5540         struct vnode *vp;
 5541         int error = 0;
 5542         struct buf *bp;
 5543         ino_t inum;
 5544         struct worklist *wk;
 5545 
 5546         ump = VFSTOUFS(mp);
 5547         while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
 5548                 /*
 5549                  * Flush ourselves if this directory entry
 5550                  * has a MKDIR_PARENT dependency.
 5551                  */
 5552                 if (dap->da_state & MKDIR_PARENT) {
 5553                         FREE_LOCK(&lk);
 5554                         if ((error = ffs_update(pvp, 1)) != 0)
 5555                                 break;
 5556                         ACQUIRE_LOCK(&lk);
 5557                         /*
 5558                          * If that cleared dependencies, go on to next.
 5559                          */
 5560                         if (dap != LIST_FIRST(diraddhdp))
 5561                                 continue;
 5562                         if (dap->da_state & MKDIR_PARENT)
 5563                                 panic("flush_pagedep_deps: MKDIR_PARENT");
 5564                 }
 5565                 /*
 5566                  * A newly allocated directory must have its "." and
 5567                  * ".." entries written out before its name can be
 5568                  * committed in its parent. We do not want or need
 5569                  * the full semantics of a synchronous ffs_syncvnode as
 5570                  * that may end up here again, once for each directory
 5571                  * level in the filesystem. Instead, we push the blocks
 5572                  * and wait for them to clear. We have to fsync twice
 5573                  * because the first call may choose to defer blocks
 5574                  * that still have dependencies, but deferral will
 5575                  * happen at most once.
 5576                  */
 5577                 inum = dap->da_newinum;
 5578                 if (dap->da_state & MKDIR_BODY) {
 5579                         FREE_LOCK(&lk);
 5580                         if ((error = ffs_vget(mp, inum, LK_EXCLUSIVE, &vp)))
 5581                                 break;
 5582                         if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) ||
 5583                             (error=ffs_syncvnode(vp, MNT_NOWAIT))) {
 5584                                 vput(vp);
 5585                                 break;
 5586                         }
 5587                         VI_LOCK(vp);
 5588                         drain_output(vp);
 5589                         /*
 5590                          * If first block is still dirty with a D_MKDIR
 5591                          * dependency then it needs to be written now.
 5592                          */
 5593                         for (;;) {
 5594                                 error = 0;
 5595                                 bp = gbincore(&vp->v_bufobj, 0);
 5596                                 if (bp == NULL)
 5597                                         break;  /* First block not present */
 5598                                 error = BUF_LOCK(bp,
 5599                                                  LK_EXCLUSIVE |
 5600                                                  LK_SLEEPFAIL |
 5601                                                  LK_INTERLOCK,
 5602                                                  VI_MTX(vp));
 5603                                 VI_LOCK(vp);
 5604                                 if (error == ENOLCK)
 5605                                         continue;       /* Slept, retry */
 5606                                 if (error != 0)
 5607                                         break;          /* Failed */
 5608                                 if ((bp->b_flags & B_DELWRI) == 0) {
 5609                                         BUF_UNLOCK(bp);
 5610                                         break;  /* Buffer not dirty */
 5611                                 }
 5612                                 for (wk = LIST_FIRST(&bp->b_dep);
 5613                                      wk != NULL;
 5614                                      wk = LIST_NEXT(wk, wk_list))
 5615                                         if (wk->wk_type == D_MKDIR)
 5616                                                 break;
 5617                                 if (wk == NULL)
 5618                                         BUF_UNLOCK(bp); /* Dependency gone */
 5619                                 else {
 5620                                         /*
 5621                                          * D_MKDIR dependency remains,
 5622                                          * must write buffer to stable
 5623                                          * storage.
 5624                                          */
 5625                                         VI_UNLOCK(vp);
 5626                                         bremfree(bp);
 5627                                         error = bwrite(bp);
 5628                                         VI_LOCK(vp);
 5629                                 }
 5630                                 break;
 5631                         }
 5632                         VI_UNLOCK(vp);
 5633                         vput(vp);
 5634                         if (error != 0)
 5635                                 break;  /* Flushing of first block failed */
 5636                         ACQUIRE_LOCK(&lk);
 5637                         /*
 5638                          * If that cleared dependencies, go on to next.
 5639                          */
 5640                         if (dap != LIST_FIRST(diraddhdp))
 5641                                 continue;
 5642                         if (dap->da_state & MKDIR_BODY)
 5643                                 panic("flush_pagedep_deps: MKDIR_BODY");
 5644                 }
 5645                 /*
 5646                  * Flush the inode on which the directory entry depends.
 5647                  * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
 5648                  * the only remaining dependency is that the updated inode
 5649                  * count must get pushed to disk. The inode has already
 5650                  * been pushed into its inode buffer (via VOP_UPDATE) at
 5651                  * the time of the reference count change. So we need only
 5652                  * locate that buffer, ensure that there will be no rollback
 5653                  * caused by a bitmap dependency, then write the inode buffer.
 5654                  */
 5655 retry:
 5656                 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
 5657                         panic("flush_pagedep_deps: lost inode");
 5658                 /*
 5659                  * If the inode still has bitmap dependencies,
 5660                  * push them to disk.
 5661                  */
 5662                 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 5663                         bp = inodedep->id_buf;
 5664                         bp = getdirtybuf(bp, &lk, MNT_WAIT);
 5665                         if (bp == NULL)
 5666                                 goto retry;
 5667                         FREE_LOCK(&lk);
 5668                         if ((error = bwrite(bp)) != 0)
 5669                                 break;
 5670                         ACQUIRE_LOCK(&lk);
 5671                         if (dap != LIST_FIRST(diraddhdp))
 5672                                 continue;
 5673                 }
 5674                 /*
 5675                  * If the inode is still sitting in a buffer waiting
 5676                  * to be written, push it to disk.
 5677                  */
 5678                 FREE_LOCK(&lk);
 5679                 if ((error = bread(ump->um_devvp,
 5680                     fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
 5681                     (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
 5682                         brelse(bp);
 5683                         break;
 5684                 }
 5685                 if ((error = bwrite(bp)) != 0)
 5686                         break;
 5687                 ACQUIRE_LOCK(&lk);
 5688                 /*
 5689                  * If we have failed to get rid of all the dependencies
 5690                  * then something is seriously wrong.
 5691                  */
 5692                 if (dap == LIST_FIRST(diraddhdp))
 5693                         panic("flush_pagedep_deps: flush failed");
 5694         }
 5695         if (error)
 5696                 ACQUIRE_LOCK(&lk);
 5697         return (error);
 5698 }
 5699 
 5700 /*
 5701  * A large burst of file addition or deletion activity can drive the
 5702  * memory load excessively high. First attempt to slow things down
 5703  * using the techniques below. If that fails, this routine requests
 5704  * the offending operations to fall back to running synchronously
 5705  * until the memory load returns to a reasonable level.
 5706  */
 5707 int
 5708 softdep_slowdown(vp)
 5709         struct vnode *vp;
 5710 {
 5711         int max_softdeps_hard;
 5712 
 5713         ACQUIRE_LOCK(&lk);
 5714         max_softdeps_hard = max_softdeps * 11 / 10;
 5715         if (num_dirrem < max_softdeps_hard / 2 &&
 5716             num_inodedep < max_softdeps_hard &&
 5717             VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps &&
 5718             num_freeblkdep < max_softdeps_hard) {
 5719                 FREE_LOCK(&lk);
 5720                 return (0);
 5721         }
 5722         if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps)
 5723                 softdep_speedup();
 5724         stat_sync_limit_hit += 1;
 5725         FREE_LOCK(&lk);
 5726         return (1);
 5727 }
 5728 
 5729 /*
 5730  * Called by the allocation routines when they are about to fail
 5731  * in the hope that we can free up some disk space.
 5732  * 
 5733  * First check to see if the work list has anything on it. If it has,
 5734  * clean up entries until we successfully free some space. Because this
 5735  * process holds inodes locked, we cannot handle any remove requests
 5736  * that might block on a locked inode as that could lead to deadlock.
 5737  * If the worklist yields no free space, encourage the syncer daemon
 5738  * to help us. In no event will we try for longer than tickdelay seconds.
 5739  */
 5740 int
 5741 softdep_request_cleanup(fs, vp)
 5742         struct fs *fs;
 5743         struct vnode *vp;
 5744 {
 5745         struct ufsmount *ump;
 5746         long starttime;
 5747         ufs2_daddr_t needed;
 5748         int error;
 5749 
 5750         ump = VTOI(vp)->i_ump;
 5751         mtx_assert(UFS_MTX(ump), MA_OWNED);
 5752         needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
 5753         starttime = time_second + tickdelay;
 5754         /*
 5755          * If we are being called because of a process doing a
 5756          * copy-on-write, then it is not safe to update the vnode
 5757          * as we may recurse into the copy-on-write routine.
 5758          */
 5759         if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
 5760                 UFS_UNLOCK(ump);
 5761                 error = ffs_update(vp, 1);
 5762                 UFS_LOCK(ump);
 5763                 if (error != 0)
 5764                         return (0);
 5765         }
 5766         while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
 5767                 if (time_second > starttime)
 5768                         return (0);
 5769                 UFS_UNLOCK(ump);
 5770                 ACQUIRE_LOCK(&lk);
 5771                 if (ump->softdep_on_worklist > 0 &&
 5772                     process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
 5773                         stat_worklist_push += 1;
 5774                         FREE_LOCK(&lk);
 5775                         UFS_LOCK(ump);
 5776                         continue;
 5777                 }
 5778                 request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT);
 5779                 FREE_LOCK(&lk);
 5780                 UFS_LOCK(ump);
 5781         }
 5782         return (1);
 5783 }
 5784 
 5785 /*
 5786  * If memory utilization has gotten too high, deliberately slow things
 5787  * down and speed up the I/O processing.
 5788  */
 5789 extern struct thread *syncertd;
 5790 static int
 5791 request_cleanup(mp, resource)
 5792         struct mount *mp;
 5793         int resource;
 5794 {
 5795         struct thread *td = curthread;
 5796         struct ufsmount *ump;
 5797 
 5798         mtx_assert(&lk, MA_OWNED);
 5799         /*
 5800          * We never hold up the filesystem syncer or buf daemon.
 5801          */
 5802         if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
 5803                 return (0);
 5804         ump = VFSTOUFS(mp);
 5805         /*
 5806          * First check to see if the work list has gotten backlogged.
 5807          * If it has, co-opt this process to help clean up two entries.
 5808          * Because this process may hold inodes locked, we cannot
 5809          * handle any remove requests that might block on a locked
 5810          * inode as that could lead to deadlock.  We set TDP_SOFTDEP
 5811          * to avoid recursively processing the worklist.
 5812          */
 5813         if (ump->softdep_on_worklist > max_softdeps / 10) {
 5814                 td->td_pflags |= TDP_SOFTDEP;
 5815                 process_worklist_item(mp, LK_NOWAIT);
 5816                 process_worklist_item(mp, LK_NOWAIT);
 5817                 td->td_pflags &= ~TDP_SOFTDEP;
 5818                 stat_worklist_push += 2;
 5819                 return(1);
 5820         }
 5821         /*
 5822          * Next, we attempt to speed up the syncer process. If that
 5823          * is successful, then we allow the process to continue.
 5824          */
 5825         if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT)
 5826                 return(0);
 5827         /*
 5828          * If we are resource constrained on inode dependencies, try
 5829          * flushing some dirty inodes. Otherwise, we are constrained
 5830          * by file deletions, so try accelerating flushes of directories
 5831          * with removal dependencies. We would like to do the cleanup
 5832          * here, but we probably hold an inode locked at this point and 
 5833          * that might deadlock against one that we try to clean. So,
 5834          * the best that we can do is request the syncer daemon to do
 5835          * the cleanup for us.
 5836          */
 5837         switch (resource) {
 5838 
 5839         case FLUSH_INODES:
 5840                 stat_ino_limit_push += 1;
 5841                 req_clear_inodedeps += 1;
 5842                 stat_countp = &stat_ino_limit_hit;
 5843                 break;
 5844 
 5845         case FLUSH_REMOVE:
 5846         case FLUSH_REMOVE_WAIT:
 5847                 stat_blk_limit_push += 1;
 5848                 req_clear_remove += 1;
 5849                 stat_countp = &stat_blk_limit_hit;
 5850                 break;
 5851 
 5852         default:
 5853                 panic("request_cleanup: unknown type");
 5854         }
 5855         /*
 5856          * Hopefully the syncer daemon will catch up and awaken us.
 5857          * We wait at most tickdelay before proceeding in any case.
 5858          */
 5859         proc_waiting += 1;
 5860         if (handle.callout == NULL)
 5861                 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
 5862         msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
 5863         proc_waiting -= 1;
 5864         return (1);
 5865 }
 5866 
 5867 /*
 5868  * Awaken processes pausing in request_cleanup and clear proc_waiting
 5869  * to indicate that there is no longer a timer running.
 5870  */
 5871 static void
 5872 pause_timer(arg)
 5873         void *arg;
 5874 {
 5875 
 5876         ACQUIRE_LOCK(&lk);
 5877         *stat_countp += 1;
 5878         wakeup_one(&proc_waiting);
 5879         if (proc_waiting > 0)
 5880                 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
 5881         else
 5882                 handle.callout = NULL;
 5883         FREE_LOCK(&lk);
 5884 }
 5885 
 5886 /*
 5887  * Flush out a directory with at least one removal dependency in an effort to
 5888  * reduce the number of dirrem, freefile, and freeblks dependency structures.
 5889  */
 5890 static void
 5891 clear_remove(td)
 5892         struct thread *td;
 5893 {
 5894         struct pagedep_hashhead *pagedephd;
 5895         struct pagedep *pagedep;
 5896         static int next = 0;
 5897         struct mount *mp;
 5898         struct vnode *vp;
 5899         int error, cnt;
 5900         ino_t ino;
 5901 
 5902         mtx_assert(&lk, MA_OWNED);
 5903 
 5904         for (cnt = 0; cnt < pagedep_hash; cnt++) {
 5905                 pagedephd = &pagedep_hashtbl[next++];
 5906                 if (next >= pagedep_hash)
 5907                         next = 0;
 5908                 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
 5909                         if (LIST_EMPTY(&pagedep->pd_dirremhd))
 5910                                 continue;
 5911                         mp = pagedep->pd_list.wk_mp;
 5912                         ino = pagedep->pd_ino;
 5913                         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
 5914                                 continue;
 5915                         FREE_LOCK(&lk);
 5916                         if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp))) {
 5917                                 softdep_error("clear_remove: vget", error);
 5918                                 vn_finished_write(mp);
 5919                                 ACQUIRE_LOCK(&lk);
 5920                                 return;
 5921                         }
 5922                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
 5923                                 softdep_error("clear_remove: fsync", error);
 5924                         VI_LOCK(vp);
 5925                         drain_output(vp);
 5926                         VI_UNLOCK(vp);
 5927                         vput(vp);
 5928                         vn_finished_write(mp);
 5929                         ACQUIRE_LOCK(&lk);
 5930                         return;
 5931                 }
 5932         }
 5933 }
 5934 
 5935 /*
 5936  * Clear out a block of dirty inodes in an effort to reduce
 5937  * the number of inodedep dependency structures.
 5938  */
 5939 static void
 5940 clear_inodedeps(td)
 5941         struct thread *td;
 5942 {
 5943         struct inodedep_hashhead *inodedephd;
 5944         struct inodedep *inodedep;
 5945         static int next = 0;
 5946         struct mount *mp;
 5947         struct vnode *vp;
 5948         struct fs *fs;
 5949         int error, cnt;
 5950         ino_t firstino, lastino, ino;
 5951 
 5952         mtx_assert(&lk, MA_OWNED);
 5953         /*
 5954          * Pick a random inode dependency to be cleared.
 5955          * We will then gather up all the inodes in its block 
 5956          * that have dependencies and flush them out.
 5957          */
 5958         for (cnt = 0; cnt < inodedep_hash; cnt++) {
 5959                 inodedephd = &inodedep_hashtbl[next++];
 5960                 if (next >= inodedep_hash)
 5961                         next = 0;
 5962                 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
 5963                         break;
 5964         }
 5965         if (inodedep == NULL)
 5966                 return;
 5967         fs = inodedep->id_fs;
 5968         mp = inodedep->id_list.wk_mp;
 5969         /*
 5970          * Find the last inode in the block with dependencies.
 5971          */
 5972         firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
 5973         for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
 5974                 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
 5975                         break;
 5976         /*
 5977          * Asynchronously push all but the last inode with dependencies.
 5978          * Synchronously push the last inode with dependencies to ensure
 5979          * that the inode block gets written to free up the inodedeps.
 5980          */
 5981         for (ino = firstino; ino <= lastino; ino++) {
 5982                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
 5983                         continue;
 5984                 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
 5985                         continue;
 5986                 FREE_LOCK(&lk);
 5987                 if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp)) != 0) {
 5988                         softdep_error("clear_inodedeps: vget", error);
 5989                         vn_finished_write(mp);
 5990                         ACQUIRE_LOCK(&lk);
 5991                         return;
 5992                 }
 5993                 if (ino == lastino) {
 5994                         if ((error = ffs_syncvnode(vp, MNT_WAIT)))
 5995                                 softdep_error("clear_inodedeps: fsync1", error);
 5996                 } else {
 5997                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
 5998                                 softdep_error("clear_inodedeps: fsync2", error);
 5999                         VI_LOCK(vp);
 6000                         drain_output(vp);
 6001                         VI_UNLOCK(vp);
 6002                 }
 6003                 vput(vp);
 6004                 vn_finished_write(mp);
 6005                 ACQUIRE_LOCK(&lk);
 6006         }
 6007 }
 6008 
 6009 /*
 6010  * Function to determine if the buffer has outstanding dependencies
 6011  * that will cause a roll-back if the buffer is written. If wantcount
 6012  * is set, return number of dependencies, otherwise just yes or no.
 6013  */
 6014 static int
 6015 softdep_count_dependencies(bp, wantcount)
 6016         struct buf *bp;
 6017         int wantcount;
 6018 {
 6019         struct worklist *wk;
 6020         struct inodedep *inodedep;
 6021         struct indirdep *indirdep;
 6022         struct allocindir *aip;
 6023         struct pagedep *pagedep;
 6024         struct diradd *dap;
 6025         int i, retval;
 6026 
 6027         retval = 0;
 6028         ACQUIRE_LOCK(&lk);
 6029         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 6030                 switch (wk->wk_type) {
 6031 
 6032                 case D_INODEDEP:
 6033                         inodedep = WK_INODEDEP(wk);
 6034                         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 6035                                 /* bitmap allocation dependency */
 6036                                 retval += 1;
 6037                                 if (!wantcount)
 6038                                         goto out;
 6039                         }
 6040                         if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
 6041                                 /* direct block pointer dependency */
 6042                                 retval += 1;
 6043                                 if (!wantcount)
 6044                                         goto out;
 6045                         }
 6046                         if (TAILQ_FIRST(&inodedep->id_extupdt)) {
 6047                                 /* direct block pointer dependency */
 6048                                 retval += 1;
 6049                                 if (!wantcount)
 6050                                         goto out;
 6051                         }
 6052                         continue;
 6053 
 6054                 case D_INDIRDEP:
 6055                         indirdep = WK_INDIRDEP(wk);
 6056 
 6057                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
 6058                                 /* indirect block pointer dependency */
 6059                                 retval += 1;
 6060                                 if (!wantcount)
 6061                                         goto out;
 6062                         }
 6063                         continue;
 6064 
 6065                 case D_PAGEDEP:
 6066                         pagedep = WK_PAGEDEP(wk);
 6067                         for (i = 0; i < DAHASHSZ; i++) {
 6068 
 6069                                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
 6070                                         /* directory entry dependency */
 6071                                         retval += 1;
 6072                                         if (!wantcount)
 6073                                                 goto out;
 6074                                 }
 6075                         }
 6076                         continue;
 6077 
 6078                 case D_BMSAFEMAP:
 6079                 case D_ALLOCDIRECT:
 6080                 case D_ALLOCINDIR:
 6081                 case D_MKDIR:
 6082                         /* never a dependency on these blocks */
 6083                         continue;
 6084 
 6085                 default:
 6086                         panic("softdep_check_for_rollback: Unexpected type %s",
 6087                             TYPENAME(wk->wk_type));
 6088                         /* NOTREACHED */
 6089                 }
 6090         }
 6091 out:
 6092         FREE_LOCK(&lk);
 6093         return retval;
 6094 }
 6095 
 6096 /*
 6097  * Acquire exclusive access to a buffer.
 6098  * Must be called with a locked mtx parameter.
 6099  * Return acquired buffer or NULL on failure.
 6100  */
 6101 static struct buf *
 6102 getdirtybuf(bp, mtx, waitfor)
 6103         struct buf *bp;
 6104         struct mtx *mtx;
 6105         int waitfor;
 6106 {
 6107         int error;
 6108 
 6109         mtx_assert(mtx, MA_OWNED);
 6110         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
 6111                 if (waitfor != MNT_WAIT)
 6112                         return (NULL);
 6113                 error = BUF_LOCK(bp,
 6114                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
 6115                 /*
 6116                  * Even if we sucessfully acquire bp here, we have dropped
 6117                  * mtx, which may violates our guarantee.
 6118                  */
 6119                 if (error == 0)
 6120                         BUF_UNLOCK(bp);
 6121                 else if (error != ENOLCK)
 6122                         panic("getdirtybuf: inconsistent lock: %d", error);
 6123                 mtx_lock(mtx);
 6124                 return (NULL);
 6125         }
 6126         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 6127                 if (mtx == &lk && waitfor == MNT_WAIT) {
 6128                         mtx_unlock(mtx);
 6129                         BO_LOCK(bp->b_bufobj);
 6130                         BUF_UNLOCK(bp);
 6131                         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 6132                                 bp->b_vflags |= BV_BKGRDWAIT;
 6133                                 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
 6134                                        PRIBIO | PDROP, "getbuf", 0);
 6135                         } else
 6136                                 BO_UNLOCK(bp->b_bufobj);
 6137                         mtx_lock(mtx);
 6138                         return (NULL);
 6139                 }
 6140                 BUF_UNLOCK(bp);
 6141                 if (waitfor != MNT_WAIT)
 6142                         return (NULL);
 6143                 /*
 6144                  * The mtx argument must be bp->b_vp's mutex in
 6145                  * this case.
 6146                  */
 6147 #ifdef  DEBUG_VFS_LOCKS
 6148                 if (bp->b_vp->v_type != VCHR)
 6149                         ASSERT_VI_LOCKED(bp->b_vp, "getdirtybuf");
 6150 #endif
 6151                 bp->b_vflags |= BV_BKGRDWAIT;
 6152                 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
 6153                 return (NULL);
 6154         }
 6155         if ((bp->b_flags & B_DELWRI) == 0) {
 6156                 BUF_UNLOCK(bp);
 6157                 return (NULL);
 6158         }
 6159         bremfree(bp);
 6160         return (bp);
 6161 }
 6162 
 6163 
 6164 /*
 6165  * Check if it is safe to suspend the file system now.  On entry,
 6166  * the vnode interlock for devvp should be held.  Return 0 with
 6167  * the mount interlock held if the file system can be suspended now,
 6168  * otherwise return EAGAIN with the mount interlock held.
 6169  */
 6170 int
 6171 softdep_check_suspend(struct mount *mp,
 6172                       struct vnode *devvp,
 6173                       int softdep_deps,
 6174                       int softdep_accdeps,
 6175                       int secondary_writes,
 6176                       int secondary_accwrites)
 6177 {
 6178         struct bufobj *bo;
 6179         struct ufsmount *ump;
 6180         int error;
 6181 
 6182         ASSERT_VI_LOCKED(devvp, "softdep_check_suspend");
 6183         ump = VFSTOUFS(mp);
 6184         bo = &devvp->v_bufobj;
 6185 
 6186         for (;;) {
 6187                 if (!TRY_ACQUIRE_LOCK(&lk)) {
 6188                         VI_UNLOCK(devvp);
 6189                         ACQUIRE_LOCK(&lk);
 6190                         FREE_LOCK(&lk);
 6191                         VI_LOCK(devvp);
 6192                         continue;
 6193                 }
 6194                 if (!MNT_ITRYLOCK(mp)) {
 6195                         FREE_LOCK(&lk);
 6196                         VI_UNLOCK(devvp);
 6197                         MNT_ILOCK(mp);
 6198                         MNT_IUNLOCK(mp);
 6199                         VI_LOCK(devvp);
 6200                         continue;
 6201                 }
 6202                 if (mp->mnt_secondary_writes != 0) {
 6203                         FREE_LOCK(&lk);
 6204                         VI_UNLOCK(devvp);
 6205                         msleep(&mp->mnt_secondary_writes,
 6206                                MNT_MTX(mp),
 6207                                (PUSER - 1) | PDROP, "secwr", 0);
 6208                         VI_LOCK(devvp);
 6209                         continue;
 6210                 }
 6211                 break;
 6212         }
 6213 
 6214         /*
 6215          * Reasons for needing more work before suspend:
 6216          * - Dirty buffers on devvp.
 6217          * - Softdep activity occurred after start of vnode sync loop
 6218          * - Secondary writes occurred after start of vnode sync loop
 6219          */
 6220         error = 0;
 6221         if (bo->bo_numoutput > 0 ||
 6222             bo->bo_dirty.bv_cnt > 0 ||
 6223             softdep_deps != 0 ||
 6224             ump->softdep_deps != 0 ||
 6225             softdep_accdeps != ump->softdep_accdeps ||
 6226             secondary_writes != 0 ||
 6227             mp->mnt_secondary_writes != 0 ||
 6228             secondary_accwrites != mp->mnt_secondary_accwrites)
 6229                 error = EAGAIN;
 6230         FREE_LOCK(&lk);
 6231         VI_UNLOCK(devvp);
 6232         return (error);
 6233 }
 6234 
 6235 
 6236 /*
 6237  * Get the number of dependency structures for the file system, both
 6238  * the current number and the total number allocated.  These will
 6239  * later be used to detect that softdep processing has occurred.
 6240  */
 6241 void
 6242 softdep_get_depcounts(struct mount *mp,
 6243                       int *softdep_depsp,
 6244                       int *softdep_accdepsp)
 6245 {
 6246         struct ufsmount *ump;
 6247 
 6248         ump = VFSTOUFS(mp);
 6249         ACQUIRE_LOCK(&lk);
 6250         *softdep_depsp = ump->softdep_deps;
 6251         *softdep_accdepsp = ump->softdep_accdeps;
 6252         FREE_LOCK(&lk);
 6253 }
 6254 
 6255 /*
 6256  * Wait for pending output on a vnode to complete.
 6257  * Must be called with vnode lock and interlock locked.
 6258  *
 6259  * XXX: Should just be a call to bufobj_wwait().
 6260  */
 6261 static void
 6262 drain_output(vp)
 6263         struct vnode *vp;
 6264 {
 6265         ASSERT_VOP_LOCKED(vp, "drain_output");
 6266         ASSERT_VI_LOCKED(vp, "drain_output");
 6267 
 6268         while (vp->v_bufobj.bo_numoutput) {
 6269                 vp->v_bufobj.bo_flag |= BO_WWAIT;
 6270                 msleep((caddr_t)&vp->v_bufobj.bo_numoutput,
 6271                     VI_MTX(vp), PRIBIO + 1, "drainvp", 0);
 6272         }
 6273 }
 6274 
 6275 /*
 6276  * Called whenever a buffer that is being invalidated or reallocated
 6277  * contains dependencies. This should only happen if an I/O error has
 6278  * occurred. The routine is called with the buffer locked.
 6279  */ 
 6280 static void
 6281 softdep_deallocate_dependencies(bp)
 6282         struct buf *bp;
 6283 {
 6284 
 6285         if ((bp->b_ioflags & BIO_ERROR) == 0)
 6286                 panic("softdep_deallocate_dependencies: dangling deps");
 6287         softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
 6288         panic("softdep_deallocate_dependencies: unrecovered I/O error");
 6289 }
 6290 
 6291 /*
 6292  * Function to handle asynchronous write errors in the filesystem.
 6293  */
 6294 static void
 6295 softdep_error(func, error)
 6296         char *func;
 6297         int error;
 6298 {
 6299 
 6300         /* XXX should do something better! */
 6301         printf("%s: got error %d while accessing filesystem\n", func, error);
 6302 }
 6303 
 6304 #endif /* SOFTUPDATES */

Cache object: 923727fe79bf4c836db7e24d9f3f79b6


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.