The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/ufs/ffs/ffs_softdep.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
    3  *
    4  * The soft updates code is derived from the appendix of a University
    5  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
    6  * "Soft Updates: A Solution to the Metadata Update Problem in File
    7  * Systems", CSE-TR-254-95, August 1995).
    8  *
    9  * Further information about soft updates can be obtained from:
   10  *
   11  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
   12  *      1614 Oxford Street              mckusick@mckusick.com
   13  *      Berkeley, CA 94709-1608         +1-510-843-9542
   14  *      USA
   15  *
   16  * Redistribution and use in source and binary forms, with or without
   17  * modification, are permitted provided that the following conditions
   18  * are met:
   19  *
   20  * 1. Redistributions of source code must retain the above copyright
   21  *    notice, this list of conditions and the following disclaimer.
   22  * 2. Redistributions in binary form must reproduce the above copyright
   23  *    notice, this list of conditions and the following disclaimer in the
   24  *    documentation and/or other materials provided with the distribution.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
   27  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   28  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   29  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
   30  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD$");
   43 
   44 /*
   45  * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
   46  */
   47 #ifndef DIAGNOSTIC
   48 #define DIAGNOSTIC
   49 #endif
   50 #ifndef DEBUG
   51 #define DEBUG
   52 #endif
   53 
   54 #include <sys/param.h>
   55 #include <sys/kernel.h>
   56 #include <sys/systm.h>
   57 #include <sys/bio.h>
   58 #include <sys/buf.h>
   59 #include <sys/kdb.h>
   60 #include <sys/kthread.h>
   61 #include <sys/lock.h>
   62 #include <sys/malloc.h>
   63 #include <sys/mount.h>
   64 #include <sys/mutex.h>
   65 #include <sys/proc.h>
   66 #include <sys/stat.h>
   67 #include <sys/sysctl.h>
   68 #include <sys/syslog.h>
   69 #include <sys/vnode.h>
   70 #include <sys/conf.h>
   71 #include <ufs/ufs/dir.h>
   72 #include <ufs/ufs/extattr.h>
   73 #include <ufs/ufs/quota.h>
   74 #include <ufs/ufs/inode.h>
   75 #include <ufs/ufs/ufsmount.h>
   76 #include <ufs/ffs/fs.h>
   77 #include <ufs/ffs/softdep.h>
   78 #include <ufs/ffs/ffs_extern.h>
   79 #include <ufs/ufs/ufs_extern.h>
   80 
   81 #include <vm/vm.h>
   82 
   83 #include "opt_ffs.h"
   84 #include "opt_quota.h"
   85 
   86 #ifndef SOFTUPDATES
   87 
   88 int
   89 softdep_flushfiles(oldmnt, flags, td)
   90         struct mount *oldmnt;
   91         int flags;
   92         struct thread *td;
   93 {
   94 
   95         panic("softdep_flushfiles called");
   96 }
   97 
   98 int
   99 softdep_mount(devvp, mp, fs, cred)
  100         struct vnode *devvp;
  101         struct mount *mp;
  102         struct fs *fs;
  103         struct ucred *cred;
  104 {
  105 
  106         return (0);
  107 }
  108 
  109 void 
  110 softdep_initialize()
  111 {
  112 
  113         return;
  114 }
  115 
  116 void
  117 softdep_uninitialize()
  118 {
  119 
  120         return;
  121 }
  122 
  123 void
  124 softdep_setup_inomapdep(bp, ip, newinum)
  125         struct buf *bp;
  126         struct inode *ip;
  127         ino_t newinum;
  128 {
  129 
  130         panic("softdep_setup_inomapdep called");
  131 }
  132 
  133 void
  134 softdep_setup_blkmapdep(bp, mp, newblkno)
  135         struct buf *bp;
  136         struct mount *mp;
  137         ufs2_daddr_t newblkno;
  138 {
  139 
  140         panic("softdep_setup_blkmapdep called");
  141 }
  142 
  143 void 
  144 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
  145         struct inode *ip;
  146         ufs_lbn_t lbn;
  147         ufs2_daddr_t newblkno;
  148         ufs2_daddr_t oldblkno;
  149         long newsize;
  150         long oldsize;
  151         struct buf *bp;
  152 {
  153         
  154         panic("softdep_setup_allocdirect called");
  155 }
  156 
  157 void 
  158 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
  159         struct inode *ip;
  160         ufs_lbn_t lbn;
  161         ufs2_daddr_t newblkno;
  162         ufs2_daddr_t oldblkno;
  163         long newsize;
  164         long oldsize;
  165         struct buf *bp;
  166 {
  167         
  168         panic("softdep_setup_allocext called");
  169 }
  170 
  171 void
  172 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
  173         struct inode *ip;
  174         ufs_lbn_t lbn;
  175         struct buf *bp;
  176         int ptrno;
  177         ufs2_daddr_t newblkno;
  178         ufs2_daddr_t oldblkno;
  179         struct buf *nbp;
  180 {
  181 
  182         panic("softdep_setup_allocindir_page called");
  183 }
  184 
  185 void
  186 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
  187         struct buf *nbp;
  188         struct inode *ip;
  189         struct buf *bp;
  190         int ptrno;
  191         ufs2_daddr_t newblkno;
  192 {
  193 
  194         panic("softdep_setup_allocindir_meta called");
  195 }
  196 
  197 void
  198 softdep_setup_freeblocks(ip, length, flags)
  199         struct inode *ip;
  200         off_t length;
  201         int flags;
  202 {
  203         
  204         panic("softdep_setup_freeblocks called");
  205 }
  206 
  207 void
  208 softdep_freefile(pvp, ino, mode)
  209                 struct vnode *pvp;
  210                 ino_t ino;
  211                 int mode;
  212 {
  213 
  214         panic("softdep_freefile called");
  215 }
  216 
  217 int 
  218 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
  219         struct buf *bp;
  220         struct inode *dp;
  221         off_t diroffset;
  222         ino_t newinum;
  223         struct buf *newdirbp;
  224         int isnewblk;
  225 {
  226 
  227         panic("softdep_setup_directory_add called");
  228 }
  229 
  230 void 
  231 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
  232         struct inode *dp;
  233         caddr_t base;
  234         caddr_t oldloc;
  235         caddr_t newloc;
  236         int entrysize;
  237 {
  238 
  239         panic("softdep_change_directoryentry_offset called");
  240 }
  241 
  242 void 
  243 softdep_setup_remove(bp, dp, ip, isrmdir)
  244         struct buf *bp;
  245         struct inode *dp;
  246         struct inode *ip;
  247         int isrmdir;
  248 {
  249         
  250         panic("softdep_setup_remove called");
  251 }
  252 
  253 void 
  254 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
  255         struct buf *bp;
  256         struct inode *dp;
  257         struct inode *ip;
  258         ino_t newinum;
  259         int isrmdir;
  260 {
  261 
  262         panic("softdep_setup_directory_change called");
  263 }
  264 
  265 void
  266 softdep_change_linkcnt(ip)
  267         struct inode *ip;
  268 {
  269 
  270         panic("softdep_change_linkcnt called");
  271 }
  272 
  273 void 
  274 softdep_load_inodeblock(ip)
  275         struct inode *ip;
  276 {
  277 
  278         panic("softdep_load_inodeblock called");
  279 }
  280 
  281 void 
  282 softdep_update_inodeblock(ip, bp, waitfor)
  283         struct inode *ip;
  284         struct buf *bp;
  285         int waitfor;
  286 {
  287 
  288         panic("softdep_update_inodeblock called");
  289 }
  290 
  291 int
  292 softdep_fsync(vp)
  293         struct vnode *vp;       /* the "in_core" copy of the inode */
  294 {
  295 
  296         return (0);
  297 }
  298 
  299 void
  300 softdep_fsync_mountdev(vp)
  301         struct vnode *vp;
  302 {
  303 
  304         return;
  305 }
  306 
  307 int
  308 softdep_flushworklist(oldmnt, countp, td)
  309         struct mount *oldmnt;
  310         int *countp;
  311         struct thread *td;
  312 {
  313 
  314         *countp = 0;
  315         return (0);
  316 }
  317 
  318 int
  319 softdep_sync_metadata(struct vnode *vp)
  320 {
  321 
  322         return (0);
  323 }
  324 
  325 int
  326 softdep_slowdown(vp)
  327         struct vnode *vp;
  328 {
  329 
  330         panic("softdep_slowdown called");
  331 }
  332 
  333 void
  334 softdep_releasefile(ip)
  335         struct inode *ip;       /* inode with the zero effective link count */
  336 {
  337 
  338         panic("softdep_releasefile called");
  339 }
  340 
  341 int
  342 softdep_request_cleanup(fs, vp)
  343         struct fs *fs;
  344         struct vnode *vp;
  345 {
  346 
  347         return (0);
  348 }
  349 
  350 int
  351 softdep_check_suspend(struct mount *mp,
  352                       struct vnode *devvp,
  353                       int softdep_deps,
  354                       int softdep_accdeps,
  355                       int secondary_writes,
  356                       int secondary_accwrites)
  357 {
  358         struct bufobj *bo;
  359         int error;
  360         
  361         (void) softdep_deps,
  362         (void) softdep_accdeps;
  363 
  364         ASSERT_VI_LOCKED(devvp, "softdep_check_suspend");
  365         bo = &devvp->v_bufobj;
  366 
  367         for (;;) {
  368                 if (!MNT_ITRYLOCK(mp)) {
  369                         VI_UNLOCK(devvp);
  370                         MNT_ILOCK(mp);
  371                         MNT_IUNLOCK(mp);
  372                         VI_LOCK(devvp);
  373                         continue;
  374                 }
  375                 if (mp->mnt_secondary_writes != 0) {
  376                         VI_UNLOCK(devvp);
  377                         msleep(&mp->mnt_secondary_writes,
  378                                MNT_MTX(mp),
  379                                (PUSER - 1) | PDROP, "secwr", 0);
  380                         VI_LOCK(devvp);
  381                         continue;
  382                 }
  383                 break;
  384         }
  385 
  386         /*
  387          * Reasons for needing more work before suspend:
  388          * - Dirty buffers on devvp.
  389          * - Secondary writes occurred after start of vnode sync loop
  390          */
  391         error = 0;
  392         if (bo->bo_numoutput > 0 ||
  393             bo->bo_dirty.bv_cnt > 0 ||
  394             secondary_writes != 0 ||
  395             mp->mnt_secondary_writes != 0 ||
  396             secondary_accwrites != mp->mnt_secondary_accwrites)
  397                 error = EAGAIN;
  398         VI_UNLOCK(devvp);
  399         return (error);
  400 }
  401 
  402 void
  403 softdep_get_depcounts(struct mount *mp,
  404                       int *softdepactivep,
  405                       int *softdepactiveaccp)
  406 {
  407         (void) mp;
  408         *softdepactivep = 0;
  409         *softdepactiveaccp = 0;
  410 }
  411 
  412 #else
  413 /*
  414  * These definitions need to be adapted to the system to which
  415  * this file is being ported.
  416  */
  417 /*
  418  * malloc types defined for the softdep system.
  419  */
  420 static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
  421 static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
  422 static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
  423 static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
  424 static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
  425 static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
  426 static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
  427 static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
  428 static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
  429 static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
  430 static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
  431 static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
  432 static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
  433 static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
  434 static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
  435 
  436 #define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)
  437 
  438 #define D_PAGEDEP       0
  439 #define D_INODEDEP      1
  440 #define D_NEWBLK        2
  441 #define D_BMSAFEMAP     3
  442 #define D_ALLOCDIRECT   4
  443 #define D_INDIRDEP      5
  444 #define D_ALLOCINDIR    6
  445 #define D_FREEFRAG      7
  446 #define D_FREEBLKS      8
  447 #define D_FREEFILE      9
  448 #define D_DIRADD        10
  449 #define D_MKDIR         11
  450 #define D_DIRREM        12
  451 #define D_NEWDIRBLK     13
  452 #define D_LAST          D_NEWDIRBLK
  453 
  454 /* 
  455  * translate from workitem type to memory type
  456  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
  457  */
  458 static struct malloc_type *memtype[] = {
  459         M_PAGEDEP,
  460         M_INODEDEP,
  461         M_NEWBLK,
  462         M_BMSAFEMAP,
  463         M_ALLOCDIRECT,
  464         M_INDIRDEP,
  465         M_ALLOCINDIR,
  466         M_FREEFRAG,
  467         M_FREEBLKS,
  468         M_FREEFILE,
  469         M_DIRADD,
  470         M_MKDIR,
  471         M_DIRREM,
  472         M_NEWDIRBLK
  473 };
  474 
  475 #define DtoM(type) (memtype[type])
  476 
  477 /*
  478  * Names of malloc types.
  479  */
  480 #define TYPENAME(type)  \
  481         ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
  482 /*
  483  * End system adaptation definitions.
  484  */
  485 
  486 /*
  487  * Forward declarations.
  488  */
  489 struct inodedep_hashhead;
  490 struct newblk_hashhead;
  491 struct pagedep_hashhead;
  492 
  493 /*
  494  * Internal function prototypes.
  495  */
  496 static  void softdep_error(char *, int);
  497 static  void drain_output(struct vnode *);
  498 static  struct buf *getdirtybuf(struct buf *, struct mtx *, int);
  499 static  void clear_remove(struct thread *);
  500 static  void clear_inodedeps(struct thread *);
  501 static  int flush_pagedep_deps(struct vnode *, struct mount *,
  502             struct diraddhd *);
  503 static  int flush_inodedep_deps(struct mount *, ino_t);
  504 static  int flush_deplist(struct allocdirectlst *, int, int *);
  505 static  int handle_written_filepage(struct pagedep *, struct buf *);
  506 static  void diradd_inode_written(struct diradd *, struct inodedep *);
  507 static  int handle_written_inodeblock(struct inodedep *, struct buf *);
  508 static  void handle_allocdirect_partdone(struct allocdirect *);
  509 static  void handle_allocindir_partdone(struct allocindir *);
  510 static  void initiate_write_filepage(struct pagedep *, struct buf *);
  511 static  void handle_written_mkdir(struct mkdir *, int);
  512 static  void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
  513 static  void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
  514 static  void handle_workitem_freefile(struct freefile *);
  515 static  void handle_workitem_remove(struct dirrem *, struct vnode *);
  516 static  struct dirrem *newdirrem(struct buf *, struct inode *,
  517             struct inode *, int, struct dirrem **);
  518 static  void free_diradd(struct diradd *);
  519 static  void free_allocindir(struct allocindir *, struct inodedep *);
  520 static  void free_newdirblk(struct newdirblk *);
  521 static  int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
  522             ufs2_daddr_t *);
  523 static  void deallocate_dependencies(struct buf *, struct inodedep *);
  524 static  void free_allocdirect(struct allocdirectlst *,
  525             struct allocdirect *, int);
  526 static  int check_inode_unwritten(struct inodedep *);
  527 static  int free_inodedep(struct inodedep *);
  528 static  void handle_workitem_freeblocks(struct freeblks *, int);
  529 static  void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
  530 static  void setup_allocindir_phase2(struct buf *, struct inode *,
  531             struct allocindir *);
  532 static  struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
  533             ufs2_daddr_t);
  534 static  void handle_workitem_freefrag(struct freefrag *);
  535 static  struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
  536 static  void allocdirect_merge(struct allocdirectlst *,
  537             struct allocdirect *, struct allocdirect *);
  538 static  struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *);
  539 static  int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t,
  540             struct newblk **);
  541 static  int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
  542 static  int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
  543             struct inodedep **);
  544 static  int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
  545 static  int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
  546 static  int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
  547             struct mount *mp, int, struct pagedep **);
  548 static  void pause_timer(void *);
  549 static  int request_cleanup(struct mount *, int);
  550 static  int process_worklist_item(struct mount *, int);
  551 static  void add_to_worklist(struct worklist *);
  552 static  void softdep_flush(void);
  553 static  int softdep_speedup(void);
  554 
  555 /*
  556  * Exported softdep operations.
  557  */
  558 static  void softdep_disk_io_initiation(struct buf *);
  559 static  void softdep_disk_write_complete(struct buf *);
  560 static  void softdep_deallocate_dependencies(struct buf *);
  561 static  int softdep_count_dependencies(struct buf *bp, int);
  562 
  563 static struct mtx lk;
  564 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
  565 
  566 #define TRY_ACQUIRE_LOCK(lk)            mtx_trylock(lk)
  567 #define ACQUIRE_LOCK(lk)                mtx_lock(lk)
  568 #define FREE_LOCK(lk)                   mtx_unlock(lk)
  569 
  570 /*
  571  * Worklist queue management.
  572  * These routines require that the lock be held.
  573  */
  574 #ifndef /* NOT */ DEBUG
  575 #define WORKLIST_INSERT(head, item) do {        \
  576         (item)->wk_state |= ONWORKLIST;         \
  577         LIST_INSERT_HEAD(head, item, wk_list);  \
  578 } while (0)
  579 #define WORKLIST_REMOVE(item) do {              \
  580         (item)->wk_state &= ~ONWORKLIST;        \
  581         LIST_REMOVE(item, wk_list);             \
  582 } while (0)
  583 #else /* DEBUG */
  584 static  void worklist_insert(struct workhead *, struct worklist *);
  585 static  void worklist_remove(struct worklist *);
  586 
  587 #define WORKLIST_INSERT(head, item) worklist_insert(head, item)
  588 #define WORKLIST_REMOVE(item) worklist_remove(item)
  589 
  590 static void
  591 worklist_insert(head, item)
  592         struct workhead *head;
  593         struct worklist *item;
  594 {
  595 
  596         mtx_assert(&lk, MA_OWNED);
  597         if (item->wk_state & ONWORKLIST)
  598                 panic("worklist_insert: already on list");
  599         item->wk_state |= ONWORKLIST;
  600         LIST_INSERT_HEAD(head, item, wk_list);
  601 }
  602 
  603 static void
  604 worklist_remove(item)
  605         struct worklist *item;
  606 {
  607 
  608         mtx_assert(&lk, MA_OWNED);
  609         if ((item->wk_state & ONWORKLIST) == 0)
  610                 panic("worklist_remove: not on list");
  611         item->wk_state &= ~ONWORKLIST;
  612         LIST_REMOVE(item, wk_list);
  613 }
  614 #endif /* DEBUG */
  615 
  616 /*
  617  * Routines for tracking and managing workitems.
  618  */
  619 static  void workitem_free(struct worklist *, int);
  620 static  void workitem_alloc(struct worklist *, int, struct mount *);
  621 
  622 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
  623 
  624 static void
  625 workitem_free(item, type)
  626         struct worklist *item;
  627         int type;
  628 {
  629         struct ufsmount *ump;
  630         mtx_assert(&lk, MA_OWNED);
  631 
  632 #ifdef DEBUG
  633         if (item->wk_state & ONWORKLIST)
  634                 panic("workitem_free: still on list");
  635         if (item->wk_type != type)
  636                 panic("workitem_free: type mismatch");
  637 #endif
  638         ump = VFSTOUFS(item->wk_mp);
  639         if (--ump->softdep_deps == 0 && ump->softdep_req)
  640                 wakeup(&ump->softdep_deps);
  641         FREE(item, DtoM(type));
  642 }
  643 
  644 static void
  645 workitem_alloc(item, type, mp)
  646         struct worklist *item;
  647         int type;
  648         struct mount *mp;
  649 {
  650         item->wk_type = type;
  651         item->wk_mp = mp;
  652         item->wk_state = 0;
  653         ACQUIRE_LOCK(&lk);
  654         VFSTOUFS(mp)->softdep_deps++;
  655         VFSTOUFS(mp)->softdep_accdeps++;
  656         FREE_LOCK(&lk);
  657 }
  658 
  659 /*
  660  * Workitem queue management
  661  */
  662 static int max_softdeps;        /* maximum number of structs before slowdown */
  663 static int maxindirdeps = 50;   /* max number of indirdeps before slowdown */
  664 static int tickdelay = 2;       /* number of ticks to pause during slowdown */
  665 static int proc_waiting;        /* tracks whether we have a timeout posted */
  666 static int *stat_countp;        /* statistic to count in proc_waiting timeout */
  667 static struct callout_handle handle; /* handle on posted proc_waiting timeout */
  668 static int req_pending;
  669 static int req_clear_inodedeps; /* syncer process flush some inodedeps */
  670 #define FLUSH_INODES            1
  671 static int req_clear_remove;    /* syncer process flush some freeblks */
  672 #define FLUSH_REMOVE            2
  673 #define FLUSH_REMOVE_WAIT       3
  674 /*
  675  * runtime statistics
  676  */
  677 static int stat_worklist_push;  /* number of worklist cleanups */
  678 static int stat_blk_limit_push; /* number of times block limit neared */
  679 static int stat_ino_limit_push; /* number of times inode limit neared */
  680 static int stat_blk_limit_hit;  /* number of times block slowdown imposed */
  681 static int stat_ino_limit_hit;  /* number of times inode slowdown imposed */
  682 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
  683 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
  684 static int stat_inode_bitmap;   /* bufs redirtied as inode bitmap not written */
  685 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
  686 static int stat_dir_entry;      /* bufs redirtied as dir entry cannot write */
  687 
  688 SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
  689 SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
  690 SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
  691 SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
  692 SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
  693 SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
  694 SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
  695 SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
  696 SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
  697 SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
  698 SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
  699 SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
  700 SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
  701 /* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */
  702 
  703 SYSCTL_DECL(_vfs_ffs);
  704 
  705 static int compute_summary_at_mount = 0;        /* Whether to recompute the summary at mount time */
  706 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
  707            &compute_summary_at_mount, 0, "Recompute summary at mount");
  708 
  709 static struct proc *softdepproc;
  710 static struct kproc_desc softdep_kp = {
  711         "softdepflush",
  712         softdep_flush,
  713         &softdepproc
  714 };
  715 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, &softdep_kp)
  716 
  717 static void
  718 softdep_flush(void)
  719 {
  720         struct mount *nmp;
  721         struct mount *mp;
  722         struct ufsmount *ump;
  723         struct thread *td;
  724         int remaining;
  725         int vfslocked;
  726 
  727         td = curthread;
  728         td->td_pflags |= TDP_NORUNNINGBUF;
  729 
  730         for (;;) {      
  731                 kthread_suspend_check(softdepproc);
  732                 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
  733                 ACQUIRE_LOCK(&lk);
  734                 /*
  735                  * If requested, try removing inode or removal dependencies.
  736                  */
  737                 if (req_clear_inodedeps) {
  738                         clear_inodedeps(td);
  739                         req_clear_inodedeps -= 1;
  740                         wakeup_one(&proc_waiting);
  741                 }
  742                 if (req_clear_remove) {
  743                         clear_remove(td);
  744                         req_clear_remove -= 1;
  745                         wakeup_one(&proc_waiting);
  746                 }
  747                 FREE_LOCK(&lk);
  748                 VFS_UNLOCK_GIANT(vfslocked);
  749                 remaining = 0;
  750                 mtx_lock(&mountlist_mtx);
  751                 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
  752                         nmp = TAILQ_NEXT(mp, mnt_list);
  753                         if ((mp->mnt_flag & MNT_SOFTDEP) == 0)
  754                                 continue;
  755                         if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
  756                                 continue;
  757                         vfslocked = VFS_LOCK_GIANT(mp);
  758                         softdep_process_worklist(mp, 0);
  759                         ump = VFSTOUFS(mp);
  760                         remaining += ump->softdep_on_worklist -
  761                                 ump->softdep_on_worklist_inprogress;
  762                         VFS_UNLOCK_GIANT(vfslocked);
  763                         mtx_lock(&mountlist_mtx);
  764                         nmp = TAILQ_NEXT(mp, mnt_list);
  765                         vfs_unbusy(mp, td);
  766                 }
  767                 mtx_unlock(&mountlist_mtx);
  768                 if (remaining)
  769                         continue;
  770                 ACQUIRE_LOCK(&lk);
  771                 if (!req_pending)
  772                         msleep(&req_pending, &lk, PVM, "sdflush", hz);
  773                 req_pending = 0;
  774                 FREE_LOCK(&lk);
  775         }
  776 }
  777 
  778 static int
  779 softdep_speedup(void)
  780 {
  781 
  782         mtx_assert(&lk, MA_OWNED);
  783         if (req_pending == 0) {
  784                 req_pending = 1;
  785                 wakeup(&req_pending);
  786         }
  787 
  788         return speedup_syncer();
  789 }
  790 
  791 /*
  792  * Add an item to the end of the work queue.
  793  * This routine requires that the lock be held.
  794  * This is the only routine that adds items to the list.
  795  * The following routine is the only one that removes items
  796  * and does so in order from first to last.
  797  */
  798 static void
  799 add_to_worklist(wk)
  800         struct worklist *wk;
  801 {
  802         struct ufsmount *ump;
  803 
  804         mtx_assert(&lk, MA_OWNED);
  805         ump = VFSTOUFS(wk->wk_mp);
  806         if (wk->wk_state & ONWORKLIST)
  807                 panic("add_to_worklist: already on list");
  808         wk->wk_state |= ONWORKLIST;
  809         if (LIST_EMPTY(&ump->softdep_workitem_pending))
  810                 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
  811         else
  812                 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
  813         ump->softdep_worklist_tail = wk;
  814         ump->softdep_on_worklist += 1;
  815 }
  816 
  817 /*
  818  * Process that runs once per second to handle items in the background queue.
  819  *
  820  * Note that we ensure that everything is done in the order in which they
  821  * appear in the queue. The code below depends on this property to ensure
  822  * that blocks of a file are freed before the inode itself is freed. This
  823  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
  824  * until all the old ones have been purged from the dependency lists.
  825  */
  826 int 
  827 softdep_process_worklist(mp, full)
  828         struct mount *mp;
  829         int full;
  830 {
  831         struct thread *td = curthread;
  832         int cnt, matchcnt, loopcount;
  833         struct ufsmount *ump;
  834         long starttime;
  835 
  836         KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
  837         /*
  838          * Record the process identifier of our caller so that we can give
  839          * this process preferential treatment in request_cleanup below.
  840          */
  841         matchcnt = 0;
  842         ump = VFSTOUFS(mp);
  843         ACQUIRE_LOCK(&lk);
  844         loopcount = 1;
  845         starttime = time_second;
  846         while (ump->softdep_on_worklist > 0) {
  847                 if ((cnt = process_worklist_item(mp, 0)) == -1)
  848                         break;
  849                 else
  850                         matchcnt += cnt;
  851                 /*
  852                  * If requested, try removing inode or removal dependencies.
  853                  */
  854                 if (req_clear_inodedeps) {
  855                         clear_inodedeps(td);
  856                         req_clear_inodedeps -= 1;
  857                         wakeup_one(&proc_waiting);
  858                 }
  859                 if (req_clear_remove) {
  860                         clear_remove(td);
  861                         req_clear_remove -= 1;
  862                         wakeup_one(&proc_waiting);
  863                 }
  864                 /*
  865                  * We do not generally want to stop for buffer space, but if
  866                  * we are really being a buffer hog, we will stop and wait.
  867                  */
  868                 if (loopcount++ % 128 == 0) {
  869                         FREE_LOCK(&lk);
  870                         bwillwrite();
  871                         ACQUIRE_LOCK(&lk);
  872                 }
  873                 /*
  874                  * Never allow processing to run for more than one
  875                  * second. Otherwise the other mountpoints may get
  876                  * excessively backlogged.
  877                  */
  878                 if (!full && starttime != time_second) {
  879                         matchcnt = -1;
  880                         break;
  881                 }
  882         }
  883         FREE_LOCK(&lk);
  884         return (matchcnt);
  885 }
  886 
  887 /*
  888  * Process one item on the worklist.
  889  */
  890 static int
  891 process_worklist_item(mp, flags)
  892         struct mount *mp;
  893         int flags;
  894 {
  895         struct worklist *wk, *wkend;
  896         struct ufsmount *ump;
  897         struct vnode *vp;
  898         int matchcnt = 0;
  899 
  900         mtx_assert(&lk, MA_OWNED);
  901         KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
  902         /*
  903          * If we are being called because of a process doing a
  904          * copy-on-write, then it is not safe to write as we may
  905          * recurse into the copy-on-write routine.
  906          */
  907         if (curthread->td_pflags & TDP_COWINPROGRESS)
  908                 return (-1);
  909         /*
  910          * Normally we just process each item on the worklist in order.
  911          * However, if we are in a situation where we cannot lock any
  912          * inodes, we have to skip over any dirrem requests whose
  913          * vnodes are resident and locked.
  914          */
  915         ump = VFSTOUFS(mp);
  916         vp = NULL;
  917         LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
  918                 if (wk->wk_state & INPROGRESS)
  919                         continue;
  920                 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
  921                         break;
  922                 wk->wk_state |= INPROGRESS;
  923                 ump->softdep_on_worklist_inprogress++;
  924                 FREE_LOCK(&lk);
  925                 ffs_vget(mp, WK_DIRREM(wk)->dm_oldinum,
  926                     LK_NOWAIT | LK_EXCLUSIVE, &vp);
  927                 ACQUIRE_LOCK(&lk);
  928                 wk->wk_state &= ~INPROGRESS;
  929                 ump->softdep_on_worklist_inprogress--;
  930                 if (vp != NULL)
  931                         break;
  932         }
  933         if (wk == 0)
  934                 return (-1);
  935         /*
  936          * Remove the item to be processed. If we are removing the last
  937          * item on the list, we need to recalculate the tail pointer.
  938          * As this happens rarely and usually when the list is short,
  939          * we just run down the list to find it rather than tracking it
  940          * in the above loop.
  941          */
  942         WORKLIST_REMOVE(wk);
  943         if (wk == ump->softdep_worklist_tail) {
  944                 LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
  945                         if (LIST_NEXT(wkend, wk_list) == NULL)
  946                                 break;
  947                 ump->softdep_worklist_tail = wkend;
  948         }
  949         ump->softdep_on_worklist -= 1;
  950         FREE_LOCK(&lk);
  951         if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
  952                 panic("process_worklist_item: suspended filesystem");
  953         matchcnt++;
  954         switch (wk->wk_type) {
  955 
  956         case D_DIRREM:
  957                 /* removal of a directory entry */
  958                 handle_workitem_remove(WK_DIRREM(wk), vp);
  959                 break;
  960 
  961         case D_FREEBLKS:
  962                 /* releasing blocks and/or fragments from a file */
  963                 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
  964                 break;
  965 
  966         case D_FREEFRAG:
  967                 /* releasing a fragment when replaced as a file grows */
  968                 handle_workitem_freefrag(WK_FREEFRAG(wk));
  969                 break;
  970 
  971         case D_FREEFILE:
  972                 /* releasing an inode when its link count drops to 0 */
  973                 handle_workitem_freefile(WK_FREEFILE(wk));
  974                 break;
  975 
  976         default:
  977                 panic("%s_process_worklist: Unknown type %s",
  978                     "softdep", TYPENAME(wk->wk_type));
  979                 /* NOTREACHED */
  980         }
  981         vn_finished_secondary_write(mp);
  982         ACQUIRE_LOCK(&lk);
  983         return (matchcnt);
  984 }
  985 
  986 /*
  987  * Move dependencies from one buffer to another.
  988  */
  989 void
  990 softdep_move_dependencies(oldbp, newbp)
  991         struct buf *oldbp;
  992         struct buf *newbp;
  993 {
  994         struct worklist *wk, *wktail;
  995 
  996         if (!LIST_EMPTY(&newbp->b_dep))
  997                 panic("softdep_move_dependencies: need merge code");
  998         wktail = 0;
  999         ACQUIRE_LOCK(&lk);
 1000         while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
 1001                 LIST_REMOVE(wk, wk_list);
 1002                 if (wktail == 0)
 1003                         LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
 1004                 else
 1005                         LIST_INSERT_AFTER(wktail, wk, wk_list);
 1006                 wktail = wk;
 1007         }
 1008         FREE_LOCK(&lk);
 1009 }
 1010 
 1011 /*
 1012  * Purge the work list of all items associated with a particular mount point.
 1013  */
 1014 int
 1015 softdep_flushworklist(oldmnt, countp, td)
 1016         struct mount *oldmnt;
 1017         int *countp;
 1018         struct thread *td;
 1019 {
 1020         struct vnode *devvp;
 1021         int count, error = 0;
 1022         struct ufsmount *ump;
 1023 
 1024         /*
 1025          * Alternately flush the block device associated with the mount
 1026          * point and process any dependencies that the flushing
 1027          * creates. We continue until no more worklist dependencies
 1028          * are found.
 1029          */
 1030         *countp = 0;
 1031         ump = VFSTOUFS(oldmnt);
 1032         devvp = ump->um_devvp;
 1033         while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
 1034                 *countp += count;
 1035                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 1036                 error = VOP_FSYNC(devvp, MNT_WAIT, td);
 1037                 VOP_UNLOCK(devvp, 0, td);
 1038                 if (error)
 1039                         break;
 1040         }
 1041         return (error);
 1042 }
 1043 
 1044 int
 1045 softdep_waitidle(struct mount *mp)
 1046 {
 1047         struct ufsmount *ump;
 1048         int error;
 1049         int i;
 1050 
 1051         ump = VFSTOUFS(mp);
 1052         ACQUIRE_LOCK(&lk);
 1053         for (i = 0; i < 10 && ump->softdep_deps; i++) {
 1054                 ump->softdep_req = 1;
 1055                 if (ump->softdep_on_worklist)
 1056                         panic("softdep_waitidle: work added after flush.");
 1057                 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
 1058         }
 1059         ump->softdep_req = 0;
 1060         FREE_LOCK(&lk);
 1061         error = 0;
 1062         if (i == 10) {
 1063                 error = EBUSY;
 1064                 printf("softdep_waitidle: Failed to flush worklist for %p\n",
 1065                     mp);
 1066         }
 1067 
 1068         return (error);
 1069 }
 1070 
 1071 /*
 1072  * Flush all vnodes and worklist items associated with a specified mount point.
 1073  */
 1074 int
 1075 softdep_flushfiles(oldmnt, flags, td)
 1076         struct mount *oldmnt;
 1077         int flags;
 1078         struct thread *td;
 1079 {
 1080         int error, count, loopcnt;
 1081 
 1082         error = 0;
 1083 
 1084         /*
 1085          * Alternately flush the vnodes associated with the mount
 1086          * point and process any dependencies that the flushing
 1087          * creates. In theory, this loop can happen at most twice,
 1088          * but we give it a few extra just to be sure.
 1089          */
 1090         for (loopcnt = 10; loopcnt > 0; loopcnt--) {
 1091                 /*
 1092                  * Do another flush in case any vnodes were brought in
 1093                  * as part of the cleanup operations.
 1094                  */
 1095                 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
 1096                         break;
 1097                 if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 ||
 1098                     count == 0)
 1099                         break;
 1100         }
 1101         /*
 1102          * If we are unmounting then it is an error to fail. If we
 1103          * are simply trying to downgrade to read-only, then filesystem
 1104          * activity can keep us busy forever, so we just fail with EBUSY.
 1105          */
 1106         if (loopcnt == 0) {
 1107                 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
 1108                         panic("softdep_flushfiles: looping");
 1109                 error = EBUSY;
 1110         }
 1111         if (!error)
 1112                 error = softdep_waitidle(oldmnt);
 1113         return (error);
 1114 }
 1115 
 1116 /*
 1117  * Structure hashing.
 1118  * 
 1119  * There are three types of structures that can be looked up:
 1120  *      1) pagedep structures identified by mount point, inode number,
 1121  *         and logical block.
 1122  *      2) inodedep structures identified by mount point and inode number.
 1123  *      3) newblk structures identified by mount point and
 1124  *         physical block number.
 1125  *
 1126  * The "pagedep" and "inodedep" dependency structures are hashed
 1127  * separately from the file blocks and inodes to which they correspond.
 1128  * This separation helps when the in-memory copy of an inode or
 1129  * file block must be replaced. It also obviates the need to access
 1130  * an inode or file page when simply updating (or de-allocating)
 1131  * dependency structures. Lookup of newblk structures is needed to
 1132  * find newly allocated blocks when trying to associate them with
 1133  * their allocdirect or allocindir structure.
 1134  *
 1135  * The lookup routines optionally create and hash a new instance when
 1136  * an existing entry is not found.
 1137  */
 1138 #define DEPALLOC        0x0001  /* allocate structure if lookup fails */
 1139 #define NODELAY         0x0002  /* cannot do background work */
 1140 
 1141 /*
 1142  * Structures and routines associated with pagedep caching.
 1143  */
 1144 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
 1145 u_long  pagedep_hash;           /* size of hash table - 1 */
 1146 #define PAGEDEP_HASH(mp, inum, lbn) \
 1147         (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
 1148             pagedep_hash])
 1149 
 1150 static int
 1151 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
 1152         struct pagedep_hashhead *pagedephd;
 1153         ino_t ino;
 1154         ufs_lbn_t lbn;
 1155         struct mount *mp;
 1156         int flags;
 1157         struct pagedep **pagedeppp;
 1158 {
 1159         struct pagedep *pagedep;
 1160 
 1161         LIST_FOREACH(pagedep, pagedephd, pd_hash)
 1162                 if (ino == pagedep->pd_ino &&
 1163                     lbn == pagedep->pd_lbn &&
 1164                     mp == pagedep->pd_list.wk_mp)
 1165                         break;
 1166         if (pagedep) {
 1167                 *pagedeppp = pagedep;
 1168                 if ((flags & DEPALLOC) != 0 &&
 1169                     (pagedep->pd_state & ONWORKLIST) == 0)
 1170                         return (0);
 1171                 return (1);
 1172         }
 1173         *pagedeppp = NULL;
 1174         return (0);
 1175 }
 1176 /*
 1177  * Look up a pagedep. Return 1 if found, 0 if not found or found
 1178  * when asked to allocate but not associated with any buffer.
 1179  * If not found, allocate if DEPALLOC flag is passed.
 1180  * Found or allocated entry is returned in pagedeppp.
 1181  * This routine must be called with splbio interrupts blocked.
 1182  */
 1183 static int
 1184 pagedep_lookup(ip, lbn, flags, pagedeppp)
 1185         struct inode *ip;
 1186         ufs_lbn_t lbn;
 1187         int flags;
 1188         struct pagedep **pagedeppp;
 1189 {
 1190         struct pagedep *pagedep;
 1191         struct pagedep_hashhead *pagedephd;
 1192         struct mount *mp;
 1193         int ret;
 1194         int i;
 1195 
 1196         mtx_assert(&lk, MA_OWNED);
 1197         mp = ITOV(ip)->v_mount;
 1198         pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
 1199 
 1200         ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
 1201         if (*pagedeppp || (flags & DEPALLOC) == 0)
 1202                 return (ret);
 1203         FREE_LOCK(&lk);
 1204         MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep),
 1205             M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
 1206         workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
 1207         ACQUIRE_LOCK(&lk);
 1208         ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
 1209         if (*pagedeppp) {
 1210                 WORKITEM_FREE(pagedep, D_PAGEDEP);
 1211                 return (ret);
 1212         }
 1213         pagedep->pd_ino = ip->i_number;
 1214         pagedep->pd_lbn = lbn;
 1215         LIST_INIT(&pagedep->pd_dirremhd);
 1216         LIST_INIT(&pagedep->pd_pendinghd);
 1217         for (i = 0; i < DAHASHSZ; i++)
 1218                 LIST_INIT(&pagedep->pd_diraddhd[i]);
 1219         LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
 1220         *pagedeppp = pagedep;
 1221         return (0);
 1222 }
 1223 
 1224 /*
 1225  * Structures and routines associated with inodedep caching.
 1226  */
 1227 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
 1228 static u_long   inodedep_hash;  /* size of hash table - 1 */
 1229 static long     num_inodedep;   /* number of inodedep allocated */
 1230 #define INODEDEP_HASH(fs, inum) \
 1231       (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
 1232 
 1233 static int
 1234 inodedep_find(inodedephd, fs, inum, inodedeppp)
 1235         struct inodedep_hashhead *inodedephd;
 1236         struct fs *fs;
 1237         ino_t inum;
 1238         struct inodedep **inodedeppp;
 1239 {
 1240         struct inodedep *inodedep;
 1241 
 1242         LIST_FOREACH(inodedep, inodedephd, id_hash)
 1243                 if (inum == inodedep->id_ino && fs == inodedep->id_fs)
 1244                         break;
 1245         if (inodedep) {
 1246                 *inodedeppp = inodedep;
 1247                 return (1);
 1248         }
 1249         *inodedeppp = NULL;
 1250 
 1251         return (0);
 1252 }
 1253 /*
 1254  * Look up an inodedep. Return 1 if found, 0 if not found.
 1255  * If not found, allocate if DEPALLOC flag is passed.
 1256  * Found or allocated entry is returned in inodedeppp.
 1257  * This routine must be called with splbio interrupts blocked.
 1258  */
 1259 static int
 1260 inodedep_lookup(mp, inum, flags, inodedeppp)
 1261         struct mount *mp;
 1262         ino_t inum;
 1263         int flags;
 1264         struct inodedep **inodedeppp;
 1265 {
 1266         struct inodedep *inodedep;
 1267         struct inodedep_hashhead *inodedephd;
 1268         struct fs *fs;
 1269 
 1270         mtx_assert(&lk, MA_OWNED);
 1271         fs = VFSTOUFS(mp)->um_fs;
 1272         inodedephd = INODEDEP_HASH(fs, inum);
 1273 
 1274         if (inodedep_find(inodedephd, fs, inum, inodedeppp))
 1275                 return (1);
 1276         if ((flags & DEPALLOC) == 0)
 1277                 return (0);
 1278         /*
 1279          * If we are over our limit, try to improve the situation.
 1280          */
 1281         if (num_inodedep > max_softdeps && (flags & NODELAY) == 0)
 1282                 request_cleanup(mp, FLUSH_INODES);
 1283         FREE_LOCK(&lk);
 1284         MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
 1285                 M_INODEDEP, M_SOFTDEP_FLAGS);
 1286         workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
 1287         ACQUIRE_LOCK(&lk);
 1288         if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
 1289                 WORKITEM_FREE(inodedep, D_INODEDEP);
 1290                 return (1);
 1291         }
 1292         num_inodedep += 1;
 1293         inodedep->id_fs = fs;
 1294         inodedep->id_ino = inum;
 1295         inodedep->id_state = ALLCOMPLETE;
 1296         inodedep->id_nlinkdelta = 0;
 1297         inodedep->id_savedino1 = NULL;
 1298         inodedep->id_savedsize = -1;
 1299         inodedep->id_savedextsize = -1;
 1300         inodedep->id_buf = NULL;
 1301         LIST_INIT(&inodedep->id_pendinghd);
 1302         LIST_INIT(&inodedep->id_inowait);
 1303         LIST_INIT(&inodedep->id_bufwait);
 1304         TAILQ_INIT(&inodedep->id_inoupdt);
 1305         TAILQ_INIT(&inodedep->id_newinoupdt);
 1306         TAILQ_INIT(&inodedep->id_extupdt);
 1307         TAILQ_INIT(&inodedep->id_newextupdt);
 1308         LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
 1309         *inodedeppp = inodedep;
 1310         return (0);
 1311 }
 1312 
 1313 /*
 1314  * Structures and routines associated with newblk caching.
 1315  */
 1316 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
 1317 u_long  newblk_hash;            /* size of hash table - 1 */
 1318 #define NEWBLK_HASH(fs, inum) \
 1319         (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
 1320 
 1321 static int
 1322 newblk_find(newblkhd, fs, newblkno, newblkpp)
 1323         struct newblk_hashhead *newblkhd;
 1324         struct fs *fs;
 1325         ufs2_daddr_t newblkno;
 1326         struct newblk **newblkpp;
 1327 {
 1328         struct newblk *newblk;
 1329 
 1330         LIST_FOREACH(newblk, newblkhd, nb_hash)
 1331                 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
 1332                         break;
 1333         if (newblk) {
 1334                 *newblkpp = newblk;
 1335                 return (1);
 1336         }
 1337         *newblkpp = NULL;
 1338         return (0);
 1339 }
 1340 
 1341 /*
 1342  * Look up a newblk. Return 1 if found, 0 if not found.
 1343  * If not found, allocate if DEPALLOC flag is passed.
 1344  * Found or allocated entry is returned in newblkpp.
 1345  */
 1346 static int
 1347 newblk_lookup(fs, newblkno, flags, newblkpp)
 1348         struct fs *fs;
 1349         ufs2_daddr_t newblkno;
 1350         int flags;
 1351         struct newblk **newblkpp;
 1352 {
 1353         struct newblk *newblk;
 1354         struct newblk_hashhead *newblkhd;
 1355 
 1356         newblkhd = NEWBLK_HASH(fs, newblkno);
 1357         if (newblk_find(newblkhd, fs, newblkno, newblkpp))
 1358                 return (1);
 1359         if ((flags & DEPALLOC) == 0)
 1360                 return (0);
 1361         FREE_LOCK(&lk);
 1362         MALLOC(newblk, struct newblk *, sizeof(struct newblk),
 1363                 M_NEWBLK, M_SOFTDEP_FLAGS);
 1364         ACQUIRE_LOCK(&lk);
 1365         if (newblk_find(newblkhd, fs, newblkno, newblkpp)) {
 1366                 FREE(newblk, M_NEWBLK);
 1367                 return (1);
 1368         }
 1369         newblk->nb_state = 0;
 1370         newblk->nb_fs = fs;
 1371         newblk->nb_newblkno = newblkno;
 1372         LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
 1373         *newblkpp = newblk;
 1374         return (0);
 1375 }
 1376 
 1377 /*
 1378  * Executed during filesystem system initialization before
 1379  * mounting any filesystems.
 1380  */
 1381 void 
 1382 softdep_initialize()
 1383 {
 1384 
 1385         LIST_INIT(&mkdirlisthd);
 1386         max_softdeps = desiredvnodes * 4;
 1387         pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
 1388             &pagedep_hash);
 1389         inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
 1390         newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
 1391 
 1392         /* initialise bioops hack */
 1393         bioops.io_start = softdep_disk_io_initiation;
 1394         bioops.io_complete = softdep_disk_write_complete;
 1395         bioops.io_deallocate = softdep_deallocate_dependencies;
 1396         bioops.io_countdeps = softdep_count_dependencies;
 1397 }
 1398 
 1399 /*
 1400  * Executed after all filesystems have been unmounted during
 1401  * filesystem module unload.
 1402  */
 1403 void
 1404 softdep_uninitialize()
 1405 {
 1406 
 1407         hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
 1408         hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
 1409         hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
 1410 }
 1411 
 1412 /*
 1413  * Called at mount time to notify the dependency code that a
 1414  * filesystem wishes to use it.
 1415  */
 1416 int
 1417 softdep_mount(devvp, mp, fs, cred)
 1418         struct vnode *devvp;
 1419         struct mount *mp;
 1420         struct fs *fs;
 1421         struct ucred *cred;
 1422 {
 1423         struct csum_total cstotal;
 1424         struct ufsmount *ump;
 1425         struct cg *cgp;
 1426         struct buf *bp;
 1427         int error, cyl;
 1428 
 1429         MNT_ILOCK(mp);
 1430         mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
 1431         if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
 1432                 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 
 1433                         MNTK_SOFTDEP;
 1434                 mp->mnt_noasync++;
 1435         }
 1436         MNT_IUNLOCK(mp);
 1437         ump = VFSTOUFS(mp);
 1438         LIST_INIT(&ump->softdep_workitem_pending);
 1439         ump->softdep_worklist_tail = NULL;
 1440         ump->softdep_on_worklist = 0;
 1441         ump->softdep_deps = 0;
 1442         /*
 1443          * When doing soft updates, the counters in the
 1444          * superblock may have gotten out of sync. Recomputation
 1445          * can take a long time and can be deferred for background
 1446          * fsck.  However, the old behavior of scanning the cylinder
 1447          * groups and recalculating them at mount time is available
 1448          * by setting vfs.ffs.compute_summary_at_mount to one.
 1449          */
 1450         if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
 1451                 return (0);
 1452         bzero(&cstotal, sizeof cstotal);
 1453         for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
 1454                 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
 1455                     fs->fs_cgsize, cred, &bp)) != 0) {
 1456                         brelse(bp);
 1457                         return (error);
 1458                 }
 1459                 cgp = (struct cg *)bp->b_data;
 1460                 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
 1461                 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
 1462                 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
 1463                 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
 1464                 fs->fs_cs(fs, cyl) = cgp->cg_cs;
 1465                 brelse(bp);
 1466         }
 1467 #ifdef DEBUG
 1468         if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
 1469                 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
 1470 #endif
 1471         bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
 1472         return (0);
 1473 }
 1474 
 1475 /*
 1476  * Protecting the freemaps (or bitmaps).
 1477  * 
 1478  * To eliminate the need to execute fsck before mounting a filesystem
 1479  * after a power failure, one must (conservatively) guarantee that the
 1480  * on-disk copy of the bitmaps never indicate that a live inode or block is
 1481  * free.  So, when a block or inode is allocated, the bitmap should be
 1482  * updated (on disk) before any new pointers.  When a block or inode is
 1483  * freed, the bitmap should not be updated until all pointers have been
 1484  * reset.  The latter dependency is handled by the delayed de-allocation
 1485  * approach described below for block and inode de-allocation.  The former
 1486  * dependency is handled by calling the following procedure when a block or
 1487  * inode is allocated. When an inode is allocated an "inodedep" is created
 1488  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
 1489  * Each "inodedep" is also inserted into the hash indexing structure so
 1490  * that any additional link additions can be made dependent on the inode
 1491  * allocation.
 1492  * 
 1493  * The ufs filesystem maintains a number of free block counts (e.g., per
 1494  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
 1495  * in addition to the bitmaps.  These counts are used to improve efficiency
 1496  * during allocation and therefore must be consistent with the bitmaps.
 1497  * There is no convenient way to guarantee post-crash consistency of these
 1498  * counts with simple update ordering, for two main reasons: (1) The counts
 1499  * and bitmaps for a single cylinder group block are not in the same disk
 1500  * sector.  If a disk write is interrupted (e.g., by power failure), one may
 1501  * be written and the other not.  (2) Some of the counts are located in the
 1502  * superblock rather than the cylinder group block. So, we focus our soft
 1503  * updates implementation on protecting the bitmaps. When mounting a
 1504  * filesystem, we recompute the auxiliary counts from the bitmaps.
 1505  */
 1506 
 1507 /*
 1508  * Called just after updating the cylinder group block to allocate an inode.
 1509  */
 1510 void
 1511 softdep_setup_inomapdep(bp, ip, newinum)
 1512         struct buf *bp;         /* buffer for cylgroup block with inode map */
 1513         struct inode *ip;       /* inode related to allocation */
 1514         ino_t newinum;          /* new inode number being allocated */
 1515 {
 1516         struct inodedep *inodedep;
 1517         struct bmsafemap *bmsafemap;
 1518 
 1519         /*
 1520          * Create a dependency for the newly allocated inode.
 1521          * Panic if it already exists as something is seriously wrong.
 1522          * Otherwise add it to the dependency list for the buffer holding
 1523          * the cylinder group map from which it was allocated.
 1524          */
 1525         ACQUIRE_LOCK(&lk);
 1526         if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY,
 1527             &inodedep)))
 1528                 panic("softdep_setup_inomapdep: dependency for new inode "
 1529                     "already exists");
 1530         inodedep->id_buf = bp;
 1531         inodedep->id_state &= ~DEPCOMPLETE;
 1532         bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp);
 1533         LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
 1534         FREE_LOCK(&lk);
 1535 }
 1536 
 1537 /*
 1538  * Called just after updating the cylinder group block to
 1539  * allocate block or fragment.
 1540  */
 1541 void
 1542 softdep_setup_blkmapdep(bp, mp, newblkno)
 1543         struct buf *bp;         /* buffer for cylgroup block with block map */
 1544         struct mount *mp;       /* filesystem doing allocation */
 1545         ufs2_daddr_t newblkno;  /* number of newly allocated block */
 1546 {
 1547         struct newblk *newblk;
 1548         struct bmsafemap *bmsafemap;
 1549         struct fs *fs;
 1550 
 1551         fs = VFSTOUFS(mp)->um_fs;
 1552         /*
 1553          * Create a dependency for the newly allocated block.
 1554          * Add it to the dependency list for the buffer holding
 1555          * the cylinder group map from which it was allocated.
 1556          */
 1557         ACQUIRE_LOCK(&lk);
 1558         if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
 1559                 panic("softdep_setup_blkmapdep: found block");
 1560         newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp);
 1561         LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
 1562         FREE_LOCK(&lk);
 1563 }
 1564 
 1565 /*
 1566  * Find the bmsafemap associated with a cylinder group buffer.
 1567  * If none exists, create one. The buffer must be locked when
 1568  * this routine is called and this routine must be called with
 1569  * splbio interrupts blocked.
 1570  */
 1571 static struct bmsafemap *
 1572 bmsafemap_lookup(mp, bp)
 1573         struct mount *mp;
 1574         struct buf *bp;
 1575 {
 1576         struct bmsafemap *bmsafemap;
 1577         struct worklist *wk;
 1578 
 1579         mtx_assert(&lk, MA_OWNED);
 1580         LIST_FOREACH(wk, &bp->b_dep, wk_list)
 1581                 if (wk->wk_type == D_BMSAFEMAP)
 1582                         return (WK_BMSAFEMAP(wk));
 1583         FREE_LOCK(&lk);
 1584         MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
 1585                 M_BMSAFEMAP, M_SOFTDEP_FLAGS);
 1586         workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
 1587         bmsafemap->sm_buf = bp;
 1588         LIST_INIT(&bmsafemap->sm_allocdirecthd);
 1589         LIST_INIT(&bmsafemap->sm_allocindirhd);
 1590         LIST_INIT(&bmsafemap->sm_inodedephd);
 1591         LIST_INIT(&bmsafemap->sm_newblkhd);
 1592         ACQUIRE_LOCK(&lk);
 1593         WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
 1594         return (bmsafemap);
 1595 }
 1596 
 1597 /*
 1598  * Direct block allocation dependencies.
 1599  * 
 1600  * When a new block is allocated, the corresponding disk locations must be
 1601  * initialized (with zeros or new data) before the on-disk inode points to
 1602  * them.  Also, the freemap from which the block was allocated must be
 1603  * updated (on disk) before the inode's pointer. These two dependencies are
 1604  * independent of each other and are needed for all file blocks and indirect
 1605  * blocks that are pointed to directly by the inode.  Just before the
 1606  * "in-core" version of the inode is updated with a newly allocated block
 1607  * number, a procedure (below) is called to setup allocation dependency
 1608  * structures.  These structures are removed when the corresponding
 1609  * dependencies are satisfied or when the block allocation becomes obsolete
 1610  * (i.e., the file is deleted, the block is de-allocated, or the block is a
 1611  * fragment that gets upgraded).  All of these cases are handled in
 1612  * procedures described later.
 1613  * 
 1614  * When a file extension causes a fragment to be upgraded, either to a larger
 1615  * fragment or to a full block, the on-disk location may change (if the
 1616  * previous fragment could not simply be extended). In this case, the old
 1617  * fragment must be de-allocated, but not until after the inode's pointer has
 1618  * been updated. In most cases, this is handled by later procedures, which
 1619  * will construct a "freefrag" structure to be added to the workitem queue
 1620  * when the inode update is complete (or obsolete).  The main exception to
 1621  * this is when an allocation occurs while a pending allocation dependency
 1622  * (for the same block pointer) remains.  This case is handled in the main
 1623  * allocation dependency setup procedure by immediately freeing the
 1624  * unreferenced fragments.
 1625  */ 
 1626 void 
 1627 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 1628         struct inode *ip;       /* inode to which block is being added */
 1629         ufs_lbn_t lbn;          /* block pointer within inode */
 1630         ufs2_daddr_t newblkno;  /* disk block number being added */
 1631         ufs2_daddr_t oldblkno;  /* previous block number, 0 unless frag */
 1632         long newsize;           /* size of new block */
 1633         long oldsize;           /* size of new block */
 1634         struct buf *bp;         /* bp for allocated block */
 1635 {
 1636         struct allocdirect *adp, *oldadp;
 1637         struct allocdirectlst *adphead;
 1638         struct bmsafemap *bmsafemap;
 1639         struct inodedep *inodedep;
 1640         struct pagedep *pagedep;
 1641         struct newblk *newblk;
 1642         struct mount *mp;
 1643 
 1644         mp = UFSTOVFS(ip->i_ump);
 1645         MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
 1646                 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
 1647         workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
 1648         adp->ad_lbn = lbn;
 1649         adp->ad_newblkno = newblkno;
 1650         adp->ad_oldblkno = oldblkno;
 1651         adp->ad_newsize = newsize;
 1652         adp->ad_oldsize = oldsize;
 1653         adp->ad_state = ATTACHED;
 1654         LIST_INIT(&adp->ad_newdirblk);
 1655         if (newblkno == oldblkno)
 1656                 adp->ad_freefrag = NULL;
 1657         else
 1658                 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
 1659 
 1660         ACQUIRE_LOCK(&lk);
 1661         if (lbn >= NDADDR) {
 1662                 /* allocating an indirect block */
 1663                 if (oldblkno != 0)
 1664                         panic("softdep_setup_allocdirect: non-zero indir");
 1665         } else {
 1666                 /*
 1667                  * Allocating a direct block.
 1668                  *
 1669                  * If we are allocating a directory block, then we must
 1670                  * allocate an associated pagedep to track additions and
 1671                  * deletions.
 1672                  */
 1673                 if ((ip->i_mode & IFMT) == IFDIR &&
 1674                     pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
 1675                         WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 1676         }
 1677         if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
 1678                 panic("softdep_setup_allocdirect: lost block");
 1679         if (newblk->nb_state == DEPCOMPLETE) {
 1680                 adp->ad_state |= DEPCOMPLETE;
 1681                 adp->ad_buf = NULL;
 1682         } else {
 1683                 bmsafemap = newblk->nb_bmsafemap;
 1684                 adp->ad_buf = bmsafemap->sm_buf;
 1685                 LIST_REMOVE(newblk, nb_deps);
 1686                 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
 1687         }
 1688         LIST_REMOVE(newblk, nb_hash);
 1689         FREE(newblk, M_NEWBLK);
 1690 
 1691         inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
 1692         adp->ad_inodedep = inodedep;
 1693         WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
 1694         /*
 1695          * The list of allocdirects must be kept in sorted and ascending
 1696          * order so that the rollback routines can quickly determine the
 1697          * first uncommitted block (the size of the file stored on disk
 1698          * ends at the end of the lowest committed fragment, or if there
 1699          * are no fragments, at the end of the highest committed block).
 1700          * Since files generally grow, the typical case is that the new
 1701          * block is to be added at the end of the list. We speed this
 1702          * special case by checking against the last allocdirect in the
 1703          * list before laboriously traversing the list looking for the
 1704          * insertion point.
 1705          */
 1706         adphead = &inodedep->id_newinoupdt;
 1707         oldadp = TAILQ_LAST(adphead, allocdirectlst);
 1708         if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
 1709                 /* insert at end of list */
 1710                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 1711                 if (oldadp != NULL && oldadp->ad_lbn == lbn)
 1712                         allocdirect_merge(adphead, adp, oldadp);
 1713                 FREE_LOCK(&lk);
 1714                 return;
 1715         }
 1716         TAILQ_FOREACH(oldadp, adphead, ad_next) {
 1717                 if (oldadp->ad_lbn >= lbn)
 1718                         break;
 1719         }
 1720         if (oldadp == NULL)
 1721                 panic("softdep_setup_allocdirect: lost entry");
 1722         /* insert in middle of list */
 1723         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
 1724         if (oldadp->ad_lbn == lbn)
 1725                 allocdirect_merge(adphead, adp, oldadp);
 1726         FREE_LOCK(&lk);
 1727 }
 1728 
 1729 /*
 1730  * Replace an old allocdirect dependency with a newer one.
 1731  * This routine must be called with splbio interrupts blocked.
 1732  */
 1733 static void
 1734 allocdirect_merge(adphead, newadp, oldadp)
 1735         struct allocdirectlst *adphead; /* head of list holding allocdirects */
 1736         struct allocdirect *newadp;     /* allocdirect being added */
 1737         struct allocdirect *oldadp;     /* existing allocdirect being checked */
 1738 {
 1739         struct worklist *wk;
 1740         struct freefrag *freefrag;
 1741         struct newdirblk *newdirblk;
 1742 
 1743         mtx_assert(&lk, MA_OWNED);
 1744         if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
 1745             newadp->ad_oldsize != oldadp->ad_newsize ||
 1746             newadp->ad_lbn >= NDADDR)
 1747                 panic("%s %jd != new %jd || old size %ld != new %ld",
 1748                     "allocdirect_merge: old blkno",
 1749                     (intmax_t)newadp->ad_oldblkno,
 1750                     (intmax_t)oldadp->ad_newblkno,
 1751                     newadp->ad_oldsize, oldadp->ad_newsize);
 1752         newadp->ad_oldblkno = oldadp->ad_oldblkno;
 1753         newadp->ad_oldsize = oldadp->ad_oldsize;
 1754         /*
 1755          * If the old dependency had a fragment to free or had never
 1756          * previously had a block allocated, then the new dependency
 1757          * can immediately post its freefrag and adopt the old freefrag.
 1758          * This action is done by swapping the freefrag dependencies.
 1759          * The new dependency gains the old one's freefrag, and the
 1760          * old one gets the new one and then immediately puts it on
 1761          * the worklist when it is freed by free_allocdirect. It is
 1762          * not possible to do this swap when the old dependency had a
 1763          * non-zero size but no previous fragment to free. This condition
 1764          * arises when the new block is an extension of the old block.
 1765          * Here, the first part of the fragment allocated to the new
 1766          * dependency is part of the block currently claimed on disk by
 1767          * the old dependency, so cannot legitimately be freed until the
 1768          * conditions for the new dependency are fulfilled.
 1769          */
 1770         if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
 1771                 freefrag = newadp->ad_freefrag;
 1772                 newadp->ad_freefrag = oldadp->ad_freefrag;
 1773                 oldadp->ad_freefrag = freefrag;
 1774         }
 1775         /*
 1776          * If we are tracking a new directory-block allocation,
 1777          * move it from the old allocdirect to the new allocdirect.
 1778          */
 1779         if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
 1780                 newdirblk = WK_NEWDIRBLK(wk);
 1781                 WORKLIST_REMOVE(&newdirblk->db_list);
 1782                 if (!LIST_EMPTY(&oldadp->ad_newdirblk))
 1783                         panic("allocdirect_merge: extra newdirblk");
 1784                 WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
 1785         }
 1786         free_allocdirect(adphead, oldadp, 0);
 1787 }
 1788                 
 1789 /*
 1790  * Allocate a new freefrag structure if needed.
 1791  */
 1792 static struct freefrag *
 1793 newfreefrag(ip, blkno, size)
 1794         struct inode *ip;
 1795         ufs2_daddr_t blkno;
 1796         long size;
 1797 {
 1798         struct freefrag *freefrag;
 1799         struct fs *fs;
 1800 
 1801         if (blkno == 0)
 1802                 return (NULL);
 1803         fs = ip->i_fs;
 1804         if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
 1805                 panic("newfreefrag: frag size");
 1806         MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
 1807                 M_FREEFRAG, M_SOFTDEP_FLAGS);
 1808         workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
 1809         freefrag->ff_inum = ip->i_number;
 1810         freefrag->ff_blkno = blkno;
 1811         freefrag->ff_fragsize = size;
 1812         return (freefrag);
 1813 }
 1814 
 1815 /*
 1816  * This workitem de-allocates fragments that were replaced during
 1817  * file block allocation.
 1818  */
 1819 static void 
 1820 handle_workitem_freefrag(freefrag)
 1821         struct freefrag *freefrag;
 1822 {
 1823         struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
 1824 
 1825         ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
 1826             freefrag->ff_fragsize, freefrag->ff_inum);
 1827         ACQUIRE_LOCK(&lk);
 1828         WORKITEM_FREE(freefrag, D_FREEFRAG);
 1829         FREE_LOCK(&lk);
 1830 }
 1831 
 1832 /*
 1833  * Set up a dependency structure for an external attributes data block.
 1834  * This routine follows much of the structure of softdep_setup_allocdirect.
 1835  * See the description of softdep_setup_allocdirect above for details.
 1836  */
 1837 void 
 1838 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 1839         struct inode *ip;
 1840         ufs_lbn_t lbn;
 1841         ufs2_daddr_t newblkno;
 1842         ufs2_daddr_t oldblkno;
 1843         long newsize;
 1844         long oldsize;
 1845         struct buf *bp;
 1846 {
 1847         struct allocdirect *adp, *oldadp;
 1848         struct allocdirectlst *adphead;
 1849         struct bmsafemap *bmsafemap;
 1850         struct inodedep *inodedep;
 1851         struct newblk *newblk;
 1852         struct mount *mp;
 1853 
 1854         mp = UFSTOVFS(ip->i_ump);
 1855         MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
 1856                 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
 1857         workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
 1858         adp->ad_lbn = lbn;
 1859         adp->ad_newblkno = newblkno;
 1860         adp->ad_oldblkno = oldblkno;
 1861         adp->ad_newsize = newsize;
 1862         adp->ad_oldsize = oldsize;
 1863         adp->ad_state = ATTACHED | EXTDATA;
 1864         LIST_INIT(&adp->ad_newdirblk);
 1865         if (newblkno == oldblkno)
 1866                 adp->ad_freefrag = NULL;
 1867         else
 1868                 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
 1869 
 1870         ACQUIRE_LOCK(&lk);
 1871         if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
 1872                 panic("softdep_setup_allocext: lost block");
 1873 
 1874         inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
 1875         adp->ad_inodedep = inodedep;
 1876 
 1877         if (newblk->nb_state == DEPCOMPLETE) {
 1878                 adp->ad_state |= DEPCOMPLETE;
 1879                 adp->ad_buf = NULL;
 1880         } else {
 1881                 bmsafemap = newblk->nb_bmsafemap;
 1882                 adp->ad_buf = bmsafemap->sm_buf;
 1883                 LIST_REMOVE(newblk, nb_deps);
 1884                 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
 1885         }
 1886         LIST_REMOVE(newblk, nb_hash);
 1887         FREE(newblk, M_NEWBLK);
 1888 
 1889         WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
 1890         if (lbn >= NXADDR)
 1891                 panic("softdep_setup_allocext: lbn %lld > NXADDR",
 1892                     (long long)lbn);
 1893         /*
 1894          * The list of allocdirects must be kept in sorted and ascending
 1895          * order so that the rollback routines can quickly determine the
 1896          * first uncommitted block (the size of the file stored on disk
 1897          * ends at the end of the lowest committed fragment, or if there
 1898          * are no fragments, at the end of the highest committed block).
 1899          * Since files generally grow, the typical case is that the new
 1900          * block is to be added at the end of the list. We speed this
 1901          * special case by checking against the last allocdirect in the
 1902          * list before laboriously traversing the list looking for the
 1903          * insertion point.
 1904          */
 1905         adphead = &inodedep->id_newextupdt;
 1906         oldadp = TAILQ_LAST(adphead, allocdirectlst);
 1907         if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
 1908                 /* insert at end of list */
 1909                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 1910                 if (oldadp != NULL && oldadp->ad_lbn == lbn)
 1911                         allocdirect_merge(adphead, adp, oldadp);
 1912                 FREE_LOCK(&lk);
 1913                 return;
 1914         }
 1915         TAILQ_FOREACH(oldadp, adphead, ad_next) {
 1916                 if (oldadp->ad_lbn >= lbn)
 1917                         break;
 1918         }
 1919         if (oldadp == NULL)
 1920                 panic("softdep_setup_allocext: lost entry");
 1921         /* insert in middle of list */
 1922         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
 1923         if (oldadp->ad_lbn == lbn)
 1924                 allocdirect_merge(adphead, adp, oldadp);
 1925         FREE_LOCK(&lk);
 1926 }
 1927 
 1928 /*
 1929  * Indirect block allocation dependencies.
 1930  * 
 1931  * The same dependencies that exist for a direct block also exist when
 1932  * a new block is allocated and pointed to by an entry in a block of
 1933  * indirect pointers. The undo/redo states described above are also
 1934  * used here. Because an indirect block contains many pointers that
 1935  * may have dependencies, a second copy of the entire in-memory indirect
 1936  * block is kept. The buffer cache copy is always completely up-to-date.
 1937  * The second copy, which is used only as a source for disk writes,
 1938  * contains only the safe pointers (i.e., those that have no remaining
 1939  * update dependencies). The second copy is freed when all pointers
 1940  * are safe. The cache is not allowed to replace indirect blocks with
 1941  * pending update dependencies. If a buffer containing an indirect
 1942  * block with dependencies is written, these routines will mark it
 1943  * dirty again. It can only be successfully written once all the
 1944  * dependencies are removed. The ffs_fsync routine in conjunction with
 1945  * softdep_sync_metadata work together to get all the dependencies
 1946  * removed so that a file can be successfully written to disk. Three
 1947  * procedures are used when setting up indirect block pointer
 1948  * dependencies. The division is necessary because of the organization
 1949  * of the "balloc" routine and because of the distinction between file
 1950  * pages and file metadata blocks.
 1951  */
 1952 
 1953 /*
 1954  * Allocate a new allocindir structure.
 1955  */
 1956 static struct allocindir *
 1957 newallocindir(ip, ptrno, newblkno, oldblkno)
 1958         struct inode *ip;       /* inode for file being extended */
 1959         int ptrno;              /* offset of pointer in indirect block */
 1960         ufs2_daddr_t newblkno;  /* disk block number being added */
 1961         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
 1962 {
 1963         struct allocindir *aip;
 1964 
 1965         MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
 1966                 M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
 1967         workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump));
 1968         aip->ai_state = ATTACHED;
 1969         aip->ai_offset = ptrno;
 1970         aip->ai_newblkno = newblkno;
 1971         aip->ai_oldblkno = oldblkno;
 1972         aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
 1973         return (aip);
 1974 }
 1975 
 1976 /*
 1977  * Called just before setting an indirect block pointer
 1978  * to a newly allocated file page.
 1979  */
 1980 void
 1981 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 1982         struct inode *ip;       /* inode for file being extended */
 1983         ufs_lbn_t lbn;          /* allocated block number within file */
 1984         struct buf *bp;         /* buffer with indirect blk referencing page */
 1985         int ptrno;              /* offset of pointer in indirect block */
 1986         ufs2_daddr_t newblkno;  /* disk block number being added */
 1987         ufs2_daddr_t oldblkno;  /* previous block number, 0 if none */
 1988         struct buf *nbp;        /* buffer holding allocated page */
 1989 {
 1990         struct allocindir *aip;
 1991         struct pagedep *pagedep;
 1992 
 1993         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
 1994         aip = newallocindir(ip, ptrno, newblkno, oldblkno);
 1995         ACQUIRE_LOCK(&lk);
 1996         /*
 1997          * If we are allocating a directory page, then we must
 1998          * allocate an associated pagedep to track additions and
 1999          * deletions.
 2000          */
 2001         if ((ip->i_mode & IFMT) == IFDIR &&
 2002             pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
 2003                 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
 2004         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
 2005         setup_allocindir_phase2(bp, ip, aip);
 2006         FREE_LOCK(&lk);
 2007 }
 2008 
 2009 /*
 2010  * Called just before setting an indirect block pointer to a
 2011  * newly allocated indirect block.
 2012  */
 2013 void
 2014 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 2015         struct buf *nbp;        /* newly allocated indirect block */
 2016         struct inode *ip;       /* inode for file being extended */
 2017         struct buf *bp;         /* indirect block referencing allocated block */
 2018         int ptrno;              /* offset of pointer in indirect block */
 2019         ufs2_daddr_t newblkno;  /* disk block number being added */
 2020 {
 2021         struct allocindir *aip;
 2022 
 2023         ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
 2024         aip = newallocindir(ip, ptrno, newblkno, 0);
 2025         ACQUIRE_LOCK(&lk);
 2026         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
 2027         setup_allocindir_phase2(bp, ip, aip);
 2028         FREE_LOCK(&lk);
 2029 }
 2030 
 2031 /*
 2032  * Called to finish the allocation of the "aip" allocated
 2033  * by one of the two routines above.
 2034  */
 2035 static void 
 2036 setup_allocindir_phase2(bp, ip, aip)
 2037         struct buf *bp;         /* in-memory copy of the indirect block */
 2038         struct inode *ip;       /* inode for file being extended */
 2039         struct allocindir *aip; /* allocindir allocated by the above routines */
 2040 {
 2041         struct worklist *wk;
 2042         struct indirdep *indirdep, *newindirdep;
 2043         struct bmsafemap *bmsafemap;
 2044         struct allocindir *oldaip;
 2045         struct freefrag *freefrag;
 2046         struct newblk *newblk;
 2047         ufs2_daddr_t blkno;
 2048 
 2049         mtx_assert(&lk, MA_OWNED);
 2050         if (bp->b_lblkno >= 0)
 2051                 panic("setup_allocindir_phase2: not indir blk");
 2052         for (indirdep = NULL, newindirdep = NULL; ; ) {
 2053                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 2054                         if (wk->wk_type != D_INDIRDEP)
 2055                                 continue;
 2056                         indirdep = WK_INDIRDEP(wk);
 2057                         break;
 2058                 }
 2059                 if (indirdep == NULL && newindirdep) {
 2060                         indirdep = newindirdep;
 2061                         WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
 2062                         newindirdep = NULL;
 2063                 }
 2064                 if (indirdep) {
 2065                         if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
 2066                             &newblk) == 0)
 2067                                 panic("setup_allocindir: lost block");
 2068                         if (newblk->nb_state == DEPCOMPLETE) {
 2069                                 aip->ai_state |= DEPCOMPLETE;
 2070                                 aip->ai_buf = NULL;
 2071                         } else {
 2072                                 bmsafemap = newblk->nb_bmsafemap;
 2073                                 aip->ai_buf = bmsafemap->sm_buf;
 2074                                 LIST_REMOVE(newblk, nb_deps);
 2075                                 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
 2076                                     aip, ai_deps);
 2077                         }
 2078                         LIST_REMOVE(newblk, nb_hash);
 2079                         FREE(newblk, M_NEWBLK);
 2080                         aip->ai_indirdep = indirdep;
 2081                         /*
 2082                          * Check to see if there is an existing dependency
 2083                          * for this block. If there is, merge the old
 2084                          * dependency into the new one.
 2085                          */
 2086                         if (aip->ai_oldblkno == 0)
 2087                                 oldaip = NULL;
 2088                         else
 2089 
 2090                                 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
 2091                                         if (oldaip->ai_offset == aip->ai_offset)
 2092                                                 break;
 2093                         freefrag = NULL;
 2094                         if (oldaip != NULL) {
 2095                                 if (oldaip->ai_newblkno != aip->ai_oldblkno)
 2096                                         panic("setup_allocindir_phase2: blkno");
 2097                                 aip->ai_oldblkno = oldaip->ai_oldblkno;
 2098                                 freefrag = aip->ai_freefrag;
 2099                                 aip->ai_freefrag = oldaip->ai_freefrag;
 2100                                 oldaip->ai_freefrag = NULL;
 2101                                 free_allocindir(oldaip, NULL);
 2102                         }
 2103                         LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
 2104                         if (ip->i_ump->um_fstype == UFS1)
 2105                                 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
 2106                                     [aip->ai_offset] = aip->ai_oldblkno;
 2107                         else
 2108                                 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
 2109                                     [aip->ai_offset] = aip->ai_oldblkno;
 2110                         FREE_LOCK(&lk);
 2111                         if (freefrag != NULL)
 2112                                 handle_workitem_freefrag(freefrag);
 2113                 } else
 2114                         FREE_LOCK(&lk);
 2115                 if (newindirdep) {
 2116                         newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
 2117                         brelse(newindirdep->ir_savebp);
 2118                         ACQUIRE_LOCK(&lk);
 2119                         WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
 2120                         if (indirdep)
 2121                                 break;
 2122                         FREE_LOCK(&lk);
 2123                 }
 2124                 if (indirdep) {
 2125                         ACQUIRE_LOCK(&lk);
 2126                         break;
 2127                 }
 2128                 MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
 2129                         M_INDIRDEP, M_SOFTDEP_FLAGS);
 2130                 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP,
 2131                     UFSTOVFS(ip->i_ump));
 2132                 newindirdep->ir_state = ATTACHED;
 2133                 if (ip->i_ump->um_fstype == UFS1)
 2134                         newindirdep->ir_state |= UFS1FMT;
 2135                 LIST_INIT(&newindirdep->ir_deplisthd);
 2136                 LIST_INIT(&newindirdep->ir_donehd);
 2137                 if (bp->b_blkno == bp->b_lblkno) {
 2138                         ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
 2139                             NULL, NULL);
 2140                         bp->b_blkno = blkno;
 2141                 }
 2142                 newindirdep->ir_savebp =
 2143                     getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
 2144                 BUF_KERNPROC(newindirdep->ir_savebp);
 2145                 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
 2146                 ACQUIRE_LOCK(&lk);
 2147         }
 2148 }
 2149 
 2150 /*
 2151  * Block de-allocation dependencies.
 2152  * 
 2153  * When blocks are de-allocated, the on-disk pointers must be nullified before
 2154  * the blocks are made available for use by other files.  (The true
 2155  * requirement is that old pointers must be nullified before new on-disk
 2156  * pointers are set.  We chose this slightly more stringent requirement to
 2157  * reduce complexity.) Our implementation handles this dependency by updating
 2158  * the inode (or indirect block) appropriately but delaying the actual block
 2159  * de-allocation (i.e., freemap and free space count manipulation) until
 2160  * after the updated versions reach stable storage.  After the disk is
 2161  * updated, the blocks can be safely de-allocated whenever it is convenient.
 2162  * This implementation handles only the common case of reducing a file's
 2163  * length to zero. Other cases are handled by the conventional synchronous
 2164  * write approach.
 2165  *
 2166  * The ffs implementation with which we worked double-checks
 2167  * the state of the block pointers and file size as it reduces
 2168  * a file's length.  Some of this code is replicated here in our
 2169  * soft updates implementation.  The freeblks->fb_chkcnt field is
 2170  * used to transfer a part of this information to the procedure
 2171  * that eventually de-allocates the blocks.
 2172  *
 2173  * This routine should be called from the routine that shortens
 2174  * a file's length, before the inode's size or block pointers
 2175  * are modified. It will save the block pointer information for
 2176  * later release and zero the inode so that the calling routine
 2177  * can release it.
 2178  */
 2179 void
 2180 softdep_setup_freeblocks(ip, length, flags)
 2181         struct inode *ip;       /* The inode whose length is to be reduced */
 2182         off_t length;           /* The new length for the file */
 2183         int flags;              /* IO_EXT and/or IO_NORMAL */
 2184 {
 2185         struct freeblks *freeblks;
 2186         struct inodedep *inodedep;
 2187         struct allocdirect *adp;
 2188         struct vnode *vp;
 2189         struct buf *bp;
 2190         struct fs *fs;
 2191         ufs2_daddr_t extblocks, datablocks;
 2192         struct mount *mp;
 2193         int i, delay, error;
 2194 
 2195         fs = ip->i_fs;
 2196         mp = UFSTOVFS(ip->i_ump);
 2197         if (length != 0)
 2198                 panic("softdep_setup_freeblocks: non-zero length");
 2199         MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
 2200                 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
 2201         workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
 2202         freeblks->fb_state = ATTACHED;
 2203         freeblks->fb_uid = ip->i_uid;
 2204         freeblks->fb_previousinum = ip->i_number;
 2205         freeblks->fb_devvp = ip->i_devvp;
 2206         extblocks = 0;
 2207         if (fs->fs_magic == FS_UFS2_MAGIC)
 2208                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
 2209         datablocks = DIP(ip, i_blocks) - extblocks;
 2210         if ((flags & IO_NORMAL) == 0) {
 2211                 freeblks->fb_oldsize = 0;
 2212                 freeblks->fb_chkcnt = 0;
 2213         } else {
 2214                 freeblks->fb_oldsize = ip->i_size;
 2215                 ip->i_size = 0;
 2216                 DIP_SET(ip, i_size, 0);
 2217                 freeblks->fb_chkcnt = datablocks;
 2218                 for (i = 0; i < NDADDR; i++) {
 2219                         freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
 2220                         DIP_SET(ip, i_db[i], 0);
 2221                 }
 2222                 for (i = 0; i < NIADDR; i++) {
 2223                         freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
 2224                         DIP_SET(ip, i_ib[i], 0);
 2225                 }
 2226                 /*
 2227                  * If the file was removed, then the space being freed was
 2228                  * accounted for then (see softdep_releasefile()). If the
 2229                  * file is merely being truncated, then we account for it now.
 2230                  */
 2231                 if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
 2232                         UFS_LOCK(ip->i_ump);
 2233                         fs->fs_pendingblocks += datablocks;
 2234                         UFS_UNLOCK(ip->i_ump);
 2235                 }
 2236         }
 2237         if ((flags & IO_EXT) == 0) {
 2238                 freeblks->fb_oldextsize = 0;
 2239         } else {
 2240                 freeblks->fb_oldextsize = ip->i_din2->di_extsize;
 2241                 ip->i_din2->di_extsize = 0;
 2242                 freeblks->fb_chkcnt += extblocks;
 2243                 for (i = 0; i < NXADDR; i++) {
 2244                         freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
 2245                         ip->i_din2->di_extb[i] = 0;
 2246                 }
 2247         }
 2248         DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
 2249         /*
 2250          * Push the zero'ed inode to to its disk buffer so that we are free
 2251          * to delete its dependencies below. Once the dependencies are gone
 2252          * the buffer can be safely released.
 2253          */
 2254         if ((error = bread(ip->i_devvp,
 2255             fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 2256             (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
 2257                 brelse(bp);
 2258                 softdep_error("softdep_setup_freeblocks", error);
 2259         }
 2260         if (ip->i_ump->um_fstype == UFS1)
 2261                 *((struct ufs1_dinode *)bp->b_data +
 2262                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
 2263         else
 2264                 *((struct ufs2_dinode *)bp->b_data +
 2265                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
 2266         /*
 2267          * Find and eliminate any inode dependencies.
 2268          */
 2269         ACQUIRE_LOCK(&lk);
 2270         (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 2271         if ((inodedep->id_state & IOSTARTED) != 0)
 2272                 panic("softdep_setup_freeblocks: inode busy");
 2273         /*
 2274          * Add the freeblks structure to the list of operations that
 2275          * must await the zero'ed inode being written to disk. If we
 2276          * still have a bitmap dependency (delay == 0), then the inode
 2277          * has never been written to disk, so we can process the
 2278          * freeblks below once we have deleted the dependencies.
 2279          */
 2280         delay = (inodedep->id_state & DEPCOMPLETE);
 2281         if (delay)
 2282                 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
 2283         /*
 2284          * Because the file length has been truncated to zero, any
 2285          * pending block allocation dependency structures associated
 2286          * with this inode are obsolete and can simply be de-allocated.
 2287          * We must first merge the two dependency lists to get rid of
 2288          * any duplicate freefrag structures, then purge the merged list.
 2289          * If we still have a bitmap dependency, then the inode has never
 2290          * been written to disk, so we can free any fragments without delay.
 2291          */
 2292         if (flags & IO_NORMAL) {
 2293                 merge_inode_lists(&inodedep->id_newinoupdt,
 2294                     &inodedep->id_inoupdt);
 2295                 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
 2296                         free_allocdirect(&inodedep->id_inoupdt, adp, delay);
 2297         }
 2298         if (flags & IO_EXT) {
 2299                 merge_inode_lists(&inodedep->id_newextupdt,
 2300                     &inodedep->id_extupdt);
 2301                 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
 2302                         free_allocdirect(&inodedep->id_extupdt, adp, delay);
 2303         }
 2304         FREE_LOCK(&lk);
 2305         bdwrite(bp);
 2306         /*
 2307          * We must wait for any I/O in progress to finish so that
 2308          * all potential buffers on the dirty list will be visible.
 2309          * Once they are all there, walk the list and get rid of
 2310          * any dependencies.
 2311          */
 2312         vp = ITOV(ip);
 2313         VI_LOCK(vp);
 2314         drain_output(vp);
 2315 restart:
 2316         TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
 2317                 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
 2318                     ((flags & IO_NORMAL) == 0 &&
 2319                       (bp->b_xflags & BX_ALTDATA) == 0))
 2320                         continue;
 2321                 if ((bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT)) == NULL)
 2322                         goto restart;
 2323                 VI_UNLOCK(vp);
 2324                 ACQUIRE_LOCK(&lk);
 2325                 (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
 2326                 deallocate_dependencies(bp, inodedep);
 2327                 FREE_LOCK(&lk);
 2328                 bp->b_flags |= B_INVAL | B_NOCACHE;
 2329                 brelse(bp);
 2330                 VI_LOCK(vp);
 2331                 goto restart;
 2332         }
 2333         VI_UNLOCK(vp);
 2334         ACQUIRE_LOCK(&lk);
 2335         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
 2336                 (void) free_inodedep(inodedep);
 2337 
 2338         if(delay) {
 2339                 freeblks->fb_state |= DEPCOMPLETE;
 2340                 /*
 2341                  * If the inode with zeroed block pointers is now on disk
 2342                  * we can start freeing blocks. Add freeblks to the worklist
 2343                  * instead of calling  handle_workitem_freeblocks directly as
 2344                  * it is more likely that additional IO is needed to complete
 2345                  * the request here than in the !delay case.
 2346                  */  
 2347                 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
 2348                         add_to_worklist(&freeblks->fb_list);
 2349         }
 2350 
 2351         FREE_LOCK(&lk);
 2352         /*
 2353          * If the inode has never been written to disk (delay == 0),
 2354          * then we can process the freeblks now that we have deleted
 2355          * the dependencies.
 2356          */
 2357         if (!delay)
 2358                 handle_workitem_freeblocks(freeblks, 0);
 2359 }
 2360 
 2361 /*
 2362  * Reclaim any dependency structures from a buffer that is about to
 2363  * be reallocated to a new vnode. The buffer must be locked, thus,
 2364  * no I/O completion operations can occur while we are manipulating
 2365  * its associated dependencies. The mutex is held so that other I/O's
 2366  * associated with related dependencies do not occur.
 2367  */
 2368 static void
 2369 deallocate_dependencies(bp, inodedep)
 2370         struct buf *bp;
 2371         struct inodedep *inodedep;
 2372 {
 2373         struct worklist *wk;
 2374         struct indirdep *indirdep;
 2375         struct allocindir *aip;
 2376         struct pagedep *pagedep;
 2377         struct dirrem *dirrem;
 2378         struct diradd *dap;
 2379         int i;
 2380 
 2381         mtx_assert(&lk, MA_OWNED);
 2382         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 2383                 switch (wk->wk_type) {
 2384 
 2385                 case D_INDIRDEP:
 2386                         indirdep = WK_INDIRDEP(wk);
 2387                         /*
 2388                          * None of the indirect pointers will ever be visible,
 2389                          * so they can simply be tossed. GOINGAWAY ensures
 2390                          * that allocated pointers will be saved in the buffer
 2391                          * cache until they are freed. Note that they will
 2392                          * only be able to be found by their physical address
 2393                          * since the inode mapping the logical address will
 2394                          * be gone. The save buffer used for the safe copy
 2395                          * was allocated in setup_allocindir_phase2 using
 2396                          * the physical address so it could be used for this
 2397                          * purpose. Hence we swap the safe copy with the real
 2398                          * copy, allowing the safe copy to be freed and holding
 2399                          * on to the real copy for later use in indir_trunc.
 2400                          */
 2401                         if (indirdep->ir_state & GOINGAWAY)
 2402                                 panic("deallocate_dependencies: already gone");
 2403                         indirdep->ir_state |= GOINGAWAY;
 2404                         VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
 2405                         while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
 2406                                 free_allocindir(aip, inodedep);
 2407                         if (bp->b_lblkno >= 0 ||
 2408                             bp->b_blkno != indirdep->ir_savebp->b_lblkno)
 2409                                 panic("deallocate_dependencies: not indir");
 2410                         bcopy(bp->b_data, indirdep->ir_savebp->b_data,
 2411                             bp->b_bcount);
 2412                         WORKLIST_REMOVE(wk);
 2413                         WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
 2414                         continue;
 2415 
 2416                 case D_PAGEDEP:
 2417                         pagedep = WK_PAGEDEP(wk);
 2418                         /*
 2419                          * None of the directory additions will ever be
 2420                          * visible, so they can simply be tossed.
 2421                          */
 2422                         for (i = 0; i < DAHASHSZ; i++)
 2423                                 while ((dap =
 2424                                     LIST_FIRST(&pagedep->pd_diraddhd[i])))
 2425                                         free_diradd(dap);
 2426                         while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
 2427                                 free_diradd(dap);
 2428                         /*
 2429                          * Copy any directory remove dependencies to the list
 2430                          * to be processed after the zero'ed inode is written.
 2431                          * If the inode has already been written, then they 
 2432                          * can be dumped directly onto the work list.
 2433                          */
 2434                         LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
 2435                                 LIST_REMOVE(dirrem, dm_next);
 2436                                 dirrem->dm_dirinum = pagedep->pd_ino;
 2437                                 if (inodedep == NULL ||
 2438                                     (inodedep->id_state & ALLCOMPLETE) ==
 2439                                      ALLCOMPLETE)
 2440                                         add_to_worklist(&dirrem->dm_list);
 2441                                 else
 2442                                         WORKLIST_INSERT(&inodedep->id_bufwait,
 2443                                             &dirrem->dm_list);
 2444                         }
 2445                         if ((pagedep->pd_state & NEWBLOCK) != 0) {
 2446                                 LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
 2447                                         if (wk->wk_type == D_NEWDIRBLK &&
 2448                                             WK_NEWDIRBLK(wk)->db_pagedep ==
 2449                                               pagedep)
 2450                                                 break;
 2451                                 if (wk != NULL) {
 2452                                         WORKLIST_REMOVE(wk);
 2453                                         free_newdirblk(WK_NEWDIRBLK(wk));
 2454                                 } else
 2455                                         panic("deallocate_dependencies: "
 2456                                               "lost pagedep");
 2457                         }
 2458                         WORKLIST_REMOVE(&pagedep->pd_list);
 2459                         LIST_REMOVE(pagedep, pd_hash);
 2460                         WORKITEM_FREE(pagedep, D_PAGEDEP);
 2461                         continue;
 2462 
 2463                 case D_ALLOCINDIR:
 2464                         free_allocindir(WK_ALLOCINDIR(wk), inodedep);
 2465                         continue;
 2466 
 2467                 case D_ALLOCDIRECT:
 2468                 case D_INODEDEP:
 2469                         panic("deallocate_dependencies: Unexpected type %s",
 2470                             TYPENAME(wk->wk_type));
 2471                         /* NOTREACHED */
 2472 
 2473                 default:
 2474                         panic("deallocate_dependencies: Unknown type %s",
 2475                             TYPENAME(wk->wk_type));
 2476                         /* NOTREACHED */
 2477                 }
 2478         }
 2479 }
 2480 
 2481 /*
 2482  * Free an allocdirect. Generate a new freefrag work request if appropriate.
 2483  * This routine must be called with splbio interrupts blocked.
 2484  */
 2485 static void
 2486 free_allocdirect(adphead, adp, delay)
 2487         struct allocdirectlst *adphead;
 2488         struct allocdirect *adp;
 2489         int delay;
 2490 {
 2491         struct newdirblk *newdirblk;
 2492         struct worklist *wk;
 2493 
 2494         mtx_assert(&lk, MA_OWNED);
 2495         if ((adp->ad_state & DEPCOMPLETE) == 0)
 2496                 LIST_REMOVE(adp, ad_deps);
 2497         TAILQ_REMOVE(adphead, adp, ad_next);
 2498         if ((adp->ad_state & COMPLETE) == 0)
 2499                 WORKLIST_REMOVE(&adp->ad_list);
 2500         if (adp->ad_freefrag != NULL) {
 2501                 if (delay)
 2502                         WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
 2503                             &adp->ad_freefrag->ff_list);
 2504                 else
 2505                         add_to_worklist(&adp->ad_freefrag->ff_list);
 2506         }
 2507         if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
 2508                 newdirblk = WK_NEWDIRBLK(wk);
 2509                 WORKLIST_REMOVE(&newdirblk->db_list);
 2510                 if (!LIST_EMPTY(&adp->ad_newdirblk))
 2511                         panic("free_allocdirect: extra newdirblk");
 2512                 if (delay)
 2513                         WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
 2514                             &newdirblk->db_list);
 2515                 else
 2516                         free_newdirblk(newdirblk);
 2517         }
 2518         WORKITEM_FREE(adp, D_ALLOCDIRECT);
 2519 }
 2520 
 2521 /*
 2522  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
 2523  * This routine must be called with splbio interrupts blocked.
 2524  */
 2525 static void
 2526 free_newdirblk(newdirblk)
 2527         struct newdirblk *newdirblk;
 2528 {
 2529         struct pagedep *pagedep;
 2530         struct diradd *dap;
 2531         int i;
 2532 
 2533         mtx_assert(&lk, MA_OWNED);
 2534         /*
 2535          * If the pagedep is still linked onto the directory buffer
 2536          * dependency chain, then some of the entries on the
 2537          * pd_pendinghd list may not be committed to disk yet. In
 2538          * this case, we will simply clear the NEWBLOCK flag and
 2539          * let the pd_pendinghd list be processed when the pagedep
 2540          * is next written. If the pagedep is no longer on the buffer
 2541          * dependency chain, then all the entries on the pd_pending
 2542          * list are committed to disk and we can free them here.
 2543          */
 2544         pagedep = newdirblk->db_pagedep;
 2545         pagedep->pd_state &= ~NEWBLOCK;
 2546         if ((pagedep->pd_state & ONWORKLIST) == 0)
 2547                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 2548                         free_diradd(dap);
 2549         /*
 2550          * If no dependencies remain, the pagedep will be freed.
 2551          */
 2552         for (i = 0; i < DAHASHSZ; i++)
 2553                 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
 2554                         break;
 2555         if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
 2556                 LIST_REMOVE(pagedep, pd_hash);
 2557                 WORKITEM_FREE(pagedep, D_PAGEDEP);
 2558         }
 2559         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 2560 }
 2561 
 2562 /*
 2563  * Prepare an inode to be freed. The actual free operation is not
 2564  * done until the zero'ed inode has been written to disk.
 2565  */
 2566 void
 2567 softdep_freefile(pvp, ino, mode)
 2568         struct vnode *pvp;
 2569         ino_t ino;
 2570         int mode;
 2571 {
 2572         struct inode *ip = VTOI(pvp);
 2573         struct inodedep *inodedep;
 2574         struct freefile *freefile;
 2575 
 2576         /*
 2577          * This sets up the inode de-allocation dependency.
 2578          */
 2579         MALLOC(freefile, struct freefile *, sizeof(struct freefile),
 2580                 M_FREEFILE, M_SOFTDEP_FLAGS);
 2581         workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
 2582         freefile->fx_mode = mode;
 2583         freefile->fx_oldinum = ino;
 2584         freefile->fx_devvp = ip->i_devvp;
 2585         if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
 2586                 UFS_LOCK(ip->i_ump);
 2587                 ip->i_fs->fs_pendinginodes += 1;
 2588                 UFS_UNLOCK(ip->i_ump);
 2589         }
 2590 
 2591         /*
 2592          * If the inodedep does not exist, then the zero'ed inode has
 2593          * been written to disk. If the allocated inode has never been
 2594          * written to disk, then the on-disk inode is zero'ed. In either
 2595          * case we can free the file immediately.
 2596          */
 2597         ACQUIRE_LOCK(&lk);
 2598         if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 ||
 2599             check_inode_unwritten(inodedep)) {
 2600                 FREE_LOCK(&lk);
 2601                 handle_workitem_freefile(freefile);
 2602                 return;
 2603         }
 2604         WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
 2605         FREE_LOCK(&lk);
 2606         ip->i_flag |= IN_MODIFIED;
 2607 }
 2608 
 2609 /*
 2610  * Check to see if an inode has never been written to disk. If
 2611  * so free the inodedep and return success, otherwise return failure.
 2612  * This routine must be called with splbio interrupts blocked.
 2613  *
 2614  * If we still have a bitmap dependency, then the inode has never
 2615  * been written to disk. Drop the dependency as it is no longer
 2616  * necessary since the inode is being deallocated. We set the
 2617  * ALLCOMPLETE flags since the bitmap now properly shows that the
 2618  * inode is not allocated. Even if the inode is actively being
 2619  * written, it has been rolled back to its zero'ed state, so we
 2620  * are ensured that a zero inode is what is on the disk. For short
 2621  * lived files, this change will usually result in removing all the
 2622  * dependencies from the inode so that it can be freed immediately.
 2623  */
 2624 static int
 2625 check_inode_unwritten(inodedep)
 2626         struct inodedep *inodedep;
 2627 {
 2628 
 2629         mtx_assert(&lk, MA_OWNED);
 2630         if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
 2631             !LIST_EMPTY(&inodedep->id_pendinghd) ||
 2632             !LIST_EMPTY(&inodedep->id_bufwait) ||
 2633             !LIST_EMPTY(&inodedep->id_inowait) ||
 2634             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 2635             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
 2636             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 2637             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 2638             inodedep->id_nlinkdelta != 0)
 2639                 return (0);
 2640 
 2641         /*
 2642          * Another process might be in initiate_write_inodeblock_ufs[12]
 2643          * trying to allocate memory without holding "Softdep Lock".
 2644          */
 2645         if ((inodedep->id_state & IOSTARTED) != 0 &&
 2646             inodedep->id_savedino1 == NULL)
 2647                 return (0);
 2648 
 2649         inodedep->id_state |= ALLCOMPLETE;
 2650         LIST_REMOVE(inodedep, id_deps);
 2651         inodedep->id_buf = NULL;
 2652         if (inodedep->id_state & ONWORKLIST)
 2653                 WORKLIST_REMOVE(&inodedep->id_list);
 2654         if (inodedep->id_savedino1 != NULL) {
 2655                 FREE(inodedep->id_savedino1, M_SAVEDINO);
 2656                 inodedep->id_savedino1 = NULL;
 2657         }
 2658         if (free_inodedep(inodedep) == 0)
 2659                 panic("check_inode_unwritten: busy inode");
 2660         return (1);
 2661 }
 2662 
 2663 /*
 2664  * Try to free an inodedep structure. Return 1 if it could be freed.
 2665  */
 2666 static int
 2667 free_inodedep(inodedep)
 2668         struct inodedep *inodedep;
 2669 {
 2670 
 2671         mtx_assert(&lk, MA_OWNED);
 2672         if ((inodedep->id_state & ONWORKLIST) != 0 ||
 2673             (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
 2674             !LIST_EMPTY(&inodedep->id_pendinghd) ||
 2675             !LIST_EMPTY(&inodedep->id_bufwait) ||
 2676             !LIST_EMPTY(&inodedep->id_inowait) ||
 2677             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 2678             !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
 2679             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 2680             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 2681             inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
 2682                 return (0);
 2683         LIST_REMOVE(inodedep, id_hash);
 2684         WORKITEM_FREE(inodedep, D_INODEDEP);
 2685         num_inodedep -= 1;
 2686         return (1);
 2687 }
 2688 
 2689 /*
 2690  * This workitem routine performs the block de-allocation.
 2691  * The workitem is added to the pending list after the updated
 2692  * inode block has been written to disk.  As mentioned above,
 2693  * checks regarding the number of blocks de-allocated (compared
 2694  * to the number of blocks allocated for the file) are also
 2695  * performed in this function.
 2696  */
 2697 static void
 2698 handle_workitem_freeblocks(freeblks, flags)
 2699         struct freeblks *freeblks;
 2700         int flags;
 2701 {
 2702         struct inode *ip;
 2703         struct vnode *vp;
 2704         struct fs *fs;
 2705         struct ufsmount *ump;
 2706         int i, nblocks, level, bsize;
 2707         ufs2_daddr_t bn, blocksreleased = 0;
 2708         int error, allerror = 0;
 2709         ufs_lbn_t baselbns[NIADDR], tmpval;
 2710         int fs_pendingblocks;
 2711 
 2712         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 2713         fs = ump->um_fs;
 2714         fs_pendingblocks = 0;
 2715         tmpval = 1;
 2716         baselbns[0] = NDADDR;
 2717         for (i = 1; i < NIADDR; i++) {
 2718                 tmpval *= NINDIR(fs);
 2719                 baselbns[i] = baselbns[i - 1] + tmpval;
 2720         }
 2721         nblocks = btodb(fs->fs_bsize);
 2722         blocksreleased = 0;
 2723         /*
 2724          * Release all extended attribute blocks or frags.
 2725          */
 2726         if (freeblks->fb_oldextsize > 0) {
 2727                 for (i = (NXADDR - 1); i >= 0; i--) {
 2728                         if ((bn = freeblks->fb_eblks[i]) == 0)
 2729                                 continue;
 2730                         bsize = sblksize(fs, freeblks->fb_oldextsize, i);
 2731                         ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
 2732                             freeblks->fb_previousinum);
 2733                         blocksreleased += btodb(bsize);
 2734                 }
 2735         }
 2736         /*
 2737          * Release all data blocks or frags.
 2738          */
 2739         if (freeblks->fb_oldsize > 0) {
 2740                 /*
 2741                  * Indirect blocks first.
 2742                  */
 2743                 for (level = (NIADDR - 1); level >= 0; level--) {
 2744                         if ((bn = freeblks->fb_iblks[level]) == 0)
 2745                                 continue;
 2746                         if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
 2747                             level, baselbns[level], &blocksreleased)) != 0)
 2748                                 allerror = error;
 2749                         ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,
 2750                             fs->fs_bsize, freeblks->fb_previousinum);
 2751                         fs_pendingblocks += nblocks;
 2752                         blocksreleased += nblocks;
 2753                 }
 2754                 /*
 2755                  * All direct blocks or frags.
 2756                  */
 2757                 for (i = (NDADDR - 1); i >= 0; i--) {
 2758                         if ((bn = freeblks->fb_dblks[i]) == 0)
 2759                                 continue;
 2760                         bsize = sblksize(fs, freeblks->fb_oldsize, i);
 2761                         ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
 2762                             freeblks->fb_previousinum);
 2763                         fs_pendingblocks += btodb(bsize);
 2764                         blocksreleased += btodb(bsize);
 2765                 }
 2766         }
 2767         UFS_LOCK(ump);
 2768         fs->fs_pendingblocks -= fs_pendingblocks;
 2769         UFS_UNLOCK(ump);
 2770         /*
 2771          * If we still have not finished background cleanup, then check
 2772          * to see if the block count needs to be adjusted.
 2773          */
 2774         if (freeblks->fb_chkcnt != blocksreleased &&
 2775             (fs->fs_flags & FS_UNCLEAN) != 0 &&
 2776             ffs_vget(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
 2777             (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) {
 2778                 ip = VTOI(vp);
 2779                 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \
 2780                     freeblks->fb_chkcnt - blocksreleased);
 2781                 ip->i_flag |= IN_CHANGE;
 2782                 vput(vp);
 2783         }
 2784 
 2785 #ifdef DIAGNOSTIC
 2786         if (freeblks->fb_chkcnt != blocksreleased &&
 2787             ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
 2788                 printf("handle_workitem_freeblocks: block count\n");
 2789         if (allerror)
 2790                 softdep_error("handle_workitem_freeblks", allerror);
 2791 #endif /* DIAGNOSTIC */
 2792 
 2793         ACQUIRE_LOCK(&lk);
 2794         WORKITEM_FREE(freeblks, D_FREEBLKS);
 2795         FREE_LOCK(&lk);
 2796 }
 2797 
 2798 /*
 2799  * Release blocks associated with the inode ip and stored in the indirect
 2800  * block dbn. If level is greater than SINGLE, the block is an indirect block
 2801  * and recursive calls to indirtrunc must be used to cleanse other indirect
 2802  * blocks.
 2803  */
 2804 static int
 2805 indir_trunc(freeblks, dbn, level, lbn, countp)
 2806         struct freeblks *freeblks;
 2807         ufs2_daddr_t dbn;
 2808         int level;
 2809         ufs_lbn_t lbn;
 2810         ufs2_daddr_t *countp;
 2811 {
 2812         struct buf *bp;
 2813         struct fs *fs;
 2814         struct worklist *wk;
 2815         struct indirdep *indirdep;
 2816         struct ufsmount *ump;
 2817         ufs1_daddr_t *bap1 = 0;
 2818         ufs2_daddr_t nb, *bap2 = 0;
 2819         ufs_lbn_t lbnadd;
 2820         int i, nblocks, ufs1fmt;
 2821         int error, allerror = 0;
 2822         int fs_pendingblocks;
 2823 
 2824         ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 2825         fs = ump->um_fs;
 2826         fs_pendingblocks = 0;
 2827         lbnadd = 1;
 2828         for (i = level; i > 0; i--)
 2829                 lbnadd *= NINDIR(fs);
 2830         /*
 2831          * Get buffer of block pointers to be freed. This routine is not
 2832          * called until the zero'ed inode has been written, so it is safe
 2833          * to free blocks as they are encountered. Because the inode has
 2834          * been zero'ed, calls to bmap on these blocks will fail. So, we
 2835          * have to use the on-disk address and the block device for the
 2836          * filesystem to look them up. If the file was deleted before its
 2837          * indirect blocks were all written to disk, the routine that set
 2838          * us up (deallocate_dependencies) will have arranged to leave
 2839          * a complete copy of the indirect block in memory for our use.
 2840          * Otherwise we have to read the blocks in from the disk.
 2841          */
 2842 #ifdef notyet
 2843         bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
 2844             GB_NOCREAT);
 2845 #else
 2846         bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
 2847 #endif
 2848         ACQUIRE_LOCK(&lk);
 2849         if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 2850                 if (wk->wk_type != D_INDIRDEP ||
 2851                     (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
 2852                     (indirdep->ir_state & GOINGAWAY) == 0)
 2853                         panic("indir_trunc: lost indirdep");
 2854                 WORKLIST_REMOVE(wk);
 2855                 WORKITEM_FREE(indirdep, D_INDIRDEP);
 2856                 if (!LIST_EMPTY(&bp->b_dep))
 2857                         panic("indir_trunc: dangling dep");
 2858                 ump->um_numindirdeps -= 1;
 2859                 FREE_LOCK(&lk);
 2860         } else {
 2861 #ifdef notyet
 2862                 if (bp)
 2863                         brelse(bp);
 2864 #endif
 2865                 FREE_LOCK(&lk);
 2866                 error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
 2867                     NOCRED, &bp);
 2868                 if (error) {
 2869                         brelse(bp);
 2870                         return (error);
 2871                 }
 2872         }
 2873         /*
 2874          * Recursively free indirect blocks.
 2875          */
 2876         if (ump->um_fstype == UFS1) {
 2877                 ufs1fmt = 1;
 2878                 bap1 = (ufs1_daddr_t *)bp->b_data;
 2879         } else {
 2880                 ufs1fmt = 0;
 2881                 bap2 = (ufs2_daddr_t *)bp->b_data;
 2882         }
 2883         nblocks = btodb(fs->fs_bsize);
 2884         for (i = NINDIR(fs) - 1; i >= 0; i--) {
 2885                 if (ufs1fmt)
 2886                         nb = bap1[i];
 2887                 else
 2888                         nb = bap2[i];
 2889                 if (nb == 0)
 2890                         continue;
 2891                 if (level != 0) {
 2892                         if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
 2893                              level - 1, lbn + (i * lbnadd), countp)) != 0)
 2894                                 allerror = error;
 2895                 }
 2896                 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize,
 2897                     freeblks->fb_previousinum);
 2898                 fs_pendingblocks += nblocks;
 2899                 *countp += nblocks;
 2900         }
 2901         UFS_LOCK(ump);
 2902         fs->fs_pendingblocks -= fs_pendingblocks;
 2903         UFS_UNLOCK(ump);
 2904         bp->b_flags |= B_INVAL | B_NOCACHE;
 2905         brelse(bp);
 2906         return (allerror);
 2907 }
 2908 
 2909 /*
 2910  * Free an allocindir.
 2911  * This routine must be called with splbio interrupts blocked.
 2912  */
 2913 static void
 2914 free_allocindir(aip, inodedep)
 2915         struct allocindir *aip;
 2916         struct inodedep *inodedep;
 2917 {
 2918         struct freefrag *freefrag;
 2919 
 2920         mtx_assert(&lk, MA_OWNED);
 2921         if ((aip->ai_state & DEPCOMPLETE) == 0)
 2922                 LIST_REMOVE(aip, ai_deps);
 2923         if (aip->ai_state & ONWORKLIST)
 2924                 WORKLIST_REMOVE(&aip->ai_list);
 2925         LIST_REMOVE(aip, ai_next);
 2926         if ((freefrag = aip->ai_freefrag) != NULL) {
 2927                 if (inodedep == NULL)
 2928                         add_to_worklist(&freefrag->ff_list);
 2929                 else
 2930                         WORKLIST_INSERT(&inodedep->id_bufwait,
 2931                             &freefrag->ff_list);
 2932         }
 2933         WORKITEM_FREE(aip, D_ALLOCINDIR);
 2934 }
 2935 
 2936 /*
 2937  * Directory entry addition dependencies.
 2938  * 
 2939  * When adding a new directory entry, the inode (with its incremented link
 2940  * count) must be written to disk before the directory entry's pointer to it.
 2941  * Also, if the inode is newly allocated, the corresponding freemap must be
 2942  * updated (on disk) before the directory entry's pointer. These requirements
 2943  * are met via undo/redo on the directory entry's pointer, which consists
 2944  * simply of the inode number.
 2945  * 
 2946  * As directory entries are added and deleted, the free space within a
 2947  * directory block can become fragmented.  The ufs filesystem will compact
 2948  * a fragmented directory block to make space for a new entry. When this
 2949  * occurs, the offsets of previously added entries change. Any "diradd"
 2950  * dependency structures corresponding to these entries must be updated with
 2951  * the new offsets.
 2952  */
 2953 
 2954 /*
 2955  * This routine is called after the in-memory inode's link
 2956  * count has been incremented, but before the directory entry's
 2957  * pointer to the inode has been set.
 2958  */
 2959 int
 2960 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 2961         struct buf *bp;         /* buffer containing directory block */
 2962         struct inode *dp;       /* inode for directory */
 2963         off_t diroffset;        /* offset of new entry in directory */
 2964         ino_t newinum;          /* inode referenced by new directory entry */
 2965         struct buf *newdirbp;   /* non-NULL => contents of new mkdir */
 2966         int isnewblk;           /* entry is in a newly allocated block */
 2967 {
 2968         int offset;             /* offset of new entry within directory block */
 2969         ufs_lbn_t lbn;          /* block in directory containing new entry */
 2970         struct fs *fs;
 2971         struct diradd *dap;
 2972         struct allocdirect *adp;
 2973         struct pagedep *pagedep;
 2974         struct inodedep *inodedep;
 2975         struct newdirblk *newdirblk = 0;
 2976         struct mkdir *mkdir1, *mkdir2;
 2977         struct mount *mp;
 2978 
 2979         /*
 2980          * Whiteouts have no dependencies.
 2981          */
 2982         if (newinum == WINO) {
 2983                 if (newdirbp != NULL)
 2984                         bdwrite(newdirbp);
 2985                 return (0);
 2986         }
 2987         mp = UFSTOVFS(dp->i_ump);
 2988         fs = dp->i_fs;
 2989         lbn = lblkno(fs, diroffset);
 2990         offset = blkoff(fs, diroffset);
 2991         MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
 2992                 M_SOFTDEP_FLAGS|M_ZERO);
 2993         workitem_alloc(&dap->da_list, D_DIRADD, mp);
 2994         dap->da_offset = offset;
 2995         dap->da_newinum = newinum;
 2996         dap->da_state = ATTACHED;
 2997         if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
 2998                 MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk),
 2999                     M_NEWDIRBLK, M_SOFTDEP_FLAGS);
 3000                 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
 3001         }
 3002         if (newdirbp == NULL) {
 3003                 dap->da_state |= DEPCOMPLETE;
 3004                 ACQUIRE_LOCK(&lk);
 3005         } else {
 3006                 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
 3007                 MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
 3008                     M_SOFTDEP_FLAGS);
 3009                 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
 3010                 mkdir1->md_state = MKDIR_BODY;
 3011                 mkdir1->md_diradd = dap;
 3012                 MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
 3013                     M_SOFTDEP_FLAGS);
 3014                 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
 3015                 mkdir2->md_state = MKDIR_PARENT;
 3016                 mkdir2->md_diradd = dap;
 3017                 /*
 3018                  * Dependency on "." and ".." being written to disk.
 3019                  */
 3020                 mkdir1->md_buf = newdirbp;
 3021                 ACQUIRE_LOCK(&lk);
 3022                 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
 3023                 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
 3024                 FREE_LOCK(&lk);
 3025                 bdwrite(newdirbp);
 3026                 /*
 3027                  * Dependency on link count increase for parent directory
 3028                  */
 3029                 ACQUIRE_LOCK(&lk);
 3030                 if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0
 3031                     || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 3032                         dap->da_state &= ~MKDIR_PARENT;
 3033                         WORKITEM_FREE(mkdir2, D_MKDIR);
 3034                 } else {
 3035                         LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
 3036                         WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
 3037                 }
 3038         }
 3039         /*
 3040          * Link into parent directory pagedep to await its being written.
 3041          */
 3042         if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
 3043                 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 3044         dap->da_pagedep = pagedep;
 3045         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
 3046             da_pdlist);
 3047         /*
 3048          * Link into its inodedep. Put it on the id_bufwait list if the inode
 3049          * is not yet written. If it is written, do the post-inode write
 3050          * processing to put it on the id_pendinghd list.
 3051          */
 3052         (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
 3053         if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
 3054                 diradd_inode_written(dap, inodedep);
 3055         else
 3056                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 3057         if (isnewblk) {
 3058                 /*
 3059                  * Directories growing into indirect blocks are rare
 3060                  * enough and the frequency of new block allocation
 3061                  * in those cases even more rare, that we choose not
 3062                  * to bother tracking them. Rather we simply force the
 3063                  * new directory entry to disk.
 3064                  */
 3065                 if (lbn >= NDADDR) {
 3066                         FREE_LOCK(&lk);
 3067                         /*
 3068                          * We only have a new allocation when at the
 3069                          * beginning of a new block, not when we are
 3070                          * expanding into an existing block.
 3071                          */
 3072                         if (blkoff(fs, diroffset) == 0)
 3073                                 return (1);
 3074                         return (0);
 3075                 }
 3076                 /*
 3077                  * We only have a new allocation when at the beginning
 3078                  * of a new fragment, not when we are expanding into an
 3079                  * existing fragment. Also, there is nothing to do if we
 3080                  * are already tracking this block.
 3081                  */
 3082                 if (fragoff(fs, diroffset) != 0) {
 3083                         FREE_LOCK(&lk);
 3084                         return (0);
 3085                 }
 3086                 if ((pagedep->pd_state & NEWBLOCK) != 0) {
 3087                         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 3088                         FREE_LOCK(&lk);
 3089                         return (0);
 3090                 }
 3091                 /*
 3092                  * Find our associated allocdirect and have it track us.
 3093                  */
 3094                 if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0)
 3095                         panic("softdep_setup_directory_add: lost inodedep");
 3096                 adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
 3097                 if (adp == NULL || adp->ad_lbn != lbn)
 3098                         panic("softdep_setup_directory_add: lost entry");
 3099                 pagedep->pd_state |= NEWBLOCK;
 3100                 newdirblk->db_pagedep = pagedep;
 3101                 WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
 3102         }
 3103         FREE_LOCK(&lk);
 3104         return (0);
 3105 }
 3106 
 3107 /*
 3108  * This procedure is called to change the offset of a directory
 3109  * entry when compacting a directory block which must be owned
 3110  * exclusively by the caller. Note that the actual entry movement
 3111  * must be done in this procedure to ensure that no I/O completions
 3112  * occur while the move is in progress.
 3113  */
 3114 void 
 3115 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
 3116         struct inode *dp;       /* inode for directory */
 3117         caddr_t base;           /* address of dp->i_offset */
 3118         caddr_t oldloc;         /* address of old directory location */
 3119         caddr_t newloc;         /* address of new directory location */
 3120         int entrysize;          /* size of directory entry */
 3121 {
 3122         int offset, oldoffset, newoffset;
 3123         struct pagedep *pagedep;
 3124         struct diradd *dap;
 3125         ufs_lbn_t lbn;
 3126 
 3127         ACQUIRE_LOCK(&lk);
 3128         lbn = lblkno(dp->i_fs, dp->i_offset);
 3129         offset = blkoff(dp->i_fs, dp->i_offset);
 3130         if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
 3131                 goto done;
 3132         oldoffset = offset + (oldloc - base);
 3133         newoffset = offset + (newloc - base);
 3134 
 3135         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
 3136                 if (dap->da_offset != oldoffset)
 3137                         continue;
 3138                 dap->da_offset = newoffset;
 3139                 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
 3140                         break;
 3141                 LIST_REMOVE(dap, da_pdlist);
 3142                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
 3143                     dap, da_pdlist);
 3144                 break;
 3145         }
 3146         if (dap == NULL) {
 3147 
 3148                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
 3149                         if (dap->da_offset == oldoffset) {
 3150                                 dap->da_offset = newoffset;
 3151                                 break;
 3152                         }
 3153                 }
 3154         }
 3155 done:
 3156         bcopy(oldloc, newloc, entrysize);
 3157         FREE_LOCK(&lk);
 3158 }
 3159 
 3160 /*
 3161  * Free a diradd dependency structure. This routine must be called
 3162  * with splbio interrupts blocked.
 3163  */
 3164 static void
 3165 free_diradd(dap)
 3166         struct diradd *dap;
 3167 {
 3168         struct dirrem *dirrem;
 3169         struct pagedep *pagedep;
 3170         struct inodedep *inodedep;
 3171         struct mkdir *mkdir, *nextmd;
 3172 
 3173         mtx_assert(&lk, MA_OWNED);
 3174         WORKLIST_REMOVE(&dap->da_list);
 3175         LIST_REMOVE(dap, da_pdlist);
 3176         if ((dap->da_state & DIRCHG) == 0) {
 3177                 pagedep = dap->da_pagedep;
 3178         } else {
 3179                 dirrem = dap->da_previous;
 3180                 pagedep = dirrem->dm_pagedep;
 3181                 dirrem->dm_dirinum = pagedep->pd_ino;
 3182                 add_to_worklist(&dirrem->dm_list);
 3183         }
 3184         if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
 3185             0, &inodedep) != 0)
 3186                 (void) free_inodedep(inodedep);
 3187         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
 3188                 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
 3189                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
 3190                         if (mkdir->md_diradd != dap)
 3191                                 continue;
 3192                         dap->da_state &= ~mkdir->md_state;
 3193                         WORKLIST_REMOVE(&mkdir->md_list);
 3194                         LIST_REMOVE(mkdir, md_mkdirs);
 3195                         WORKITEM_FREE(mkdir, D_MKDIR);
 3196                 }
 3197                 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
 3198                         panic("free_diradd: unfound ref");
 3199         }
 3200         WORKITEM_FREE(dap, D_DIRADD);
 3201 }
 3202 
 3203 /*
 3204  * Directory entry removal dependencies.
 3205  * 
 3206  * When removing a directory entry, the entry's inode pointer must be
 3207  * zero'ed on disk before the corresponding inode's link count is decremented
 3208  * (possibly freeing the inode for re-use). This dependency is handled by
 3209  * updating the directory entry but delaying the inode count reduction until
 3210  * after the directory block has been written to disk. After this point, the
 3211  * inode count can be decremented whenever it is convenient.
 3212  */
 3213 
 3214 /*
 3215  * This routine should be called immediately after removing
 3216  * a directory entry.  The inode's link count should not be
 3217  * decremented by the calling procedure -- the soft updates
 3218  * code will do this task when it is safe.
 3219  */
 3220 void 
 3221 softdep_setup_remove(bp, dp, ip, isrmdir)
 3222         struct buf *bp;         /* buffer containing directory block */
 3223         struct inode *dp;       /* inode for the directory being modified */
 3224         struct inode *ip;       /* inode for directory entry being removed */
 3225         int isrmdir;            /* indicates if doing RMDIR */
 3226 {
 3227         struct dirrem *dirrem, *prevdirrem;
 3228 
 3229         /*
 3230          * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
 3231          */
 3232         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
 3233 
 3234         /*
 3235          * If the COMPLETE flag is clear, then there were no active
 3236          * entries and we want to roll back to a zeroed entry until
 3237          * the new inode is committed to disk. If the COMPLETE flag is
 3238          * set then we have deleted an entry that never made it to
 3239          * disk. If the entry we deleted resulted from a name change,
 3240          * then the old name still resides on disk. We cannot delete
 3241          * its inode (returned to us in prevdirrem) until the zeroed
 3242          * directory entry gets to disk. The new inode has never been
 3243          * referenced on the disk, so can be deleted immediately.
 3244          */
 3245         if ((dirrem->dm_state & COMPLETE) == 0) {
 3246                 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
 3247                     dm_next);
 3248                 FREE_LOCK(&lk);
 3249         } else {
 3250                 if (prevdirrem != NULL)
 3251                         LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
 3252                             prevdirrem, dm_next);
 3253                 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
 3254                 FREE_LOCK(&lk);
 3255                 handle_workitem_remove(dirrem, NULL);
 3256         }
 3257 }
 3258 
 3259 /*
 3260  * Allocate a new dirrem if appropriate and return it along with
 3261  * its associated pagedep. Called without a lock, returns with lock.
 3262  */
 3263 static long num_dirrem;         /* number of dirrem allocated */
 3264 static struct dirrem *
 3265 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
 3266         struct buf *bp;         /* buffer containing directory block */
 3267         struct inode *dp;       /* inode for the directory being modified */
 3268         struct inode *ip;       /* inode for directory entry being removed */
 3269         int isrmdir;            /* indicates if doing RMDIR */
 3270         struct dirrem **prevdirremp; /* previously referenced inode, if any */
 3271 {
 3272         int offset;
 3273         ufs_lbn_t lbn;
 3274         struct diradd *dap;
 3275         struct dirrem *dirrem;
 3276         struct pagedep *pagedep;
 3277 
 3278         /*
 3279          * Whiteouts have no deletion dependencies.
 3280          */
 3281         if (ip == NULL)
 3282                 panic("newdirrem: whiteout");
 3283         /*
 3284          * If we are over our limit, try to improve the situation.
 3285          * Limiting the number of dirrem structures will also limit
 3286          * the number of freefile and freeblks structures.
 3287          */
 3288         ACQUIRE_LOCK(&lk);
 3289         if (num_dirrem > max_softdeps / 2)
 3290                 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE);
 3291         num_dirrem += 1;
 3292         FREE_LOCK(&lk);
 3293         MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
 3294                 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
 3295         workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount);
 3296         dirrem->dm_state = isrmdir ? RMDIR : 0;
 3297         dirrem->dm_oldinum = ip->i_number;
 3298         *prevdirremp = NULL;
 3299 
 3300         ACQUIRE_LOCK(&lk);
 3301         lbn = lblkno(dp->i_fs, dp->i_offset);
 3302         offset = blkoff(dp->i_fs, dp->i_offset);
 3303         if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
 3304                 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 3305         dirrem->dm_pagedep = pagedep;
 3306         /*
 3307          * Check for a diradd dependency for the same directory entry.
 3308          * If present, then both dependencies become obsolete and can
 3309          * be de-allocated. Check for an entry on both the pd_dirraddhd
 3310          * list and the pd_pendinghd list.
 3311          */
 3312 
 3313         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
 3314                 if (dap->da_offset == offset)
 3315                         break;
 3316         if (dap == NULL) {
 3317 
 3318                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
 3319                         if (dap->da_offset == offset)
 3320                                 break;
 3321                 if (dap == NULL)
 3322                         return (dirrem);
 3323         }
 3324         /*
 3325          * Must be ATTACHED at this point.
 3326          */
 3327         if ((dap->da_state & ATTACHED) == 0)
 3328                 panic("newdirrem: not ATTACHED");
 3329         if (dap->da_newinum != ip->i_number)
 3330                 panic("newdirrem: inum %d should be %d",
 3331                     ip->i_number, dap->da_newinum);
 3332         /*
 3333          * If we are deleting a changed name that never made it to disk,
 3334          * then return the dirrem describing the previous inode (which
 3335          * represents the inode currently referenced from this entry on disk).
 3336          */
 3337         if ((dap->da_state & DIRCHG) != 0) {
 3338                 *prevdirremp = dap->da_previous;
 3339                 dap->da_state &= ~DIRCHG;
 3340                 dap->da_pagedep = pagedep;
 3341         }
 3342         /*
 3343          * We are deleting an entry that never made it to disk.
 3344          * Mark it COMPLETE so we can delete its inode immediately.
 3345          */
 3346         dirrem->dm_state |= COMPLETE;
 3347         free_diradd(dap);
 3348         return (dirrem);
 3349 }
 3350 
 3351 /*
 3352  * Directory entry change dependencies.
 3353  * 
 3354  * Changing an existing directory entry requires that an add operation
 3355  * be completed first followed by a deletion. The semantics for the addition
 3356  * are identical to the description of adding a new entry above except
 3357  * that the rollback is to the old inode number rather than zero. Once
 3358  * the addition dependency is completed, the removal is done as described
 3359  * in the removal routine above.
 3360  */
 3361 
 3362 /*
 3363  * This routine should be called immediately after changing
 3364  * a directory entry.  The inode's link count should not be
 3365  * decremented by the calling procedure -- the soft updates
 3366  * code will perform this task when it is safe.
 3367  */
 3368 void 
 3369 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 3370         struct buf *bp;         /* buffer containing directory block */
 3371         struct inode *dp;       /* inode for the directory being modified */
 3372         struct inode *ip;       /* inode for directory entry being removed */
 3373         ino_t newinum;          /* new inode number for changed entry */
 3374         int isrmdir;            /* indicates if doing RMDIR */
 3375 {
 3376         int offset;
 3377         struct diradd *dap = NULL;
 3378         struct dirrem *dirrem, *prevdirrem;
 3379         struct pagedep *pagedep;
 3380         struct inodedep *inodedep;
 3381         struct mount *mp;
 3382 
 3383         offset = blkoff(dp->i_fs, dp->i_offset);
 3384         mp = UFSTOVFS(dp->i_ump);
 3385 
 3386         /*
 3387          * Whiteouts do not need diradd dependencies.
 3388          */
 3389         if (newinum != WINO) {
 3390                 MALLOC(dap, struct diradd *, sizeof(struct diradd),
 3391                     M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
 3392                 workitem_alloc(&dap->da_list, D_DIRADD, mp);
 3393                 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
 3394                 dap->da_offset = offset;
 3395                 dap->da_newinum = newinum;
 3396         }
 3397 
 3398         /*
 3399          * Allocate a new dirrem and ACQUIRE_LOCK.
 3400          */
 3401         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
 3402         pagedep = dirrem->dm_pagedep;
 3403         /*
 3404          * The possible values for isrmdir:
 3405          *      0 - non-directory file rename
 3406          *      1 - directory rename within same directory
 3407          *   inum - directory rename to new directory of given inode number
 3408          * When renaming to a new directory, we are both deleting and
 3409          * creating a new directory entry, so the link count on the new
 3410          * directory should not change. Thus we do not need the followup
 3411          * dirrem which is usually done in handle_workitem_remove. We set
 3412          * the DIRCHG flag to tell handle_workitem_remove to skip the 
 3413          * followup dirrem.
 3414          */
 3415         if (isrmdir > 1)
 3416                 dirrem->dm_state |= DIRCHG;
 3417 
 3418         /*
 3419          * Whiteouts have no additional dependencies,
 3420          * so just put the dirrem on the correct list.
 3421          */
 3422         if (newinum == WINO) {
 3423                 if ((dirrem->dm_state & COMPLETE) == 0) {
 3424                         LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
 3425                             dm_next);
 3426                 } else {
 3427                         dirrem->dm_dirinum = pagedep->pd_ino;
 3428                         add_to_worklist(&dirrem->dm_list);
 3429                 }
 3430                 FREE_LOCK(&lk);
 3431                 return;
 3432         }
 3433 
 3434         /*
 3435          * If the COMPLETE flag is clear, then there were no active
 3436          * entries and we want to roll back to the previous inode until
 3437          * the new inode is committed to disk. If the COMPLETE flag is
 3438          * set, then we have deleted an entry that never made it to disk.
 3439          * If the entry we deleted resulted from a name change, then the old
 3440          * inode reference still resides on disk. Any rollback that we do
 3441          * needs to be to that old inode (returned to us in prevdirrem). If
 3442          * the entry we deleted resulted from a create, then there is
 3443          * no entry on the disk, so we want to roll back to zero rather
 3444          * than the uncommitted inode. In either of the COMPLETE cases we
 3445          * want to immediately free the unwritten and unreferenced inode.
 3446          */
 3447         if ((dirrem->dm_state & COMPLETE) == 0) {
 3448                 dap->da_previous = dirrem;
 3449         } else {
 3450                 if (prevdirrem != NULL) {
 3451                         dap->da_previous = prevdirrem;
 3452                 } else {
 3453                         dap->da_state &= ~DIRCHG;
 3454                         dap->da_pagedep = pagedep;
 3455                 }
 3456                 dirrem->dm_dirinum = pagedep->pd_ino;
 3457                 add_to_worklist(&dirrem->dm_list);
 3458         }
 3459         /*
 3460          * Link into its inodedep. Put it on the id_bufwait list if the inode
 3461          * is not yet written. If it is written, do the post-inode write
 3462          * processing to put it on the id_pendinghd list.
 3463          */
 3464         if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 ||
 3465             (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 3466                 dap->da_state |= COMPLETE;
 3467                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 3468                 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
 3469         } else {
 3470                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
 3471                     dap, da_pdlist);
 3472                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 3473         }
 3474         FREE_LOCK(&lk);
 3475 }
 3476 
 3477 /*
 3478  * Called whenever the link count on an inode is changed.
 3479  * It creates an inode dependency so that the new reference(s)
 3480  * to the inode cannot be committed to disk until the updated
 3481  * inode has been written.
 3482  */
 3483 void
 3484 softdep_change_linkcnt(ip)
 3485         struct inode *ip;       /* the inode with the increased link count */
 3486 {
 3487         struct inodedep *inodedep;
 3488 
 3489         ACQUIRE_LOCK(&lk);
 3490         (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
 3491             DEPALLOC, &inodedep);
 3492         if (ip->i_nlink < ip->i_effnlink)
 3493                 panic("softdep_change_linkcnt: bad delta");
 3494         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 3495         FREE_LOCK(&lk);
 3496 }
 3497 
 3498 /*
 3499  * Called when the effective link count and the reference count
 3500  * on an inode drops to zero. At this point there are no names
 3501  * referencing the file in the filesystem and no active file
 3502  * references. The space associated with the file will be freed
 3503  * as soon as the necessary soft dependencies are cleared.
 3504  */
 3505 void
 3506 softdep_releasefile(ip)
 3507         struct inode *ip;       /* inode with the zero effective link count */
 3508 {
 3509         struct inodedep *inodedep;
 3510         struct fs *fs;
 3511         int extblocks;
 3512 
 3513         if (ip->i_effnlink > 0)
 3514                 panic("softdep_releasefile: file still referenced");
 3515         /*
 3516          * We may be called several times as the on-disk link count
 3517          * drops to zero. We only want to account for the space once.
 3518          */
 3519         if (ip->i_flag & IN_SPACECOUNTED)
 3520                 return;
 3521         /*
 3522          * We have to deactivate a snapshot otherwise copyonwrites may
 3523          * add blocks and the cleanup may remove blocks after we have
 3524          * tried to account for them.
 3525          */
 3526         if ((ip->i_flags & SF_SNAPSHOT) != 0)
 3527                 ffs_snapremove(ITOV(ip));
 3528         /*
 3529          * If we are tracking an nlinkdelta, we have to also remember
 3530          * whether we accounted for the freed space yet.
 3531          */
 3532         ACQUIRE_LOCK(&lk);
 3533         if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep)))
 3534                 inodedep->id_state |= SPACECOUNTED;
 3535         FREE_LOCK(&lk);
 3536         fs = ip->i_fs;
 3537         extblocks = 0;
 3538         if (fs->fs_magic == FS_UFS2_MAGIC)
 3539                 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
 3540         UFS_LOCK(ip->i_ump);
 3541         ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks;
 3542         ip->i_fs->fs_pendinginodes += 1;
 3543         UFS_UNLOCK(ip->i_ump);
 3544         ip->i_flag |= IN_SPACECOUNTED;
 3545 }
 3546 
 3547 /*
 3548  * This workitem decrements the inode's link count.
 3549  * If the link count reaches zero, the file is removed.
 3550  */
 3551 static void 
 3552 handle_workitem_remove(dirrem, xp)
 3553         struct dirrem *dirrem;
 3554         struct vnode *xp;
 3555 {
 3556         struct thread *td = curthread;
 3557         struct inodedep *inodedep;
 3558         struct vnode *vp;
 3559         struct inode *ip;
 3560         ino_t oldinum;
 3561         int error;
 3562 
 3563         if ((vp = xp) == NULL &&
 3564             (error = ffs_vget(dirrem->dm_list.wk_mp,
 3565             dirrem->dm_oldinum, LK_EXCLUSIVE, &vp)) != 0) {
 3566                 softdep_error("handle_workitem_remove: vget", error);
 3567                 return;
 3568         }
 3569         ip = VTOI(vp);
 3570         ACQUIRE_LOCK(&lk);
 3571         if ((inodedep_lookup(dirrem->dm_list.wk_mp,
 3572             dirrem->dm_oldinum, 0, &inodedep)) == 0)
 3573                 panic("handle_workitem_remove: lost inodedep");
 3574         /*
 3575          * Normal file deletion.
 3576          */
 3577         if ((dirrem->dm_state & RMDIR) == 0) {
 3578                 ip->i_nlink--;
 3579                 DIP_SET(ip, i_nlink, ip->i_nlink);
 3580                 ip->i_flag |= IN_CHANGE;
 3581                 if (ip->i_nlink < ip->i_effnlink)
 3582                         panic("handle_workitem_remove: bad file delta");
 3583                 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 3584                 num_dirrem -= 1;
 3585                 WORKITEM_FREE(dirrem, D_DIRREM);
 3586                 FREE_LOCK(&lk);
 3587                 vput(vp);
 3588                 return;
 3589         }
 3590         /*
 3591          * Directory deletion. Decrement reference count for both the
 3592          * just deleted parent directory entry and the reference for ".".
 3593          * Next truncate the directory to length zero. When the
 3594          * truncation completes, arrange to have the reference count on
 3595          * the parent decremented to account for the loss of "..".
 3596          */
 3597         ip->i_nlink -= 2;
 3598         DIP_SET(ip, i_nlink, ip->i_nlink);
 3599         ip->i_flag |= IN_CHANGE;
 3600         if (ip->i_nlink < ip->i_effnlink)
 3601                 panic("handle_workitem_remove: bad dir delta");
 3602         inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 3603         FREE_LOCK(&lk);
 3604         if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
 3605                 softdep_error("handle_workitem_remove: truncate", error);
 3606         ACQUIRE_LOCK(&lk);
 3607         /*
 3608          * Rename a directory to a new parent. Since, we are both deleting
 3609          * and creating a new directory entry, the link count on the new
 3610          * directory should not change. Thus we skip the followup dirrem.
 3611          */
 3612         if (dirrem->dm_state & DIRCHG) {
 3613                 num_dirrem -= 1;
 3614                 WORKITEM_FREE(dirrem, D_DIRREM);
 3615                 FREE_LOCK(&lk);
 3616                 vput(vp);
 3617                 return;
 3618         }
 3619         /*
 3620          * If the inodedep does not exist, then the zero'ed inode has
 3621          * been written to disk. If the allocated inode has never been
 3622          * written to disk, then the on-disk inode is zero'ed. In either
 3623          * case we can remove the file immediately.
 3624          */
 3625         dirrem->dm_state = 0;
 3626         oldinum = dirrem->dm_oldinum;
 3627         dirrem->dm_oldinum = dirrem->dm_dirinum;
 3628         if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum,
 3629             0, &inodedep) == 0 || check_inode_unwritten(inodedep)) {
 3630                 if (xp != NULL)
 3631                         add_to_worklist(&dirrem->dm_list);
 3632                 FREE_LOCK(&lk);
 3633                 vput(vp);
 3634                 if (xp == NULL)
 3635                         handle_workitem_remove(dirrem, NULL);
 3636                 return;
 3637         }
 3638         WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
 3639         FREE_LOCK(&lk);
 3640         ip->i_flag |= IN_CHANGE;
 3641         ffs_update(vp, 0);
 3642         vput(vp);
 3643 }
 3644 
 3645 /*
 3646  * Inode de-allocation dependencies.
 3647  * 
 3648  * When an inode's link count is reduced to zero, it can be de-allocated. We
 3649  * found it convenient to postpone de-allocation until after the inode is
 3650  * written to disk with its new link count (zero).  At this point, all of the
 3651  * on-disk inode's block pointers are nullified and, with careful dependency
 3652  * list ordering, all dependencies related to the inode will be satisfied and
 3653  * the corresponding dependency structures de-allocated.  So, if/when the
 3654  * inode is reused, there will be no mixing of old dependencies with new
 3655  * ones.  This artificial dependency is set up by the block de-allocation
 3656  * procedure above (softdep_setup_freeblocks) and completed by the
 3657  * following procedure.
 3658  */
 3659 static void 
 3660 handle_workitem_freefile(freefile)
 3661         struct freefile *freefile;
 3662 {
 3663         struct fs *fs;
 3664         struct inodedep *idp;
 3665         struct ufsmount *ump;
 3666         int error;
 3667 
 3668         ump = VFSTOUFS(freefile->fx_list.wk_mp);
 3669         fs = ump->um_fs;
 3670 #ifdef DEBUG
 3671         ACQUIRE_LOCK(&lk);
 3672         error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
 3673         FREE_LOCK(&lk);
 3674         if (error)
 3675                 panic("handle_workitem_freefile: inodedep survived");
 3676 #endif
 3677         UFS_LOCK(ump);
 3678         fs->fs_pendinginodes -= 1;
 3679         UFS_UNLOCK(ump);
 3680         if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
 3681             freefile->fx_oldinum, freefile->fx_mode)) != 0)
 3682                 softdep_error("handle_workitem_freefile", error);
 3683         ACQUIRE_LOCK(&lk);
 3684         WORKITEM_FREE(freefile, D_FREEFILE);
 3685         FREE_LOCK(&lk);
 3686 }
 3687 
 3688 
 3689 /*
 3690  * Helper function which unlinks marker element from work list and returns
 3691  * the next element on the list.
 3692  */
 3693 static __inline struct worklist *
 3694 markernext(struct worklist *marker)
 3695 {
 3696         struct worklist *next;
 3697         
 3698         next = LIST_NEXT(marker, wk_list);
 3699         LIST_REMOVE(marker, wk_list);
 3700         return next;
 3701 }
 3702 
 3703 /*
 3704  * Disk writes.
 3705  * 
 3706  * The dependency structures constructed above are most actively used when file
 3707  * system blocks are written to disk.  No constraints are placed on when a
 3708  * block can be written, but unsatisfied update dependencies are made safe by
 3709  * modifying (or replacing) the source memory for the duration of the disk
 3710  * write.  When the disk write completes, the memory block is again brought
 3711  * up-to-date.
 3712  *
 3713  * In-core inode structure reclamation.
 3714  * 
 3715  * Because there are a finite number of "in-core" inode structures, they are
 3716  * reused regularly.  By transferring all inode-related dependencies to the
 3717  * in-memory inode block and indexing them separately (via "inodedep"s), we
 3718  * can allow "in-core" inode structures to be reused at any time and avoid
 3719  * any increase in contention.
 3720  *
 3721  * Called just before entering the device driver to initiate a new disk I/O.
 3722  * The buffer must be locked, thus, no I/O completion operations can occur
 3723  * while we are manipulating its associated dependencies.
 3724  */
 3725 static void 
 3726 softdep_disk_io_initiation(bp)
 3727         struct buf *bp;         /* structure describing disk write to occur */
 3728 {
 3729         struct worklist *wk;
 3730         struct worklist marker;
 3731         struct indirdep *indirdep;
 3732         struct inodedep *inodedep;
 3733 
 3734         /*
 3735          * We only care about write operations. There should never
 3736          * be dependencies for reads.
 3737          */
 3738         if (bp->b_iocmd != BIO_WRITE)
 3739                 panic("softdep_disk_io_initiation: not write");
 3740 
 3741         marker.wk_type = D_LAST + 1;    /* Not a normal workitem */
 3742         PHOLD(curproc);                 /* Don't swap out kernel stack */
 3743 
 3744         ACQUIRE_LOCK(&lk);
 3745         /*
 3746          * Do any necessary pre-I/O processing.
 3747          */
 3748         for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
 3749              wk = markernext(&marker)) {
 3750                 LIST_INSERT_AFTER(wk, &marker, wk_list);
 3751                 switch (wk->wk_type) {
 3752 
 3753                 case D_PAGEDEP:
 3754                         initiate_write_filepage(WK_PAGEDEP(wk), bp);
 3755                         continue;
 3756 
 3757                 case D_INODEDEP:
 3758                         inodedep = WK_INODEDEP(wk);
 3759                         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
 3760                                 initiate_write_inodeblock_ufs1(inodedep, bp);
 3761                         else
 3762                                 initiate_write_inodeblock_ufs2(inodedep, bp);
 3763                         continue;
 3764 
 3765                 case D_INDIRDEP:
 3766                         indirdep = WK_INDIRDEP(wk);
 3767                         if (indirdep->ir_state & GOINGAWAY)
 3768                                 panic("disk_io_initiation: indirdep gone");
 3769                         /*
 3770                          * If there are no remaining dependencies, this
 3771                          * will be writing the real pointers, so the
 3772                          * dependency can be freed.
 3773                          */
 3774                         if (LIST_EMPTY(&indirdep->ir_deplisthd)) {
 3775                                 struct buf *bp;
 3776 
 3777                                 bp = indirdep->ir_savebp;
 3778                                 bp->b_flags |= B_INVAL | B_NOCACHE;
 3779                                 /* inline expand WORKLIST_REMOVE(wk); */
 3780                                 wk->wk_state &= ~ONWORKLIST;
 3781                                 LIST_REMOVE(wk, wk_list);
 3782                                 WORKITEM_FREE(indirdep, D_INDIRDEP);
 3783                                 FREE_LOCK(&lk);
 3784                                 brelse(bp);
 3785                                 ACQUIRE_LOCK(&lk);
 3786                                 continue;
 3787                         }
 3788                         /*
 3789                          * Replace up-to-date version with safe version.
 3790                          */
 3791                         FREE_LOCK(&lk);
 3792                         MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
 3793                             M_INDIRDEP, M_SOFTDEP_FLAGS);
 3794                         ACQUIRE_LOCK(&lk);
 3795                         indirdep->ir_state &= ~ATTACHED;
 3796                         indirdep->ir_state |= UNDONE;
 3797                         bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
 3798                         bcopy(indirdep->ir_savebp->b_data, bp->b_data,
 3799                             bp->b_bcount);
 3800                         continue;
 3801 
 3802                 case D_MKDIR:
 3803                 case D_BMSAFEMAP:
 3804                 case D_ALLOCDIRECT:
 3805                 case D_ALLOCINDIR:
 3806                         continue;
 3807 
 3808                 default:
 3809                         panic("handle_disk_io_initiation: Unexpected type %s",
 3810                             TYPENAME(wk->wk_type));
 3811                         /* NOTREACHED */
 3812                 }
 3813         }
 3814         FREE_LOCK(&lk);
 3815         PRELE(curproc);                 /* Allow swapout of kernel stack */
 3816 }
 3817 
 3818 /*
 3819  * Called from within the procedure above to deal with unsatisfied
 3820  * allocation dependencies in a directory. The buffer must be locked,
 3821  * thus, no I/O completion operations can occur while we are
 3822  * manipulating its associated dependencies.
 3823  */
 3824 static void
 3825 initiate_write_filepage(pagedep, bp)
 3826         struct pagedep *pagedep;
 3827         struct buf *bp;
 3828 {
 3829         struct diradd *dap;
 3830         struct direct *ep;
 3831         int i;
 3832 
 3833         if (pagedep->pd_state & IOSTARTED) {
 3834                 /*
 3835                  * This can only happen if there is a driver that does not
 3836                  * understand chaining. Here biodone will reissue the call
 3837                  * to strategy for the incomplete buffers.
 3838                  */
 3839                 printf("initiate_write_filepage: already started\n");
 3840                 return;
 3841         }
 3842         pagedep->pd_state |= IOSTARTED;
 3843         for (i = 0; i < DAHASHSZ; i++) {
 3844                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
 3845                         ep = (struct direct *)
 3846                             ((char *)bp->b_data + dap->da_offset);
 3847                         if (ep->d_ino != dap->da_newinum)
 3848                                 panic("%s: dir inum %d != new %d",
 3849                                     "initiate_write_filepage",
 3850                                     ep->d_ino, dap->da_newinum);
 3851                         if (dap->da_state & DIRCHG)
 3852                                 ep->d_ino = dap->da_previous->dm_oldinum;
 3853                         else
 3854                                 ep->d_ino = 0;
 3855                         dap->da_state &= ~ATTACHED;
 3856                         dap->da_state |= UNDONE;
 3857                 }
 3858         }
 3859 }
 3860 
 3861 /*
 3862  * Version of initiate_write_inodeblock that handles UFS1 dinodes.
 3863  * Note that any bug fixes made to this routine must be done in the
 3864  * version found below.
 3865  *
 3866  * Called from within the procedure above to deal with unsatisfied
 3867  * allocation dependencies in an inodeblock. The buffer must be
 3868  * locked, thus, no I/O completion operations can occur while we
 3869  * are manipulating its associated dependencies.
 3870  */
 3871 static void 
 3872 initiate_write_inodeblock_ufs1(inodedep, bp)
 3873         struct inodedep *inodedep;
 3874         struct buf *bp;                 /* The inode block */
 3875 {
 3876         struct allocdirect *adp, *lastadp;
 3877         struct ufs1_dinode *dp;
 3878         struct ufs1_dinode *sip;
 3879         struct fs *fs;
 3880         ufs_lbn_t i, prevlbn = 0;
 3881         int deplist;
 3882 
 3883         if (inodedep->id_state & IOSTARTED)
 3884                 panic("initiate_write_inodeblock_ufs1: already started");
 3885         inodedep->id_state |= IOSTARTED;
 3886         fs = inodedep->id_fs;
 3887         dp = (struct ufs1_dinode *)bp->b_data +
 3888             ino_to_fsbo(fs, inodedep->id_ino);
 3889         /*
 3890          * If the bitmap is not yet written, then the allocated
 3891          * inode cannot be written to disk.
 3892          */
 3893         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 3894                 if (inodedep->id_savedino1 != NULL)
 3895                         panic("initiate_write_inodeblock_ufs1: I/O underway");
 3896                 FREE_LOCK(&lk);
 3897                 MALLOC(sip, struct ufs1_dinode *,
 3898                     sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
 3899                 ACQUIRE_LOCK(&lk);
 3900                 inodedep->id_savedino1 = sip;
 3901                 *inodedep->id_savedino1 = *dp;
 3902                 bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
 3903                 dp->di_gen = inodedep->id_savedino1->di_gen;
 3904                 return;
 3905         }
 3906         /*
 3907          * If no dependencies, then there is nothing to roll back.
 3908          */
 3909         inodedep->id_savedsize = dp->di_size;
 3910         inodedep->id_savedextsize = 0;
 3911         if (TAILQ_EMPTY(&inodedep->id_inoupdt))
 3912                 return;
 3913         /*
 3914          * Set the dependencies to busy.
 3915          */
 3916         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 3917              adp = TAILQ_NEXT(adp, ad_next)) {
 3918 #ifdef DIAGNOSTIC
 3919                 if (deplist != 0 && prevlbn >= adp->ad_lbn)
 3920                         panic("softdep_write_inodeblock: lbn order");
 3921                 prevlbn = adp->ad_lbn;
 3922                 if (adp->ad_lbn < NDADDR &&
 3923                     dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
 3924                         panic("%s: direct pointer #%jd mismatch %d != %jd",
 3925                             "softdep_write_inodeblock",
 3926                             (intmax_t)adp->ad_lbn,
 3927                             dp->di_db[adp->ad_lbn],
 3928                             (intmax_t)adp->ad_newblkno);
 3929                 if (adp->ad_lbn >= NDADDR &&
 3930                     dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
 3931                         panic("%s: indirect pointer #%jd mismatch %d != %jd",
 3932                             "softdep_write_inodeblock",
 3933                             (intmax_t)adp->ad_lbn - NDADDR,
 3934                             dp->di_ib[adp->ad_lbn - NDADDR],
 3935                             (intmax_t)adp->ad_newblkno);
 3936                 deplist |= 1 << adp->ad_lbn;
 3937                 if ((adp->ad_state & ATTACHED) == 0)
 3938                         panic("softdep_write_inodeblock: Unknown state 0x%x",
 3939                             adp->ad_state);
 3940 #endif /* DIAGNOSTIC */
 3941                 adp->ad_state &= ~ATTACHED;
 3942                 adp->ad_state |= UNDONE;
 3943         }
 3944         /*
 3945          * The on-disk inode cannot claim to be any larger than the last
 3946          * fragment that has been written. Otherwise, the on-disk inode
 3947          * might have fragments that were not the last block in the file
 3948          * which would corrupt the filesystem.
 3949          */
 3950         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 3951              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 3952                 if (adp->ad_lbn >= NDADDR)
 3953                         break;
 3954                 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
 3955                 /* keep going until hitting a rollback to a frag */
 3956                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 3957                         continue;
 3958                 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 3959                 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
 3960 #ifdef DIAGNOSTIC
 3961                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
 3962                                 panic("softdep_write_inodeblock: lost dep1");
 3963 #endif /* DIAGNOSTIC */
 3964                         dp->di_db[i] = 0;
 3965                 }
 3966                 for (i = 0; i < NIADDR; i++) {
 3967 #ifdef DIAGNOSTIC
 3968                         if (dp->di_ib[i] != 0 &&
 3969                             (deplist & ((1 << NDADDR) << i)) == 0)
 3970                                 panic("softdep_write_inodeblock: lost dep2");
 3971 #endif /* DIAGNOSTIC */
 3972                         dp->di_ib[i] = 0;
 3973                 }
 3974                 return;
 3975         }
 3976         /*
 3977          * If we have zero'ed out the last allocated block of the file,
 3978          * roll back the size to the last currently allocated block.
 3979          * We know that this last allocated block is a full-sized as
 3980          * we already checked for fragments in the loop above.
 3981          */
 3982         if (lastadp != NULL &&
 3983             dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 3984                 for (i = lastadp->ad_lbn; i >= 0; i--)
 3985                         if (dp->di_db[i] != 0)
 3986                                 break;
 3987                 dp->di_size = (i + 1) * fs->fs_bsize;
 3988         }
 3989         /*
 3990          * The only dependencies are for indirect blocks.
 3991          *
 3992          * The file size for indirect block additions is not guaranteed.
 3993          * Such a guarantee would be non-trivial to achieve. The conventional
 3994          * synchronous write implementation also does not make this guarantee.
 3995          * Fsck should catch and fix discrepancies. Arguably, the file size
 3996          * can be over-estimated without destroying integrity when the file
 3997          * moves into the indirect blocks (i.e., is large). If we want to
 3998          * postpone fsck, we are stuck with this argument.
 3999          */
 4000         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
 4001                 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
 4002 }
 4003                 
 4004 /*
 4005  * Version of initiate_write_inodeblock that handles UFS2 dinodes.
 4006  * Note that any bug fixes made to this routine must be done in the
 4007  * version found above.
 4008  *
 4009  * Called from within the procedure above to deal with unsatisfied
 4010  * allocation dependencies in an inodeblock. The buffer must be
 4011  * locked, thus, no I/O completion operations can occur while we
 4012  * are manipulating its associated dependencies.
 4013  */
 4014 static void 
 4015 initiate_write_inodeblock_ufs2(inodedep, bp)
 4016         struct inodedep *inodedep;
 4017         struct buf *bp;                 /* The inode block */
 4018 {
 4019         struct allocdirect *adp, *lastadp;
 4020         struct ufs2_dinode *dp;
 4021         struct ufs2_dinode *sip;
 4022         struct fs *fs;
 4023         ufs_lbn_t i, prevlbn = 0;
 4024         int deplist;
 4025 
 4026         if (inodedep->id_state & IOSTARTED)
 4027                 panic("initiate_write_inodeblock_ufs2: already started");
 4028         inodedep->id_state |= IOSTARTED;
 4029         fs = inodedep->id_fs;
 4030         dp = (struct ufs2_dinode *)bp->b_data +
 4031             ino_to_fsbo(fs, inodedep->id_ino);
 4032         /*
 4033          * If the bitmap is not yet written, then the allocated
 4034          * inode cannot be written to disk.
 4035          */
 4036         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 4037                 if (inodedep->id_savedino2 != NULL)
 4038                         panic("initiate_write_inodeblock_ufs2: I/O underway");
 4039                 FREE_LOCK(&lk);
 4040                 MALLOC(sip, struct ufs2_dinode *,
 4041                     sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
 4042                 ACQUIRE_LOCK(&lk);
 4043                 inodedep->id_savedino2 = sip;
 4044                 *inodedep->id_savedino2 = *dp;
 4045                 bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
 4046                 dp->di_gen = inodedep->id_savedino2->di_gen;
 4047                 return;
 4048         }
 4049         /*
 4050          * If no dependencies, then there is nothing to roll back.
 4051          */
 4052         inodedep->id_savedsize = dp->di_size;
 4053         inodedep->id_savedextsize = dp->di_extsize;
 4054         if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
 4055             TAILQ_EMPTY(&inodedep->id_extupdt))
 4056                 return;
 4057         /*
 4058          * Set the ext data dependencies to busy.
 4059          */
 4060         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
 4061              adp = TAILQ_NEXT(adp, ad_next)) {
 4062 #ifdef DIAGNOSTIC
 4063                 if (deplist != 0 && prevlbn >= adp->ad_lbn)
 4064                         panic("softdep_write_inodeblock: lbn order");
 4065                 prevlbn = adp->ad_lbn;
 4066                 if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno)
 4067                         panic("%s: direct pointer #%jd mismatch %jd != %jd",
 4068                             "softdep_write_inodeblock",
 4069                             (intmax_t)adp->ad_lbn,
 4070                             (intmax_t)dp->di_extb[adp->ad_lbn],
 4071                             (intmax_t)adp->ad_newblkno);
 4072                 deplist |= 1 << adp->ad_lbn;
 4073                 if ((adp->ad_state & ATTACHED) == 0)
 4074                         panic("softdep_write_inodeblock: Unknown state 0x%x",
 4075                             adp->ad_state);
 4076 #endif /* DIAGNOSTIC */
 4077                 adp->ad_state &= ~ATTACHED;
 4078                 adp->ad_state |= UNDONE;
 4079         }
 4080         /*
 4081          * The on-disk inode cannot claim to be any larger than the last
 4082          * fragment that has been written. Otherwise, the on-disk inode
 4083          * might have fragments that were not the last block in the ext
 4084          * data which would corrupt the filesystem.
 4085          */
 4086         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
 4087              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 4088                 dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
 4089                 /* keep going until hitting a rollback to a frag */
 4090                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 4091                         continue;
 4092                 dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 4093                 for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
 4094 #ifdef DIAGNOSTIC
 4095                         if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
 4096                                 panic("softdep_write_inodeblock: lost dep1");
 4097 #endif /* DIAGNOSTIC */
 4098                         dp->di_extb[i] = 0;
 4099                 }
 4100                 lastadp = NULL;
 4101                 break;
 4102         }
 4103         /*
 4104          * If we have zero'ed out the last allocated block of the ext
 4105          * data, roll back the size to the last currently allocated block.
 4106          * We know that this last allocated block is a full-sized as
 4107          * we already checked for fragments in the loop above.
 4108          */
 4109         if (lastadp != NULL &&
 4110             dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 4111                 for (i = lastadp->ad_lbn; i >= 0; i--)
 4112                         if (dp->di_extb[i] != 0)
 4113                                 break;
 4114                 dp->di_extsize = (i + 1) * fs->fs_bsize;
 4115         }
 4116         /*
 4117          * Set the file data dependencies to busy.
 4118          */
 4119         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 4120              adp = TAILQ_NEXT(adp, ad_next)) {
 4121 #ifdef DIAGNOSTIC
 4122                 if (deplist != 0 && prevlbn >= adp->ad_lbn)
 4123                         panic("softdep_write_inodeblock: lbn order");
 4124                 prevlbn = adp->ad_lbn;
 4125                 if (adp->ad_lbn < NDADDR &&
 4126                     dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
 4127                         panic("%s: direct pointer #%jd mismatch %jd != %jd",
 4128                             "softdep_write_inodeblock",
 4129                             (intmax_t)adp->ad_lbn,
 4130                             (intmax_t)dp->di_db[adp->ad_lbn],
 4131                             (intmax_t)adp->ad_newblkno);
 4132                 if (adp->ad_lbn >= NDADDR &&
 4133                     dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
 4134                         panic("%s indirect pointer #%jd mismatch %jd != %jd",
 4135                             "softdep_write_inodeblock:",
 4136                             (intmax_t)adp->ad_lbn - NDADDR,
 4137                             (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
 4138                             (intmax_t)adp->ad_newblkno);
 4139                 deplist |= 1 << adp->ad_lbn;
 4140                 if ((adp->ad_state & ATTACHED) == 0)
 4141                         panic("softdep_write_inodeblock: Unknown state 0x%x",
 4142                             adp->ad_state);
 4143 #endif /* DIAGNOSTIC */
 4144                 adp->ad_state &= ~ATTACHED;
 4145                 adp->ad_state |= UNDONE;
 4146         }
 4147         /*
 4148          * The on-disk inode cannot claim to be any larger than the last
 4149          * fragment that has been written. Otherwise, the on-disk inode
 4150          * might have fragments that were not the last block in the file
 4151          * which would corrupt the filesystem.
 4152          */
 4153         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 4154              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 4155                 if (adp->ad_lbn >= NDADDR)
 4156                         break;
 4157                 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
 4158                 /* keep going until hitting a rollback to a frag */
 4159                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 4160                         continue;
 4161                 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 4162                 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
 4163 #ifdef DIAGNOSTIC
 4164                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
 4165                                 panic("softdep_write_inodeblock: lost dep2");
 4166 #endif /* DIAGNOSTIC */
 4167                         dp->di_db[i] = 0;
 4168                 }
 4169                 for (i = 0; i < NIADDR; i++) {
 4170 #ifdef DIAGNOSTIC
 4171                         if (dp->di_ib[i] != 0 &&
 4172                             (deplist & ((1 << NDADDR) << i)) == 0)
 4173                                 panic("softdep_write_inodeblock: lost dep3");
 4174 #endif /* DIAGNOSTIC */
 4175                         dp->di_ib[i] = 0;
 4176                 }
 4177                 return;
 4178         }
 4179         /*
 4180          * If we have zero'ed out the last allocated block of the file,
 4181          * roll back the size to the last currently allocated block.
 4182          * We know that this last allocated block is a full-sized as
 4183          * we already checked for fragments in the loop above.
 4184          */
 4185         if (lastadp != NULL &&
 4186             dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 4187                 for (i = lastadp->ad_lbn; i >= 0; i--)
 4188                         if (dp->di_db[i] != 0)
 4189                                 break;
 4190                 dp->di_size = (i + 1) * fs->fs_bsize;
 4191         }
 4192         /*
 4193          * The only dependencies are for indirect blocks.
 4194          *
 4195          * The file size for indirect block additions is not guaranteed.
 4196          * Such a guarantee would be non-trivial to achieve. The conventional
 4197          * synchronous write implementation also does not make this guarantee.
 4198          * Fsck should catch and fix discrepancies. Arguably, the file size
 4199          * can be over-estimated without destroying integrity when the file
 4200          * moves into the indirect blocks (i.e., is large). If we want to
 4201          * postpone fsck, we are stuck with this argument.
 4202          */
 4203         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
 4204                 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
 4205 }
 4206 
 4207 /*
 4208  * This routine is called during the completion interrupt
 4209  * service routine for a disk write (from the procedure called
 4210  * by the device driver to inform the filesystem caches of
 4211  * a request completion).  It should be called early in this
 4212  * procedure, before the block is made available to other
 4213  * processes or other routines are called.
 4214  */
 4215 static void 
 4216 softdep_disk_write_complete(bp)
 4217         struct buf *bp;         /* describes the completed disk write */
 4218 {
 4219         struct worklist *wk;
 4220         struct worklist *owk;
 4221         struct workhead reattach;
 4222         struct newblk *newblk;
 4223         struct allocindir *aip;
 4224         struct allocdirect *adp;
 4225         struct indirdep *indirdep;
 4226         struct inodedep *inodedep;
 4227         struct bmsafemap *bmsafemap;
 4228 
 4229         /*
 4230          * If an error occurred while doing the write, then the data
 4231          * has not hit the disk and the dependencies cannot be unrolled.
 4232          */
 4233         if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
 4234                 return;
 4235         LIST_INIT(&reattach);
 4236         /*
 4237          * This lock must not be released anywhere in this code segment.
 4238          */
 4239         ACQUIRE_LOCK(&lk);
 4240         owk = NULL;
 4241         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 4242                 WORKLIST_REMOVE(wk);
 4243                 if (wk == owk)
 4244                         panic("duplicate worklist: %p\n", wk);
 4245                 owk = wk;
 4246                 switch (wk->wk_type) {
 4247 
 4248                 case D_PAGEDEP:
 4249                         if (handle_written_filepage(WK_PAGEDEP(wk), bp))
 4250                                 WORKLIST_INSERT(&reattach, wk);
 4251                         continue;
 4252 
 4253                 case D_INODEDEP:
 4254                         if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
 4255                                 WORKLIST_INSERT(&reattach, wk);
 4256                         continue;
 4257 
 4258                 case D_BMSAFEMAP:
 4259                         bmsafemap = WK_BMSAFEMAP(wk);
 4260                         while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
 4261                                 newblk->nb_state |= DEPCOMPLETE;
 4262                                 newblk->nb_bmsafemap = NULL;
 4263                                 LIST_REMOVE(newblk, nb_deps);
 4264                         }
 4265                         while ((adp =
 4266                            LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
 4267                                 adp->ad_state |= DEPCOMPLETE;
 4268                                 adp->ad_buf = NULL;
 4269                                 LIST_REMOVE(adp, ad_deps);
 4270                                 handle_allocdirect_partdone(adp);
 4271                         }
 4272                         while ((aip =
 4273                             LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
 4274                                 aip->ai_state |= DEPCOMPLETE;
 4275                                 aip->ai_buf = NULL;
 4276                                 LIST_REMOVE(aip, ai_deps);
 4277                                 handle_allocindir_partdone(aip);
 4278                         }
 4279                         while ((inodedep =
 4280                              LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
 4281                                 inodedep->id_state |= DEPCOMPLETE;
 4282                                 LIST_REMOVE(inodedep, id_deps);
 4283                                 inodedep->id_buf = NULL;
 4284                         }
 4285                         WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
 4286                         continue;
 4287 
 4288                 case D_MKDIR:
 4289                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
 4290                         continue;
 4291 
 4292                 case D_ALLOCDIRECT:
 4293                         adp = WK_ALLOCDIRECT(wk);
 4294                         adp->ad_state |= COMPLETE;
 4295                         handle_allocdirect_partdone(adp);
 4296                         continue;
 4297 
 4298                 case D_ALLOCINDIR:
 4299                         aip = WK_ALLOCINDIR(wk);
 4300                         aip->ai_state |= COMPLETE;
 4301                         handle_allocindir_partdone(aip);
 4302                         continue;
 4303 
 4304                 case D_INDIRDEP:
 4305                         indirdep = WK_INDIRDEP(wk);
 4306                         if (indirdep->ir_state & GOINGAWAY)
 4307                                 panic("disk_write_complete: indirdep gone");
 4308                         bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
 4309                         FREE(indirdep->ir_saveddata, M_INDIRDEP);
 4310                         indirdep->ir_saveddata = 0;
 4311                         indirdep->ir_state &= ~UNDONE;
 4312                         indirdep->ir_state |= ATTACHED;
 4313                         while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
 4314                                 handle_allocindir_partdone(aip);
 4315                                 if (aip == LIST_FIRST(&indirdep->ir_donehd))
 4316                                         panic("disk_write_complete: not gone");
 4317                         }
 4318                         WORKLIST_INSERT(&reattach, wk);
 4319                         if ((bp->b_flags & B_DELWRI) == 0)
 4320                                 stat_indir_blk_ptrs++;
 4321                         bdirty(bp);
 4322                         continue;
 4323 
 4324                 default:
 4325                         panic("handle_disk_write_complete: Unknown type %s",
 4326                             TYPENAME(wk->wk_type));
 4327                         /* NOTREACHED */
 4328                 }
 4329         }
 4330         /*
 4331          * Reattach any requests that must be redone.
 4332          */
 4333         while ((wk = LIST_FIRST(&reattach)) != NULL) {
 4334                 WORKLIST_REMOVE(wk);
 4335                 WORKLIST_INSERT(&bp->b_dep, wk);
 4336         }
 4337         FREE_LOCK(&lk);
 4338 }
 4339 
 4340 /*
 4341  * Called from within softdep_disk_write_complete above. Note that
 4342  * this routine is always called from interrupt level with further
 4343  * splbio interrupts blocked.
 4344  */
 4345 static void 
 4346 handle_allocdirect_partdone(adp)
 4347         struct allocdirect *adp;        /* the completed allocdirect */
 4348 {
 4349         struct allocdirectlst *listhead;
 4350         struct allocdirect *listadp;
 4351         struct inodedep *inodedep;
 4352         long bsize, delay;
 4353 
 4354         if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
 4355                 return;
 4356         if (adp->ad_buf != NULL)
 4357                 panic("handle_allocdirect_partdone: dangling dep");
 4358         /*
 4359          * The on-disk inode cannot claim to be any larger than the last
 4360          * fragment that has been written. Otherwise, the on-disk inode
 4361          * might have fragments that were not the last block in the file
 4362          * which would corrupt the filesystem. Thus, we cannot free any
 4363          * allocdirects after one whose ad_oldblkno claims a fragment as
 4364          * these blocks must be rolled back to zero before writing the inode.
 4365          * We check the currently active set of allocdirects in id_inoupdt
 4366          * or id_extupdt as appropriate.
 4367          */
 4368         inodedep = adp->ad_inodedep;
 4369         bsize = inodedep->id_fs->fs_bsize;
 4370         if (adp->ad_state & EXTDATA)
 4371                 listhead = &inodedep->id_extupdt;
 4372         else
 4373                 listhead = &inodedep->id_inoupdt;
 4374         TAILQ_FOREACH(listadp, listhead, ad_next) {
 4375                 /* found our block */
 4376                 if (listadp == adp)
 4377                         break;
 4378                 /* continue if ad_oldlbn is not a fragment */
 4379                 if (listadp->ad_oldsize == 0 ||
 4380                     listadp->ad_oldsize == bsize)
 4381                         continue;
 4382                 /* hit a fragment */
 4383                 return;
 4384         }
 4385         /*
 4386          * If we have reached the end of the current list without
 4387          * finding the just finished dependency, then it must be
 4388          * on the future dependency list. Future dependencies cannot
 4389          * be freed until they are moved to the current list.
 4390          */
 4391         if (listadp == NULL) {
 4392 #ifdef DEBUG
 4393                 if (adp->ad_state & EXTDATA)
 4394                         listhead = &inodedep->id_newextupdt;
 4395                 else
 4396                         listhead = &inodedep->id_newinoupdt;
 4397                 TAILQ_FOREACH(listadp, listhead, ad_next)
 4398                         /* found our block */
 4399                         if (listadp == adp)
 4400                                 break;
 4401                 if (listadp == NULL)
 4402                         panic("handle_allocdirect_partdone: lost dep");
 4403 #endif /* DEBUG */
 4404                 return;
 4405         }
 4406         /*
 4407          * If we have found the just finished dependency, then free
 4408          * it along with anything that follows it that is complete.
 4409          * If the inode still has a bitmap dependency, then it has
 4410          * never been written to disk, hence the on-disk inode cannot
 4411          * reference the old fragment so we can free it without delay.
 4412          */
 4413         delay = (inodedep->id_state & DEPCOMPLETE);
 4414         for (; adp; adp = listadp) {
 4415                 listadp = TAILQ_NEXT(adp, ad_next);
 4416                 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
 4417                         return;
 4418                 free_allocdirect(listhead, adp, delay);
 4419         }
 4420 }
 4421 
 4422 /*
 4423  * Called from within softdep_disk_write_complete above. Note that
 4424  * this routine is always called from interrupt level with further
 4425  * splbio interrupts blocked.
 4426  */
 4427 static void
 4428 handle_allocindir_partdone(aip)
 4429         struct allocindir *aip;         /* the completed allocindir */
 4430 {
 4431         struct indirdep *indirdep;
 4432 
 4433         if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
 4434                 return;
 4435         if (aip->ai_buf != NULL)
 4436                 panic("handle_allocindir_partdone: dangling dependency");
 4437         indirdep = aip->ai_indirdep;
 4438         if (indirdep->ir_state & UNDONE) {
 4439                 LIST_REMOVE(aip, ai_next);
 4440                 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
 4441                 return;
 4442         }
 4443         if (indirdep->ir_state & UFS1FMT)
 4444                 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
 4445                     aip->ai_newblkno;
 4446         else
 4447                 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
 4448                     aip->ai_newblkno;
 4449         LIST_REMOVE(aip, ai_next);
 4450         if (aip->ai_freefrag != NULL)
 4451                 add_to_worklist(&aip->ai_freefrag->ff_list);
 4452         WORKITEM_FREE(aip, D_ALLOCINDIR);
 4453 }
 4454 
 4455 /*
 4456  * Called from within softdep_disk_write_complete above to restore
 4457  * in-memory inode block contents to their most up-to-date state. Note
 4458  * that this routine is always called from interrupt level with further
 4459  * splbio interrupts blocked.
 4460  */
 4461 static int 
 4462 handle_written_inodeblock(inodedep, bp)
 4463         struct inodedep *inodedep;
 4464         struct buf *bp;         /* buffer containing the inode block */
 4465 {
 4466         struct worklist *wk, *filefree;
 4467         struct allocdirect *adp, *nextadp;
 4468         struct ufs1_dinode *dp1 = NULL;
 4469         struct ufs2_dinode *dp2 = NULL;
 4470         int hadchanges, fstype;
 4471 
 4472         if ((inodedep->id_state & IOSTARTED) == 0)
 4473                 panic("handle_written_inodeblock: not started");
 4474         inodedep->id_state &= ~IOSTARTED;
 4475         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
 4476                 fstype = UFS1;
 4477                 dp1 = (struct ufs1_dinode *)bp->b_data +
 4478                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
 4479         } else {
 4480                 fstype = UFS2;
 4481                 dp2 = (struct ufs2_dinode *)bp->b_data +
 4482                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
 4483         }
 4484         /*
 4485          * If we had to rollback the inode allocation because of
 4486          * bitmaps being incomplete, then simply restore it.
 4487          * Keep the block dirty so that it will not be reclaimed until
 4488          * all associated dependencies have been cleared and the
 4489          * corresponding updates written to disk.
 4490          */
 4491         if (inodedep->id_savedino1 != NULL) {
 4492                 if (fstype == UFS1)
 4493                         *dp1 = *inodedep->id_savedino1;
 4494                 else
 4495                         *dp2 = *inodedep->id_savedino2;
 4496                 FREE(inodedep->id_savedino1, M_SAVEDINO);
 4497                 inodedep->id_savedino1 = NULL;
 4498                 if ((bp->b_flags & B_DELWRI) == 0)
 4499                         stat_inode_bitmap++;
 4500                 bdirty(bp);
 4501                 return (1);
 4502         }
 4503         inodedep->id_state |= COMPLETE;
 4504         /*
 4505          * Roll forward anything that had to be rolled back before 
 4506          * the inode could be updated.
 4507          */
 4508         hadchanges = 0;
 4509         for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
 4510                 nextadp = TAILQ_NEXT(adp, ad_next);
 4511                 if (adp->ad_state & ATTACHED)
 4512                         panic("handle_written_inodeblock: new entry");
 4513                 if (fstype == UFS1) {
 4514                         if (adp->ad_lbn < NDADDR) {
 4515                                 if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
 4516                                         panic("%s %s #%jd mismatch %d != %jd",
 4517                                             "handle_written_inodeblock:",
 4518                                             "direct pointer",
 4519                                             (intmax_t)adp->ad_lbn,
 4520                                             dp1->di_db[adp->ad_lbn],
 4521                                             (intmax_t)adp->ad_oldblkno);
 4522                                 dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
 4523                         } else {
 4524                                 if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
 4525                                         panic("%s: %s #%jd allocated as %d",
 4526                                             "handle_written_inodeblock",
 4527                                             "indirect pointer",
 4528                                             (intmax_t)adp->ad_lbn - NDADDR,
 4529                                             dp1->di_ib[adp->ad_lbn - NDADDR]);
 4530                                 dp1->di_ib[adp->ad_lbn - NDADDR] =
 4531                                     adp->ad_newblkno;
 4532                         }
 4533                 } else {
 4534                         if (adp->ad_lbn < NDADDR) {
 4535                                 if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
 4536                                         panic("%s: %s #%jd %s %jd != %jd",
 4537                                             "handle_written_inodeblock",
 4538                                             "direct pointer",
 4539                                             (intmax_t)adp->ad_lbn, "mismatch",
 4540                                             (intmax_t)dp2->di_db[adp->ad_lbn],
 4541                                             (intmax_t)adp->ad_oldblkno);
 4542                                 dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
 4543                         } else {
 4544                                 if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
 4545                                         panic("%s: %s #%jd allocated as %jd",
 4546                                             "handle_written_inodeblock",
 4547                                             "indirect pointer",
 4548                                             (intmax_t)adp->ad_lbn - NDADDR,
 4549                                             (intmax_t)
 4550                                             dp2->di_ib[adp->ad_lbn - NDADDR]);
 4551                                 dp2->di_ib[adp->ad_lbn - NDADDR] =
 4552                                     adp->ad_newblkno;
 4553                         }
 4554                 }
 4555                 adp->ad_state &= ~UNDONE;
 4556                 adp->ad_state |= ATTACHED;
 4557                 hadchanges = 1;
 4558         }
 4559         for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
 4560                 nextadp = TAILQ_NEXT(adp, ad_next);
 4561                 if (adp->ad_state & ATTACHED)
 4562                         panic("handle_written_inodeblock: new entry");
 4563                 if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno)
 4564                         panic("%s: direct pointers #%jd %s %jd != %jd",
 4565                             "handle_written_inodeblock",
 4566                             (intmax_t)adp->ad_lbn, "mismatch",
 4567                             (intmax_t)dp2->di_extb[adp->ad_lbn],
 4568                             (intmax_t)adp->ad_oldblkno);
 4569                 dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
 4570                 adp->ad_state &= ~UNDONE;
 4571                 adp->ad_state |= ATTACHED;
 4572                 hadchanges = 1;
 4573         }
 4574         if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
 4575                 stat_direct_blk_ptrs++;
 4576         /*
 4577          * Reset the file size to its most up-to-date value.
 4578          */
 4579         if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
 4580                 panic("handle_written_inodeblock: bad size");
 4581         if (fstype == UFS1) {
 4582                 if (dp1->di_size != inodedep->id_savedsize) {
 4583                         dp1->di_size = inodedep->id_savedsize;
 4584                         hadchanges = 1;
 4585                 }
 4586         } else {
 4587                 if (dp2->di_size != inodedep->id_savedsize) {
 4588                         dp2->di_size = inodedep->id_savedsize;
 4589                         hadchanges = 1;
 4590                 }
 4591                 if (dp2->di_extsize != inodedep->id_savedextsize) {
 4592                         dp2->di_extsize = inodedep->id_savedextsize;
 4593                         hadchanges = 1;
 4594                 }
 4595         }
 4596         inodedep->id_savedsize = -1;
 4597         inodedep->id_savedextsize = -1;
 4598         /*
 4599          * If there were any rollbacks in the inode block, then it must be
 4600          * marked dirty so that its will eventually get written back in
 4601          * its correct form.
 4602          */
 4603         if (hadchanges)
 4604                 bdirty(bp);
 4605         /*
 4606          * Process any allocdirects that completed during the update.
 4607          */
 4608         if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
 4609                 handle_allocdirect_partdone(adp);
 4610         if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
 4611                 handle_allocdirect_partdone(adp);
 4612         /*
 4613          * Process deallocations that were held pending until the
 4614          * inode had been written to disk. Freeing of the inode
 4615          * is delayed until after all blocks have been freed to
 4616          * avoid creation of new <vfsid, inum, lbn> triples
 4617          * before the old ones have been deleted.
 4618          */
 4619         filefree = NULL;
 4620         while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
 4621                 WORKLIST_REMOVE(wk);
 4622                 switch (wk->wk_type) {
 4623 
 4624                 case D_FREEFILE:
 4625                         /*
 4626                          * We defer adding filefree to the worklist until
 4627                          * all other additions have been made to ensure
 4628                          * that it will be done after all the old blocks
 4629                          * have been freed.
 4630                          */
 4631                         if (filefree != NULL)
 4632                                 panic("handle_written_inodeblock: filefree");
 4633                         filefree = wk;
 4634                         continue;
 4635 
 4636                 case D_MKDIR:
 4637                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
 4638                         continue;
 4639 
 4640                 case D_DIRADD:
 4641                         diradd_inode_written(WK_DIRADD(wk), inodedep);
 4642                         continue;
 4643 
 4644                 case D_FREEBLKS:
 4645                         wk->wk_state |= COMPLETE;
 4646                         if ((wk->wk_state  & ALLCOMPLETE) != ALLCOMPLETE)
 4647                                 continue;
 4648                          /* -- fall through -- */
 4649                 case D_FREEFRAG:
 4650                 case D_DIRREM:
 4651                         add_to_worklist(wk);
 4652                         continue;
 4653 
 4654                 case D_NEWDIRBLK:
 4655                         free_newdirblk(WK_NEWDIRBLK(wk));
 4656                         continue;
 4657 
 4658                 default:
 4659                         panic("handle_written_inodeblock: Unknown type %s",
 4660                             TYPENAME(wk->wk_type));
 4661                         /* NOTREACHED */
 4662                 }
 4663         }
 4664         if (filefree != NULL) {
 4665                 if (free_inodedep(inodedep) == 0)
 4666                         panic("handle_written_inodeblock: live inodedep");
 4667                 add_to_worklist(filefree);
 4668                 return (0);
 4669         }
 4670 
 4671         /*
 4672          * If no outstanding dependencies, free it.
 4673          */
 4674         if (free_inodedep(inodedep) ||
 4675             (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
 4676              TAILQ_FIRST(&inodedep->id_extupdt) == 0))
 4677                 return (0);
 4678         return (hadchanges);
 4679 }
 4680 
 4681 /*
 4682  * Process a diradd entry after its dependent inode has been written.
 4683  * This routine must be called with splbio interrupts blocked.
 4684  */
 4685 static void
 4686 diradd_inode_written(dap, inodedep)
 4687         struct diradd *dap;
 4688         struct inodedep *inodedep;
 4689 {
 4690         struct pagedep *pagedep;
 4691 
 4692         dap->da_state |= COMPLETE;
 4693         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 4694                 if (dap->da_state & DIRCHG)
 4695                         pagedep = dap->da_previous->dm_pagedep;
 4696                 else
 4697                         pagedep = dap->da_pagedep;
 4698                 LIST_REMOVE(dap, da_pdlist);
 4699                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 4700         }
 4701         WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
 4702 }
 4703 
 4704 /*
 4705  * Handle the completion of a mkdir dependency.
 4706  */
 4707 static void
 4708 handle_written_mkdir(mkdir, type)
 4709         struct mkdir *mkdir;
 4710         int type;
 4711 {
 4712         struct diradd *dap;
 4713         struct pagedep *pagedep;
 4714 
 4715         if (mkdir->md_state != type)
 4716                 panic("handle_written_mkdir: bad type");
 4717         dap = mkdir->md_diradd;
 4718         dap->da_state &= ~type;
 4719         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
 4720                 dap->da_state |= DEPCOMPLETE;
 4721         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 4722                 if (dap->da_state & DIRCHG)
 4723                         pagedep = dap->da_previous->dm_pagedep;
 4724                 else
 4725                         pagedep = dap->da_pagedep;
 4726                 LIST_REMOVE(dap, da_pdlist);
 4727                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 4728         }
 4729         LIST_REMOVE(mkdir, md_mkdirs);
 4730         WORKITEM_FREE(mkdir, D_MKDIR);
 4731 }
 4732 
 4733 /*
 4734  * Called from within softdep_disk_write_complete above.
 4735  * A write operation was just completed. Removed inodes can
 4736  * now be freed and associated block pointers may be committed.
 4737  * Note that this routine is always called from interrupt level
 4738  * with further splbio interrupts blocked.
 4739  */
 4740 static int 
 4741 handle_written_filepage(pagedep, bp)
 4742         struct pagedep *pagedep;
 4743         struct buf *bp;         /* buffer containing the written page */
 4744 {
 4745         struct dirrem *dirrem;
 4746         struct diradd *dap, *nextdap;
 4747         struct direct *ep;
 4748         int i, chgs;
 4749 
 4750         if ((pagedep->pd_state & IOSTARTED) == 0)
 4751                 panic("handle_written_filepage: not started");
 4752         pagedep->pd_state &= ~IOSTARTED;
 4753         /*
 4754          * Process any directory removals that have been committed.
 4755          */
 4756         while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
 4757                 LIST_REMOVE(dirrem, dm_next);
 4758                 dirrem->dm_dirinum = pagedep->pd_ino;
 4759                 add_to_worklist(&dirrem->dm_list);
 4760         }
 4761         /*
 4762          * Free any directory additions that have been committed.
 4763          * If it is a newly allocated block, we have to wait until
 4764          * the on-disk directory inode claims the new block.
 4765          */
 4766         if ((pagedep->pd_state & NEWBLOCK) == 0)
 4767                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 4768                         free_diradd(dap);
 4769         /*
 4770          * Uncommitted directory entries must be restored.
 4771          */
 4772         for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
 4773                 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
 4774                      dap = nextdap) {
 4775                         nextdap = LIST_NEXT(dap, da_pdlist);
 4776                         if (dap->da_state & ATTACHED)
 4777                                 panic("handle_written_filepage: attached");
 4778                         ep = (struct direct *)
 4779                             ((char *)bp->b_data + dap->da_offset);
 4780                         ep->d_ino = dap->da_newinum;
 4781                         dap->da_state &= ~UNDONE;
 4782                         dap->da_state |= ATTACHED;
 4783                         chgs = 1;
 4784                         /*
 4785                          * If the inode referenced by the directory has
 4786                          * been written out, then the dependency can be
 4787                          * moved to the pending list.
 4788                          */
 4789                         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 4790                                 LIST_REMOVE(dap, da_pdlist);
 4791                                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
 4792                                     da_pdlist);
 4793                         }
 4794                 }
 4795         }
 4796         /*
 4797          * If there were any rollbacks in the directory, then it must be
 4798          * marked dirty so that its will eventually get written back in
 4799          * its correct form.
 4800          */
 4801         if (chgs) {
 4802                 if ((bp->b_flags & B_DELWRI) == 0)
 4803                         stat_dir_entry++;
 4804                 bdirty(bp);
 4805                 return (1);
 4806         }
 4807         /*
 4808          * If we are not waiting for a new directory block to be
 4809          * claimed by its inode, then the pagedep will be freed.
 4810          * Otherwise it will remain to track any new entries on
 4811          * the page in case they are fsync'ed.
 4812          */
 4813         if ((pagedep->pd_state & NEWBLOCK) == 0) {
 4814                 LIST_REMOVE(pagedep, pd_hash);
 4815                 WORKITEM_FREE(pagedep, D_PAGEDEP);
 4816         }
 4817         return (0);
 4818 }
 4819 
 4820 /*
 4821  * Writing back in-core inode structures.
 4822  * 
 4823  * The filesystem only accesses an inode's contents when it occupies an
 4824  * "in-core" inode structure.  These "in-core" structures are separate from
 4825  * the page frames used to cache inode blocks.  Only the latter are
 4826  * transferred to/from the disk.  So, when the updated contents of the
 4827  * "in-core" inode structure are copied to the corresponding in-memory inode
 4828  * block, the dependencies are also transferred.  The following procedure is
 4829  * called when copying a dirty "in-core" inode to a cached inode block.
 4830  */
 4831 
 4832 /*
 4833  * Called when an inode is loaded from disk. If the effective link count
 4834  * differed from the actual link count when it was last flushed, then we
 4835  * need to ensure that the correct effective link count is put back.
 4836  */
 4837 void 
 4838 softdep_load_inodeblock(ip)
 4839         struct inode *ip;       /* the "in_core" copy of the inode */
 4840 {
 4841         struct inodedep *inodedep;
 4842 
 4843         /*
 4844          * Check for alternate nlink count.
 4845          */
 4846         ip->i_effnlink = ip->i_nlink;
 4847         ACQUIRE_LOCK(&lk);
 4848         if (inodedep_lookup(UFSTOVFS(ip->i_ump),
 4849             ip->i_number, 0, &inodedep) == 0) {
 4850                 FREE_LOCK(&lk);
 4851                 return;
 4852         }
 4853         ip->i_effnlink -= inodedep->id_nlinkdelta;
 4854         if (inodedep->id_state & SPACECOUNTED)
 4855                 ip->i_flag |= IN_SPACECOUNTED;
 4856         FREE_LOCK(&lk);
 4857 }
 4858 
 4859 /*
 4860  * This routine is called just before the "in-core" inode
 4861  * information is to be copied to the in-memory inode block.
 4862  * Recall that an inode block contains several inodes. If
 4863  * the force flag is set, then the dependencies will be
 4864  * cleared so that the update can always be made. Note that
 4865  * the buffer is locked when this routine is called, so we
 4866  * will never be in the middle of writing the inode block 
 4867  * to disk.
 4868  */
 4869 void 
 4870 softdep_update_inodeblock(ip, bp, waitfor)
 4871         struct inode *ip;       /* the "in_core" copy of the inode */
 4872         struct buf *bp;         /* the buffer containing the inode block */
 4873         int waitfor;            /* nonzero => update must be allowed */
 4874 {
 4875         struct inodedep *inodedep;
 4876         struct worklist *wk;
 4877         struct mount *mp;
 4878         struct buf *ibp;
 4879         int error;
 4880 
 4881         /*
 4882          * If the effective link count is not equal to the actual link
 4883          * count, then we must track the difference in an inodedep while
 4884          * the inode is (potentially) tossed out of the cache. Otherwise,
 4885          * if there is no existing inodedep, then there are no dependencies
 4886          * to track.
 4887          */
 4888         mp = UFSTOVFS(ip->i_ump);
 4889         ACQUIRE_LOCK(&lk);
 4890         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
 4891                 FREE_LOCK(&lk);
 4892                 if (ip->i_effnlink != ip->i_nlink)
 4893                         panic("softdep_update_inodeblock: bad link count");
 4894                 return;
 4895         }
 4896         if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
 4897                 panic("softdep_update_inodeblock: bad delta");
 4898         /*
 4899          * Changes have been initiated. Anything depending on these
 4900          * changes cannot occur until this inode has been written.
 4901          */
 4902         inodedep->id_state &= ~COMPLETE;
 4903         if ((inodedep->id_state & ONWORKLIST) == 0)
 4904                 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
 4905         /*
 4906          * Any new dependencies associated with the incore inode must 
 4907          * now be moved to the list associated with the buffer holding
 4908          * the in-memory copy of the inode. Once merged process any
 4909          * allocdirects that are completed by the merger.
 4910          */
 4911         merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
 4912         if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
 4913                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
 4914         merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
 4915         if (!TAILQ_EMPTY(&inodedep->id_extupdt))
 4916                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
 4917         /*
 4918          * Now that the inode has been pushed into the buffer, the
 4919          * operations dependent on the inode being written to disk
 4920          * can be moved to the id_bufwait so that they will be
 4921          * processed when the buffer I/O completes.
 4922          */
 4923         while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
 4924                 WORKLIST_REMOVE(wk);
 4925                 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
 4926         }
 4927         /*
 4928          * Newly allocated inodes cannot be written until the bitmap
 4929          * that allocates them have been written (indicated by
 4930          * DEPCOMPLETE being set in id_state). If we are doing a
 4931          * forced sync (e.g., an fsync on a file), we force the bitmap
 4932          * to be written so that the update can be done.
 4933          */
 4934         if (waitfor == 0) {
 4935                 FREE_LOCK(&lk);
 4936                 return;
 4937         }
 4938 retry:
 4939         if ((inodedep->id_state & DEPCOMPLETE) != 0) {
 4940                 FREE_LOCK(&lk);
 4941                 return;
 4942         }
 4943         ibp = inodedep->id_buf;
 4944         ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
 4945         if (ibp == NULL) {
 4946                 /*
 4947                  * If ibp came back as NULL, the dependency could have been
 4948                  * freed while we slept.  Look it up again, and check to see
 4949                  * that it has completed.
 4950                  */
 4951                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
 4952                         goto retry;
 4953                 FREE_LOCK(&lk);
 4954                 return;
 4955         }
 4956         FREE_LOCK(&lk);
 4957         if ((error = bwrite(ibp)) != 0)
 4958                 softdep_error("softdep_update_inodeblock: bwrite", error);
 4959 }
 4960 
 4961 /*
 4962  * Merge the a new inode dependency list (such as id_newinoupdt) into an
 4963  * old inode dependency list (such as id_inoupdt). This routine must be
 4964  * called with splbio interrupts blocked.
 4965  */
 4966 static void
 4967 merge_inode_lists(newlisthead, oldlisthead)
 4968         struct allocdirectlst *newlisthead;
 4969         struct allocdirectlst *oldlisthead;
 4970 {
 4971         struct allocdirect *listadp, *newadp;
 4972 
 4973         newadp = TAILQ_FIRST(newlisthead);
 4974         for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
 4975                 if (listadp->ad_lbn < newadp->ad_lbn) {
 4976                         listadp = TAILQ_NEXT(listadp, ad_next);
 4977                         continue;
 4978                 }
 4979                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
 4980                 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
 4981                 if (listadp->ad_lbn == newadp->ad_lbn) {
 4982                         allocdirect_merge(oldlisthead, newadp,
 4983                             listadp);
 4984                         listadp = newadp;
 4985                 }
 4986                 newadp = TAILQ_FIRST(newlisthead);
 4987         }
 4988         while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
 4989                 TAILQ_REMOVE(newlisthead, newadp, ad_next);
 4990                 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
 4991         }
 4992 }
 4993 
 4994 /*
 4995  * If we are doing an fsync, then we must ensure that any directory
 4996  * entries for the inode have been written after the inode gets to disk.
 4997  */
 4998 int
 4999 softdep_fsync(vp)
 5000         struct vnode *vp;       /* the "in_core" copy of the inode */
 5001 {
 5002         struct inodedep *inodedep;
 5003         struct pagedep *pagedep;
 5004         struct worklist *wk;
 5005         struct diradd *dap;
 5006         struct mount *mp;
 5007         struct vnode *pvp;
 5008         struct inode *ip;
 5009         struct buf *bp;
 5010         struct fs *fs;
 5011         struct thread *td = curthread;
 5012         int error, flushparent, pagedep_new_block;
 5013         ino_t parentino;
 5014         ufs_lbn_t lbn;
 5015 
 5016         ip = VTOI(vp);
 5017         fs = ip->i_fs;
 5018         mp = vp->v_mount;
 5019         ACQUIRE_LOCK(&lk);
 5020         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
 5021                 FREE_LOCK(&lk);
 5022                 return (0);
 5023         }
 5024         if (!LIST_EMPTY(&inodedep->id_inowait) ||
 5025             !LIST_EMPTY(&inodedep->id_bufwait) ||
 5026             !TAILQ_EMPTY(&inodedep->id_extupdt) ||
 5027             !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
 5028             !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
 5029             !TAILQ_EMPTY(&inodedep->id_newinoupdt))
 5030                 panic("softdep_fsync: pending ops");
 5031         for (error = 0, flushparent = 0; ; ) {
 5032                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
 5033                         break;
 5034                 if (wk->wk_type != D_DIRADD)
 5035                         panic("softdep_fsync: Unexpected type %s",
 5036                             TYPENAME(wk->wk_type));
 5037                 dap = WK_DIRADD(wk);
 5038                 /*
 5039                  * Flush our parent if this directory entry has a MKDIR_PARENT
 5040                  * dependency or is contained in a newly allocated block.
 5041                  */
 5042                 if (dap->da_state & DIRCHG)
 5043                         pagedep = dap->da_previous->dm_pagedep;
 5044                 else
 5045                         pagedep = dap->da_pagedep;
 5046                 parentino = pagedep->pd_ino;
 5047                 lbn = pagedep->pd_lbn;
 5048                 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
 5049                         panic("softdep_fsync: dirty");
 5050                 if ((dap->da_state & MKDIR_PARENT) ||
 5051                     (pagedep->pd_state & NEWBLOCK))
 5052                         flushparent = 1;
 5053                 else
 5054                         flushparent = 0;
 5055                 /*
 5056                  * If we are being fsync'ed as part of vgone'ing this vnode,
 5057                  * then we will not be able to release and recover the
 5058                  * vnode below, so we just have to give up on writing its
 5059                  * directory entry out. It will eventually be written, just
 5060                  * not now, but then the user was not asking to have it
 5061                  * written, so we are not breaking any promises.
 5062                  */
 5063                 if (vp->v_iflag & VI_DOOMED)
 5064                         break;
 5065                 /*
 5066                  * We prevent deadlock by always fetching inodes from the
 5067                  * root, moving down the directory tree. Thus, when fetching
 5068                  * our parent directory, we first try to get the lock. If
 5069                  * that fails, we must unlock ourselves before requesting
 5070                  * the lock on our parent. See the comment in ufs_lookup
 5071                  * for details on possible races.
 5072                  */
 5073                 FREE_LOCK(&lk);
 5074                 if (ffs_vget(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) {
 5075                         VOP_UNLOCK(vp, 0, td);
 5076                         error = ffs_vget(mp, parentino, LK_EXCLUSIVE, &pvp);
 5077                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 5078                         if (error != 0)
 5079                                 return (error);
 5080                 }
 5081                 /*
 5082                  * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
 5083                  * that are contained in direct blocks will be resolved by 
 5084                  * doing a ffs_update. Pagedeps contained in indirect blocks
 5085                  * may require a complete sync'ing of the directory. So, we
 5086                  * try the cheap and fast ffs_update first, and if that fails,
 5087                  * then we do the slower ffs_syncvnode of the directory.
 5088                  */
 5089                 if (flushparent) {
 5090                         int locked;
 5091 
 5092                         if ((error = ffs_update(pvp, 1)) != 0) {
 5093                                 vput(pvp);
 5094                                 return (error);
 5095                         }
 5096                         ACQUIRE_LOCK(&lk);
 5097                         locked = 1;
 5098                         if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
 5099                                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
 5100                                         if (wk->wk_type != D_DIRADD)
 5101                                                 panic("softdep_fsync: Unexpected type %s",
 5102                                                       TYPENAME(wk->wk_type));
 5103                                         dap = WK_DIRADD(wk);
 5104                                         if (dap->da_state & DIRCHG)
 5105                                                 pagedep = dap->da_previous->dm_pagedep;
 5106                                         else
 5107                                                 pagedep = dap->da_pagedep;
 5108                                         pagedep_new_block = pagedep->pd_state & NEWBLOCK;
 5109                                         FREE_LOCK(&lk);
 5110                                         locked = 0;
 5111                                         if (pagedep_new_block &&
 5112                                             (error = ffs_syncvnode(pvp, MNT_WAIT))) {
 5113                                                 vput(pvp);
 5114                                                 return (error);
 5115                                         }
 5116                                 }
 5117                         }
 5118                         if (locked)
 5119                                 FREE_LOCK(&lk);
 5120                 }
 5121                 /*
 5122                  * Flush directory page containing the inode's name.
 5123                  */
 5124                 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
 5125                     &bp);
 5126                 if (error == 0)
 5127                         error = bwrite(bp);
 5128                 else
 5129                         brelse(bp);
 5130                 vput(pvp);
 5131                 if (error != 0)
 5132                         return (error);
 5133                 ACQUIRE_LOCK(&lk);
 5134                 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
 5135                         break;
 5136         }
 5137         FREE_LOCK(&lk);
 5138         return (0);
 5139 }
 5140 
 5141 /*
 5142  * Flush all the dirty bitmaps associated with the block device
 5143  * before flushing the rest of the dirty blocks so as to reduce
 5144  * the number of dependencies that will have to be rolled back.
 5145  */
 5146 void
 5147 softdep_fsync_mountdev(vp)
 5148         struct vnode *vp;
 5149 {
 5150         struct buf *bp, *nbp;
 5151         struct worklist *wk;
 5152 
 5153         if (!vn_isdisk(vp, NULL))
 5154                 panic("softdep_fsync_mountdev: vnode not a disk");
 5155 restart:
 5156         ACQUIRE_LOCK(&lk);
 5157         VI_LOCK(vp);
 5158         TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
 5159                 /* 
 5160                  * If it is already scheduled, skip to the next buffer.
 5161                  */
 5162                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
 5163                         continue;
 5164 
 5165                 if ((bp->b_flags & B_DELWRI) == 0)
 5166                         panic("softdep_fsync_mountdev: not dirty");
 5167                 /*
 5168                  * We are only interested in bitmaps with outstanding
 5169                  * dependencies.
 5170                  */
 5171                 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
 5172                     wk->wk_type != D_BMSAFEMAP ||
 5173                     (bp->b_vflags & BV_BKGRDINPROG)) {
 5174                         BUF_UNLOCK(bp);
 5175                         continue;
 5176                 }
 5177                 VI_UNLOCK(vp);
 5178                 FREE_LOCK(&lk);
 5179                 bremfree(bp);
 5180                 (void) bawrite(bp);
 5181                 goto restart;
 5182         }
 5183         FREE_LOCK(&lk);
 5184         drain_output(vp);
 5185         VI_UNLOCK(vp);
 5186 }
 5187 
 5188 /*
 5189  * This routine is called when we are trying to synchronously flush a
 5190  * file. This routine must eliminate any filesystem metadata dependencies
 5191  * so that the syncing routine can succeed by pushing the dirty blocks
 5192  * associated with the file. If any I/O errors occur, they are returned.
 5193  */
 5194 int
 5195 softdep_sync_metadata(struct vnode *vp)
 5196 {
 5197         struct pagedep *pagedep;
 5198         struct allocdirect *adp;
 5199         struct allocindir *aip;
 5200         struct buf *bp, *nbp;
 5201         struct worklist *wk;
 5202         int i, error, waitfor;
 5203 
 5204         if (!DOINGSOFTDEP(vp))
 5205                 return (0);
 5206         /*
 5207          * Ensure that any direct block dependencies have been cleared.
 5208          */
 5209         ACQUIRE_LOCK(&lk);
 5210         if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) {
 5211                 FREE_LOCK(&lk);
 5212                 return (error);
 5213         }
 5214         FREE_LOCK(&lk);
 5215         /*
 5216          * For most files, the only metadata dependencies are the
 5217          * cylinder group maps that allocate their inode or blocks.
 5218          * The block allocation dependencies can be found by traversing
 5219          * the dependency lists for any buffers that remain on their
 5220          * dirty buffer list. The inode allocation dependency will
 5221          * be resolved when the inode is updated with MNT_WAIT.
 5222          * This work is done in two passes. The first pass grabs most
 5223          * of the buffers and begins asynchronously writing them. The
 5224          * only way to wait for these asynchronous writes is to sleep
 5225          * on the filesystem vnode which may stay busy for a long time
 5226          * if the filesystem is active. So, instead, we make a second
 5227          * pass over the dependencies blocking on each write. In the
 5228          * usual case we will be blocking against a write that we
 5229          * initiated, so when it is done the dependency will have been
 5230          * resolved. Thus the second pass is expected to end quickly.
 5231          */
 5232         waitfor = MNT_NOWAIT;
 5233 
 5234 top:
 5235         /*
 5236          * We must wait for any I/O in progress to finish so that
 5237          * all potential buffers on the dirty list will be visible.
 5238          */
 5239         VI_LOCK(vp);
 5240         drain_output(vp);
 5241         while ((bp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd)) != NULL) {
 5242                 bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT);
 5243                 if (bp)
 5244                         break;
 5245         }
 5246         VI_UNLOCK(vp);
 5247         if (bp == NULL)
 5248                 return (0);
 5249 loop:
 5250         /* While syncing snapshots, we must allow recursive lookups */
 5251         bp->b_lock.lk_flags |= LK_CANRECURSE;
 5252         ACQUIRE_LOCK(&lk);
 5253         /*
 5254          * As we hold the buffer locked, none of its dependencies
 5255          * will disappear.
 5256          */
 5257         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 5258                 switch (wk->wk_type) {
 5259 
 5260                 case D_ALLOCDIRECT:
 5261                         adp = WK_ALLOCDIRECT(wk);
 5262                         if (adp->ad_state & DEPCOMPLETE)
 5263                                 continue;
 5264                         nbp = adp->ad_buf;
 5265                         nbp = getdirtybuf(nbp, &lk, waitfor);
 5266                         if (nbp == NULL)
 5267                                 continue;
 5268                         FREE_LOCK(&lk);
 5269                         if (waitfor == MNT_NOWAIT) {
 5270                                 bawrite(nbp);
 5271                         } else if ((error = bwrite(nbp)) != 0) {
 5272                                 break;
 5273                         }
 5274                         ACQUIRE_LOCK(&lk);
 5275                         continue;
 5276 
 5277                 case D_ALLOCINDIR:
 5278                         aip = WK_ALLOCINDIR(wk);
 5279                         if (aip->ai_state & DEPCOMPLETE)
 5280                                 continue;
 5281                         nbp = aip->ai_buf;
 5282                         nbp = getdirtybuf(nbp, &lk, waitfor);
 5283                         if (nbp == NULL)
 5284                                 continue;
 5285                         FREE_LOCK(&lk);
 5286                         if (waitfor == MNT_NOWAIT) {
 5287                                 bawrite(nbp);
 5288                         } else if ((error = bwrite(nbp)) != 0) {
 5289                                 break;
 5290                         }
 5291                         ACQUIRE_LOCK(&lk);
 5292                         continue;
 5293 
 5294                 case D_INDIRDEP:
 5295                 restart:
 5296 
 5297                         LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
 5298                                 if (aip->ai_state & DEPCOMPLETE)
 5299                                         continue;
 5300                                 nbp = aip->ai_buf;
 5301                                 nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
 5302                                 if (nbp == NULL)
 5303                                         goto restart;
 5304                                 FREE_LOCK(&lk);
 5305                                 if ((error = bwrite(nbp)) != 0) {
 5306                                         goto loop_end;
 5307                                 }
 5308                                 ACQUIRE_LOCK(&lk);
 5309                                 goto restart;
 5310                         }
 5311                         continue;
 5312 
 5313                 case D_INODEDEP:
 5314                         if ((error = flush_inodedep_deps(wk->wk_mp,
 5315                             WK_INODEDEP(wk)->id_ino)) != 0) {
 5316                                 FREE_LOCK(&lk);
 5317                                 break;
 5318                         }
 5319                         continue;
 5320 
 5321                 case D_PAGEDEP:
 5322                         /*
 5323                          * We are trying to sync a directory that may
 5324                          * have dependencies on both its own metadata
 5325                          * and/or dependencies on the inodes of any
 5326                          * recently allocated files. We walk its diradd
 5327                          * lists pushing out the associated inode.
 5328                          */
 5329                         pagedep = WK_PAGEDEP(wk);
 5330                         for (i = 0; i < DAHASHSZ; i++) {
 5331                                 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
 5332                                         continue;
 5333                                 if ((error =
 5334                                     flush_pagedep_deps(vp, wk->wk_mp,
 5335                                                 &pagedep->pd_diraddhd[i]))) {
 5336                                         FREE_LOCK(&lk);
 5337                                         goto loop_end;
 5338                                 }
 5339                         }
 5340                         continue;
 5341 
 5342                 case D_MKDIR:
 5343                         /*
 5344                          * This case should never happen if the vnode has
 5345                          * been properly sync'ed. However, if this function
 5346                          * is used at a place where the vnode has not yet
 5347                          * been sync'ed, this dependency can show up. So,
 5348                          * rather than panic, just flush it.
 5349                          */
 5350                         nbp = WK_MKDIR(wk)->md_buf;
 5351                         nbp = getdirtybuf(nbp, &lk, waitfor);
 5352                         if (nbp == NULL)
 5353                                 continue;
 5354                         FREE_LOCK(&lk);
 5355                         if (waitfor == MNT_NOWAIT) {
 5356                                 bawrite(nbp);
 5357                         } else if ((error = bwrite(nbp)) != 0) {
 5358                                 break;
 5359                         }
 5360                         ACQUIRE_LOCK(&lk);
 5361                         continue;
 5362 
 5363                 case D_BMSAFEMAP:
 5364                         /*
 5365                          * This case should never happen if the vnode has
 5366                          * been properly sync'ed. However, if this function
 5367                          * is used at a place where the vnode has not yet
 5368                          * been sync'ed, this dependency can show up. So,
 5369                          * rather than panic, just flush it.
 5370                          */
 5371                         nbp = WK_BMSAFEMAP(wk)->sm_buf;
 5372                         nbp = getdirtybuf(nbp, &lk, waitfor);
 5373                         if (nbp == NULL)
 5374                                 continue;
 5375                         FREE_LOCK(&lk);
 5376                         if (waitfor == MNT_NOWAIT) {
 5377                                 bawrite(nbp);
 5378                         } else if ((error = bwrite(nbp)) != 0) {
 5379                                 break;
 5380                         }
 5381                         ACQUIRE_LOCK(&lk);
 5382                         continue;
 5383 
 5384                 default:
 5385                         panic("softdep_sync_metadata: Unknown type %s",
 5386                             TYPENAME(wk->wk_type));
 5387                         /* NOTREACHED */
 5388                 }
 5389         loop_end:
 5390                 /* We reach here only in error and unlocked */
 5391                 if (error == 0)
 5392                         panic("softdep_sync_metadata: zero error");
 5393                 bp->b_lock.lk_flags &= ~LK_CANRECURSE;
 5394                 bawrite(bp);
 5395                 return (error);
 5396         }
 5397         FREE_LOCK(&lk);
 5398         VI_LOCK(vp);
 5399         while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) {
 5400                 nbp = getdirtybuf(nbp, VI_MTX(vp), MNT_WAIT);
 5401                 if (nbp)
 5402                         break;
 5403         }
 5404         VI_UNLOCK(vp);
 5405         bp->b_lock.lk_flags &= ~LK_CANRECURSE;
 5406         bawrite(bp);
 5407         if (nbp != NULL) {
 5408                 bp = nbp;
 5409                 goto loop;
 5410         }
 5411         /*
 5412          * The brief unlock is to allow any pent up dependency
 5413          * processing to be done. Then proceed with the second pass.
 5414          */
 5415         if (waitfor == MNT_NOWAIT) {
 5416                 waitfor = MNT_WAIT;
 5417                 goto top;
 5418         }
 5419 
 5420         /*
 5421          * If we have managed to get rid of all the dirty buffers,
 5422          * then we are done. For certain directories and block
 5423          * devices, we may need to do further work.
 5424          *
 5425          * We must wait for any I/O in progress to finish so that
 5426          * all potential buffers on the dirty list will be visible.
 5427          */
 5428         VI_LOCK(vp);
 5429         drain_output(vp);
 5430         VI_UNLOCK(vp);
 5431         return (0);
 5432 }
 5433 
 5434 /*
 5435  * Flush the dependencies associated with an inodedep.
 5436  * Called with splbio blocked.
 5437  */
 5438 static int
 5439 flush_inodedep_deps(mp, ino)
 5440         struct mount *mp;
 5441         ino_t ino;
 5442 {
 5443         struct inodedep *inodedep;
 5444         int error, waitfor;
 5445 
 5446         /*
 5447          * This work is done in two passes. The first pass grabs most
 5448          * of the buffers and begins asynchronously writing them. The
 5449          * only way to wait for these asynchronous writes is to sleep
 5450          * on the filesystem vnode which may stay busy for a long time
 5451          * if the filesystem is active. So, instead, we make a second
 5452          * pass over the dependencies blocking on each write. In the
 5453          * usual case we will be blocking against a write that we
 5454          * initiated, so when it is done the dependency will have been
 5455          * resolved. Thus the second pass is expected to end quickly.
 5456          * We give a brief window at the top of the loop to allow
 5457          * any pending I/O to complete.
 5458          */
 5459         for (error = 0, waitfor = MNT_NOWAIT; ; ) {
 5460                 if (error)
 5461                         return (error);
 5462                 FREE_LOCK(&lk);
 5463                 ACQUIRE_LOCK(&lk);
 5464                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
 5465                         return (0);
 5466                 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
 5467                     flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
 5468                     flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
 5469                     flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
 5470                         continue;
 5471                 /*
 5472                  * If pass2, we are done, otherwise do pass 2.
 5473                  */
 5474                 if (waitfor == MNT_WAIT)
 5475                         break;
 5476                 waitfor = MNT_WAIT;
 5477         }
 5478         /*
 5479          * Try freeing inodedep in case all dependencies have been removed.
 5480          */
 5481         if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
 5482                 (void) free_inodedep(inodedep);
 5483         return (0);
 5484 }
 5485 
 5486 /*
 5487  * Flush an inode dependency list.
 5488  * Called with splbio blocked.
 5489  */
 5490 static int
 5491 flush_deplist(listhead, waitfor, errorp)
 5492         struct allocdirectlst *listhead;
 5493         int waitfor;
 5494         int *errorp;
 5495 {
 5496         struct allocdirect *adp;
 5497         struct buf *bp;
 5498 
 5499         mtx_assert(&lk, MA_OWNED);
 5500         TAILQ_FOREACH(adp, listhead, ad_next) {
 5501                 if (adp->ad_state & DEPCOMPLETE)
 5502                         continue;
 5503                 bp = adp->ad_buf;
 5504                 bp = getdirtybuf(bp, &lk, waitfor);
 5505                 if (bp == NULL) {
 5506                         if (waitfor == MNT_NOWAIT)
 5507                                 continue;
 5508                         return (1);
 5509                 }
 5510                 FREE_LOCK(&lk);
 5511                 if (waitfor == MNT_NOWAIT) {
 5512                         bawrite(bp);
 5513                 } else if ((*errorp = bwrite(bp)) != 0) {
 5514                         ACQUIRE_LOCK(&lk);
 5515                         return (1);
 5516                 }
 5517                 ACQUIRE_LOCK(&lk);
 5518                 return (1);
 5519         }
 5520         return (0);
 5521 }
 5522 
 5523 /*
 5524  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
 5525  * Called with splbio blocked.
 5526  */
 5527 static int
 5528 flush_pagedep_deps(pvp, mp, diraddhdp)
 5529         struct vnode *pvp;
 5530         struct mount *mp;
 5531         struct diraddhd *diraddhdp;
 5532 {
 5533         struct inodedep *inodedep;
 5534         struct ufsmount *ump;
 5535         struct diradd *dap;
 5536         struct vnode *vp;
 5537         int error = 0;
 5538         struct buf *bp;
 5539         ino_t inum;
 5540         struct worklist *wk;
 5541 
 5542         ump = VFSTOUFS(mp);
 5543         while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
 5544                 /*
 5545                  * Flush ourselves if this directory entry
 5546                  * has a MKDIR_PARENT dependency.
 5547                  */
 5548                 if (dap->da_state & MKDIR_PARENT) {
 5549                         FREE_LOCK(&lk);
 5550                         if ((error = ffs_update(pvp, 1)) != 0)
 5551                                 break;
 5552                         ACQUIRE_LOCK(&lk);
 5553                         /*
 5554                          * If that cleared dependencies, go on to next.
 5555                          */
 5556                         if (dap != LIST_FIRST(diraddhdp))
 5557                                 continue;
 5558                         if (dap->da_state & MKDIR_PARENT)
 5559                                 panic("flush_pagedep_deps: MKDIR_PARENT");
 5560                 }
 5561                 /*
 5562                  * A newly allocated directory must have its "." and
 5563                  * ".." entries written out before its name can be
 5564                  * committed in its parent. We do not want or need
 5565                  * the full semantics of a synchronous ffs_syncvnode as
 5566                  * that may end up here again, once for each directory
 5567                  * level in the filesystem. Instead, we push the blocks
 5568                  * and wait for them to clear. We have to fsync twice
 5569                  * because the first call may choose to defer blocks
 5570                  * that still have dependencies, but deferral will
 5571                  * happen at most once.
 5572                  */
 5573                 inum = dap->da_newinum;
 5574                 if (dap->da_state & MKDIR_BODY) {
 5575                         FREE_LOCK(&lk);
 5576                         if ((error = ffs_vget(mp, inum, LK_EXCLUSIVE, &vp)))
 5577                                 break;
 5578                         if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) ||
 5579                             (error=ffs_syncvnode(vp, MNT_NOWAIT))) {
 5580                                 vput(vp);
 5581                                 break;
 5582                         }
 5583                         VI_LOCK(vp);
 5584                         drain_output(vp);
 5585                         /*
 5586                          * If first block is still dirty with a D_MKDIR
 5587                          * dependency then it needs to be written now.
 5588                          */
 5589                         for (;;) {
 5590                                 error = 0;
 5591                                 bp = gbincore(&vp->v_bufobj, 0);
 5592                                 if (bp == NULL)
 5593                                         break;  /* First block not present */
 5594                                 error = BUF_LOCK(bp,
 5595                                                  LK_EXCLUSIVE |
 5596                                                  LK_SLEEPFAIL |
 5597                                                  LK_INTERLOCK,
 5598                                                  VI_MTX(vp));
 5599                                 VI_LOCK(vp);
 5600                                 if (error == ENOLCK)
 5601                                         continue;       /* Slept, retry */
 5602                                 if (error != 0)
 5603                                         break;          /* Failed */
 5604                                 if ((bp->b_flags & B_DELWRI) == 0) {
 5605                                         BUF_UNLOCK(bp);
 5606                                         break;  /* Buffer not dirty */
 5607                                 }
 5608                                 for (wk = LIST_FIRST(&bp->b_dep);
 5609                                      wk != NULL;
 5610                                      wk = LIST_NEXT(wk, wk_list))
 5611                                         if (wk->wk_type == D_MKDIR)
 5612                                                 break;
 5613                                 if (wk == NULL)
 5614                                         BUF_UNLOCK(bp); /* Dependency gone */
 5615                                 else {
 5616                                         /*
 5617                                          * D_MKDIR dependency remains,
 5618                                          * must write buffer to stable
 5619                                          * storage.
 5620                                          */
 5621                                         VI_UNLOCK(vp);
 5622                                         bremfree(bp);
 5623                                         error = bwrite(bp);
 5624                                         VI_LOCK(vp);
 5625                                 }
 5626                                 break;
 5627                         }
 5628                         VI_UNLOCK(vp);
 5629                         vput(vp);
 5630                         if (error != 0)
 5631                                 break;  /* Flushing of first block failed */
 5632                         ACQUIRE_LOCK(&lk);
 5633                         /*
 5634                          * If that cleared dependencies, go on to next.
 5635                          */
 5636                         if (dap != LIST_FIRST(diraddhdp))
 5637                                 continue;
 5638                         if (dap->da_state & MKDIR_BODY)
 5639                                 panic("flush_pagedep_deps: MKDIR_BODY");
 5640                 }
 5641                 /*
 5642                  * Flush the inode on which the directory entry depends.
 5643                  * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
 5644                  * the only remaining dependency is that the updated inode
 5645                  * count must get pushed to disk. The inode has already
 5646                  * been pushed into its inode buffer (via VOP_UPDATE) at
 5647                  * the time of the reference count change. So we need only
 5648                  * locate that buffer, ensure that there will be no rollback
 5649                  * caused by a bitmap dependency, then write the inode buffer.
 5650                  */
 5651 retry:
 5652                 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
 5653                         panic("flush_pagedep_deps: lost inode");
 5654                 /*
 5655                  * If the inode still has bitmap dependencies,
 5656                  * push them to disk.
 5657                  */
 5658                 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 5659                         bp = inodedep->id_buf;
 5660                         bp = getdirtybuf(bp, &lk, MNT_WAIT);
 5661                         if (bp == NULL)
 5662                                 goto retry;
 5663                         FREE_LOCK(&lk);
 5664                         if ((error = bwrite(bp)) != 0)
 5665                                 break;
 5666                         ACQUIRE_LOCK(&lk);
 5667                         if (dap != LIST_FIRST(diraddhdp))
 5668                                 continue;
 5669                 }
 5670                 /*
 5671                  * If the inode is still sitting in a buffer waiting
 5672                  * to be written, push it to disk.
 5673                  */
 5674                 FREE_LOCK(&lk);
 5675                 if ((error = bread(ump->um_devvp,
 5676                     fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
 5677                     (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
 5678                         brelse(bp);
 5679                         break;
 5680                 }
 5681                 if ((error = bwrite(bp)) != 0)
 5682                         break;
 5683                 ACQUIRE_LOCK(&lk);
 5684                 /*
 5685                  * If we have failed to get rid of all the dependencies
 5686                  * then something is seriously wrong.
 5687                  */
 5688                 if (dap == LIST_FIRST(diraddhdp))
 5689                         panic("flush_pagedep_deps: flush failed");
 5690         }
 5691         if (error)
 5692                 ACQUIRE_LOCK(&lk);
 5693         return (error);
 5694 }
 5695 
 5696 /*
 5697  * A large burst of file addition or deletion activity can drive the
 5698  * memory load excessively high. First attempt to slow things down
 5699  * using the techniques below. If that fails, this routine requests
 5700  * the offending operations to fall back to running synchronously
 5701  * until the memory load returns to a reasonable level.
 5702  */
 5703 int
 5704 softdep_slowdown(vp)
 5705         struct vnode *vp;
 5706 {
 5707         int max_softdeps_hard;
 5708 
 5709         ACQUIRE_LOCK(&lk);
 5710         max_softdeps_hard = max_softdeps * 11 / 10;
 5711         if (num_dirrem < max_softdeps_hard / 2 &&
 5712             num_inodedep < max_softdeps_hard &&
 5713             VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps) {
 5714                 FREE_LOCK(&lk);
 5715                 return (0);
 5716         }
 5717         if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps)
 5718                 softdep_speedup();
 5719         stat_sync_limit_hit += 1;
 5720         FREE_LOCK(&lk);
 5721         return (1);
 5722 }
 5723 
 5724 /*
 5725  * Called by the allocation routines when they are about to fail
 5726  * in the hope that we can free up some disk space.
 5727  * 
 5728  * First check to see if the work list has anything on it. If it has,
 5729  * clean up entries until we successfully free some space. Because this
 5730  * process holds inodes locked, we cannot handle any remove requests
 5731  * that might block on a locked inode as that could lead to deadlock.
 5732  * If the worklist yields no free space, encourage the syncer daemon
 5733  * to help us. In no event will we try for longer than tickdelay seconds.
 5734  */
 5735 int
 5736 softdep_request_cleanup(fs, vp)
 5737         struct fs *fs;
 5738         struct vnode *vp;
 5739 {
 5740         struct ufsmount *ump;
 5741         long starttime;
 5742         ufs2_daddr_t needed;
 5743         int error;
 5744 
 5745         ump = VTOI(vp)->i_ump;
 5746         mtx_assert(UFS_MTX(ump), MA_OWNED);
 5747         needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
 5748         starttime = time_second + tickdelay;
 5749         /*
 5750          * If we are being called because of a process doing a
 5751          * copy-on-write, then it is not safe to update the vnode
 5752          * as we may recurse into the copy-on-write routine.
 5753          */
 5754         if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
 5755                 UFS_UNLOCK(ump);
 5756                 error = ffs_update(vp, 1);
 5757                 UFS_LOCK(ump);
 5758                 if (error != 0)
 5759                         return (0);
 5760         }
 5761         while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
 5762                 if (time_second > starttime)
 5763                         return (0);
 5764                 UFS_UNLOCK(ump);
 5765                 ACQUIRE_LOCK(&lk);
 5766                 if (ump->softdep_on_worklist > 0 &&
 5767                     process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
 5768                         stat_worklist_push += 1;
 5769                         FREE_LOCK(&lk);
 5770                         UFS_LOCK(ump);
 5771                         continue;
 5772                 }
 5773                 request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT);
 5774                 FREE_LOCK(&lk);
 5775                 UFS_LOCK(ump);
 5776         }
 5777         return (1);
 5778 }
 5779 
 5780 /*
 5781  * If memory utilization has gotten too high, deliberately slow things
 5782  * down and speed up the I/O processing.
 5783  */
 5784 extern struct thread *syncertd;
 5785 static int
 5786 request_cleanup(mp, resource)
 5787         struct mount *mp;
 5788         int resource;
 5789 {
 5790         struct thread *td = curthread;
 5791         struct ufsmount *ump;
 5792 
 5793         mtx_assert(&lk, MA_OWNED);
 5794         /*
 5795          * We never hold up the filesystem syncer or buf daemon.
 5796          */
 5797         if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
 5798                 return (0);
 5799         ump = VFSTOUFS(mp);
 5800         /*
 5801          * First check to see if the work list has gotten backlogged.
 5802          * If it has, co-opt this process to help clean up two entries.
 5803          * Because this process may hold inodes locked, we cannot
 5804          * handle any remove requests that might block on a locked
 5805          * inode as that could lead to deadlock.  We set TDP_SOFTDEP
 5806          * to avoid recursively processing the worklist.
 5807          */
 5808         if (ump->softdep_on_worklist > max_softdeps / 10) {
 5809                 td->td_pflags |= TDP_SOFTDEP;
 5810                 process_worklist_item(mp, LK_NOWAIT);
 5811                 process_worklist_item(mp, LK_NOWAIT);
 5812                 td->td_pflags &= ~TDP_SOFTDEP;
 5813                 stat_worklist_push += 2;
 5814                 return(1);
 5815         }
 5816         /*
 5817          * Next, we attempt to speed up the syncer process. If that
 5818          * is successful, then we allow the process to continue.
 5819          */
 5820         if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT)
 5821                 return(0);
 5822         /*
 5823          * If we are resource constrained on inode dependencies, try
 5824          * flushing some dirty inodes. Otherwise, we are constrained
 5825          * by file deletions, so try accelerating flushes of directories
 5826          * with removal dependencies. We would like to do the cleanup
 5827          * here, but we probably hold an inode locked at this point and 
 5828          * that might deadlock against one that we try to clean. So,
 5829          * the best that we can do is request the syncer daemon to do
 5830          * the cleanup for us.
 5831          */
 5832         switch (resource) {
 5833 
 5834         case FLUSH_INODES:
 5835                 stat_ino_limit_push += 1;
 5836                 req_clear_inodedeps += 1;
 5837                 stat_countp = &stat_ino_limit_hit;
 5838                 break;
 5839 
 5840         case FLUSH_REMOVE:
 5841         case FLUSH_REMOVE_WAIT:
 5842                 stat_blk_limit_push += 1;
 5843                 req_clear_remove += 1;
 5844                 stat_countp = &stat_blk_limit_hit;
 5845                 break;
 5846 
 5847         default:
 5848                 panic("request_cleanup: unknown type");
 5849         }
 5850         /*
 5851          * Hopefully the syncer daemon will catch up and awaken us.
 5852          * We wait at most tickdelay before proceeding in any case.
 5853          */
 5854         proc_waiting += 1;
 5855         if (handle.callout == NULL)
 5856                 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
 5857         msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
 5858         proc_waiting -= 1;
 5859         return (1);
 5860 }
 5861 
 5862 /*
 5863  * Awaken processes pausing in request_cleanup and clear proc_waiting
 5864  * to indicate that there is no longer a timer running.
 5865  */
 5866 static void
 5867 pause_timer(arg)
 5868         void *arg;
 5869 {
 5870 
 5871         ACQUIRE_LOCK(&lk);
 5872         *stat_countp += 1;
 5873         wakeup_one(&proc_waiting);
 5874         if (proc_waiting > 0)
 5875                 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
 5876         else
 5877                 handle.callout = NULL;
 5878         FREE_LOCK(&lk);
 5879 }
 5880 
 5881 /*
 5882  * Flush out a directory with at least one removal dependency in an effort to
 5883  * reduce the number of dirrem, freefile, and freeblks dependency structures.
 5884  */
 5885 static void
 5886 clear_remove(td)
 5887         struct thread *td;
 5888 {
 5889         struct pagedep_hashhead *pagedephd;
 5890         struct pagedep *pagedep;
 5891         static int next = 0;
 5892         struct mount *mp;
 5893         struct vnode *vp;
 5894         int error, cnt;
 5895         ino_t ino;
 5896 
 5897         mtx_assert(&lk, MA_OWNED);
 5898 
 5899         for (cnt = 0; cnt < pagedep_hash; cnt++) {
 5900                 pagedephd = &pagedep_hashtbl[next++];
 5901                 if (next >= pagedep_hash)
 5902                         next = 0;
 5903                 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
 5904                         if (LIST_EMPTY(&pagedep->pd_dirremhd))
 5905                                 continue;
 5906                         mp = pagedep->pd_list.wk_mp;
 5907                         ino = pagedep->pd_ino;
 5908                         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
 5909                                 continue;
 5910                         FREE_LOCK(&lk);
 5911                         if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp))) {
 5912                                 softdep_error("clear_remove: vget", error);
 5913                                 vn_finished_write(mp);
 5914                                 ACQUIRE_LOCK(&lk);
 5915                                 return;
 5916                         }
 5917                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
 5918                                 softdep_error("clear_remove: fsync", error);
 5919                         VI_LOCK(vp);
 5920                         drain_output(vp);
 5921                         VI_UNLOCK(vp);
 5922                         vput(vp);
 5923                         vn_finished_write(mp);
 5924                         ACQUIRE_LOCK(&lk);
 5925                         return;
 5926                 }
 5927         }
 5928 }
 5929 
 5930 /*
 5931  * Clear out a block of dirty inodes in an effort to reduce
 5932  * the number of inodedep dependency structures.
 5933  */
 5934 static void
 5935 clear_inodedeps(td)
 5936         struct thread *td;
 5937 {
 5938         struct inodedep_hashhead *inodedephd;
 5939         struct inodedep *inodedep;
 5940         static int next = 0;
 5941         struct mount *mp;
 5942         struct vnode *vp;
 5943         struct fs *fs;
 5944         int error, cnt;
 5945         ino_t firstino, lastino, ino;
 5946 
 5947         mtx_assert(&lk, MA_OWNED);
 5948         /*
 5949          * Pick a random inode dependency to be cleared.
 5950          * We will then gather up all the inodes in its block 
 5951          * that have dependencies and flush them out.
 5952          */
 5953         for (cnt = 0; cnt < inodedep_hash; cnt++) {
 5954                 inodedephd = &inodedep_hashtbl[next++];
 5955                 if (next >= inodedep_hash)
 5956                         next = 0;
 5957                 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
 5958                         break;
 5959         }
 5960         if (inodedep == NULL)
 5961                 return;
 5962         fs = inodedep->id_fs;
 5963         mp = inodedep->id_list.wk_mp;
 5964         /*
 5965          * Find the last inode in the block with dependencies.
 5966          */
 5967         firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
 5968         for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
 5969                 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
 5970                         break;
 5971         /*
 5972          * Asynchronously push all but the last inode with dependencies.
 5973          * Synchronously push the last inode with dependencies to ensure
 5974          * that the inode block gets written to free up the inodedeps.
 5975          */
 5976         for (ino = firstino; ino <= lastino; ino++) {
 5977                 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
 5978                         continue;
 5979                 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
 5980                         continue;
 5981                 FREE_LOCK(&lk);
 5982                 if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp)) != 0) {
 5983                         softdep_error("clear_inodedeps: vget", error);
 5984                         vn_finished_write(mp);
 5985                         ACQUIRE_LOCK(&lk);
 5986                         return;
 5987                 }
 5988                 if (ino == lastino) {
 5989                         if ((error = ffs_syncvnode(vp, MNT_WAIT)))
 5990                                 softdep_error("clear_inodedeps: fsync1", error);
 5991                 } else {
 5992                         if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
 5993                                 softdep_error("clear_inodedeps: fsync2", error);
 5994                         VI_LOCK(vp);
 5995                         drain_output(vp);
 5996                         VI_UNLOCK(vp);
 5997                 }
 5998                 vput(vp);
 5999                 vn_finished_write(mp);
 6000                 ACQUIRE_LOCK(&lk);
 6001         }
 6002 }
 6003 
 6004 /*
 6005  * Function to determine if the buffer has outstanding dependencies
 6006  * that will cause a roll-back if the buffer is written. If wantcount
 6007  * is set, return number of dependencies, otherwise just yes or no.
 6008  */
 6009 static int
 6010 softdep_count_dependencies(bp, wantcount)
 6011         struct buf *bp;
 6012         int wantcount;
 6013 {
 6014         struct worklist *wk;
 6015         struct inodedep *inodedep;
 6016         struct indirdep *indirdep;
 6017         struct allocindir *aip;
 6018         struct pagedep *pagedep;
 6019         struct diradd *dap;
 6020         int i, retval;
 6021 
 6022         retval = 0;
 6023         ACQUIRE_LOCK(&lk);
 6024         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 6025                 switch (wk->wk_type) {
 6026 
 6027                 case D_INODEDEP:
 6028                         inodedep = WK_INODEDEP(wk);
 6029                         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 6030                                 /* bitmap allocation dependency */
 6031                                 retval += 1;
 6032                                 if (!wantcount)
 6033                                         goto out;
 6034                         }
 6035                         if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
 6036                                 /* direct block pointer dependency */
 6037                                 retval += 1;
 6038                                 if (!wantcount)
 6039                                         goto out;
 6040                         }
 6041                         if (TAILQ_FIRST(&inodedep->id_extupdt)) {
 6042                                 /* direct block pointer dependency */
 6043                                 retval += 1;
 6044                                 if (!wantcount)
 6045                                         goto out;
 6046                         }
 6047                         continue;
 6048 
 6049                 case D_INDIRDEP:
 6050                         indirdep = WK_INDIRDEP(wk);
 6051 
 6052                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
 6053                                 /* indirect block pointer dependency */
 6054                                 retval += 1;
 6055                                 if (!wantcount)
 6056                                         goto out;
 6057                         }
 6058                         continue;
 6059 
 6060                 case D_PAGEDEP:
 6061                         pagedep = WK_PAGEDEP(wk);
 6062                         for (i = 0; i < DAHASHSZ; i++) {
 6063 
 6064                                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
 6065                                         /* directory entry dependency */
 6066                                         retval += 1;
 6067                                         if (!wantcount)
 6068                                                 goto out;
 6069                                 }
 6070                         }
 6071                         continue;
 6072 
 6073                 case D_BMSAFEMAP:
 6074                 case D_ALLOCDIRECT:
 6075                 case D_ALLOCINDIR:
 6076                 case D_MKDIR:
 6077                         /* never a dependency on these blocks */
 6078                         continue;
 6079 
 6080                 default:
 6081                         panic("softdep_check_for_rollback: Unexpected type %s",
 6082                             TYPENAME(wk->wk_type));
 6083                         /* NOTREACHED */
 6084                 }
 6085         }
 6086 out:
 6087         FREE_LOCK(&lk);
 6088         return retval;
 6089 }
 6090 
 6091 /*
 6092  * Acquire exclusive access to a buffer.
 6093  * Must be called with a locked mtx parameter.
 6094  * Return acquired buffer or NULL on failure.
 6095  */
 6096 static struct buf *
 6097 getdirtybuf(bp, mtx, waitfor)
 6098         struct buf *bp;
 6099         struct mtx *mtx;
 6100         int waitfor;
 6101 {
 6102         int error;
 6103 
 6104         mtx_assert(mtx, MA_OWNED);
 6105         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
 6106                 if (waitfor != MNT_WAIT)
 6107                         return (NULL);
 6108                 error = BUF_LOCK(bp,
 6109                     LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
 6110                 /*
 6111                  * Even if we sucessfully acquire bp here, we have dropped
 6112                  * mtx, which may violates our guarantee.
 6113                  */
 6114                 if (error == 0)
 6115                         BUF_UNLOCK(bp);
 6116                 else if (error != ENOLCK)
 6117                         panic("getdirtybuf: inconsistent lock: %d", error);
 6118                 mtx_lock(mtx);
 6119                 return (NULL);
 6120         }
 6121         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 6122                 if (mtx == &lk && waitfor == MNT_WAIT) {
 6123                         mtx_unlock(mtx);
 6124                         BO_LOCK(bp->b_bufobj);
 6125                         BUF_UNLOCK(bp);
 6126                         if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 6127                                 bp->b_vflags |= BV_BKGRDWAIT;
 6128                                 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
 6129                                        PRIBIO | PDROP, "getbuf", 0);
 6130                         } else
 6131                                 BO_UNLOCK(bp->b_bufobj);
 6132                         mtx_lock(mtx);
 6133                         return (NULL);
 6134                 }
 6135                 BUF_UNLOCK(bp);
 6136                 if (waitfor != MNT_WAIT)
 6137                         return (NULL);
 6138                 /*
 6139                  * The mtx argument must be bp->b_vp's mutex in
 6140                  * this case.
 6141                  */
 6142 #ifdef  DEBUG_VFS_LOCKS
 6143                 if (bp->b_vp->v_type != VCHR)
 6144                         ASSERT_VI_LOCKED(bp->b_vp, "getdirtybuf");
 6145 #endif
 6146                 bp->b_vflags |= BV_BKGRDWAIT;
 6147                 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
 6148                 return (NULL);
 6149         }
 6150         if ((bp->b_flags & B_DELWRI) == 0) {
 6151                 BUF_UNLOCK(bp);
 6152                 return (NULL);
 6153         }
 6154         bremfree(bp);
 6155         return (bp);
 6156 }
 6157 
 6158 
 6159 /*
 6160  * Check if it is safe to suspend the file system now.  On entry,
 6161  * the vnode interlock for devvp should be held.  Return 0 with
 6162  * the mount interlock held if the file system can be suspended now,
 6163  * otherwise return EAGAIN with the mount interlock held.
 6164  */
 6165 int
 6166 softdep_check_suspend(struct mount *mp,
 6167                       struct vnode *devvp,
 6168                       int softdep_deps,
 6169                       int softdep_accdeps,
 6170                       int secondary_writes,
 6171                       int secondary_accwrites)
 6172 {
 6173         struct bufobj *bo;
 6174         struct ufsmount *ump;
 6175         int error;
 6176 
 6177         ASSERT_VI_LOCKED(devvp, "softdep_check_suspend");
 6178         ump = VFSTOUFS(mp);
 6179         bo = &devvp->v_bufobj;
 6180 
 6181         for (;;) {
 6182                 if (!TRY_ACQUIRE_LOCK(&lk)) {
 6183                         VI_UNLOCK(devvp);
 6184                         ACQUIRE_LOCK(&lk);
 6185                         FREE_LOCK(&lk);
 6186                         VI_LOCK(devvp);
 6187                         continue;
 6188                 }
 6189                 if (!MNT_ITRYLOCK(mp)) {
 6190                         FREE_LOCK(&lk);
 6191                         VI_UNLOCK(devvp);
 6192                         MNT_ILOCK(mp);
 6193                         MNT_IUNLOCK(mp);
 6194                         VI_LOCK(devvp);
 6195                         continue;
 6196                 }
 6197                 if (mp->mnt_secondary_writes != 0) {
 6198                         FREE_LOCK(&lk);
 6199                         VI_UNLOCK(devvp);
 6200                         msleep(&mp->mnt_secondary_writes,
 6201                                MNT_MTX(mp),
 6202                                (PUSER - 1) | PDROP, "secwr", 0);
 6203                         VI_LOCK(devvp);
 6204                         continue;
 6205                 }
 6206                 break;
 6207         }
 6208 
 6209         /*
 6210          * Reasons for needing more work before suspend:
 6211          * - Dirty buffers on devvp.
 6212          * - Softdep activity occurred after start of vnode sync loop
 6213          * - Secondary writes occurred after start of vnode sync loop
 6214          */
 6215         error = 0;
 6216         if (bo->bo_numoutput > 0 ||
 6217             bo->bo_dirty.bv_cnt > 0 ||
 6218             softdep_deps != 0 ||
 6219             ump->softdep_deps != 0 ||
 6220             softdep_accdeps != ump->softdep_accdeps ||
 6221             secondary_writes != 0 ||
 6222             mp->mnt_secondary_writes != 0 ||
 6223             secondary_accwrites != mp->mnt_secondary_accwrites)
 6224                 error = EAGAIN;
 6225         FREE_LOCK(&lk);
 6226         VI_UNLOCK(devvp);
 6227         return (error);
 6228 }
 6229 
 6230 
 6231 /*
 6232  * Get the number of dependency structures for the file system, both
 6233  * the current number and the total number allocated.  These will
 6234  * later be used to detect that softdep processing has occurred.
 6235  */
 6236 void
 6237 softdep_get_depcounts(struct mount *mp,
 6238                       int *softdep_depsp,
 6239                       int *softdep_accdepsp)
 6240 {
 6241         struct ufsmount *ump;
 6242 
 6243         ump = VFSTOUFS(mp);
 6244         ACQUIRE_LOCK(&lk);
 6245         *softdep_depsp = ump->softdep_deps;
 6246         *softdep_accdepsp = ump->softdep_accdeps;
 6247         FREE_LOCK(&lk);
 6248 }
 6249 
 6250 /*
 6251  * Wait for pending output on a vnode to complete.
 6252  * Must be called with vnode lock and interlock locked.
 6253  *
 6254  * XXX: Should just be a call to bufobj_wwait().
 6255  */
 6256 static void
 6257 drain_output(vp)
 6258         struct vnode *vp;
 6259 {
 6260         ASSERT_VOP_LOCKED(vp, "drain_output");
 6261         ASSERT_VI_LOCKED(vp, "drain_output");
 6262 
 6263         while (vp->v_bufobj.bo_numoutput) {
 6264                 vp->v_bufobj.bo_flag |= BO_WWAIT;
 6265                 msleep((caddr_t)&vp->v_bufobj.bo_numoutput,
 6266                     VI_MTX(vp), PRIBIO + 1, "drainvp", 0);
 6267         }
 6268 }
 6269 
 6270 /*
 6271  * Called whenever a buffer that is being invalidated or reallocated
 6272  * contains dependencies. This should only happen if an I/O error has
 6273  * occurred. The routine is called with the buffer locked.
 6274  */ 
 6275 static void
 6276 softdep_deallocate_dependencies(bp)
 6277         struct buf *bp;
 6278 {
 6279 
 6280         if ((bp->b_ioflags & BIO_ERROR) == 0)
 6281                 panic("softdep_deallocate_dependencies: dangling deps");
 6282         softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
 6283         panic("softdep_deallocate_dependencies: unrecovered I/O error");
 6284 }
 6285 
 6286 /*
 6287  * Function to handle asynchronous write errors in the filesystem.
 6288  */
 6289 static void
 6290 softdep_error(func, error)
 6291         char *func;
 6292         int error;
 6293 {
 6294 
 6295         /* XXX should do something better! */
 6296         printf("%s: got error %d while accessing filesystem\n", func, error);
 6297 }
 6298 
 6299 #endif /* SOFTUPDATES */

Cache object: a8d3e6b20160afc5cbccb787710fac95


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.