The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vfs/hammer/hammer_inode.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
    3  * 
    4  * This code is derived from software contributed to The DragonFly Project
    5  * by Matthew Dillon <dillon@backplane.com>
    6  * 
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in
   15  *    the documentation and/or other materials provided with the
   16  *    distribution.
   17  * 3. Neither the name of The DragonFly Project nor the names of its
   18  *    contributors may be used to endorse or promote products derived
   19  *    from this software without specific, prior written permission.
   20  * 
   21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
   25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  */
   34 
   35 #include "hammer.h"
   36 #include <vm/vm_extern.h>
   37 
   38 static int      hammer_unload_inode(struct hammer_inode *ip);
   39 static void     hammer_free_inode(hammer_inode_t ip);
   40 static void     hammer_flush_inode_core(hammer_inode_t ip,
   41                                         hammer_flush_group_t flg, int flags);
   42 static int      hammer_setup_child_callback(hammer_record_t rec, void *data);
   43 #if 0
   44 static int      hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
   45 #endif
   46 static int      hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
   47                                         hammer_flush_group_t flg);
   48 static int      hammer_setup_parent_inodes_helper(hammer_record_t record,
   49                                         int depth, hammer_flush_group_t flg);
   50 static void     hammer_inode_wakereclaims(hammer_inode_t ip);
   51 static struct hammer_inostats *hammer_inode_inostats(hammer_mount_t hmp,
   52                                         pid_t pid);
   53 
   54 #ifdef DEBUG_TRUNCATE
   55 extern struct hammer_inode *HammerTruncIp;
   56 #endif
   57 
   58 struct krate hammer_gen_krate = { 1 };
   59 
   60 /*
   61  * RB-Tree support for inode structures
   62  */
   63 int
   64 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
   65 {
   66         if (ip1->obj_localization < ip2->obj_localization)
   67                 return(-1);
   68         if (ip1->obj_localization > ip2->obj_localization)
   69                 return(1);
   70         if (ip1->obj_id < ip2->obj_id)
   71                 return(-1);
   72         if (ip1->obj_id > ip2->obj_id)
   73                 return(1);
   74         if (ip1->obj_asof < ip2->obj_asof)
   75                 return(-1);
   76         if (ip1->obj_asof > ip2->obj_asof)
   77                 return(1);
   78         return(0);
   79 }
   80 
   81 int
   82 hammer_redo_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
   83 {
   84         if (ip1->redo_fifo_start < ip2->redo_fifo_start)
   85                 return(-1);
   86         if (ip1->redo_fifo_start > ip2->redo_fifo_start)
   87                 return(1);
   88         return(0);
   89 }
   90 
   91 /*
   92  * RB-Tree support for inode structures / special LOOKUP_INFO
   93  */
   94 static int
   95 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
   96 {
   97         if (info->obj_localization < ip->obj_localization)
   98                 return(-1);
   99         if (info->obj_localization > ip->obj_localization)
  100                 return(1);
  101         if (info->obj_id < ip->obj_id)
  102                 return(-1);
  103         if (info->obj_id > ip->obj_id)
  104                 return(1);
  105         if (info->obj_asof < ip->obj_asof)
  106                 return(-1);
  107         if (info->obj_asof > ip->obj_asof)
  108                 return(1);
  109         return(0);
  110 }
  111 
  112 /*
  113  * Used by hammer_scan_inode_snapshots() to locate all of an object's
  114  * snapshots.  Note that the asof field is not tested, which we can get
  115  * away with because it is the lowest-priority field.
  116  */
  117 static int
  118 hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data)
  119 {
  120         hammer_inode_info_t info = data;
  121 
  122         if (ip->obj_localization > info->obj_localization)
  123                 return(1);
  124         if (ip->obj_localization < info->obj_localization)
  125                 return(-1);
  126         if (ip->obj_id > info->obj_id)
  127                 return(1);
  128         if (ip->obj_id < info->obj_id)
  129                 return(-1);
  130         return(0);
  131 }
  132 
  133 /*
  134  * Used by hammer_unload_pseudofs() to locate all inodes associated with
  135  * a particular PFS.
  136  */
  137 static int
  138 hammer_inode_pfs_cmp(hammer_inode_t ip, void *data)
  139 {
  140         u_int32_t localization = *(u_int32_t *)data;
  141         if (ip->obj_localization > localization)
  142                 return(1);
  143         if (ip->obj_localization < localization)
  144                 return(-1);
  145         return(0);
  146 }
  147 
  148 /*
  149  * RB-Tree support for pseudofs structures
  150  */
  151 static int
  152 hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2)
  153 {
  154         if (p1->localization < p2->localization)
  155                 return(-1);
  156         if (p1->localization > p2->localization)
  157                 return(1);
  158         return(0);
  159 }
  160 
  161 
  162 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
  163 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
  164                 hammer_inode_info_cmp, hammer_inode_info_t);
  165 RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node,
  166              hammer_pfs_rb_compare, u_int32_t, localization);
  167 
  168 /*
  169  * The kernel is not actively referencing this vnode but is still holding
  170  * it cached.
  171  *
  172  * This is called from the frontend.
  173  *
  174  * MPALMOSTSAFE
  175  */
  176 int
  177 hammer_vop_inactive(struct vop_inactive_args *ap)
  178 {
  179         struct hammer_inode *ip = VTOI(ap->a_vp);
  180         hammer_mount_t hmp;
  181 
  182         /*
  183          * Degenerate case
  184          */
  185         if (ip == NULL) {
  186                 vrecycle(ap->a_vp);
  187                 return(0);
  188         }
  189 
  190         /*
  191          * If the inode no longer has visibility in the filesystem try to
  192          * recycle it immediately, even if the inode is dirty.  Recycling
  193          * it quickly allows the system to reclaim buffer cache and VM
  194          * resources which can matter a lot in a heavily loaded system.
  195          *
  196          * This can deadlock in vfsync() if we aren't careful.
  197          * 
  198          * Do not queue the inode to the flusher if we still have visibility,
  199          * otherwise namespace calls such as chmod will unnecessarily generate
  200          * multiple inode updates.
  201          */
  202         if (ip->ino_data.nlinks == 0) {
  203                 hmp = ip->hmp;
  204                 lwkt_gettoken(&hmp->fs_token);
  205                 hammer_inode_unloadable_check(ip, 0);
  206                 if (ip->flags & HAMMER_INODE_MODMASK)
  207                         hammer_flush_inode(ip, 0);
  208                 lwkt_reltoken(&hmp->fs_token);
  209                 vrecycle(ap->a_vp);
  210         }
  211         return(0);
  212 }
  213 
  214 /*
  215  * Release the vnode association.  This is typically (but not always)
  216  * the last reference on the inode.
  217  *
  218  * Once the association is lost we are on our own with regards to
  219  * flushing the inode.
  220  *
  221  * We must interlock ip->vp so hammer_get_vnode() can avoid races.
  222  */
  223 int
  224 hammer_vop_reclaim(struct vop_reclaim_args *ap)
  225 {
  226         struct hammer_inode *ip;
  227         hammer_mount_t hmp;
  228         struct vnode *vp;
  229 
  230         vp = ap->a_vp;
  231 
  232         if ((ip = vp->v_data) != NULL) {
  233                 hmp = ip->hmp;
  234                 lwkt_gettoken(&hmp->fs_token);
  235                 hammer_lock_ex(&ip->lock);
  236                 vp->v_data = NULL;
  237                 ip->vp = NULL;
  238 
  239                 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
  240                         ++hammer_count_reclaims;
  241                         ++hmp->count_reclaims;
  242                         ip->flags |= HAMMER_INODE_RECLAIM;
  243                 }
  244                 hammer_unlock(&ip->lock);
  245                 vclrisdirty(vp);
  246                 hammer_rel_inode(ip, 1);
  247                 lwkt_reltoken(&hmp->fs_token);
  248         }
  249         return(0);
  250 }
  251 
  252 /*
  253  * Inform the kernel that the inode is dirty.  This will be checked
  254  * by vn_unlock().
  255  */
  256 void
  257 hammer_inode_dirty(struct hammer_inode *ip)
  258 {
  259         struct vnode *vp;
  260 
  261         if ((ip->flags & HAMMER_INODE_MODMASK) &&
  262             (vp = ip->vp) != NULL) {
  263                 vsetisdirty(vp);
  264         }
  265 }
  266 
  267 /*
  268  * Return a locked vnode for the specified inode.  The inode must be
  269  * referenced but NOT LOCKED on entry and will remain referenced on
  270  * return.
  271  *
  272  * Called from the frontend.
  273  */
  274 int
  275 hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp)
  276 {
  277         hammer_mount_t hmp;
  278         struct vnode *vp;
  279         int error = 0;
  280         u_int8_t obj_type;
  281 
  282         hmp = ip->hmp;
  283 
  284         for (;;) {
  285                 if ((vp = ip->vp) == NULL) {
  286                         error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
  287                         if (error)
  288                                 break;
  289                         hammer_lock_ex(&ip->lock);
  290                         if (ip->vp != NULL) {
  291                                 hammer_unlock(&ip->lock);
  292                                 vp = *vpp;
  293                                 vp->v_type = VBAD;
  294                                 vx_put(vp);
  295                                 continue;
  296                         }
  297                         hammer_ref(&ip->lock);
  298                         vp = *vpp;
  299                         ip->vp = vp;
  300 
  301                         obj_type = ip->ino_data.obj_type;
  302                         vp->v_type = hammer_get_vnode_type(obj_type);
  303 
  304                         hammer_inode_wakereclaims(ip);
  305 
  306                         switch(ip->ino_data.obj_type) {
  307                         case HAMMER_OBJTYPE_CDEV:
  308                         case HAMMER_OBJTYPE_BDEV:
  309                                 vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
  310                                 addaliasu(vp, ip->ino_data.rmajor,
  311                                           ip->ino_data.rminor);
  312                                 break;
  313                         case HAMMER_OBJTYPE_FIFO:
  314                                 vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
  315                                 break;
  316                         case HAMMER_OBJTYPE_REGFILE:
  317                                 break;
  318                         default:
  319                                 break;
  320                         }
  321 
  322                         /*
  323                          * Only mark as the root vnode if the ip is not
  324                          * historical, otherwise the VFS cache will get
  325                          * confused.  The other half of the special handling
  326                          * is in hammer_vop_nlookupdotdot().
  327                          *
  328                          * Pseudo-filesystem roots can be accessed via
  329                          * non-root filesystem paths and setting VROOT may
  330                          * confuse the namecache.  Set VPFSROOT instead.
  331                          */
  332                         if (ip->obj_id == HAMMER_OBJID_ROOT &&
  333                             ip->obj_asof == hmp->asof) {
  334                                 if (ip->obj_localization == 0)
  335                                         vsetflags(vp, VROOT);
  336                                 else
  337                                         vsetflags(vp, VPFSROOT);
  338                         }
  339 
  340                         vp->v_data = (void *)ip;
  341                         /* vnode locked by getnewvnode() */
  342                         /* make related vnode dirty if inode dirty? */
  343                         hammer_unlock(&ip->lock);
  344                         if (vp->v_type == VREG) {
  345                                 vinitvmio(vp, ip->ino_data.size,
  346                                           hammer_blocksize(ip->ino_data.size),
  347                                           hammer_blockoff(ip->ino_data.size));
  348                         }
  349                         break;
  350                 }
  351 
  352                 /*
  353                  * Interlock vnode clearing.  This does not prevent the
  354                  * vnode from going into a reclaimed state but it does
  355                  * prevent it from being destroyed or reused so the vget()
  356                  * will properly fail.
  357                  */
  358                 hammer_lock_ex(&ip->lock);
  359                 if ((vp = ip->vp) == NULL) {
  360                         hammer_unlock(&ip->lock);
  361                         continue;
  362                 }
  363                 vhold(vp);
  364                 hammer_unlock(&ip->lock);
  365 
  366                 /*
  367                  * loop if the vget fails (aka races), or if the vp
  368                  * no longer matches ip->vp.
  369                  */
  370                 if (vget(vp, LK_EXCLUSIVE) == 0) {
  371                         if (vp == ip->vp) {
  372                                 vdrop(vp);
  373                                 break;
  374                         }
  375                         vput(vp);
  376                 }
  377                 vdrop(vp);
  378         }
  379         *vpp = vp;
  380         return(error);
  381 }
  382 
  383 /*
  384  * Locate all copies of the inode for obj_id compatible with the specified
  385  * asof, reference, and issue the related call-back.  This routine is used
  386  * for direct-io invalidation and does not create any new inodes.
  387  */
  388 void
  389 hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo,
  390                             int (*callback)(hammer_inode_t ip, void *data),
  391                             void *data)
  392 {
  393         hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root,
  394                                    hammer_inode_info_cmp_all_history,
  395                                    callback, iinfo);
  396 }
  397 
  398 /*
  399  * Acquire a HAMMER inode.  The returned inode is not locked.  These functions
  400  * do not attach or detach the related vnode (use hammer_get_vnode() for
  401  * that).
  402  *
  403  * The flags argument is only applied for newly created inodes, and only
  404  * certain flags are inherited.
  405  *
  406  * Called from the frontend.
  407  */
  408 struct hammer_inode *
  409 hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
  410                  int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
  411                  int flags, int *errorp)
  412 {
  413         hammer_mount_t hmp = trans->hmp;
  414         struct hammer_node_cache *cachep;
  415         struct hammer_inode_info iinfo;
  416         struct hammer_cursor cursor;
  417         struct hammer_inode *ip;
  418 
  419 
  420         /*
  421          * Determine if we already have an inode cached.  If we do then
  422          * we are golden.
  423          *
  424          * If we find an inode with no vnode we have to mark the
  425          * transaction such that hammer_inode_waitreclaims() is
  426          * called later on to avoid building up an infinite number
  427          * of inodes.  Otherwise we can continue to * add new inodes
  428          * faster then they can be disposed of, even with the tsleep
  429          * delay.
  430          *
  431          * If we find a dummy inode we return a failure so dounlink
  432          * (which does another lookup) doesn't try to mess with the
  433          * link count.  hammer_vop_nresolve() uses hammer_get_dummy_inode()
  434          * to ref dummy inodes.
  435          */
  436         iinfo.obj_id = obj_id;
  437         iinfo.obj_asof = asof;
  438         iinfo.obj_localization = localization;
  439 loop:
  440         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
  441         if (ip) {
  442                 if (ip->flags & HAMMER_INODE_DUMMY) {
  443                         *errorp = ENOENT;
  444                         return(NULL);
  445                 }
  446                 hammer_ref(&ip->lock);
  447                 *errorp = 0;
  448                 return(ip);
  449         }
  450 
  451         /*
  452          * Allocate a new inode structure and deal with races later.
  453          */
  454         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
  455         ++hammer_count_inodes;
  456         ++hmp->count_inodes;
  457         ip->obj_id = obj_id;
  458         ip->obj_asof = iinfo.obj_asof;
  459         ip->obj_localization = localization;
  460         ip->hmp = hmp;
  461         ip->flags = flags & HAMMER_INODE_RO;
  462         ip->cache[0].ip = ip;
  463         ip->cache[1].ip = ip;
  464         ip->cache[2].ip = ip;
  465         ip->cache[3].ip = ip;
  466         if (hmp->ronly)
  467                 ip->flags |= HAMMER_INODE_RO;
  468         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
  469                 0x7FFFFFFFFFFFFFFFLL;
  470         RB_INIT(&ip->rec_tree);
  471         TAILQ_INIT(&ip->target_list);
  472         hammer_ref(&ip->lock);
  473 
  474         /*
  475          * Locate the on-disk inode.  If this is a PFS root we always
  476          * access the current version of the root inode and (if it is not
  477          * a master) always access information under it with a snapshot
  478          * TID.
  479          *
  480          * We cache recent inode lookups in this directory in dip->cache[2].
  481          * If we can't find it we assume the inode we are looking for is
  482          * close to the directory inode.
  483          */
  484 retry:
  485         cachep = NULL;
  486         if (dip) {
  487                 if (dip->cache[2].node)
  488                         cachep = &dip->cache[2];
  489                 else
  490                         cachep = &dip->cache[0];
  491         }
  492         hammer_init_cursor(trans, &cursor, cachep, NULL);
  493         cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE;
  494         cursor.key_beg.obj_id = ip->obj_id;
  495         cursor.key_beg.key = 0;
  496         cursor.key_beg.create_tid = 0;
  497         cursor.key_beg.delete_tid = 0;
  498         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
  499         cursor.key_beg.obj_type = 0;
  500 
  501         cursor.asof = iinfo.obj_asof;
  502         cursor.flags = HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_GET_DATA |
  503                        HAMMER_CURSOR_ASOF;
  504 
  505         *errorp = hammer_btree_lookup(&cursor);
  506         if (*errorp == EDEADLK) {
  507                 hammer_done_cursor(&cursor);
  508                 goto retry;
  509         }
  510 
  511         /*
  512          * On success the B-Tree lookup will hold the appropriate
  513          * buffer cache buffers and provide a pointer to the requested
  514          * information.  Copy the information to the in-memory inode
  515          * and cache the B-Tree node to improve future operations.
  516          */
  517         if (*errorp == 0) {
  518                 ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
  519                 ip->ino_data = cursor.data->inode;
  520 
  521                 /*
  522                  * cache[0] tries to cache the location of the object inode.
  523                  * The assumption is that it is near the directory inode.
  524                  *
  525                  * cache[1] tries to cache the location of the object data.
  526                  * We might have something in the governing directory from
  527                  * scan optimizations (see the strategy code in
  528                  * hammer_vnops.c).
  529                  *
  530                  * We update dip->cache[2], if possible, with the location
  531                  * of the object inode for future directory shortcuts.
  532                  */
  533                 hammer_cache_node(&ip->cache[0], cursor.node);
  534                 if (dip) {
  535                         if (dip->cache[3].node) {
  536                                 hammer_cache_node(&ip->cache[1],
  537                                                   dip->cache[3].node);
  538                         }
  539                         hammer_cache_node(&dip->cache[2], cursor.node);
  540                 }
  541 
  542                 /*
  543                  * The file should not contain any data past the file size
  544                  * stored in the inode.  Setting save_trunc_off to the
  545                  * file size instead of max reduces B-Tree lookup overheads
  546                  * on append by allowing the flusher to avoid checking for
  547                  * record overwrites.
  548                  */
  549                 ip->save_trunc_off = ip->ino_data.size;
  550 
  551                 /*
  552                  * Locate and assign the pseudofs management structure to
  553                  * the inode.
  554                  */
  555                 if (dip && dip->obj_localization == ip->obj_localization) {
  556                         ip->pfsm = dip->pfsm;
  557                         hammer_ref(&ip->pfsm->lock);
  558                 } else {
  559                         ip->pfsm = hammer_load_pseudofs(trans,
  560                                                         ip->obj_localization,
  561                                                         errorp);
  562                         *errorp = 0;    /* ignore ENOENT */
  563                 }
  564         }
  565 
  566         /*
  567          * The inode is placed on the red-black tree and will be synced to
  568          * the media when flushed or by the filesystem sync.  If this races
  569          * another instantiation/lookup the insertion will fail.
  570          */
  571         if (*errorp == 0) {
  572                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
  573                         hammer_free_inode(ip);
  574                         hammer_done_cursor(&cursor);
  575                         goto loop;
  576                 }
  577                 ip->flags |= HAMMER_INODE_ONDISK;
  578         } else {
  579                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
  580                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
  581                         --hmp->rsv_inodes;
  582                 }
  583 
  584                 hammer_free_inode(ip);
  585                 ip = NULL;
  586         }
  587         hammer_done_cursor(&cursor);
  588 
  589         /*
  590          * NEWINODE is only set if the inode becomes dirty later,
  591          * setting it here just leads to unnecessary stalls.
  592          *
  593          * trans->flags |= HAMMER_TRANSF_NEWINODE;
  594          */
  595         return (ip);
  596 }
  597 
  598 /*
  599  * Get a dummy inode to placemark a broken directory entry.
  600  */
  601 struct hammer_inode *
  602 hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip,
  603                  int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
  604                  int flags, int *errorp)
  605 {
  606         hammer_mount_t hmp = trans->hmp;
  607         struct hammer_inode_info iinfo;
  608         struct hammer_inode *ip;
  609 
  610         /*
  611          * Determine if we already have an inode cached.  If we do then
  612          * we are golden.
  613          *
  614          * If we find an inode with no vnode we have to mark the
  615          * transaction such that hammer_inode_waitreclaims() is
  616          * called later on to avoid building up an infinite number
  617          * of inodes.  Otherwise we can continue to * add new inodes
  618          * faster then they can be disposed of, even with the tsleep
  619          * delay.
  620          *
  621          * If we find a non-fake inode we return an error.  Only fake
  622          * inodes can be returned by this routine.
  623          */
  624         iinfo.obj_id = obj_id;
  625         iinfo.obj_asof = asof;
  626         iinfo.obj_localization = localization;
  627 loop:
  628         *errorp = 0;
  629         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
  630         if (ip) {
  631                 if ((ip->flags & HAMMER_INODE_DUMMY) == 0) {
  632                         *errorp = ENOENT;
  633                         return(NULL);
  634                 }
  635                 hammer_ref(&ip->lock);
  636                 return(ip);
  637         }
  638 
  639         /*
  640          * Allocate a new inode structure and deal with races later.
  641          */
  642         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
  643         ++hammer_count_inodes;
  644         ++hmp->count_inodes;
  645         ip->obj_id = obj_id;
  646         ip->obj_asof = iinfo.obj_asof;
  647         ip->obj_localization = localization;
  648         ip->hmp = hmp;
  649         ip->flags = flags | HAMMER_INODE_RO | HAMMER_INODE_DUMMY;
  650         ip->cache[0].ip = ip;
  651         ip->cache[1].ip = ip;
  652         ip->cache[2].ip = ip;
  653         ip->cache[3].ip = ip;
  654         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
  655                 0x7FFFFFFFFFFFFFFFLL;
  656         RB_INIT(&ip->rec_tree);
  657         TAILQ_INIT(&ip->target_list);
  658         hammer_ref(&ip->lock);
  659 
  660         /*
  661          * Populate the dummy inode.  Leave everything zero'd out.
  662          *
  663          * (ip->ino_leaf and ip->ino_data)
  664          *
  665          * Make the dummy inode a FIFO object which most copy programs
  666          * will properly ignore.
  667          */
  668         ip->save_trunc_off = ip->ino_data.size;
  669         ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO;
  670 
  671         /*
  672          * Locate and assign the pseudofs management structure to
  673          * the inode.
  674          */
  675         if (dip && dip->obj_localization == ip->obj_localization) {
  676                 ip->pfsm = dip->pfsm;
  677                 hammer_ref(&ip->pfsm->lock);
  678         } else {
  679                 ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization,
  680                                                 errorp);
  681                 *errorp = 0;    /* ignore ENOENT */
  682         }
  683 
  684         /*
  685          * The inode is placed on the red-black tree and will be synced to
  686          * the media when flushed or by the filesystem sync.  If this races
  687          * another instantiation/lookup the insertion will fail.
  688          *
  689          * NOTE: Do not set HAMMER_INODE_ONDISK.  The inode is a fake.
  690          */
  691         if (*errorp == 0) {
  692                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
  693                         hammer_free_inode(ip);
  694                         goto loop;
  695                 }
  696         } else {
  697                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
  698                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
  699                         --hmp->rsv_inodes;
  700                 }
  701                 hammer_free_inode(ip);
  702                 ip = NULL;
  703         }
  704         trans->flags |= HAMMER_TRANSF_NEWINODE;
  705         return (ip);
  706 }
  707 
  708 /*
  709  * Return a referenced inode only if it is in our inode cache.
  710  *
  711  * Dummy inodes do not count.
  712  */
  713 struct hammer_inode *
  714 hammer_find_inode(hammer_transaction_t trans, int64_t obj_id,
  715                   hammer_tid_t asof, u_int32_t localization)
  716 {
  717         hammer_mount_t hmp = trans->hmp;
  718         struct hammer_inode_info iinfo;
  719         struct hammer_inode *ip;
  720 
  721         iinfo.obj_id = obj_id;
  722         iinfo.obj_asof = asof;
  723         iinfo.obj_localization = localization;
  724 
  725         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
  726         if (ip) {
  727                 if (ip->flags & HAMMER_INODE_DUMMY)
  728                         ip = NULL;
  729                 else
  730                         hammer_ref(&ip->lock);
  731         }
  732         return(ip);
  733 }
  734 
  735 /*
  736  * Create a new filesystem object, returning the inode in *ipp.  The
  737  * returned inode will be referenced.  The inode is created in-memory.
  738  *
  739  * If pfsm is non-NULL the caller wishes to create the root inode for
  740  * a master PFS.
  741  */
  742 int
  743 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
  744                     struct ucred *cred,
  745                     hammer_inode_t dip, const char *name, int namelen,
  746                     hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp)
  747 {
  748         hammer_mount_t hmp;
  749         hammer_inode_t ip;
  750         uid_t xuid;
  751         int error;
  752         int64_t namekey;
  753         u_int32_t dummy;
  754 
  755         hmp = trans->hmp;
  756 
  757         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
  758         ++hammer_count_inodes;
  759         ++hmp->count_inodes;
  760         trans->flags |= HAMMER_TRANSF_NEWINODE;
  761 
  762         if (pfsm) {
  763                 KKASSERT(pfsm->localization != 0);
  764                 ip->obj_id = HAMMER_OBJID_ROOT;
  765                 ip->obj_localization = pfsm->localization;
  766         } else {
  767                 KKASSERT(dip != NULL);
  768                 namekey = hammer_directory_namekey(dip, name, namelen, &dummy);
  769                 ip->obj_id = hammer_alloc_objid(hmp, dip, namekey);
  770                 ip->obj_localization = dip->obj_localization;
  771         }
  772 
  773         KKASSERT(ip->obj_id != 0);
  774         ip->obj_asof = hmp->asof;
  775         ip->hmp = hmp;
  776         ip->flush_state = HAMMER_FST_IDLE;
  777         ip->flags = HAMMER_INODE_DDIRTY |
  778                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME;
  779         ip->cache[0].ip = ip;
  780         ip->cache[1].ip = ip;
  781         ip->cache[2].ip = ip;
  782         ip->cache[3].ip = ip;
  783 
  784         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
  785         /* ip->save_trunc_off = 0; (already zero) */
  786         RB_INIT(&ip->rec_tree);
  787         TAILQ_INIT(&ip->target_list);
  788 
  789         ip->ino_data.atime = trans->time;
  790         ip->ino_data.mtime = trans->time;
  791         ip->ino_data.size = 0;
  792         ip->ino_data.nlinks = 0;
  793 
  794         /*
  795          * A nohistory designator on the parent directory is inherited by
  796          * the child.  We will do this even for pseudo-fs creation... the
  797          * sysad can turn it off.
  798          */
  799         if (dip) {
  800                 ip->ino_data.uflags = dip->ino_data.uflags &
  801                                       (SF_NOHISTORY|UF_NOHISTORY|UF_NODUMP);
  802         }
  803 
  804         ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
  805         ip->ino_leaf.base.localization = ip->obj_localization +
  806                                          HAMMER_LOCALIZE_INODE;
  807         ip->ino_leaf.base.obj_id = ip->obj_id;
  808         ip->ino_leaf.base.key = 0;
  809         ip->ino_leaf.base.create_tid = 0;
  810         ip->ino_leaf.base.delete_tid = 0;
  811         ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
  812         ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
  813 
  814         ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
  815         ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
  816         ip->ino_data.mode = vap->va_mode;
  817         ip->ino_data.ctime = trans->time;
  818 
  819         /*
  820          * If we are running version 2 or greater directory entries are
  821          * inode-localized instead of data-localized.
  822          */
  823         if (trans->hmp->version >= HAMMER_VOL_VERSION_TWO) {
  824                 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
  825                         ip->ino_data.cap_flags |=
  826                                 HAMMER_INODE_CAP_DIR_LOCAL_INO;
  827                 }
  828         }
  829         if (trans->hmp->version >= HAMMER_VOL_VERSION_SIX) {
  830                 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
  831                         ip->ino_data.cap_flags |=
  832                                 HAMMER_INODE_CAP_DIRHASH_ALG1;
  833                 }
  834         }
  835 
  836         /*
  837          * Setup the ".." pointer.  This only needs to be done for directories
  838          * but we do it for all objects as a recovery aid.
  839          */
  840         if (dip)
  841                 ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id;
  842 #if 0
  843         /*
  844          * The parent_obj_localization field only applies to pseudo-fs roots.
  845          * XXX this is no longer applicable, PFSs are no longer directly
  846          * tied into the parent's directory structure.
  847          */
  848         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY &&
  849             ip->obj_id == HAMMER_OBJID_ROOT) {
  850                 ip->ino_data.ext.obj.parent_obj_localization = 
  851                                                 dip->obj_localization;
  852         }
  853 #endif
  854 
  855         switch(ip->ino_leaf.base.obj_type) {
  856         case HAMMER_OBJTYPE_CDEV:
  857         case HAMMER_OBJTYPE_BDEV:
  858                 ip->ino_data.rmajor = vap->va_rmajor;
  859                 ip->ino_data.rminor = vap->va_rminor;
  860                 break;
  861         default:
  862                 break;
  863         }
  864 
  865         /*
  866          * Calculate default uid/gid and overwrite with information from
  867          * the vap.
  868          */
  869         if (dip) {
  870                 xuid = hammer_to_unix_xid(&dip->ino_data.uid);
  871                 xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode,
  872                                              xuid, cred, &vap->va_mode);
  873         } else {
  874                 xuid = 0;
  875         }
  876         ip->ino_data.mode = vap->va_mode;
  877 
  878         if (vap->va_vaflags & VA_UID_UUID_VALID)
  879                 ip->ino_data.uid = vap->va_uid_uuid;
  880         else if (vap->va_uid != (uid_t)VNOVAL)
  881                 hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
  882         else
  883                 hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
  884 
  885         if (vap->va_vaflags & VA_GID_UUID_VALID)
  886                 ip->ino_data.gid = vap->va_gid_uuid;
  887         else if (vap->va_gid != (gid_t)VNOVAL)
  888                 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
  889         else if (dip)
  890                 ip->ino_data.gid = dip->ino_data.gid;
  891 
  892         hammer_ref(&ip->lock);
  893 
  894         if (pfsm) {
  895                 ip->pfsm = pfsm;
  896                 hammer_ref(&pfsm->lock);
  897                 error = 0;
  898         } else if (dip->obj_localization == ip->obj_localization) {
  899                 ip->pfsm = dip->pfsm;
  900                 hammer_ref(&ip->pfsm->lock);
  901                 error = 0;
  902         } else {
  903                 ip->pfsm = hammer_load_pseudofs(trans,
  904                                                 ip->obj_localization,
  905                                                 &error);
  906                 error = 0;      /* ignore ENOENT */
  907         }
  908 
  909         if (error) {
  910                 hammer_free_inode(ip);
  911                 ip = NULL;
  912         } else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
  913                 panic("hammer_create_inode: duplicate obj_id %llx",
  914                       (long long)ip->obj_id);
  915                 /* not reached */
  916                 hammer_free_inode(ip);
  917         }
  918         *ipp = ip;
  919         return(error);
  920 }
  921 
  922 /*
  923  * Final cleanup / freeing of an inode structure
  924  */
  925 static void
  926 hammer_free_inode(hammer_inode_t ip)
  927 {
  928         struct hammer_mount *hmp;
  929 
  930         hmp = ip->hmp;
  931         KKASSERT(hammer_oneref(&ip->lock));
  932         hammer_uncache_node(&ip->cache[0]);
  933         hammer_uncache_node(&ip->cache[1]);
  934         hammer_uncache_node(&ip->cache[2]);
  935         hammer_uncache_node(&ip->cache[3]);
  936         hammer_inode_wakereclaims(ip);
  937         if (ip->objid_cache)
  938                 hammer_clear_objid(ip);
  939         --hammer_count_inodes;
  940         --hmp->count_inodes;
  941         if (ip->pfsm) {
  942                 hammer_rel_pseudofs(hmp, ip->pfsm);
  943                 ip->pfsm = NULL;
  944         }
  945         kfree(ip, hmp->m_inodes);
  946         ip = NULL;
  947 }
  948 
  949 /*
  950  * Retrieve pseudo-fs data.  NULL will never be returned.
  951  *
  952  * If an error occurs *errorp will be set and a default template is returned,
  953  * otherwise *errorp is set to 0.  Typically when an error occurs it will
  954  * be ENOENT.
  955  */
  956 hammer_pseudofs_inmem_t
  957 hammer_load_pseudofs(hammer_transaction_t trans,
  958                      u_int32_t localization, int *errorp)
  959 {
  960         hammer_mount_t hmp = trans->hmp;
  961         hammer_inode_t ip;
  962         hammer_pseudofs_inmem_t pfsm;
  963         struct hammer_cursor cursor;
  964         int bytes;
  965 
  966 retry:
  967         pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization);
  968         if (pfsm) {
  969                 hammer_ref(&pfsm->lock);
  970                 *errorp = 0;
  971                 return(pfsm);
  972         }
  973 
  974         /*
  975          * PFS records are stored in the root inode (not the PFS root inode,
  976          * but the real root).  Avoid an infinite recursion if loading
  977          * the PFS for the real root.
  978          */
  979         if (localization) {
  980                 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT,
  981                                       HAMMER_MAX_TID,
  982                                       HAMMER_DEF_LOCALIZATION, 0, errorp);
  983         } else {
  984                 ip = NULL;
  985         }
  986 
  987         pfsm = kmalloc(sizeof(*pfsm), hmp->m_misc, M_WAITOK | M_ZERO);
  988         pfsm->localization = localization;
  989         pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid;
  990         pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid;
  991 
  992         hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip);
  993         cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION +
  994                                       HAMMER_LOCALIZE_MISC;
  995         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
  996         cursor.key_beg.create_tid = 0;
  997         cursor.key_beg.delete_tid = 0;
  998         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
  999         cursor.key_beg.obj_type = 0;
 1000         cursor.key_beg.key = localization;
 1001         cursor.asof = HAMMER_MAX_TID;
 1002         cursor.flags |= HAMMER_CURSOR_ASOF;
 1003 
 1004         if (ip)
 1005                 *errorp = hammer_ip_lookup(&cursor);
 1006         else
 1007                 *errorp = hammer_btree_lookup(&cursor);
 1008         if (*errorp == 0) {
 1009                 *errorp = hammer_ip_resolve_data(&cursor);
 1010                 if (*errorp == 0) {
 1011                         if (cursor.data->pfsd.mirror_flags &
 1012                             HAMMER_PFSD_DELETED) {
 1013                                 *errorp = ENOENT;
 1014                         } else {
 1015                                 bytes = cursor.leaf->data_len;
 1016                                 if (bytes > sizeof(pfsm->pfsd))
 1017                                         bytes = sizeof(pfsm->pfsd);
 1018                                 bcopy(cursor.data, &pfsm->pfsd, bytes);
 1019                         }
 1020                 }
 1021         }
 1022         hammer_done_cursor(&cursor);
 1023 
 1024         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
 1025         hammer_ref(&pfsm->lock);
 1026         if (ip)
 1027                 hammer_rel_inode(ip, 0);
 1028         if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) {
 1029                 kfree(pfsm, hmp->m_misc);
 1030                 goto retry;
 1031         }
 1032         return(pfsm);
 1033 }
 1034 
 1035 /*
 1036  * Store pseudo-fs data.  The backend will automatically delete any prior
 1037  * on-disk pseudo-fs data but we have to delete in-memory versions.
 1038  */
 1039 int
 1040 hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
 1041 {
 1042         struct hammer_cursor cursor;
 1043         hammer_record_t record;
 1044         hammer_inode_t ip;
 1045         int error;
 1046 
 1047         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
 1048                               HAMMER_DEF_LOCALIZATION, 0, &error);
 1049 retry:
 1050         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
 1051         hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
 1052         cursor.key_beg.localization = ip->obj_localization +
 1053                                       HAMMER_LOCALIZE_MISC;
 1054         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
 1055         cursor.key_beg.create_tid = 0;
 1056         cursor.key_beg.delete_tid = 0;
 1057         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
 1058         cursor.key_beg.obj_type = 0;
 1059         cursor.key_beg.key = pfsm->localization;
 1060         cursor.asof = HAMMER_MAX_TID;
 1061         cursor.flags |= HAMMER_CURSOR_ASOF;
 1062 
 1063         /*
 1064          * Replace any in-memory version of the record.
 1065          */
 1066         error = hammer_ip_lookup(&cursor);
 1067         if (error == 0 && hammer_cursor_inmem(&cursor)) {
 1068                 record = cursor.iprec;
 1069                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
 1070                         KKASSERT(cursor.deadlk_rec == NULL);
 1071                         hammer_ref(&record->lock);
 1072                         cursor.deadlk_rec = record;
 1073                         error = EDEADLK;
 1074                 } else {
 1075                         record->flags |= HAMMER_RECF_DELETED_FE;
 1076                         error = 0;
 1077                 }
 1078         }
 1079 
 1080         /*
 1081          * Allocate replacement general record.  The backend flush will
 1082          * delete any on-disk version of the record.
 1083          */
 1084         if (error == 0 || error == ENOENT) {
 1085                 record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
 1086                 record->type = HAMMER_MEM_RECORD_GENERAL;
 1087 
 1088                 record->leaf.base.localization = ip->obj_localization +
 1089                                                  HAMMER_LOCALIZE_MISC;
 1090                 record->leaf.base.rec_type = HAMMER_RECTYPE_PFS;
 1091                 record->leaf.base.key = pfsm->localization;
 1092                 record->leaf.data_len = sizeof(pfsm->pfsd);
 1093                 bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd));
 1094                 error = hammer_ip_add_record(trans, record);
 1095         }
 1096         hammer_done_cursor(&cursor);
 1097         if (error == EDEADLK)
 1098                 goto retry;
 1099         hammer_rel_inode(ip, 0);
 1100         return(error);
 1101 }
 1102 
 1103 /*
 1104  * Create a root directory for a PFS if one does not alredy exist.
 1105  *
 1106  * The PFS root stands alone so we must also bump the nlinks count
 1107  * to prevent it from being destroyed on release.
 1108  */
 1109 int
 1110 hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
 1111                        hammer_pseudofs_inmem_t pfsm)
 1112 {
 1113         hammer_inode_t ip;
 1114         struct vattr vap;
 1115         int error;
 1116 
 1117         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
 1118                               pfsm->localization, 0, &error);
 1119         if (ip == NULL) {
 1120                 vattr_null(&vap);
 1121                 vap.va_mode = 0755;
 1122                 vap.va_type = VDIR;
 1123                 error = hammer_create_inode(trans, &vap, cred,
 1124                                             NULL, NULL, 0,
 1125                                             pfsm, &ip);
 1126                 if (error == 0) {
 1127                         ++ip->ino_data.nlinks;
 1128                         hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY);
 1129                 }
 1130         }
 1131         if (ip)
 1132                 hammer_rel_inode(ip, 0);
 1133         return(error);
 1134 }
 1135 
 1136 /*
 1137  * Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY
 1138  * if we are unable to disassociate all the inodes.
 1139  */
 1140 static
 1141 int
 1142 hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data)
 1143 {
 1144         int res;
 1145 
 1146         hammer_ref(&ip->lock);
 1147         if (hammer_isactive(&ip->lock) == 2 && ip->vp)
 1148                 vclean_unlocked(ip->vp);
 1149         if (hammer_isactive(&ip->lock) == 1 && ip->vp == NULL)
 1150                 res = 0;
 1151         else
 1152                 res = -1;       /* stop, someone is using the inode */
 1153         hammer_rel_inode(ip, 0);
 1154         return(res);
 1155 }
 1156 
 1157 int
 1158 hammer_unload_pseudofs(hammer_transaction_t trans, u_int32_t localization)
 1159 {
 1160         int res;
 1161         int try;
 1162 
 1163         for (try = res = 0; try < 4; ++try) {
 1164                 res = hammer_ino_rb_tree_RB_SCAN(&trans->hmp->rb_inos_root,
 1165                                            hammer_inode_pfs_cmp,
 1166                                            hammer_unload_pseudofs_callback,
 1167                                            &localization);
 1168                 if (res == 0 && try > 1)
 1169                         break;
 1170                 hammer_flusher_sync(trans->hmp);
 1171         }
 1172         if (res != 0)
 1173                 res = ENOTEMPTY;
 1174         return(res);
 1175 }
 1176 
 1177 
 1178 /*
 1179  * Release a reference on a PFS
 1180  */
 1181 void
 1182 hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm)
 1183 {
 1184         hammer_rel(&pfsm->lock);
 1185         if (hammer_norefs(&pfsm->lock)) {
 1186                 RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm);
 1187                 kfree(pfsm, hmp->m_misc);
 1188         }
 1189 }
 1190 
 1191 /*
 1192  * Called by hammer_sync_inode().
 1193  */
 1194 static int
 1195 hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
 1196 {
 1197         hammer_transaction_t trans = cursor->trans;
 1198         hammer_record_t record;
 1199         int error;
 1200         int redirty;
 1201 
 1202 retry:
 1203         error = 0;
 1204 
 1205         /*
 1206          * If the inode has a presence on-disk then locate it and mark
 1207          * it deleted, setting DELONDISK.
 1208          *
 1209          * The record may or may not be physically deleted, depending on
 1210          * the retention policy.
 1211          */
 1212         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
 1213             HAMMER_INODE_ONDISK) {
 1214                 hammer_normalize_cursor(cursor);
 1215                 cursor->key_beg.localization = ip->obj_localization + 
 1216                                                HAMMER_LOCALIZE_INODE;
 1217                 cursor->key_beg.obj_id = ip->obj_id;
 1218                 cursor->key_beg.key = 0;
 1219                 cursor->key_beg.create_tid = 0;
 1220                 cursor->key_beg.delete_tid = 0;
 1221                 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
 1222                 cursor->key_beg.obj_type = 0;
 1223                 cursor->asof = ip->obj_asof;
 1224                 cursor->flags &= ~HAMMER_CURSOR_INITMASK;
 1225                 cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF;
 1226                 cursor->flags |= HAMMER_CURSOR_BACKEND;
 1227 
 1228                 error = hammer_btree_lookup(cursor);
 1229                 if (hammer_debug_inode)
 1230                         kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
 1231 
 1232                 if (error == 0) {
 1233                         error = hammer_ip_delete_record(cursor, ip, trans->tid);
 1234                         if (hammer_debug_inode)
 1235                                 kprintf(" error %d\n", error);
 1236                         if (error == 0) {
 1237                                 ip->flags |= HAMMER_INODE_DELONDISK;
 1238                         }
 1239                         if (cursor->node)
 1240                                 hammer_cache_node(&ip->cache[0], cursor->node);
 1241                 }
 1242                 if (error == EDEADLK) {
 1243                         hammer_done_cursor(cursor);
 1244                         error = hammer_init_cursor(trans, cursor,
 1245                                                    &ip->cache[0], ip);
 1246                         if (hammer_debug_inode)
 1247                                 kprintf("IPDED %p %d\n", ip, error);
 1248                         if (error == 0)
 1249                                 goto retry;
 1250                 }
 1251         }
 1252 
 1253         /*
 1254          * Ok, write out the initial record or a new record (after deleting
 1255          * the old one), unless the DELETED flag is set.  This routine will
 1256          * clear DELONDISK if it writes out a record.
 1257          *
 1258          * Update our inode statistics if this is the first application of
 1259          * the inode on-disk.
 1260          */
 1261         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
 1262                 /*
 1263                  * Generate a record and write it to the media.  We clean-up
 1264                  * the state before releasing so we do not have to set-up
 1265                  * a flush_group.
 1266                  */
 1267                 record = hammer_alloc_mem_record(ip, 0);
 1268                 record->type = HAMMER_MEM_RECORD_INODE;
 1269                 record->flush_state = HAMMER_FST_FLUSH;
 1270                 record->leaf = ip->sync_ino_leaf;
 1271                 record->leaf.base.create_tid = trans->tid;
 1272                 record->leaf.data_len = sizeof(ip->sync_ino_data);
 1273                 record->leaf.create_ts = trans->time32;
 1274                 record->data = (void *)&ip->sync_ino_data;
 1275                 record->flags |= HAMMER_RECF_INTERLOCK_BE;
 1276 
 1277                 /*
 1278                  * If this flag is set we cannot sync the new file size
 1279                  * because we haven't finished related truncations.  The
 1280                  * inode will be flushed in another flush group to finish
 1281                  * the job.
 1282                  */
 1283                 if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
 1284                     ip->sync_ino_data.size != ip->ino_data.size) {
 1285                         redirty = 1;
 1286                         ip->sync_ino_data.size = ip->ino_data.size;
 1287                 } else {
 1288                         redirty = 0;
 1289                 }
 1290 
 1291                 for (;;) {
 1292                         error = hammer_ip_sync_record_cursor(cursor, record);
 1293                         if (hammer_debug_inode)
 1294                                 kprintf("GENREC %p rec %08x %d\n",      
 1295                                         ip, record->flags, error);
 1296                         if (error != EDEADLK)
 1297                                 break;
 1298                         hammer_done_cursor(cursor);
 1299                         error = hammer_init_cursor(trans, cursor,
 1300                                                    &ip->cache[0], ip);
 1301                         if (hammer_debug_inode)
 1302                                 kprintf("GENREC reinit %d\n", error);
 1303                         if (error)
 1304                                 break;
 1305                 }
 1306 
 1307                 /*
 1308                  * Note:  The record was never on the inode's record tree
 1309                  * so just wave our hands importantly and destroy it.
 1310                  */
 1311                 record->flags |= HAMMER_RECF_COMMITTED;
 1312                 record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
 1313                 record->flush_state = HAMMER_FST_IDLE;
 1314                 ++ip->rec_generation;
 1315                 hammer_rel_mem_record(record);
 1316 
 1317                 /*
 1318                  * Finish up.
 1319                  */
 1320                 if (error == 0) {
 1321                         if (hammer_debug_inode)
 1322                                 kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
 1323                         ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
 1324                                             HAMMER_INODE_SDIRTY |
 1325                                             HAMMER_INODE_ATIME |
 1326                                             HAMMER_INODE_MTIME);
 1327                         ip->flags &= ~HAMMER_INODE_DELONDISK;
 1328                         if (redirty)
 1329                                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
 1330 
 1331                         /*
 1332                          * Root volume count of inodes
 1333                          */
 1334                         hammer_sync_lock_sh(trans);
 1335                         if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
 1336                                 hammer_modify_volume_field(trans,
 1337                                                            trans->rootvol,
 1338                                                            vol0_stat_inodes);
 1339                                 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
 1340                                 hammer_modify_volume_done(trans->rootvol);
 1341                                 ip->flags |= HAMMER_INODE_ONDISK;
 1342                                 if (hammer_debug_inode)
 1343                                         kprintf("NOWONDISK %p\n", ip);
 1344                         }
 1345                         hammer_sync_unlock(trans);
 1346                 }
 1347         }
 1348 
 1349         /*
 1350          * If the inode has been destroyed, clean out any left-over flags
 1351          * that may have been set by the frontend.
 1352          */
 1353         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) { 
 1354                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
 1355                                     HAMMER_INODE_SDIRTY |
 1356                                     HAMMER_INODE_ATIME |
 1357                                     HAMMER_INODE_MTIME);
 1358         }
 1359         return(error);
 1360 }
 1361 
 1362 /*
 1363  * Update only the itimes fields.
 1364  *
 1365  * ATIME can be updated without generating any UNDO.  MTIME is updated
 1366  * with UNDO so it is guaranteed to be synchronized properly in case of
 1367  * a crash.
 1368  *
 1369  * Neither field is included in the B-Tree leaf element's CRC, which is how
 1370  * we can get away with updating ATIME the way we do.
 1371  */
 1372 static int
 1373 hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
 1374 {
 1375         hammer_transaction_t trans = cursor->trans;
 1376         int error;
 1377 
 1378 retry:
 1379         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) !=
 1380             HAMMER_INODE_ONDISK) {
 1381                 return(0);
 1382         }
 1383 
 1384         hammer_normalize_cursor(cursor);
 1385         cursor->key_beg.localization = ip->obj_localization + 
 1386                                        HAMMER_LOCALIZE_INODE;
 1387         cursor->key_beg.obj_id = ip->obj_id;
 1388         cursor->key_beg.key = 0;
 1389         cursor->key_beg.create_tid = 0;
 1390         cursor->key_beg.delete_tid = 0;
 1391         cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
 1392         cursor->key_beg.obj_type = 0;
 1393         cursor->asof = ip->obj_asof;
 1394         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
 1395         cursor->flags |= HAMMER_CURSOR_ASOF;
 1396         cursor->flags |= HAMMER_CURSOR_GET_LEAF;
 1397         cursor->flags |= HAMMER_CURSOR_GET_DATA;
 1398         cursor->flags |= HAMMER_CURSOR_BACKEND;
 1399 
 1400         error = hammer_btree_lookup(cursor);
 1401         if (error == 0) {
 1402                 hammer_cache_node(&ip->cache[0], cursor->node);
 1403                 if (ip->sync_flags & HAMMER_INODE_MTIME) {
 1404                         /*
 1405                          * Updating MTIME requires an UNDO.  Just cover
 1406                          * both atime and mtime.
 1407                          */
 1408                         hammer_sync_lock_sh(trans);
 1409                         hammer_modify_buffer(trans, cursor->data_buffer,
 1410                                      HAMMER_ITIMES_BASE(&cursor->data->inode),
 1411                                      HAMMER_ITIMES_BYTES);
 1412                         cursor->data->inode.atime = ip->sync_ino_data.atime;
 1413                         cursor->data->inode.mtime = ip->sync_ino_data.mtime;
 1414                         hammer_modify_buffer_done(cursor->data_buffer);
 1415                         hammer_sync_unlock(trans);
 1416                 } else if (ip->sync_flags & HAMMER_INODE_ATIME) {
 1417                         /*
 1418                          * Updating atime only can be done in-place with
 1419                          * no UNDO.
 1420                          */
 1421                         hammer_sync_lock_sh(trans);
 1422                         hammer_modify_buffer(trans, cursor->data_buffer,
 1423                                              NULL, 0);
 1424                         cursor->data->inode.atime = ip->sync_ino_data.atime;
 1425                         hammer_modify_buffer_done(cursor->data_buffer);
 1426                         hammer_sync_unlock(trans);
 1427                 }
 1428                 ip->sync_flags &= ~(HAMMER_INODE_ATIME | HAMMER_INODE_MTIME);
 1429         }
 1430         if (error == EDEADLK) {
 1431                 hammer_done_cursor(cursor);
 1432                 error = hammer_init_cursor(trans, cursor,
 1433                                            &ip->cache[0], ip);
 1434                 if (error == 0)
 1435                         goto retry;
 1436         }
 1437         return(error);
 1438 }
 1439 
 1440 /*
 1441  * Release a reference on an inode, flush as requested.
 1442  *
 1443  * On the last reference we queue the inode to the flusher for its final
 1444  * disposition.
 1445  */
 1446 void
 1447 hammer_rel_inode(struct hammer_inode *ip, int flush)
 1448 {
 1449         /*hammer_mount_t hmp = ip->hmp;*/
 1450 
 1451         /*
 1452          * Handle disposition when dropping the last ref.
 1453          */
 1454         for (;;) {
 1455                 if (hammer_oneref(&ip->lock)) {
 1456                         /*
 1457                          * Determine whether on-disk action is needed for
 1458                          * the inode's final disposition.
 1459                          */
 1460                         KKASSERT(ip->vp == NULL);
 1461                         hammer_inode_unloadable_check(ip, 0);
 1462                         if (ip->flags & HAMMER_INODE_MODMASK) {
 1463                                 hammer_flush_inode(ip, 0);
 1464                         } else if (hammer_oneref(&ip->lock)) {
 1465                                 hammer_unload_inode(ip);
 1466                                 break;
 1467                         }
 1468                 } else {
 1469                         if (flush)
 1470                                 hammer_flush_inode(ip, 0);
 1471 
 1472                         /*
 1473                          * The inode still has multiple refs, try to drop
 1474                          * one ref.
 1475                          */
 1476                         KKASSERT(hammer_isactive(&ip->lock) >= 1);
 1477                         if (hammer_isactive(&ip->lock) > 1) {
 1478                                 hammer_rel(&ip->lock);
 1479                                 break;
 1480                         }
 1481                 }
 1482         }
 1483 }
 1484 
 1485 /*
 1486  * Unload and destroy the specified inode.  Must be called with one remaining
 1487  * reference.  The reference is disposed of.
 1488  *
 1489  * The inode must be completely clean.
 1490  */
 1491 static int
 1492 hammer_unload_inode(struct hammer_inode *ip)
 1493 {
 1494         hammer_mount_t hmp = ip->hmp;
 1495 
 1496         KASSERT(hammer_oneref(&ip->lock),
 1497                 ("hammer_unload_inode: %d refs", hammer_isactive(&ip->lock)));
 1498         KKASSERT(ip->vp == NULL);
 1499         KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
 1500         KKASSERT(ip->cursor_ip_refs == 0);
 1501         KKASSERT(hammer_notlocked(&ip->lock));
 1502         KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
 1503 
 1504         KKASSERT(RB_EMPTY(&ip->rec_tree));
 1505         KKASSERT(TAILQ_EMPTY(&ip->target_list));
 1506 
 1507         if (ip->flags & HAMMER_INODE_RDIRTY) {
 1508                 RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
 1509                 ip->flags &= ~HAMMER_INODE_RDIRTY;
 1510         }
 1511         RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
 1512 
 1513         hammer_free_inode(ip);
 1514         return(0);
 1515 }
 1516 
 1517 /*
 1518  * Called during unmounting if a critical error occured.  The in-memory
 1519  * inode and all related structures are destroyed.
 1520  *
 1521  * If a critical error did not occur the unmount code calls the standard
 1522  * release and asserts that the inode is gone.
 1523  */
 1524 int
 1525 hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused)
 1526 {
 1527         hammer_record_t rec;
 1528 
 1529         /*
 1530          * Get rid of the inodes in-memory records, regardless of their
 1531          * state, and clear the mod-mask.
 1532          */
 1533         while ((rec = TAILQ_FIRST(&ip->target_list)) != NULL) {
 1534                 TAILQ_REMOVE(&ip->target_list, rec, target_entry);
 1535                 rec->target_ip = NULL;
 1536                 if (rec->flush_state == HAMMER_FST_SETUP)
 1537                         rec->flush_state = HAMMER_FST_IDLE;
 1538         }
 1539         while ((rec = RB_ROOT(&ip->rec_tree)) != NULL) {
 1540                 if (rec->flush_state == HAMMER_FST_FLUSH)
 1541                         --rec->flush_group->refs;
 1542                 else
 1543                         hammer_ref(&rec->lock);
 1544                 KKASSERT(hammer_oneref(&rec->lock));
 1545                 rec->flush_state = HAMMER_FST_IDLE;
 1546                 rec->flush_group = NULL;
 1547                 rec->flags |= HAMMER_RECF_DELETED_FE; /* wave hands */
 1548                 rec->flags |= HAMMER_RECF_DELETED_BE; /* wave hands */
 1549                 ++ip->rec_generation;
 1550                 hammer_rel_mem_record(rec);
 1551         }
 1552         ip->flags &= ~HAMMER_INODE_MODMASK;
 1553         ip->sync_flags &= ~HAMMER_INODE_MODMASK;
 1554         KKASSERT(ip->vp == NULL);
 1555 
 1556         /*
 1557          * Remove the inode from any flush group, force it idle.  FLUSH
 1558          * and SETUP states have an inode ref.
 1559          */
 1560         switch(ip->flush_state) {
 1561         case HAMMER_FST_FLUSH:
 1562                 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
 1563                 --ip->flush_group->refs;
 1564                 ip->flush_group = NULL;
 1565                 /* fall through */
 1566         case HAMMER_FST_SETUP:
 1567                 hammer_rel(&ip->lock);
 1568                 ip->flush_state = HAMMER_FST_IDLE;
 1569                 /* fall through */
 1570         case HAMMER_FST_IDLE:
 1571                 break;
 1572         }
 1573 
 1574         /*
 1575          * There shouldn't be any associated vnode.  The unload needs at
 1576          * least one ref, if we do have a vp steal its ip ref.
 1577          */
 1578         if (ip->vp) {
 1579                 kprintf("hammer_destroy_inode_callback: Unexpected "
 1580                         "vnode association ip %p vp %p\n", ip, ip->vp);
 1581                 ip->vp->v_data = NULL;
 1582                 ip->vp = NULL;
 1583         } else {
 1584                 hammer_ref(&ip->lock);
 1585         }
 1586         hammer_unload_inode(ip);
 1587         return(0);
 1588 }
 1589 
 1590 /*
 1591  * Called on mount -u when switching from RW to RO or vise-versa.  Adjust
 1592  * the read-only flag for cached inodes.
 1593  *
 1594  * This routine is called from a RB_SCAN().
 1595  */
 1596 int
 1597 hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
 1598 {
 1599         hammer_mount_t hmp = ip->hmp;
 1600 
 1601         if (hmp->ronly || hmp->asof != HAMMER_MAX_TID)
 1602                 ip->flags |= HAMMER_INODE_RO;
 1603         else
 1604                 ip->flags &= ~HAMMER_INODE_RO;
 1605         return(0);
 1606 }
 1607 
 1608 /*
 1609  * A transaction has modified an inode, requiring updates as specified by
 1610  * the passed flags.
 1611  *
 1612  * HAMMER_INODE_DDIRTY: Inode data has been updated, not incl mtime/atime,
 1613  *                      and not including size changes due to write-append
 1614  *                      (but other size changes are included).
 1615  * HAMMER_INODE_SDIRTY: Inode data has been updated, size changes due to
 1616  *                      write-append.
 1617  * HAMMER_INODE_XDIRTY: Dirty in-memory records
 1618  * HAMMER_INODE_BUFS:   Dirty buffer cache buffers
 1619  * HAMMER_INODE_DELETED: Inode record/data must be deleted
 1620  * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
 1621  */
 1622 void
 1623 hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
 1624 {
 1625         /* 
 1626          * ronly of 0 or 2 does not trigger assertion.
 1627          * 2 is a special error state 
 1628          */
 1629         KKASSERT(ip->hmp->ronly != 1 ||
 1630                   (flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 
 1631                             HAMMER_INODE_SDIRTY |
 1632                             HAMMER_INODE_BUFS | HAMMER_INODE_DELETED |
 1633                             HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) == 0);
 1634         if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
 1635                 ip->flags |= HAMMER_INODE_RSV_INODES;
 1636                 ++ip->hmp->rsv_inodes;
 1637         }
 1638 
 1639         /*
 1640          * Set the NEWINODE flag in the transaction if the inode
 1641          * transitions to a dirty state.  This is used to track
 1642          * the load on the inode cache.
 1643          */
 1644         if (trans &&
 1645             (ip->flags & HAMMER_INODE_MODMASK) == 0 &&
 1646             (flags & HAMMER_INODE_MODMASK)) {
 1647                 trans->flags |= HAMMER_TRANSF_NEWINODE;
 1648         }
 1649         if (flags & HAMMER_INODE_MODMASK)
 1650                 hammer_inode_dirty(ip);
 1651         ip->flags |= flags;
 1652 }
 1653 
 1654 /*
 1655  * Attempt to quickly update the atime for a hammer inode.  Return 0 on
 1656  * success, -1 on failure.
 1657  *
 1658  * We attempt to update the atime with only the ip lock and not the
 1659  * whole filesystem lock in order to improve concurrency.  We can only
 1660  * do this safely if the ATIME flag is already pending on the inode.
 1661  *
 1662  * This function is called via a vnops path (ip pointer is stable) without
 1663  * fs_token held.
 1664  */
 1665 int
 1666 hammer_update_atime_quick(hammer_inode_t ip)
 1667 {
 1668         struct timeval tv;
 1669         int res = -1;
 1670 
 1671         if ((ip->flags & HAMMER_INODE_RO) ||
 1672             (ip->hmp->mp->mnt_flag & MNT_NOATIME)) {
 1673                 /*
 1674                  * Silently indicate success on read-only mount/snap
 1675                  */
 1676                 res = 0;
 1677         } else if (ip->flags & HAMMER_INODE_ATIME) {
 1678                 /*
 1679                  * Double check with inode lock held against backend.  This
 1680                  * is only safe if all we need to do is update
 1681                  * ino_data.atime.
 1682                  */
 1683                 getmicrotime(&tv);
 1684                 hammer_lock_ex(&ip->lock);
 1685                 if (ip->flags & HAMMER_INODE_ATIME) {
 1686                         ip->ino_data.atime =
 1687                             (unsigned long)tv.tv_sec * 1000000ULL + tv.tv_usec;
 1688                         res = 0;
 1689                 }
 1690                 hammer_unlock(&ip->lock);
 1691         }
 1692         return res;
 1693 }
 1694 
 1695 /*
 1696  * Request that an inode be flushed.  This whole mess cannot block and may
 1697  * recurse (if not synchronous).  Once requested HAMMER will attempt to
 1698  * actively flush the inode until the flush can be done.
 1699  *
 1700  * The inode may already be flushing, or may be in a setup state.  We can
 1701  * place the inode in a flushing state if it is currently idle and flag it
 1702  * to reflush if it is currently flushing.
 1703  *
 1704  * Upon return if the inode could not be flushed due to a setup
 1705  * dependancy, then it will be automatically flushed when the dependancy
 1706  * is satisfied.
 1707  */
 1708 void
 1709 hammer_flush_inode(hammer_inode_t ip, int flags)
 1710 {
 1711         hammer_mount_t hmp;
 1712         hammer_flush_group_t flg;
 1713         int good;
 1714 
 1715         /*
 1716          * fill_flush_group is the first flush group we may be able to
 1717          * continue filling, it may be open or closed but it will always
 1718          * be past the currently flushing (running) flg.
 1719          *
 1720          * next_flush_group is the next open flush group.
 1721          */
 1722         hmp = ip->hmp;
 1723         while ((flg = hmp->fill_flush_group) != NULL) {
 1724                 KKASSERT(flg->running == 0);
 1725                 if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit &&
 1726                     flg->total_count <= hammer_autoflush) {
 1727                         break;
 1728                 }
 1729                 hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
 1730                 hammer_flusher_async(ip->hmp, flg);
 1731         }
 1732         if (flg == NULL) {
 1733                 flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK|M_ZERO);
 1734                 flg->seq = hmp->flusher.next++;
 1735                 if (hmp->next_flush_group == NULL)
 1736                         hmp->next_flush_group = flg;
 1737                 if (hmp->fill_flush_group == NULL)
 1738                         hmp->fill_flush_group = flg;
 1739                 RB_INIT(&flg->flush_tree);
 1740                 TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
 1741         }
 1742 
 1743         /*
 1744          * Trivial 'nothing to flush' case.  If the inode is in a SETUP
 1745          * state we have to put it back into an IDLE state so we can
 1746          * drop the extra ref.
 1747          *
 1748          * If we have a parent dependancy we must still fall through
 1749          * so we can run it.
 1750          */
 1751         if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
 1752                 if (ip->flush_state == HAMMER_FST_SETUP &&
 1753                     TAILQ_EMPTY(&ip->target_list)) {
 1754                         ip->flush_state = HAMMER_FST_IDLE;
 1755                         hammer_rel_inode(ip, 0);
 1756                 }
 1757                 if (ip->flush_state == HAMMER_FST_IDLE)
 1758                         return;
 1759         }
 1760 
 1761         /*
 1762          * Our flush action will depend on the current state.
 1763          */
 1764         switch(ip->flush_state) {
 1765         case HAMMER_FST_IDLE:
 1766                 /*
 1767                  * We have no dependancies and can flush immediately.  Some
 1768                  * our children may not be flushable so we have to re-test
 1769                  * with that additional knowledge.
 1770                  */
 1771                 hammer_flush_inode_core(ip, flg, flags);
 1772                 break;
 1773         case HAMMER_FST_SETUP:
 1774                 /*
 1775                  * Recurse upwards through dependancies via target_list
 1776                  * and start their flusher actions going if possible.
 1777                  *
 1778                  * 'good' is our connectivity.  -1 means we have none and
 1779                  * can't flush, 0 means there weren't any dependancies, and
 1780                  * 1 means we have good connectivity.
 1781                  */
 1782                 good = hammer_setup_parent_inodes(ip, 0, flg);
 1783 
 1784                 if (good >= 0) {
 1785                         /*
 1786                          * We can continue if good >= 0.  Determine how 
 1787                          * many records under our inode can be flushed (and
 1788                          * mark them).
 1789                          */
 1790                         hammer_flush_inode_core(ip, flg, flags);
 1791                 } else {
 1792                         /*
 1793                          * Parent has no connectivity, tell it to flush
 1794                          * us as soon as it does.
 1795                          *
 1796                          * The REFLUSH flag is also needed to trigger
 1797                          * dependancy wakeups.
 1798                          */
 1799                         ip->flags |= HAMMER_INODE_CONN_DOWN |
 1800                                      HAMMER_INODE_REFLUSH;
 1801                         if (flags & HAMMER_FLUSH_SIGNAL) {
 1802                                 ip->flags |= HAMMER_INODE_RESIGNAL;
 1803                                 hammer_flusher_async(ip->hmp, flg);
 1804                         }
 1805                 }
 1806                 break;
 1807         case HAMMER_FST_FLUSH:
 1808                 /*
 1809                  * We are already flushing, flag the inode to reflush
 1810                  * if needed after it completes its current flush.
 1811                  *
 1812                  * The REFLUSH flag is also needed to trigger
 1813                  * dependancy wakeups.
 1814                  */
 1815                 if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
 1816                         ip->flags |= HAMMER_INODE_REFLUSH;
 1817                 if (flags & HAMMER_FLUSH_SIGNAL) {
 1818                         ip->flags |= HAMMER_INODE_RESIGNAL;
 1819                         hammer_flusher_async(ip->hmp, flg);
 1820                 }
 1821                 break;
 1822         }
 1823 }
 1824 
 1825 /*
 1826  * Scan ip->target_list, which is a list of records owned by PARENTS to our
 1827  * ip which reference our ip.
 1828  *
 1829  * XXX This is a huge mess of recursive code, but not one bit of it blocks
 1830  *     so for now do not ref/deref the structures.  Note that if we use the
 1831  *     ref/rel code later, the rel CAN block.
 1832  */
 1833 static int
 1834 hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
 1835                            hammer_flush_group_t flg)
 1836 {
 1837         hammer_record_t depend;
 1838         int good;
 1839         int r;
 1840 
 1841         /*
 1842          * If we hit our recursion limit and we have parent dependencies
 1843          * We cannot continue.  Returning < 0 will cause us to be flagged
 1844          * for reflush.  Returning -2 cuts off additional dependency checks
 1845          * because they are likely to also hit the depth limit.
 1846          *
 1847          * We cannot return < 0 if there are no dependencies or there might
 1848          * not be anything to wakeup (ip).
 1849          */
 1850         if (depth == 20 && TAILQ_FIRST(&ip->target_list)) {
 1851                 krateprintf(&hammer_gen_krate,
 1852                             "HAMMER Warning: depth limit reached on "
 1853                             "setup recursion, inode %p %016llx\n",
 1854                             ip, (long long)ip->obj_id);
 1855                 return(-2);
 1856         }
 1857 
 1858         /*
 1859          * Scan dependencies
 1860          */
 1861         good = 0;
 1862         TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
 1863                 r = hammer_setup_parent_inodes_helper(depend, depth, flg);
 1864                 KKASSERT(depend->target_ip == ip);
 1865                 if (r < 0 && good == 0)
 1866                         good = -1;
 1867                 if (r > 0)
 1868                         good = 1;
 1869 
 1870                 /*
 1871                  * If we failed due to the recursion depth limit then stop
 1872                  * now.
 1873                  */
 1874                 if (r == -2)
 1875                         break;
 1876         }
 1877         return(good);
 1878 }
 1879 
 1880 /*
 1881  * This helper function takes a record representing the dependancy between
 1882  * the parent inode and child inode.
 1883  *
 1884  * record->ip           = parent inode
 1885  * record->target_ip    = child inode
 1886  * 
 1887  * We are asked to recurse upwards and convert the record from SETUP
 1888  * to FLUSH if possible.
 1889  *
 1890  * Return 1 if the record gives us connectivity
 1891  *
 1892  * Return 0 if the record is not relevant 
 1893  *
 1894  * Return -1 if we can't resolve the dependancy and there is no connectivity.
 1895  */
 1896 static int
 1897 hammer_setup_parent_inodes_helper(hammer_record_t record, int depth,
 1898                                   hammer_flush_group_t flg)
 1899 {
 1900         hammer_inode_t pip;
 1901         int good;
 1902 
 1903         KKASSERT(record->flush_state != HAMMER_FST_IDLE);
 1904         pip = record->ip;
 1905 
 1906         /*
 1907          * If the record is already flushing, is it in our flush group?
 1908          *
 1909          * If it is in our flush group but it is a general record or a 
 1910          * delete-on-disk, it does not improve our connectivity (return 0),
 1911          * and if the target inode is not trying to destroy itself we can't
 1912          * allow the operation yet anyway (the second return -1).
 1913          */
 1914         if (record->flush_state == HAMMER_FST_FLUSH) {
 1915                 /*
 1916                  * If not in our flush group ask the parent to reflush
 1917                  * us as soon as possible.
 1918                  */
 1919                 if (record->flush_group != flg) {
 1920                         pip->flags |= HAMMER_INODE_REFLUSH;
 1921                         record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
 1922                         return(-1);
 1923                 }
 1924 
 1925                 /*
 1926                  * If in our flush group everything is already set up,
 1927                  * just return whether the record will improve our
 1928                  * visibility or not.
 1929                  */
 1930                 if (record->type == HAMMER_MEM_RECORD_ADD)
 1931                         return(1);
 1932                 return(0);
 1933         }
 1934 
 1935         /*
 1936          * It must be a setup record.  Try to resolve the setup dependancies
 1937          * by recursing upwards so we can place ip on the flush list.
 1938          *
 1939          * Limit ourselves to 20 levels of recursion to avoid blowing out
 1940          * the kernel stack.  If we hit the recursion limit we can't flush
 1941          * until the parent flushes.  The parent will flush independantly
 1942          * on its own and ultimately a deep recursion will be resolved.
 1943          */
 1944         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
 1945 
 1946         good = hammer_setup_parent_inodes(pip, depth + 1, flg);
 1947 
 1948         /*
 1949          * If good < 0 the parent has no connectivity and we cannot safely
 1950          * flush the directory entry, which also means we can't flush our
 1951          * ip.  Flag us for downward recursion once the parent's
 1952          * connectivity is resolved.  Flag the parent for [re]flush or it
 1953          * may not check for downward recursions.
 1954          */
 1955         if (good < 0) {
 1956                 pip->flags |= HAMMER_INODE_REFLUSH;
 1957                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
 1958                 return(good);
 1959         }
 1960 
 1961         /*
 1962          * We are go, place the parent inode in a flushing state so we can
 1963          * place its record in a flushing state.  Note that the parent
 1964          * may already be flushing.  The record must be in the same flush
 1965          * group as the parent.
 1966          */
 1967         if (pip->flush_state != HAMMER_FST_FLUSH)
 1968                 hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION);
 1969         KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
 1970 
 1971         /*
 1972          * It is possible for a rename to create a loop in the recursion
 1973          * and revisit a record.  This will result in the record being
 1974          * placed in a flush state unexpectedly.  This check deals with
 1975          * the case.
 1976          */
 1977         if (record->flush_state == HAMMER_FST_FLUSH) {
 1978                 if (record->type == HAMMER_MEM_RECORD_ADD)
 1979                         return(1);
 1980                 return(0);
 1981         }
 1982 
 1983         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
 1984 
 1985 #if 0
 1986         if (record->type == HAMMER_MEM_RECORD_DEL &&
 1987             (record->target_ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_DELONDISK)) == 0) {
 1988                 /*
 1989                  * Regardless of flushing state we cannot sync this path if the
 1990                  * record represents a delete-on-disk but the target inode
 1991                  * is not ready to sync its own deletion.
 1992                  *
 1993                  * XXX need to count effective nlinks to determine whether
 1994                  * the flush is ok, otherwise removing a hardlink will
 1995                  * just leave the DEL record to rot.
 1996                  */
 1997                 record->target_ip->flags |= HAMMER_INODE_REFLUSH;
 1998                 return(-1);
 1999         } else
 2000 #endif
 2001         if (pip->flush_group == flg) {
 2002                 /*
 2003                  * Because we have not calculated nlinks yet we can just
 2004                  * set records to the flush state if the parent is in
 2005                  * the same flush group as we are.
 2006                  */
 2007                 record->flush_state = HAMMER_FST_FLUSH;
 2008                 record->flush_group = flg;
 2009                 ++record->flush_group->refs;
 2010                 hammer_ref(&record->lock);
 2011 
 2012                 /*
 2013                  * A general directory-add contributes to our visibility.
 2014                  *
 2015                  * Otherwise it is probably a directory-delete or 
 2016                  * delete-on-disk record and does not contribute to our
 2017                  * visbility (but we can still flush it).
 2018                  */
 2019                 if (record->type == HAMMER_MEM_RECORD_ADD)
 2020                         return(1);
 2021                 return(0);
 2022         } else {
 2023                 /*
 2024                  * If the parent is not in our flush group we cannot
 2025                  * flush this record yet, there is no visibility.
 2026                  * We tell the parent to reflush and mark ourselves
 2027                  * so the parent knows it should flush us too.
 2028                  */
 2029                 pip->flags |= HAMMER_INODE_REFLUSH;
 2030                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
 2031                 return(-1);
 2032         }
 2033 }
 2034 
 2035 /*
 2036  * This is the core routine placing an inode into the FST_FLUSH state.
 2037  */
 2038 static void
 2039 hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
 2040 {
 2041         hammer_mount_t hmp = ip->hmp;
 2042         int go_count;
 2043 
 2044         /*
 2045          * Set flush state and prevent the flusher from cycling into
 2046          * the next flush group.  Do not place the ip on the list yet.
 2047          * Inodes not in the idle state get an extra reference.
 2048          */
 2049         KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
 2050         if (ip->flush_state == HAMMER_FST_IDLE)
 2051                 hammer_ref(&ip->lock);
 2052         ip->flush_state = HAMMER_FST_FLUSH;
 2053         ip->flush_group = flg;
 2054         ++hmp->flusher.group_lock;
 2055         ++hmp->count_iqueued;
 2056         ++hammer_count_iqueued;
 2057         ++flg->total_count;
 2058         hammer_redo_fifo_start_flush(ip);
 2059 
 2060 #if 0
 2061         /*
 2062          * We need to be able to vfsync/truncate from the backend.
 2063          *
 2064          * XXX Any truncation from the backend will acquire the vnode
 2065          *     independently.
 2066          */
 2067         KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
 2068         if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
 2069                 ip->flags |= HAMMER_INODE_VHELD;
 2070                 vref(ip->vp);
 2071         }
 2072 #endif
 2073 
 2074         /*
 2075          * Figure out how many in-memory records we can actually flush
 2076          * (not including inode meta-data, buffers, etc).
 2077          */
 2078         KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0);
 2079         if (flags & HAMMER_FLUSH_RECURSION) {
 2080                 /*
 2081                  * If this is a upwards recursion we do not want to
 2082                  * recurse down again!
 2083                  */
 2084                 go_count = 1;
 2085 #if 0
 2086         } else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
 2087                 /*
 2088                  * No new records are added if we must complete a flush
 2089                  * from a previous cycle, but we do have to move the records
 2090                  * from the previous cycle to the current one.
 2091                  */
 2092 #if 0
 2093                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
 2094                                    hammer_syncgrp_child_callback, NULL);
 2095 #endif
 2096                 go_count = 1;
 2097 #endif
 2098         } else {
 2099                 /*
 2100                  * Normal flush, scan records and bring them into the flush.
 2101                  * Directory adds and deletes are usually skipped (they are
 2102                  * grouped with the related inode rather then with the
 2103                  * directory).
 2104                  *
 2105                  * go_count can be negative, which means the scan aborted
 2106                  * due to the flush group being over-full and we should
 2107                  * flush what we have.
 2108                  */
 2109                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
 2110                                    hammer_setup_child_callback, NULL);
 2111         }
 2112 
 2113         /*
 2114          * This is a more involved test that includes go_count.  If we
 2115          * can't flush, flag the inode and return.  If go_count is 0 we
 2116          * were are unable to flush any records in our rec_tree and
 2117          * must ignore the XDIRTY flag.
 2118          */
 2119         if (go_count == 0) {
 2120                 if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
 2121                         --hmp->count_iqueued;
 2122                         --hammer_count_iqueued;
 2123 
 2124                         --flg->total_count;
 2125                         ip->flush_state = HAMMER_FST_SETUP;
 2126                         ip->flush_group = NULL;
 2127                         if (flags & HAMMER_FLUSH_SIGNAL) {
 2128                                 ip->flags |= HAMMER_INODE_REFLUSH |
 2129                                              HAMMER_INODE_RESIGNAL;
 2130                         } else {
 2131                                 ip->flags |= HAMMER_INODE_REFLUSH;
 2132                         }
 2133 #if 0
 2134                         if (ip->flags & HAMMER_INODE_VHELD) {
 2135                                 ip->flags &= ~HAMMER_INODE_VHELD;
 2136                                 vrele(ip->vp);
 2137                         }
 2138 #endif
 2139 
 2140                         /*
 2141                          * REFLUSH is needed to trigger dependancy wakeups
 2142                          * when an inode is in SETUP.
 2143                          */
 2144                         ip->flags |= HAMMER_INODE_REFLUSH;
 2145                         if (--hmp->flusher.group_lock == 0)
 2146                                 wakeup(&hmp->flusher.group_lock);
 2147                         return;
 2148                 }
 2149         }
 2150 
 2151         /*
 2152          * Snapshot the state of the inode for the backend flusher.
 2153          *
 2154          * We continue to retain save_trunc_off even when all truncations
 2155          * have been resolved as an optimization to determine if we can
 2156          * skip the B-Tree lookup for overwrite deletions.
 2157          *
 2158          * NOTE: The DELETING flag is a mod flag, but it is also sticky,
 2159          * and stays in ip->flags.  Once set, it stays set until the
 2160          * inode is destroyed.
 2161          */
 2162         if (ip->flags & HAMMER_INODE_TRUNCATED) {
 2163                 KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
 2164                 ip->sync_trunc_off = ip->trunc_off;
 2165                 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
 2166                 ip->flags &= ~HAMMER_INODE_TRUNCATED;
 2167                 ip->sync_flags |= HAMMER_INODE_TRUNCATED;
 2168 
 2169                 /*
 2170                  * The save_trunc_off used to cache whether the B-Tree
 2171                  * holds any records past that point is not used until
 2172                  * after the truncation has succeeded, so we can safely
 2173                  * set it now.
 2174                  */
 2175                 if (ip->save_trunc_off > ip->sync_trunc_off)
 2176                         ip->save_trunc_off = ip->sync_trunc_off;
 2177         }
 2178         ip->sync_flags |= (ip->flags & HAMMER_INODE_MODMASK &
 2179                            ~HAMMER_INODE_TRUNCATED);
 2180         ip->sync_ino_leaf = ip->ino_leaf;
 2181         ip->sync_ino_data = ip->ino_data;
 2182         ip->flags &= ~HAMMER_INODE_MODMASK | HAMMER_INODE_TRUNCATED;
 2183 #ifdef DEBUG_TRUNCATE
 2184         if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp)
 2185                 kprintf("truncateS %016llx\n", ip->sync_trunc_off);
 2186 #endif
 2187 
 2188         /*
 2189          * The flusher list inherits our inode and reference.
 2190          */
 2191         KKASSERT(flg->running == 0);
 2192         RB_INSERT(hammer_fls_rb_tree, &flg->flush_tree, ip);
 2193         if (--hmp->flusher.group_lock == 0)
 2194                 wakeup(&hmp->flusher.group_lock);
 2195 
 2196         /*
 2197          * Auto-flush the group if it grows too large.  Make sure the
 2198          * inode reclaim wait pipeline continues to work.
 2199          */
 2200         if (flg->total_count >= hammer_autoflush ||
 2201             flg->total_count >= hammer_limit_reclaims / 4) {
 2202                 if (hmp->fill_flush_group == flg)
 2203                         hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
 2204                 hammer_flusher_async(hmp, flg);
 2205         }
 2206 }
 2207 
 2208 /*
 2209  * Callback for scan of ip->rec_tree.  Try to include each record in our
 2210  * flush.  ip->flush_group has been set but the inode has not yet been
 2211  * moved into a flushing state.
 2212  *
 2213  * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
 2214  * both inodes.
 2215  *
 2216  * We return 1 for any record placed or found in FST_FLUSH, which prevents
 2217  * the caller from shortcutting the flush.
 2218  */
 2219 static int
 2220 hammer_setup_child_callback(hammer_record_t rec, void *data)
 2221 {
 2222         hammer_flush_group_t flg;
 2223         hammer_inode_t target_ip;
 2224         hammer_inode_t ip;
 2225         int r;
 2226 
 2227         /*
 2228          * Records deleted or committed by the backend are ignored.
 2229          * Note that the flush detects deleted frontend records at
 2230          * multiple points to deal with races.  This is just the first
 2231          * line of defense.  The only time HAMMER_RECF_DELETED_FE cannot
 2232          * be set is when HAMMER_RECF_INTERLOCK_BE is set, because it
 2233          * messes up link-count calculations.
 2234          *
 2235          * NOTE: Don't get confused between record deletion and, say,
 2236          * directory entry deletion.  The deletion of a directory entry
 2237          * which is on-media has nothing to do with the record deletion
 2238          * flags.
 2239          */
 2240         if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE |
 2241                           HAMMER_RECF_COMMITTED)) {
 2242                 if (rec->flush_state == HAMMER_FST_FLUSH) {
 2243                         KKASSERT(rec->flush_group == rec->ip->flush_group);
 2244                         r = 1;
 2245                 } else {
 2246                         r = 0;
 2247                 }
 2248                 return(r);
 2249         }
 2250 
 2251         /*
 2252          * If the record is in an idle state it has no dependancies and
 2253          * can be flushed.
 2254          */
 2255         ip = rec->ip;
 2256         flg = ip->flush_group;
 2257         r = 0;
 2258 
 2259         switch(rec->flush_state) {
 2260         case HAMMER_FST_IDLE:
 2261                 /*
 2262                  * The record has no setup dependancy, we can flush it.
 2263                  */
 2264                 KKASSERT(rec->target_ip == NULL);
 2265                 rec->flush_state = HAMMER_FST_FLUSH;
 2266                 rec->flush_group = flg;
 2267                 ++flg->refs;
 2268                 hammer_ref(&rec->lock);
 2269                 r = 1;
 2270                 break;
 2271         case HAMMER_FST_SETUP:
 2272                 /*
 2273                  * The record has a setup dependancy.  These are typically
 2274                  * directory entry adds and deletes.  Such entries will be
 2275                  * flushed when their inodes are flushed so we do not
 2276                  * usually have to add them to the flush here.  However,
 2277                  * if the target_ip has set HAMMER_INODE_CONN_DOWN then
 2278                  * it is asking us to flush this record (and it).
 2279                  */
 2280                 target_ip = rec->target_ip;
 2281                 KKASSERT(target_ip != NULL);
 2282                 KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
 2283 
 2284                 /*
 2285                  * If the target IP is already flushing in our group
 2286                  * we could associate the record, but target_ip has
 2287                  * already synced ino_data to sync_ino_data and we
 2288                  * would also have to adjust nlinks.   Plus there are
 2289                  * ordering issues for adds and deletes.
 2290                  *
 2291                  * Reflush downward if this is an ADD, and upward if
 2292                  * this is a DEL.
 2293                  */
 2294                 if (target_ip->flush_state == HAMMER_FST_FLUSH) {
 2295                         if (rec->type == HAMMER_MEM_RECORD_ADD)
 2296                                 ip->flags |= HAMMER_INODE_REFLUSH;
 2297                         else
 2298                                 target_ip->flags |= HAMMER_INODE_REFLUSH;
 2299                         break;
 2300                 } 
 2301 
 2302                 /*
 2303                  * Target IP is not yet flushing.  This can get complex
 2304                  * because we have to be careful about the recursion.
 2305                  *
 2306                  * Directories create an issue for us in that if a flush
 2307                  * of a directory is requested the expectation is to flush
 2308                  * any pending directory entries, but this will cause the
 2309                  * related inodes to recursively flush as well.  We can't
 2310                  * really defer the operation so just get as many as we
 2311                  * can and
 2312                  */
 2313 #if 0
 2314                 if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
 2315                     (target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) {
 2316                         /*
 2317                          * We aren't reclaiming and the target ip was not
 2318                          * previously prevented from flushing due to this
 2319                          * record dependancy.  Do not flush this record.
 2320                          */
 2321                         /*r = 0;*/
 2322                 } else
 2323 #endif
 2324                 if (flg->total_count + flg->refs >
 2325                            ip->hmp->undo_rec_limit) {
 2326                         /*
 2327                          * Our flush group is over-full and we risk blowing
 2328                          * out the UNDO FIFO.  Stop the scan, flush what we
 2329                          * have, then reflush the directory.
 2330                          *
 2331                          * The directory may be forced through multiple
 2332                          * flush groups before it can be completely
 2333                          * flushed.
 2334                          */
 2335                         ip->flags |= HAMMER_INODE_RESIGNAL |
 2336                                      HAMMER_INODE_REFLUSH;
 2337                         r = -1;
 2338                 } else if (rec->type == HAMMER_MEM_RECORD_ADD) {
 2339                         /*
 2340                          * If the target IP is not flushing we can force
 2341                          * it to flush, even if it is unable to write out
 2342                          * any of its own records we have at least one in
 2343                          * hand that we CAN deal with.
 2344                          */
 2345                         rec->flush_state = HAMMER_FST_FLUSH;
 2346                         rec->flush_group = flg;
 2347                         ++flg->refs;
 2348                         hammer_ref(&rec->lock);
 2349                         hammer_flush_inode_core(target_ip, flg,
 2350                                                 HAMMER_FLUSH_RECURSION);
 2351                         r = 1;
 2352                 } else {
 2353                         /*
 2354                          * General or delete-on-disk record.
 2355                          *
 2356                          * XXX this needs help.  If a delete-on-disk we could
 2357                          * disconnect the target.  If the target has its own
 2358                          * dependancies they really need to be flushed.
 2359                          *
 2360                          * XXX
 2361                          */
 2362                         rec->flush_state = HAMMER_FST_FLUSH;
 2363                         rec->flush_group = flg;
 2364                         ++flg->refs;
 2365                         hammer_ref(&rec->lock);
 2366                         hammer_flush_inode_core(target_ip, flg,
 2367                                                 HAMMER_FLUSH_RECURSION);
 2368                         r = 1;
 2369                 }
 2370                 break;
 2371         case HAMMER_FST_FLUSH:
 2372                 /* 
 2373                  * The record could be part of a previous flush group if the
 2374                  * inode is a directory (the record being a directory entry).
 2375                  * Once the flush group was closed a hammer_test_inode()
 2376                  * function can cause a new flush group to be setup, placing
 2377                  * the directory inode itself in a new flush group.
 2378                  *
 2379                  * When associated with a previous flush group we count it
 2380                  * as if it were in our current flush group, since it will
 2381                  * effectively be flushed by the time we flush our current
 2382                  * flush group.
 2383                  */
 2384                 KKASSERT(
 2385                     rec->ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY ||
 2386                     rec->flush_group == flg);
 2387                 r = 1;
 2388                 break;
 2389         }
 2390         return(r);
 2391 }
 2392 
 2393 #if 0
 2394 /*
 2395  * This version just moves records already in a flush state to the new
 2396  * flush group and that is it.
 2397  */
 2398 static int
 2399 hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
 2400 {
 2401         hammer_inode_t ip = rec->ip;
 2402 
 2403         switch(rec->flush_state) {
 2404         case HAMMER_FST_FLUSH:
 2405                 KKASSERT(rec->flush_group == ip->flush_group);
 2406                 break;
 2407         default:
 2408                 break;
 2409         }
 2410         return(0);
 2411 }
 2412 #endif
 2413 
 2414 /*
 2415  * Wait for a previously queued flush to complete.
 2416  *
 2417  * If a critical error occured we don't try to wait.
 2418  */
 2419 void
 2420 hammer_wait_inode(hammer_inode_t ip)
 2421 {
 2422         /*
 2423          * The inode can be in a SETUP state in which case RESIGNAL
 2424          * should be set.  If RESIGNAL is not set then the previous
 2425          * flush completed and a later operation placed the inode
 2426          * in a passive setup state again, so we're done.
 2427          *
 2428          * The inode can be in a FLUSH state in which case we
 2429          * can just wait for completion.
 2430          */
 2431         while (ip->flush_state == HAMMER_FST_FLUSH ||
 2432             (ip->flush_state == HAMMER_FST_SETUP &&
 2433              (ip->flags & HAMMER_INODE_RESIGNAL))) {
 2434                 /*
 2435                  * Don't try to flush on a critical error
 2436                  */
 2437                 if (ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
 2438                         break;
 2439 
 2440                 /*
 2441                  * If the inode was already being flushed its flg
 2442                  * may not have been queued to the backend.  We
 2443                  * have to make sure it gets queued or we can wind
 2444                  * up blocked or deadlocked (particularly if we are
 2445                  * the vnlru thread).
 2446                  */
 2447                 if (ip->flush_state == HAMMER_FST_FLUSH) {
 2448                         KKASSERT(ip->flush_group);
 2449                         if (ip->flush_group->closed == 0) {
 2450                                 if (hammer_debug_inode) {
 2451                                         kprintf("hammer: debug: forcing "
 2452                                                 "async flush ip %016jx\n",
 2453                                                 (intmax_t)ip->obj_id);
 2454                                 }
 2455                                 hammer_flusher_async(ip->hmp,
 2456                                                      ip->flush_group);
 2457                                 continue; /* retest */
 2458                         }
 2459                 }
 2460 
 2461                 /*
 2462                  * In a flush state with the flg queued to the backend
 2463                  * or in a setup state with RESIGNAL set, we can safely
 2464                  * wait.
 2465                  */
 2466                 ip->flags |= HAMMER_INODE_FLUSHW;
 2467                 tsleep(&ip->flags, 0, "hmrwin", 0);
 2468         }
 2469 
 2470 #if 0
 2471         /*
 2472          * The inode may have been in a passive setup state,
 2473          * call flush to make sure we get signaled.
 2474          */
 2475         if (ip->flush_state == HAMMER_FST_SETUP)
 2476                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
 2477 #endif
 2478 
 2479 }
 2480 
 2481 /*
 2482  * Called by the backend code when a flush has been completed.
 2483  * The inode has already been removed from the flush list.
 2484  *
 2485  * A pipelined flush can occur, in which case we must re-enter the
 2486  * inode on the list and re-copy its fields.
 2487  */
 2488 void
 2489 hammer_flush_inode_done(hammer_inode_t ip, int error)
 2490 {
 2491         hammer_mount_t hmp;
 2492         int dorel;
 2493 
 2494         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
 2495 
 2496         hmp = ip->hmp;
 2497 
 2498         /*
 2499          * Auto-reflush if the backend could not completely flush
 2500          * the inode.  This fixes a case where a deferred buffer flush
 2501          * could cause fsync to return early.
 2502          */
 2503         if (ip->sync_flags & HAMMER_INODE_MODMASK)
 2504                 ip->flags |= HAMMER_INODE_REFLUSH;
 2505 
 2506         /*
 2507          * Merge left-over flags back into the frontend and fix the state.
 2508          * Incomplete truncations are retained by the backend.
 2509          */
 2510         ip->error = error;
 2511         ip->flags |= ip->sync_flags & ~HAMMER_INODE_TRUNCATED;
 2512         ip->sync_flags &= HAMMER_INODE_TRUNCATED;
 2513 
 2514         /*
 2515          * The backend may have adjusted nlinks, so if the adjusted nlinks
 2516          * does not match the fronttend set the frontend's DDIRTY flag again.
 2517          */
 2518         if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
 2519                 ip->flags |= HAMMER_INODE_DDIRTY;
 2520 
 2521         /*
 2522          * Fix up the dirty buffer status.
 2523          */
 2524         if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
 2525                 ip->flags |= HAMMER_INODE_BUFS;
 2526         }
 2527         hammer_redo_fifo_end_flush(ip);
 2528 
 2529         /*
 2530          * Re-set the XDIRTY flag if some of the inode's in-memory records
 2531          * could not be flushed.
 2532          */
 2533         KKASSERT((RB_EMPTY(&ip->rec_tree) &&
 2534                   (ip->flags & HAMMER_INODE_XDIRTY) == 0) ||
 2535                  (!RB_EMPTY(&ip->rec_tree) &&
 2536                   (ip->flags & HAMMER_INODE_XDIRTY) != 0));
 2537 
 2538         /*
 2539          * Do not lose track of inodes which no longer have vnode
 2540          * assocations, otherwise they may never get flushed again.
 2541          *
 2542          * The reflush flag can be set superfluously, causing extra pain
 2543          * for no reason.  If the inode is no longer modified it no longer
 2544          * needs to be flushed.
 2545          */
 2546         if (ip->flags & HAMMER_INODE_MODMASK) {
 2547                 if (ip->vp == NULL)
 2548                         ip->flags |= HAMMER_INODE_REFLUSH;
 2549         } else {
 2550                 ip->flags &= ~HAMMER_INODE_REFLUSH;
 2551         }
 2552         if (ip->flags & HAMMER_INODE_MODMASK)
 2553                 hammer_inode_dirty(ip);
 2554 
 2555         /*
 2556          * Adjust the flush state.
 2557          */
 2558         if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
 2559                 /*
 2560                  * We were unable to flush out all our records, leave the
 2561                  * inode in a flush state and in the current flush group.
 2562                  * The flush group will be re-run.
 2563                  *
 2564                  * This occurs if the UNDO block gets too full or there is
 2565                  * too much dirty meta-data and allows the flusher to
 2566                  * finalize the UNDO block and then re-flush.
 2567                  */
 2568                 ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
 2569                 dorel = 0;
 2570         } else {
 2571                 /*
 2572                  * Remove from the flush_group
 2573                  */
 2574                 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
 2575                 ip->flush_group = NULL;
 2576 
 2577 #if 0
 2578                 /*
 2579                  * Clean up the vnode ref and tracking counts.
 2580                  */
 2581                 if (ip->flags & HAMMER_INODE_VHELD) {
 2582                         ip->flags &= ~HAMMER_INODE_VHELD;
 2583                         vrele(ip->vp);
 2584                 }
 2585 #endif
 2586                 --hmp->count_iqueued;
 2587                 --hammer_count_iqueued;
 2588 
 2589                 /*
 2590                  * And adjust the state.
 2591                  */
 2592                 if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
 2593                         ip->flush_state = HAMMER_FST_IDLE;
 2594                         dorel = 1;
 2595                 } else {
 2596                         ip->flush_state = HAMMER_FST_SETUP;
 2597                         dorel = 0;
 2598                 }
 2599 
 2600                 /*
 2601                  * If the frontend is waiting for a flush to complete,
 2602                  * wake it up.
 2603                  */
 2604                 if (ip->flags & HAMMER_INODE_FLUSHW) {
 2605                         ip->flags &= ~HAMMER_INODE_FLUSHW;
 2606                         wakeup(&ip->flags);
 2607                 }
 2608 
 2609                 /*
 2610                  * If the frontend made more changes and requested another
 2611                  * flush, then try to get it running.
 2612                  *
 2613                  * Reflushes are aborted when the inode is errored out.
 2614                  */
 2615                 if (ip->flags & HAMMER_INODE_REFLUSH) {
 2616                         ip->flags &= ~HAMMER_INODE_REFLUSH;
 2617                         if (ip->flags & HAMMER_INODE_RESIGNAL) {
 2618                                 ip->flags &= ~HAMMER_INODE_RESIGNAL;
 2619                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
 2620                         } else {
 2621                                 hammer_flush_inode(ip, 0);
 2622                         }
 2623                 }
 2624         }
 2625 
 2626         /*
 2627          * If we have no parent dependancies we can clear CONN_DOWN
 2628          */
 2629         if (TAILQ_EMPTY(&ip->target_list))
 2630                 ip->flags &= ~HAMMER_INODE_CONN_DOWN;
 2631 
 2632         /*
 2633          * If the inode is now clean drop the space reservation.
 2634          */
 2635         if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
 2636             (ip->flags & HAMMER_INODE_RSV_INODES)) {
 2637                 ip->flags &= ~HAMMER_INODE_RSV_INODES;
 2638                 --hmp->rsv_inodes;
 2639         }
 2640 
 2641         ip->flags &= ~HAMMER_INODE_SLAVEFLUSH;
 2642 
 2643         if (dorel)
 2644                 hammer_rel_inode(ip, 0);
 2645 }
 2646 
 2647 /*
 2648  * Called from hammer_sync_inode() to synchronize in-memory records
 2649  * to the media.
 2650  */
 2651 static int
 2652 hammer_sync_record_callback(hammer_record_t record, void *data)
 2653 {
 2654         hammer_cursor_t cursor = data;
 2655         hammer_transaction_t trans = cursor->trans;
 2656         hammer_mount_t hmp = trans->hmp;
 2657         int error;
 2658 
 2659         /*
 2660          * Skip records that do not belong to the current flush.
 2661          */
 2662         ++hammer_stats_record_iterations;
 2663         if (record->flush_state != HAMMER_FST_FLUSH)
 2664                 return(0);
 2665 
 2666 #if 1
 2667         if (record->flush_group != record->ip->flush_group) {
 2668                 kprintf("sync_record %p ip %p bad flush group %p %p\n", record, record->ip, record->flush_group ,record->ip->flush_group);
 2669                 if (hammer_debug_critical)
 2670                         Debugger("blah2");
 2671                 return(0);
 2672         }
 2673 #endif
 2674         KKASSERT(record->flush_group == record->ip->flush_group);
 2675 
 2676         /*
 2677          * Interlock the record using the BE flag.  Once BE is set the
 2678          * frontend cannot change the state of FE.
 2679          *
 2680          * NOTE: If FE is set prior to us setting BE we still sync the
 2681          * record out, but the flush completion code converts it to 
 2682          * a delete-on-disk record instead of destroying it.
 2683          */
 2684         KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
 2685         record->flags |= HAMMER_RECF_INTERLOCK_BE;
 2686 
 2687         /*
 2688          * The backend has already disposed of the record.
 2689          */
 2690         if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) {
 2691                 error = 0;
 2692                 goto done;
 2693         }
 2694 
 2695         /*
 2696          * If the whole inode is being deleted and all on-disk records will
 2697          * be deleted very soon, we can't sync any new records to disk
 2698          * because they will be deleted in the same transaction they were
 2699          * created in (delete_tid == create_tid), which will assert.
 2700          *
 2701          * XXX There may be a case with RECORD_ADD with DELETED_FE set
 2702          * that we currently panic on.
 2703          */
 2704         if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
 2705                 switch(record->type) {
 2706                 case HAMMER_MEM_RECORD_DATA:
 2707                         /*
 2708                          * We don't have to do anything, if the record was
 2709                          * committed the space will have been accounted for
 2710                          * in the blockmap.
 2711                          */
 2712                         /* fall through */
 2713                 case HAMMER_MEM_RECORD_GENERAL:
 2714                         /*
 2715                          * Set deleted-by-backend flag.  Do not set the
 2716                          * backend committed flag, because we are throwing
 2717                          * the record away.
 2718                          */
 2719                         record->flags |= HAMMER_RECF_DELETED_BE;
 2720                         ++record->ip->rec_generation;
 2721                         error = 0;
 2722                         goto done;
 2723                 case HAMMER_MEM_RECORD_ADD:
 2724                         panic("hammer_sync_record_callback: illegal add "
 2725                               "during inode deletion record %p", record);
 2726                         break; /* NOT REACHED */
 2727                 case HAMMER_MEM_RECORD_INODE:
 2728                         panic("hammer_sync_record_callback: attempt to "
 2729                               "sync inode record %p?", record);
 2730                         break; /* NOT REACHED */
 2731                 case HAMMER_MEM_RECORD_DEL:
 2732                         /* 
 2733                          * Follow through and issue the on-disk deletion
 2734                          */
 2735                         break;
 2736                 }
 2737         }
 2738 
 2739         /*
 2740          * If DELETED_FE is set special handling is needed for directory
 2741          * entries.  Dependant pieces related to the directory entry may
 2742          * have already been synced to disk.  If this occurs we have to
 2743          * sync the directory entry and then change the in-memory record
 2744          * from an ADD to a DELETE to cover the fact that it's been
 2745          * deleted by the frontend.
 2746          *
 2747          * A directory delete covering record (MEM_RECORD_DEL) can never
 2748          * be deleted by the frontend.
 2749          *
 2750          * Any other record type (aka DATA) can be deleted by the frontend.
 2751          * XXX At the moment the flusher must skip it because there may
 2752          * be another data record in the flush group for the same block,
 2753          * meaning that some frontend data changes can leak into the backend's
 2754          * synchronization point.
 2755          */
 2756         if (record->flags & HAMMER_RECF_DELETED_FE) {
 2757                 if (record->type == HAMMER_MEM_RECORD_ADD) {
 2758                         /*
 2759                          * Convert a front-end deleted directory-add to
 2760                          * a directory-delete entry later.
 2761                          */
 2762                         record->flags |= HAMMER_RECF_CONVERT_DELETE;
 2763                 } else {
 2764                         /*
 2765                          * Dispose of the record (race case).  Mark as
 2766                          * deleted by backend (and not committed).
 2767                          */
 2768                         KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
 2769                         record->flags |= HAMMER_RECF_DELETED_BE;
 2770                         ++record->ip->rec_generation;
 2771                         error = 0;
 2772                         goto done;
 2773                 }
 2774         }
 2775 
 2776         /*
 2777          * Assign the create_tid for new records.  Deletions already
 2778          * have the record's entire key properly set up.
 2779          */
 2780         if (record->type != HAMMER_MEM_RECORD_DEL) {
 2781                 record->leaf.base.create_tid = trans->tid;
 2782                 record->leaf.create_ts = trans->time32;
 2783         }
 2784 
 2785         /*
 2786          * This actually moves the record to the on-media B-Tree.  We
 2787          * must also generate REDO_TERM entries in the UNDO/REDO FIFO
 2788          * indicating that the related REDO_WRITE(s) have been committed.
 2789          *
 2790          * During recovery any REDO_TERM's within the nominal recovery span
 2791          * are ignored since the related meta-data is being undone, causing
 2792          * any matching REDO_WRITEs to execute.  The REDO_TERMs outside
 2793          * the nominal recovery span will match against REDO_WRITEs and
 2794          * prevent them from being executed (because the meta-data has
 2795          * already been synchronized).
 2796          */
 2797         if (record->flags & HAMMER_RECF_REDO) {
 2798                 KKASSERT(record->type == HAMMER_MEM_RECORD_DATA);
 2799                 hammer_generate_redo(trans, record->ip,
 2800                                      record->leaf.base.key -
 2801                                          record->leaf.data_len,
 2802                                      HAMMER_REDO_TERM_WRITE,
 2803                                      NULL,
 2804                                      record->leaf.data_len);
 2805         }
 2806 
 2807         for (;;) {
 2808                 error = hammer_ip_sync_record_cursor(cursor, record);
 2809                 if (error != EDEADLK)
 2810                         break;
 2811                 hammer_done_cursor(cursor);
 2812                 error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
 2813                                            record->ip);
 2814                 if (error)
 2815                         break;
 2816         }
 2817         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
 2818 
 2819         if (error)
 2820                 error = -error;
 2821 done:
 2822         hammer_flush_record_done(record, error);
 2823 
 2824         /*
 2825          * Do partial finalization if we have built up too many dirty
 2826          * buffers.  Otherwise a buffer cache deadlock can occur when
 2827          * doing things like creating tens of thousands of tiny files.
 2828          *
 2829          * We must release our cursor lock to avoid a 3-way deadlock
 2830          * due to the exclusive sync lock the finalizer must get.
 2831          *
 2832          * WARNING: See warnings in hammer_unlock_cursor() function.
 2833          */
 2834         if (hammer_flusher_meta_limit(hmp) ||
 2835             vm_page_count_severe()) {
 2836                 hammer_unlock_cursor(cursor);
 2837                 hammer_flusher_finalize(trans, 0);
 2838                 hammer_lock_cursor(cursor);
 2839         }
 2840         return(error);
 2841 }
 2842 
 2843 /*
 2844  * Backend function called by the flusher to sync an inode to media.
 2845  */
 2846 int
 2847 hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
 2848 {
 2849         struct hammer_cursor cursor;
 2850         hammer_node_t tmp_node;
 2851         hammer_record_t depend;
 2852         hammer_record_t next;
 2853         int error, tmp_error;
 2854         u_int64_t nlinks;
 2855 
 2856         if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
 2857                 return(0);
 2858 
 2859         error = hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
 2860         if (error)
 2861                 goto done;
 2862 
 2863         /*
 2864          * Any directory records referencing this inode which are not in
 2865          * our current flush group must adjust our nlink count for the
 2866          * purposes of synchronizating to disk.
 2867          *
 2868          * Records which are in our flush group can be unlinked from our
 2869          * inode now, potentially allowing the inode to be physically
 2870          * deleted.
 2871          *
 2872          * This cannot block.
 2873          */
 2874         nlinks = ip->ino_data.nlinks;
 2875         next = TAILQ_FIRST(&ip->target_list);
 2876         while ((depend = next) != NULL) {
 2877                 next = TAILQ_NEXT(depend, target_entry);
 2878                 if (depend->flush_state == HAMMER_FST_FLUSH &&
 2879                     depend->flush_group == ip->flush_group) {
 2880                         /*
 2881                          * If this is an ADD that was deleted by the frontend
 2882                          * the frontend nlinks count will have already been
 2883                          * decremented, but the backend is going to sync its
 2884                          * directory entry and must account for it.  The
 2885                          * record will be converted to a delete-on-disk when
 2886                          * it gets synced.
 2887                          *
 2888                          * If the ADD was not deleted by the frontend we
 2889                          * can remove the dependancy from our target_list.
 2890                          */
 2891                         if (depend->flags & HAMMER_RECF_DELETED_FE) {
 2892                                 ++nlinks;
 2893                         } else {
 2894                                 TAILQ_REMOVE(&ip->target_list, depend,
 2895                                              target_entry);
 2896                                 depend->target_ip = NULL;
 2897                         }
 2898                 } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
 2899                         /*
 2900                          * Not part of our flush group and not deleted by
 2901                          * the front-end, adjust the link count synced to
 2902                          * the media (undo what the frontend did when it
 2903                          * queued the record).
 2904                          */
 2905                         KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
 2906                         switch(depend->type) {
 2907                         case HAMMER_MEM_RECORD_ADD:
 2908                                 --nlinks;
 2909                                 break;
 2910                         case HAMMER_MEM_RECORD_DEL:
 2911                                 ++nlinks;
 2912                                 break;
 2913                         default:
 2914                                 break;
 2915                         }
 2916                 }
 2917         }
 2918 
 2919         /*
 2920          * Set dirty if we had to modify the link count.
 2921          */
 2922         if (ip->sync_ino_data.nlinks != nlinks) {
 2923                 KKASSERT((int64_t)nlinks >= 0);
 2924                 ip->sync_ino_data.nlinks = nlinks;
 2925                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
 2926         }
 2927 
 2928         /*
 2929          * If there is a trunction queued destroy any data past the (aligned)
 2930          * truncation point.  Userland will have dealt with the buffer
 2931          * containing the truncation point for us.
 2932          *
 2933          * We don't flush pending frontend data buffers until after we've
 2934          * dealt with the truncation.
 2935          */
 2936         if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
 2937                 /*
 2938                  * Interlock trunc_off.  The VOP front-end may continue to
 2939                  * make adjustments to it while we are blocked.
 2940                  */
 2941                 off_t trunc_off;
 2942                 off_t aligned_trunc_off;
 2943                 int blkmask;
 2944 
 2945                 trunc_off = ip->sync_trunc_off;
 2946                 blkmask = hammer_blocksize(trunc_off) - 1;
 2947                 aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
 2948 
 2949                 /*
 2950                  * Delete any whole blocks on-media.  The front-end has
 2951                  * already cleaned out any partial block and made it
 2952                  * pending.  The front-end may have updated trunc_off
 2953                  * while we were blocked so we only use sync_trunc_off.
 2954                  *
 2955                  * This operation can blow out the buffer cache, EWOULDBLOCK
 2956                  * means we were unable to complete the deletion.  The
 2957                  * deletion will update sync_trunc_off in that case.
 2958                  */
 2959                 error = hammer_ip_delete_range(&cursor, ip,
 2960                                                 aligned_trunc_off,
 2961                                                 0x7FFFFFFFFFFFFFFFLL, 2);
 2962                 if (error == EWOULDBLOCK) {
 2963                         ip->flags |= HAMMER_INODE_WOULDBLOCK;
 2964                         error = 0;
 2965                         goto defer_buffer_flush;
 2966                 }
 2967 
 2968                 if (error)
 2969                         goto done;
 2970 
 2971                 /*
 2972                  * Generate a REDO_TERM_TRUNC entry in the UNDO/REDO FIFO.
 2973                  *
 2974                  * XXX we do this even if we did not previously generate
 2975                  * a REDO_TRUNC record.  This operation may enclosed the
 2976                  * range for multiple prior truncation entries in the REDO
 2977                  * log.
 2978                  */
 2979                 if (trans->hmp->version >= HAMMER_VOL_VERSION_FOUR &&
 2980                     (ip->flags & HAMMER_INODE_RDIRTY)) {
 2981                         hammer_generate_redo(trans, ip, aligned_trunc_off,
 2982                                              HAMMER_REDO_TERM_TRUNC,
 2983                                              NULL, 0);
 2984                 }
 2985 
 2986                 /*
 2987                  * Clear the truncation flag on the backend after we have
 2988                  * completed the deletions.  Backend data is now good again
 2989                  * (including new records we are about to sync, below).
 2990                  *
 2991                  * Leave sync_trunc_off intact.  As we write additional
 2992                  * records the backend will update sync_trunc_off.  This
 2993                  * tells the backend whether it can skip the overwrite
 2994                  * test.  This should work properly even when the backend
 2995                  * writes full blocks where the truncation point straddles
 2996                  * the block because the comparison is against the base
 2997                  * offset of the record.
 2998                  */
 2999                 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
 3000                 /* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
 3001         } else {
 3002                 error = 0;
 3003         }
 3004 
 3005         /*
 3006          * Now sync related records.  These will typically be directory
 3007          * entries, records tracking direct-writes, or delete-on-disk records.
 3008          */
 3009         if (error == 0) {
 3010                 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
 3011                                     hammer_sync_record_callback, &cursor);
 3012                 if (tmp_error < 0)
 3013                         tmp_error = -error;
 3014                 if (tmp_error)
 3015                         error = tmp_error;
 3016         }
 3017         hammer_cache_node(&ip->cache[1], cursor.node);
 3018 
 3019         /*
 3020          * Re-seek for inode update, assuming our cache hasn't been ripped
 3021          * out from under us.
 3022          */
 3023         if (error == 0) {
 3024                 tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error);
 3025                 if (tmp_node) {
 3026                         hammer_cursor_downgrade(&cursor);
 3027                         hammer_lock_sh(&tmp_node->lock);
 3028                         if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0)
 3029                                 hammer_cursor_seek(&cursor, tmp_node, 0);
 3030                         hammer_unlock(&tmp_node->lock);
 3031                         hammer_rel_node(tmp_node);
 3032                 }
 3033                 error = 0;
 3034         }
 3035 
 3036         /*
 3037          * If we are deleting the inode the frontend had better not have
 3038          * any active references on elements making up the inode.
 3039          *
 3040          * The call to hammer_ip_delete_clean() cleans up auxillary records
 3041          * but not DB or DATA records.  Those must have already been deleted
 3042          * by the normal truncation mechanic.
 3043          */
 3044         if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
 3045                 RB_EMPTY(&ip->rec_tree)  &&
 3046             (ip->sync_flags & HAMMER_INODE_DELETING) &&
 3047             (ip->flags & HAMMER_INODE_DELETED) == 0) {
 3048                 int count1 = 0;
 3049 
 3050                 error = hammer_ip_delete_clean(&cursor, ip, &count1);
 3051                 if (error == 0) {
 3052                         ip->flags |= HAMMER_INODE_DELETED;
 3053                         ip->sync_flags &= ~HAMMER_INODE_DELETING;
 3054                         ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
 3055                         KKASSERT(RB_EMPTY(&ip->rec_tree));
 3056 
 3057                         /*
 3058                          * Set delete_tid in both the frontend and backend
 3059                          * copy of the inode record.  The DELETED flag handles
 3060                          * this, do not set DDIRTY.
 3061                          */
 3062                         ip->ino_leaf.base.delete_tid = trans->tid;
 3063                         ip->sync_ino_leaf.base.delete_tid = trans->tid;
 3064                         ip->ino_leaf.delete_ts = trans->time32;
 3065                         ip->sync_ino_leaf.delete_ts = trans->time32;
 3066 
 3067 
 3068                         /*
 3069                          * Adjust the inode count in the volume header
 3070                          */
 3071                         hammer_sync_lock_sh(trans);
 3072                         if (ip->flags & HAMMER_INODE_ONDISK) {
 3073                                 hammer_modify_volume_field(trans,
 3074                                                            trans->rootvol,
 3075                                                            vol0_stat_inodes);
 3076                                 --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
 3077                                 hammer_modify_volume_done(trans->rootvol);
 3078                         }
 3079                         hammer_sync_unlock(trans);
 3080                 }
 3081         }
 3082 
 3083         if (error)
 3084                 goto done;
 3085         ip->sync_flags &= ~HAMMER_INODE_BUFS;
 3086 
 3087 defer_buffer_flush:
 3088         /*
 3089          * Now update the inode's on-disk inode-data and/or on-disk record.
 3090          * DELETED and ONDISK are managed only in ip->flags.
 3091          *
 3092          * In the case of a defered buffer flush we still update the on-disk
 3093          * inode to satisfy visibility requirements if there happen to be
 3094          * directory dependancies.
 3095          */
 3096         switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) {
 3097         case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK:
 3098                 /*
 3099                  * If deleted and on-disk, don't set any additional flags.
 3100                  * the delete flag takes care of things.
 3101                  *
 3102                  * Clear flags which may have been set by the frontend.
 3103                  */
 3104                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
 3105                                     HAMMER_INODE_SDIRTY |
 3106                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
 3107                                     HAMMER_INODE_DELETING);
 3108                 break;
 3109         case HAMMER_INODE_DELETED:
 3110                 /*
 3111                  * Take care of the case where a deleted inode was never
 3112                  * flushed to the disk in the first place.
 3113                  *
 3114                  * Clear flags which may have been set by the frontend.
 3115                  */
 3116                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
 3117                                     HAMMER_INODE_SDIRTY |
 3118                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
 3119                                     HAMMER_INODE_DELETING);
 3120                 while (RB_ROOT(&ip->rec_tree)) {
 3121                         hammer_record_t record = RB_ROOT(&ip->rec_tree);
 3122                         hammer_ref(&record->lock);
 3123                         KKASSERT(hammer_oneref(&record->lock));
 3124                         record->flags |= HAMMER_RECF_DELETED_BE;
 3125                         ++record->ip->rec_generation;
 3126                         hammer_rel_mem_record(record);
 3127                 }
 3128                 break;
 3129         case HAMMER_INODE_ONDISK:
 3130                 /*
 3131                  * If already on-disk, do not set any additional flags.
 3132                  */
 3133                 break;
 3134         default:
 3135                 /*
 3136                  * If not on-disk and not deleted, set DDIRTY to force
 3137                  * an initial record to be written.
 3138                  *
 3139                  * Also set the create_tid in both the frontend and backend
 3140                  * copy of the inode record.
 3141                  */
 3142                 ip->ino_leaf.base.create_tid = trans->tid;
 3143                 ip->ino_leaf.create_ts = trans->time32;
 3144                 ip->sync_ino_leaf.base.create_tid = trans->tid;
 3145                 ip->sync_ino_leaf.create_ts = trans->time32;
 3146                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
 3147                 break;
 3148         }
 3149 
 3150         /*
 3151          * If DDIRTY or SDIRTY is set, write out a new record.
 3152          * If the inode is already on-disk the old record is marked as
 3153          * deleted.
 3154          *
 3155          * If DELETED is set hammer_update_inode() will delete the existing
 3156          * record without writing out a new one.
 3157          *
 3158          * If *ONLY* the ITIMES flag is set we can update the record in-place.
 3159          */
 3160         if (ip->flags & HAMMER_INODE_DELETED) {
 3161                 error = hammer_update_inode(&cursor, ip);
 3162         } else 
 3163         if (!(ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY)) &&
 3164             (ip->sync_flags & (HAMMER_INODE_ATIME | HAMMER_INODE_MTIME))) {
 3165                 error = hammer_update_itimes(&cursor, ip);
 3166         } else
 3167         if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY |
 3168                               HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) {
 3169                 error = hammer_update_inode(&cursor, ip);
 3170         }
 3171 done:
 3172         if (ip->flags & HAMMER_INODE_MODMASK)
 3173                 hammer_inode_dirty(ip);
 3174         if (error) {
 3175                 hammer_critical_error(ip->hmp, ip, error,
 3176                                       "while syncing inode");
 3177         }
 3178         hammer_done_cursor(&cursor);
 3179         return(error);
 3180 }
 3181 
 3182 /*
 3183  * This routine is called when the OS is no longer actively referencing
 3184  * the inode (but might still be keeping it cached), or when releasing
 3185  * the last reference to an inode.
 3186  *
 3187  * At this point if the inode's nlinks count is zero we want to destroy
 3188  * it, which may mean destroying it on-media too.
 3189  */
 3190 void
 3191 hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
 3192 {
 3193         struct vnode *vp;
 3194 
 3195         /*
 3196          * Set the DELETING flag when the link count drops to 0 and the
 3197          * OS no longer has any opens on the inode.
 3198          *
 3199          * The backend will clear DELETING (a mod flag) and set DELETED
 3200          * (a state flag) when it is actually able to perform the
 3201          * operation.
 3202          *
 3203          * Don't reflag the deletion if the flusher is currently syncing
 3204          * one that was already flagged.  A previously set DELETING flag
 3205          * may bounce around flags and sync_flags until the operation is
 3206          * completely done.
 3207          *
 3208          * Do not attempt to modify a snapshot inode (one set to read-only).
 3209          */
 3210         if (ip->ino_data.nlinks == 0 &&
 3211             ((ip->flags | ip->sync_flags) & (HAMMER_INODE_RO|HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) == 0) {
 3212                 ip->flags |= HAMMER_INODE_DELETING;
 3213                 ip->flags |= HAMMER_INODE_TRUNCATED;
 3214                 ip->trunc_off = 0;
 3215                 vp = NULL;
 3216                 if (getvp) {
 3217                         if (hammer_get_vnode(ip, &vp) != 0)
 3218                                 return;
 3219                 }
 3220 
 3221                 /*
 3222                  * Final cleanup
 3223                  */
 3224                 if (ip->vp)
 3225                         nvtruncbuf(ip->vp, 0, HAMMER_BUFSIZE, 0, 0);
 3226                 if (ip->flags & HAMMER_INODE_MODMASK)
 3227                         hammer_inode_dirty(ip);
 3228                 if (getvp)
 3229                         vput(vp);
 3230         }
 3231 }
 3232 
 3233 /*
 3234  * After potentially resolving a dependancy the inode is tested
 3235  * to determine whether it needs to be reflushed.
 3236  */
 3237 void
 3238 hammer_test_inode(hammer_inode_t ip)
 3239 {
 3240         if (ip->flags & HAMMER_INODE_REFLUSH) {
 3241                 ip->flags &= ~HAMMER_INODE_REFLUSH;
 3242                 hammer_ref(&ip->lock);
 3243                 if (ip->flags & HAMMER_INODE_RESIGNAL) {
 3244                         ip->flags &= ~HAMMER_INODE_RESIGNAL;
 3245                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
 3246                 } else {
 3247                         hammer_flush_inode(ip, 0);
 3248                 }
 3249                 hammer_rel_inode(ip, 0);
 3250         }
 3251 }
 3252 
 3253 /*
 3254  * Clear the RECLAIM flag on an inode.  This occurs when the inode is
 3255  * reassociated with a vp or just before it gets freed.
 3256  *
 3257  * Pipeline wakeups to threads blocked due to an excessive number of
 3258  * detached inodes.  This typically occurs when atime updates accumulate
 3259  * while scanning a directory tree.
 3260  */
 3261 static void
 3262 hammer_inode_wakereclaims(hammer_inode_t ip)
 3263 {
 3264         struct hammer_reclaim *reclaim;
 3265         hammer_mount_t hmp = ip->hmp;
 3266 
 3267         if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
 3268                 return;
 3269 
 3270         --hammer_count_reclaims;
 3271         --hmp->count_reclaims;
 3272         ip->flags &= ~HAMMER_INODE_RECLAIM;
 3273 
 3274         if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
 3275                 KKASSERT(reclaim->count > 0);
 3276                 if (--reclaim->count == 0) {
 3277                         TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
 3278                         wakeup(reclaim);
 3279                 }
 3280         }
 3281 }
 3282 
 3283 /*
 3284  * Setup our reclaim pipeline.  We only let so many detached (and dirty)
 3285  * inodes build up before we start blocking.  This routine is called
 3286  * if a new inode is created or an inode is loaded from media.
 3287  *
 3288  * When we block we don't care *which* inode has finished reclaiming,
 3289  * as long as one does.
 3290  *
 3291  * The reclaim pipeline is primarily governed by the auto-flush which is
 3292  * 1/4 hammer_limit_reclaims.  We don't want to block if the count is
 3293  * less than 1/2 hammer_limit_reclaims.  From 1/2 to full count is
 3294  * dynamically governed.
 3295  */
 3296 void
 3297 hammer_inode_waitreclaims(hammer_transaction_t trans)
 3298 {
 3299         hammer_mount_t hmp = trans->hmp;
 3300         struct hammer_reclaim reclaim;
 3301         int lower_limit;
 3302 
 3303         /*
 3304          * Track inode load, delay if the number of reclaiming inodes is
 3305          * between 2/4 and 4/4 hammer_limit_reclaims, depending.
 3306          */
 3307         if (curthread->td_proc) {
 3308                 struct hammer_inostats *stats;
 3309 
 3310                 stats = hammer_inode_inostats(hmp, curthread->td_proc->p_pid);
 3311                 ++stats->count;
 3312 
 3313                 if (stats->count > hammer_limit_reclaims / 2)
 3314                         stats->count = hammer_limit_reclaims / 2;
 3315                 lower_limit = hammer_limit_reclaims - stats->count;
 3316                 if (hammer_debug_general & 0x10000) {
 3317                         kprintf("pid %5d limit %d\n",
 3318                                 (int)curthread->td_proc->p_pid, lower_limit);
 3319                 }
 3320         } else {
 3321                 lower_limit = hammer_limit_reclaims * 3 / 4;
 3322         }
 3323         if (hmp->count_reclaims >= lower_limit) {
 3324                 reclaim.count = 1;
 3325                 TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
 3326                 tsleep(&reclaim, 0, "hmrrcm", hz);
 3327                 if (reclaim.count > 0)
 3328                         TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
 3329         }
 3330 }
 3331 
 3332 /*
 3333  * Keep track of reclaim statistics on a per-pid basis using a loose
 3334  * 4-way set associative hash table.  Collisions inherit the count of
 3335  * the previous entry.
 3336  *
 3337  * NOTE: We want to be careful here to limit the chain size.  If the chain
 3338  *       size is too large a pid will spread its stats out over too many
 3339  *       entries under certain types of heavy filesystem activity and
 3340  *       wind up not delaying long enough.
 3341  */
 3342 static
 3343 struct hammer_inostats *
 3344 hammer_inode_inostats(hammer_mount_t hmp, pid_t pid)
 3345 {
 3346         struct hammer_inostats *stats;
 3347         int delta;
 3348         int chain;
 3349         static volatile int iterator;   /* we don't care about MP races */
 3350 
 3351         /*
 3352          * Chain up to 4 times to find our entry.
 3353          */
 3354         for (chain = 0; chain < 4; ++chain) {
 3355                 stats = &hmp->inostats[(pid + chain) & HAMMER_INOSTATS_HMASK];
 3356                 if (stats->pid == pid)
 3357                         break;
 3358         }
 3359 
 3360         /*
 3361          * Replace one of the four chaining entries with our new entry.
 3362          */
 3363         if (chain == 4) {
 3364                 stats = &hmp->inostats[(pid + (iterator++ & 3)) &
 3365                                        HAMMER_INOSTATS_HMASK];
 3366                 stats->pid = pid;
 3367         }
 3368 
 3369         /*
 3370          * Decay the entry
 3371          */
 3372         if (stats->count && stats->ltick != ticks) {
 3373                 delta = ticks - stats->ltick;
 3374                 stats->ltick = ticks;
 3375                 if (delta <= 0 || delta > hz * 60)
 3376                         stats->count = 0;
 3377                 else
 3378                         stats->count = stats->count * hz / (hz + delta);
 3379         }
 3380         if (hammer_debug_general & 0x10000)
 3381                 kprintf("pid %5d stats %d\n", (int)pid, stats->count);
 3382         return (stats);
 3383 }
 3384 
 3385 #if 0
 3386 
 3387 /*
 3388  * XXX not used, doesn't work very well due to the large batching nature
 3389  * of flushes.
 3390  *
 3391  * A larger then normal backlog of inodes is sitting in the flusher,
 3392  * enforce a general slowdown to let it catch up.  This routine is only
 3393  * called on completion of a non-flusher-related transaction which
 3394  * performed B-Tree node I/O.
 3395  *
 3396  * It is possible for the flusher to stall in a continuous load.
 3397  * blogbench -i1000 -o seems to do a good job generating this sort of load.
 3398  * If the flusher is unable to catch up the inode count can bloat until
 3399  * we run out of kvm.
 3400  *
 3401  * This is a bit of a hack.
 3402  */
 3403 void
 3404 hammer_inode_waithard(hammer_mount_t hmp)
 3405 {
 3406         /*
 3407          * Hysteresis.
 3408          */
 3409         if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
 3410                 if (hmp->count_reclaims < hammer_limit_reclaims / 2 &&
 3411                     hmp->count_iqueued < hmp->count_inodes / 20) {
 3412                         hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
 3413                         return;
 3414                 }
 3415         } else {
 3416                 if (hmp->count_reclaims < hammer_limit_reclaims ||
 3417                     hmp->count_iqueued < hmp->count_inodes / 10) {
 3418                         return;
 3419                 }
 3420                 hmp->flags |= HAMMER_MOUNT_FLUSH_RECOVERY;
 3421         }
 3422 
 3423         /*
 3424          * Block for one flush cycle.
 3425          */
 3426         hammer_flusher_wait_next(hmp);
 3427 }
 3428 
 3429 #endif

Cache object: d064f6b7c51ae39981d3162f70616686


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.