hammer_cursor.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
    3  * 
    4  * This code is derived from software contributed to The DragonFly Project
    5  * by Matthew Dillon <dillon@backplane.com>
    6  * 
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in
   15  *    the documentation and/or other materials provided with the
   16  *    distribution.
   17  * 3. Neither the name of The DragonFly Project nor the names of its
   18  *    contributors may be used to endorse or promote products derived
   19  *    from this software without specific, prior written permission.
   20  * 
   21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
   25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  */
   34 
   35 /*
   36  * HAMMER B-Tree index - cursor support routines
   37  */
   38 #include "hammer.h"
   39 
   40 static int hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive);
   41 
   42 /*
   43  * Initialize a fresh cursor using the B-Tree node cache.  If the cache
   44  * is not available initialize a fresh cursor at the root of the filesystem.
   45  */
   46 int
   47 hammer_init_cursor(hammer_transaction_t trans, hammer_cursor_t cursor,
   48                    hammer_node_cache_t cache, hammer_inode_t ip)
   49 {
   50         hammer_volume_t volume;
   51         hammer_node_t node;
   52         hammer_mount_t hmp;
   53         u_int tticks;
   54         int error;
   55 
   56         bzero(cursor, sizeof(*cursor));
   57 
   58         cursor->trans = trans;
   59         hmp = trans->hmp;
   60 
   61         /*
   62          * As the number of inodes queued to the flusher increases we use
   63          * time-domain multiplexing to control read vs flush performance.
   64          * We have to do it here, before acquiring any ip or node locks,
   65          * to avoid deadlocking or excessively delaying the flusher.
   66          *
   67          * The full time period is hammer_tdmux_ticks, typically 1/5 of
   68          * a second.
   69          *
   70          * inode allocation begins to get restrained at 2/4 the limit
   71          * via the "hmrrcm" mechanism in hammer_inode.  We want to begin
   72          * limiting read activity before that to try to avoid processes
   73          * stalling out in "hmrrcm".
   74          */
   75         tticks = hammer_tdmux_ticks;
   76         if (trans->type != HAMMER_TRANS_FLS && tticks &&
   77             hmp->count_reclaims > hammer_limit_reclaims / tticks &&
   78             hmp->count_reclaims > hammer_autoflush * 2 &&
   79             hammer_flusher_running(hmp)) {
   80                 u_int rticks;
   81                 u_int xticks;
   82                 u_int dummy;
   83 
   84                 /*
   85                  * 0 ... xticks ... tticks
   86                  *
   87                  * rticks is the calculated position, xticks is the demarc
   88                  * where values below xticks are reserved for the flusher
   89                  * and values >= to xticks may be used by the frontend.
   90                  *
   91                  * At least one tick is always made available for the
   92                  * frontend.
   93                  */
   94                 rticks = (u_int)ticks % tticks;
   95                 xticks = hmp->count_reclaims * tticks / hammer_limit_reclaims;
   96 
   97                 /*
   98                  * Ensure rticks and xticks are stable
   99                  */
  100                 cpu_ccfence();
  101                 if (rticks < xticks) {
  102                         if (hammer_debug_general & 0x0004)
  103                                 kprintf("rt %3u, xt %3u, tt %3u\n",
  104                                         rticks, xticks, tticks);
  105                         tsleep(&dummy, 0, "htdmux", xticks - rticks);
  106                 }
  107         }
  108 
  109         /*
  110          * If the cursor operation is on behalf of an inode, lock
  111          * the inode.
  112          *
  113          * When acquiring a shared lock on an inode on which the backend
  114          * flusher deadlocked, wait up to hammer_tdmux_ticks (1 second)
  115          * for the deadlock to clear.
  116          */
  117         if ((cursor->ip = ip) != NULL) {
  118                 ++ip->cursor_ip_refs;
  119                 if (trans->type == HAMMER_TRANS_FLS) {
  120                         hammer_lock_ex(&ip->lock);
  121                 } else {
  122 #if 0
  123                         if (ip->cursor_exclreq_count) {
  124                                 tsleep(&ip->cursor_exclreq_count, 0,
  125                                        "hstag1", hammer_tdmux_ticks);
  126                         }
  127 #endif
  128                         hammer_lock_sh(&ip->lock);
  129                 }
  130         }
  131 
  132         /*
  133          * Step 1 - acquire a locked node from the cache if possible
  134          */
  135         if (cache && cache->node) {
  136                 node = hammer_ref_node_safe(trans, cache, &error);
  137                 if (error == 0) {
  138                         hammer_lock_sh(&node->lock);
  139                         if (node->flags & HAMMER_NODE_DELETED) {
  140                                 hammer_unlock(&node->lock);
  141                                 hammer_rel_node(node);
  142                                 node = NULL;
  143                         }
  144                 }
  145                 if (node == NULL)
  146                         ++hammer_stats_btree_root_iterations;
  147         } else {
  148                 node = NULL;
  149                 ++hammer_stats_btree_root_iterations;
  150         }
  151 
  152         /*
  153          * Step 2 - If we couldn't get a node from the cache, get
  154          * the one from the root of the filesystem.
  155          */
  156         while (node == NULL) {
  157                 volume = hammer_get_root_volume(hmp, &error);
  158                 if (error)
  159                         break;
  160                 node = hammer_get_node(trans, volume->ondisk->vol0_btree_root,
  161                                        0, &error);
  162                 hammer_rel_volume(volume, 0);
  163                 if (error)
  164                         break;
  165                 /*
  166                  * When the frontend acquires the root b-tree node while the
  167                  * backend is deadlocked on it, wait up to hammer_tdmux_ticks
  168                  * (1 second) for the deadlock to clear.
  169                  */
  170 #if 0
  171                 if (node->cursor_exclreq_count &&
  172                     cursor->trans->type != HAMMER_TRANS_FLS) {
  173                         tsleep(&node->cursor_exclreq_count, 0,
  174                                "hstag3", hammer_tdmux_ticks);
  175                 }
  176 #endif
  177                 hammer_lock_sh(&node->lock);
  178 
  179                 /*
  180                  * If someone got in before we could lock the node, retry.
  181                  */
  182                 if (node->flags & HAMMER_NODE_DELETED) {
  183                         hammer_unlock(&node->lock);
  184                         hammer_rel_node(node);
  185                         node = NULL;
  186                         continue;
  187                 }
  188                 if (volume->ondisk->vol0_btree_root != node->node_offset) {
  189                         hammer_unlock(&node->lock);
  190                         hammer_rel_node(node);
  191                         node = NULL;
  192                         continue;
  193                 }
  194         }
  195 
  196         /*
  197          * Step 3 - finish initializing the cursor by acquiring the parent
  198          */
  199         cursor->node = node;
  200         if (error == 0)
  201                 error = hammer_load_cursor_parent(cursor, 0);
  202         KKASSERT(error == 0);
  203         /* if (error) hammer_done_cursor(cursor); */
  204         return(error);
  205 }
  206 
  207 /*
  208  * Normalize a cursor.  Sometimes cursors can be left in a state
  209  * where node is NULL.  If the cursor is in this state, cursor up.
  210  */
  211 void
  212 hammer_normalize_cursor(hammer_cursor_t cursor)
  213 {
  214         if (cursor->node == NULL) {
  215                 KKASSERT(cursor->parent != NULL);
  216                 hammer_cursor_up(cursor);
  217         }
  218 }
  219 
  220 
  221 /*
  222  * We are finished with a cursor.  We NULL out various fields as sanity
  223  * check, in case the structure is inappropriately used afterwords.
  224  */
  225 void
  226 hammer_done_cursor(hammer_cursor_t cursor)
  227 {
  228         hammer_inode_t ip;
  229 
  230         KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0);
  231         if (cursor->parent) {
  232                 hammer_unlock(&cursor->parent->lock);
  233                 hammer_rel_node(cursor->parent);
  234                 cursor->parent = NULL;
  235         }
  236         if (cursor->node) {
  237                 hammer_unlock(&cursor->node->lock);
  238                 hammer_rel_node(cursor->node);
  239                 cursor->node = NULL;
  240         }
  241         if (cursor->data_buffer) {
  242                 hammer_rel_buffer(cursor->data_buffer, 0);
  243                 cursor->data_buffer = NULL;
  244         }
  245         if ((ip = cursor->ip) != NULL) {
  246                 KKASSERT(ip->cursor_ip_refs > 0);
  247                 --ip->cursor_ip_refs;
  248                 hammer_unlock(&ip->lock);
  249                 cursor->ip = NULL;
  250         }
  251         if (cursor->iprec) {
  252                 hammer_rel_mem_record(cursor->iprec);
  253                 cursor->iprec = NULL;
  254         }
  255 
  256         /*
  257          * If we deadlocked this node will be referenced.  Do a quick
  258          * lock/unlock to wait for the deadlock condition to clear.
  259          *
  260          * Maintain exclreq_count / wakeup as necessary to notify new
  261          * entrants into ip.  We continue to hold the fs_token so our
  262          * EDEADLK retry loop should get its chance before another thread
  263          * steals the lock.
  264          */
  265         if (cursor->deadlk_node) {
  266 #if 0
  267                 if (ip && cursor->trans->type == HAMMER_TRANS_FLS)
  268                         ++ip->cursor_exclreq_count;
  269                 ++cursor->deadlk_node->cursor_exclreq_count;
  270 #endif
  271                 hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk");
  272                 hammer_unlock(&cursor->deadlk_node->lock);
  273 #if 0
  274                 if (--cursor->deadlk_node->cursor_exclreq_count == 0)
  275                         wakeup(&cursor->deadlk_node->cursor_exclreq_count);
  276                 if (ip && cursor->trans->type == HAMMER_TRANS_FLS) {
  277                         if (--ip->cursor_exclreq_count == 0)
  278                                 wakeup(&ip->cursor_exclreq_count);
  279                 }
  280 #endif
  281                 hammer_rel_node(cursor->deadlk_node);
  282                 cursor->deadlk_node = NULL;
  283         }
  284         if (cursor->deadlk_rec) {
  285                 hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr");
  286                 hammer_rel_mem_record(cursor->deadlk_rec);
  287                 cursor->deadlk_rec = NULL;
  288         }
  289 
  290         cursor->data = NULL;
  291         cursor->leaf = NULL;
  292         cursor->left_bound = NULL;
  293         cursor->right_bound = NULL;
  294         cursor->trans = NULL;
  295 }
  296 
  297 /*
  298  * Upgrade cursor->node and cursor->parent to exclusive locks.  This
  299  * function can return EDEADLK.
  300  *
  301  * The lock must already be either held shared or already held exclusively
  302  * by us.
  303  *
  304  * We upgrade the parent first as it is the most likely to collide first
  305  * with the downward traversal that the frontend typically does.
  306  *
  307  * If we fail to upgrade the lock and cursor->deadlk_node is NULL, 
  308  * we add another reference to the node that failed and set
  309  * cursor->deadlk_node so hammer_done_cursor() can block on it.
  310  */
  311 int
  312 hammer_cursor_upgrade(hammer_cursor_t cursor)
  313 {
  314         int error;
  315 
  316         if (cursor->parent) {
  317                 error = hammer_lock_upgrade(&cursor->parent->lock, 1);
  318                 if (error && cursor->deadlk_node == NULL) {
  319                         cursor->deadlk_node = cursor->parent;
  320                         hammer_ref_node(cursor->deadlk_node);
  321                 }
  322         } else {
  323                 error = 0;
  324         }
  325         if (error == 0) {
  326                 error = hammer_lock_upgrade(&cursor->node->lock, 1);
  327                 if (error && cursor->deadlk_node == NULL) {
  328                         cursor->deadlk_node = cursor->node;
  329                         hammer_ref_node(cursor->deadlk_node);
  330                 }
  331         }
  332 #if 0
  333         error = hammer_lock_upgrade(&cursor->node->lock, 1);
  334         if (error && cursor->deadlk_node == NULL) {
  335                 cursor->deadlk_node = cursor->node;
  336                 hammer_ref_node(cursor->deadlk_node);
  337         } else if (error == 0 && cursor->parent) {
  338                 error = hammer_lock_upgrade(&cursor->parent->lock, 1);
  339                 if (error && cursor->deadlk_node == NULL) {
  340                         cursor->deadlk_node = cursor->parent;
  341                         hammer_ref_node(cursor->deadlk_node);
  342                 }
  343         }
  344 #endif
  345         return(error);
  346 }
  347 
  348 int
  349 hammer_cursor_upgrade_node(hammer_cursor_t cursor)
  350 {
  351         int error;
  352 
  353         error = hammer_lock_upgrade(&cursor->node->lock, 1);
  354         if (error && cursor->deadlk_node == NULL) {
  355                 cursor->deadlk_node = cursor->node;
  356                 hammer_ref_node(cursor->deadlk_node);
  357         }
  358         return(error);
  359 }
  360 
  361 /*
  362  * Downgrade cursor->node and cursor->parent to shared locks.  This
  363  * function can return EDEADLK.
  364  */
  365 void
  366 hammer_cursor_downgrade(hammer_cursor_t cursor)
  367 {
  368         if (hammer_lock_excl_owned(&cursor->node->lock, curthread))
  369                 hammer_lock_downgrade(&cursor->node->lock, 1);
  370         if (cursor->parent &&
  371             hammer_lock_excl_owned(&cursor->parent->lock, curthread)) {
  372                 hammer_lock_downgrade(&cursor->parent->lock, 1);
  373         }
  374 }
  375 
  376 /*
  377  * Upgrade and downgrade pairs of cursors.  This is used by the dedup
  378  * code which must deal with two cursors.  A special function is needed
  379  * because some of the nodes may be shared between the two cursors,
  380  * resulting in share counts > 1 which will normally cause an upgrade
  381  * to fail.
  382  */
  383 static __noinline
  384 int
  385 collect_node(hammer_node_t *array, int *counts, int n, hammer_node_t node)
  386 {
  387         int i;
  388 
  389         for (i = 0; i < n; ++i) {
  390                 if (array[i] == node)
  391                         break;
  392         }
  393         if (i == n) {
  394                 array[i] = node;
  395                 counts[i] = 1;
  396                 ++i;
  397         } else {
  398                 ++counts[i];
  399         }
  400         return(i);
  401 }
  402 
  403 int
  404 hammer_cursor_upgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2)
  405 {
  406         hammer_node_t nodes[4];
  407         int counts[4];
  408         int error;
  409         int i;
  410         int n;
  411 
  412         n = collect_node(nodes, counts, 0, cursor1->node);
  413         if (cursor1->parent)
  414                 n = collect_node(nodes, counts, n, cursor1->parent);
  415         n = collect_node(nodes, counts, n, cursor2->node);
  416         if (cursor2->parent)
  417                 n = collect_node(nodes, counts, n, cursor2->parent);
  418 
  419         error = 0;
  420         for (i = 0; i < n; ++i) {
  421                 error = hammer_lock_upgrade(&nodes[i]->lock, counts[i]);
  422                 if (error)
  423                         break;
  424         }
  425         if (error) {
  426                 while (--i >= 0)
  427                         hammer_lock_downgrade(&nodes[i]->lock, counts[i]);
  428         }
  429         return (error);
  430 }
  431 
  432 void
  433 hammer_cursor_downgrade2(hammer_cursor_t cursor1, hammer_cursor_t cursor2)
  434 {
  435         hammer_node_t nodes[4];
  436         int counts[4];
  437         int i;
  438         int n;
  439 
  440         n = collect_node(nodes, counts, 0, cursor1->node);
  441         if (cursor1->parent)
  442                 n = collect_node(nodes, counts, n, cursor1->parent);
  443         n = collect_node(nodes, counts, n, cursor2->node);
  444         if (cursor2->parent)
  445                 n = collect_node(nodes, counts, n, cursor2->parent);
  446 
  447         for (i = 0; i < n; ++i)
  448                 hammer_lock_downgrade(&nodes[i]->lock, counts[i]);
  449 }
  450 
  451 /*
  452  * Seek the cursor to the specified node and index.
  453  *
  454  * The caller must ref the node prior to calling this routine and release
  455  * it after it returns.  If the seek succeeds the cursor will gain its own
  456  * ref on the node.
  457  */
  458 int
  459 hammer_cursor_seek(hammer_cursor_t cursor, hammer_node_t node, int index)
  460 {
  461         int error;
  462 
  463         hammer_cursor_downgrade(cursor);
  464         error = 0;
  465 
  466         if (cursor->node != node) {
  467                 hammer_unlock(&cursor->node->lock);
  468                 hammer_rel_node(cursor->node);
  469                 cursor->node = node;
  470                 hammer_ref_node(node);
  471                 hammer_lock_sh(&node->lock);
  472                 KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0);
  473 
  474                 if (cursor->parent) {
  475                         hammer_unlock(&cursor->parent->lock);
  476                         hammer_rel_node(cursor->parent);
  477                         cursor->parent = NULL;
  478                         cursor->parent_index = 0;
  479                 }
  480                 error = hammer_load_cursor_parent(cursor, 0);
  481         }
  482         cursor->index = index;
  483         return (error);
  484 }
  485 
  486 /*
  487  * Load the parent of cursor->node into cursor->parent.
  488  */
  489 static
  490 int
  491 hammer_load_cursor_parent(hammer_cursor_t cursor, int try_exclusive)
  492 {
  493         hammer_mount_t hmp;
  494         hammer_node_t parent;
  495         hammer_node_t node;
  496         hammer_btree_elm_t elm;
  497         int error;
  498         int parent_index;
  499 
  500         hmp = cursor->trans->hmp;
  501 
  502         if (cursor->node->ondisk->parent) {
  503                 node = cursor->node;
  504                 parent = hammer_btree_get_parent(cursor->trans, node,
  505                                                  &parent_index,
  506                                                  &error, try_exclusive);
  507                 if (error == 0) {
  508                         elm = &parent->ondisk->elms[parent_index];
  509                         cursor->parent = parent;
  510                         cursor->parent_index = parent_index;
  511                         cursor->left_bound = &elm[0].internal.base;
  512                         cursor->right_bound = &elm[1].internal.base;
  513                 }
  514         } else {
  515                 cursor->parent = NULL;
  516                 cursor->parent_index = 0;
  517                 cursor->left_bound = &hmp->root_btree_beg;
  518                 cursor->right_bound = &hmp->root_btree_end;
  519                 error = 0;
  520         }
  521         return(error);
  522 }
  523 
  524 /*
  525  * Cursor up to our parent node.  Return ENOENT if we are at the root of
  526  * the filesystem.
  527  */
  528 int
  529 hammer_cursor_up(hammer_cursor_t cursor)
  530 {
  531         int error;
  532 
  533         hammer_cursor_downgrade(cursor);
  534 
  535         /*
  536          * If the parent is NULL we are at the root of the B-Tree and
  537          * return ENOENT.
  538          */
  539         if (cursor->parent == NULL)
  540                 return (ENOENT);
  541 
  542         /*
  543          * Set the node to its parent. 
  544          */
  545         hammer_unlock(&cursor->node->lock);
  546         hammer_rel_node(cursor->node);
  547         cursor->node = cursor->parent;
  548         cursor->index = cursor->parent_index;
  549         cursor->parent = NULL;
  550         cursor->parent_index = 0;
  551 
  552         error = hammer_load_cursor_parent(cursor, 0);
  553         return(error);
  554 }
  555 
  556 /*
  557  * Special cursor up given a locked cursor.  The orignal node is not
  558  * unlocked or released and the cursor is not downgraded.
  559  *
  560  * This function can fail with EDEADLK.
  561  *
  562  * This function is only run when recursively deleting parent nodes
  563  * to get rid of an empty leaf.
  564  */
  565 int
  566 hammer_cursor_up_locked(hammer_cursor_t cursor)
  567 {
  568         hammer_node_t save;
  569         int error;
  570         int save_index;
  571 
  572         /*
  573          * If the parent is NULL we are at the root of the B-Tree and
  574          * return ENOENT.
  575          */
  576         if (cursor->parent == NULL)
  577                 return (ENOENT);
  578 
  579         save = cursor->node;
  580         save_index = cursor->index;
  581 
  582         /*
  583          * Set the node to its parent. 
  584          */
  585         cursor->node = cursor->parent;
  586         cursor->index = cursor->parent_index;
  587         cursor->parent = NULL;
  588         cursor->parent_index = 0;
  589 
  590         /*
  591          * load the new parent, attempt to exclusively lock it.  Note that
  592          * we are still holding the old parent (now cursor->node) exclusively
  593          * locked.
  594          *
  595          * This can return EDEADLK.  Undo the operation on any error.  These
  596          * up sequences can occur during iterations so be sure to restore
  597          * the index.
  598          */
  599         error = hammer_load_cursor_parent(cursor, 1);
  600         if (error) {
  601                 cursor->parent = cursor->node;
  602                 cursor->parent_index = cursor->index;
  603                 cursor->node = save;
  604                 cursor->index = save_index;
  605         }
  606         return(error);
  607 }
  608 
  609 
  610 /*
  611  * Cursor down through the current node, which must be an internal node.
  612  *
  613  * This routine adjusts the cursor and sets index to 0.
  614  */
  615 int
  616 hammer_cursor_down(hammer_cursor_t cursor)
  617 {
  618         hammer_node_t node;
  619         hammer_btree_elm_t elm;
  620         int error;
  621 
  622         /*
  623          * The current node becomes the current parent
  624          */
  625         hammer_cursor_downgrade(cursor);
  626         node = cursor->node;
  627         KKASSERT(cursor->index >= 0 && cursor->index < node->ondisk->count);
  628         if (cursor->parent) {
  629                 hammer_unlock(&cursor->parent->lock);
  630                 hammer_rel_node(cursor->parent);
  631         }
  632         cursor->parent = node;
  633         cursor->parent_index = cursor->index;
  634         cursor->node = NULL;
  635         cursor->index = 0;
  636 
  637         /*
  638          * Extract element to push into at (node,index), set bounds.
  639          */
  640         elm = &node->ondisk->elms[cursor->parent_index];
  641 
  642         /*
  643          * Ok, push down into elm.  If elm specifies an internal or leaf
  644          * node the current node must be an internal node.  If elm specifies
  645          * a spike then the current node must be a leaf node.
  646          */
  647         switch(elm->base.btype) {
  648         case HAMMER_BTREE_TYPE_INTERNAL:
  649         case HAMMER_BTREE_TYPE_LEAF:
  650                 KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL);
  651                 KKASSERT(elm->internal.subtree_offset != 0);
  652                 cursor->left_bound = &elm[0].internal.base;
  653                 cursor->right_bound = &elm[1].internal.base;
  654                 node = hammer_get_node(cursor->trans,
  655                                        elm->internal.subtree_offset, 0, &error);
  656                 if (error == 0) {
  657                         KASSERT(elm->base.btype == node->ondisk->type, ("BTYPE MISMATCH %c %c NODE %p", elm->base.btype, node->ondisk->type, node));
  658                         if (node->ondisk->parent != cursor->parent->node_offset)
  659                                 panic("node %p %016llx vs %016llx", node, (long long)node->ondisk->parent, (long long)cursor->parent->node_offset);
  660                         KKASSERT(node->ondisk->parent == cursor->parent->node_offset);
  661                 }
  662                 break;
  663         default:
  664                 panic("hammer_cursor_down: illegal btype %02x (%c)",
  665                       elm->base.btype,
  666                       (elm->base.btype ? elm->base.btype : '?'));
  667                 break;
  668         }
  669 
  670         /*
  671          * If no error occured we can lock the new child node.  If the
  672          * node is deadlock flagged wait up to hammer_tdmux_ticks (1 second)
  673          * for the deadlock to clear.  Otherwise a large number of concurrent
  674          * readers can continuously stall the flusher.
  675          *
  676          * We specifically do this in the cursor_down() code in order to
  677          * deal with frontend top-down searches smashing against bottom-up
  678          * flusher-based mirror updates.  These collisions typically occur
  679          * above the inode in the B-Tree and are not covered by the
  680          * ip->cursor_exclreq_count logic.
  681          */
  682         if (error == 0) {
  683 #if 0
  684                 if (node->cursor_exclreq_count &&
  685                     cursor->trans->type != HAMMER_TRANS_FLS) {
  686                         tsleep(&node->cursor_exclreq_count, 0,
  687                                "hstag2", hammer_tdmux_ticks);
  688                 }
  689 #endif
  690                 hammer_lock_sh(&node->lock);
  691                 KKASSERT ((node->flags & HAMMER_NODE_DELETED) == 0);
  692                 cursor->node = node;
  693                 cursor->index = 0;
  694         }
  695         return(error);
  696 }
  697 
  698 /************************************************************************
  699  *                              DEADLOCK RECOVERY                       *
  700  ************************************************************************
  701  *
  702  * These are the new deadlock recovery functions.  Currently they are only
  703  * used for the mirror propagation and physical node removal cases but
  704  * ultimately the intention is to use them for all deadlock recovery
  705  * operations.
  706  *
  707  * WARNING!  The contents of the cursor may be modified while unlocked.
  708  *           passive modifications including adjusting the node, parent,
  709  *           indexes, and leaf pointer.
  710  *
  711  *           An outright removal of the element the cursor was pointing at
  712  *           will cause the HAMMER_CURSOR_TRACKED_RIPOUT flag to be set,
  713  *           which chains to causing the HAMMER_CURSOR_RETEST to be set
  714  *           when the cursor is locked again.
  715  */
  716 void
  717 hammer_unlock_cursor(hammer_cursor_t cursor)
  718 {
  719         hammer_node_t node;
  720 
  721         KKASSERT((cursor->flags & HAMMER_CURSOR_TRACKED) == 0);
  722         KKASSERT(cursor->node);
  723 
  724         /*
  725          * Release the cursor's locks and track B-Tree operations on node.
  726          * While being tracked our cursor can be modified by other threads
  727          * and the node may be replaced.
  728          */
  729         if (cursor->parent) {
  730                 hammer_unlock(&cursor->parent->lock);
  731                 hammer_rel_node(cursor->parent);
  732                 cursor->parent = NULL;
  733         }
  734         node = cursor->node;
  735         cursor->flags |= HAMMER_CURSOR_TRACKED;
  736         TAILQ_INSERT_TAIL(&node->cursor_list, cursor, deadlk_entry);
  737         hammer_unlock(&node->lock);
  738 }
  739 
  740 /*
  741  * Get the cursor heated up again.  The cursor's node may have
  742  * changed and we might have to locate the new parent.
  743  *
  744  * If the exact element we were on got deleted RIPOUT will be
  745  * set and we must clear ATEDISK so an iteration does not skip
  746  * the element after it.
  747  */
  748 int
  749 hammer_lock_cursor(hammer_cursor_t cursor)
  750 {
  751         hammer_node_t node;
  752         int error;
  753 
  754         KKASSERT(cursor->flags & HAMMER_CURSOR_TRACKED);
  755 
  756         /*
  757          * Relock the node
  758          */
  759         for (;;) {
  760                 node = cursor->node;
  761                 hammer_ref_node(node);
  762                 hammer_lock_sh(&node->lock);
  763                 if (cursor->node == node) {
  764                         hammer_rel_node(node);
  765                         break;
  766                 }
  767                 hammer_unlock(&node->lock);
  768                 hammer_rel_node(node);
  769         }
  770 
  771         /*
  772          * Untrack the cursor, clean up, and re-establish the parent node.
  773          */
  774         TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry);
  775         cursor->flags &= ~HAMMER_CURSOR_TRACKED;
  776 
  777         /*
  778          * If a ripout has occured iterations must re-test the (new)
  779          * current element.  Clearing ATEDISK prevents the element from
  780          * being skipped and RETEST causes it to be re-tested.
  781          */
  782         if (cursor->flags & HAMMER_CURSOR_TRACKED_RIPOUT) {
  783                 cursor->flags &= ~HAMMER_CURSOR_TRACKED_RIPOUT;
  784                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
  785                 cursor->flags |= HAMMER_CURSOR_RETEST;
  786         }
  787         error = hammer_load_cursor_parent(cursor, 0);
  788         return(error);
  789 }
  790 
  791 /*
  792  * Recover from a deadlocked cursor, tracking any node removals or
  793  * replacements.  If the cursor's current node is removed by another
  794  * thread (via btree_remove()) the cursor will be seeked upwards.
  795  *
  796  * The caller is working a modifying operation and must be holding the
  797  * sync lock (shared).  We do not release the sync lock because this
  798  * would break atomicy.
  799  */
  800 int
  801 hammer_recover_cursor(hammer_cursor_t cursor)
  802 {
  803         hammer_transaction_t trans __debugvar;
  804 #if 0
  805         hammer_inode_t ip;
  806 #endif
  807         int error;
  808 
  809         hammer_unlock_cursor(cursor);
  810 
  811 #if 0
  812         ip = cursor->ip;
  813 #endif
  814         trans = cursor->trans;
  815         KKASSERT(trans->sync_lock_refs > 0);
  816 
  817         /*
  818          * Wait for the deadlock to clear.
  819          *
  820          * Maintain exclreq_count / wakeup as necessary to notify new
  821          * entrants into ip.  We continue to hold the fs_token so our
  822          * EDEADLK retry loop should get its chance before another thread
  823          * steals the lock.
  824          */
  825         if (cursor->deadlk_node) {
  826 #if 0
  827                 if (ip && trans->type == HAMMER_TRANS_FLS)
  828                         ++ip->cursor_exclreq_count;
  829                 ++cursor->deadlk_node->cursor_exclreq_count;
  830 #endif
  831                 hammer_lock_ex_ident(&cursor->deadlk_node->lock, "hmrdlk");
  832                 hammer_unlock(&cursor->deadlk_node->lock);
  833 #if 0
  834                 if (--cursor->deadlk_node->cursor_exclreq_count == 0)
  835                         wakeup(&cursor->deadlk_node->cursor_exclreq_count);
  836                 if (ip && trans->type == HAMMER_TRANS_FLS) {
  837                         if (--ip->cursor_exclreq_count == 0)
  838                                 wakeup(&ip->cursor_exclreq_count);
  839                 }
  840 #endif
  841                 hammer_rel_node(cursor->deadlk_node);
  842                 cursor->deadlk_node = NULL;
  843         }
  844         if (cursor->deadlk_rec) {
  845                 hammer_wait_mem_record_ident(cursor->deadlk_rec, "hmmdlr");
  846                 hammer_rel_mem_record(cursor->deadlk_rec);
  847                 cursor->deadlk_rec = NULL;
  848         }
  849         error = hammer_lock_cursor(cursor);
  850         return(error);
  851 }
  852 
  853 /*
  854  * Dup ocursor to ncursor.  ncursor inherits ocursor's locks and ocursor
  855  * is effectively unlocked and becomes tracked.  If ocursor was not locked
  856  * then ncursor also inherits the tracking.
  857  *
  858  * After the caller finishes working with ncursor it must be cleaned up
  859  * with hammer_done_cursor(), and the caller must re-lock ocursor.
  860  */
  861 hammer_cursor_t
  862 hammer_push_cursor(hammer_cursor_t ocursor)
  863 {
  864         hammer_cursor_t ncursor;
  865         hammer_inode_t ip;
  866         hammer_node_t node;
  867         hammer_mount_t hmp;
  868 
  869         hmp = ocursor->trans->hmp;
  870         ncursor = kmalloc(sizeof(*ncursor), hmp->m_misc, M_WAITOK | M_ZERO);
  871         bcopy(ocursor, ncursor, sizeof(*ocursor));
  872 
  873         node = ocursor->node;
  874         hammer_ref_node(node);
  875         if ((ocursor->flags & HAMMER_CURSOR_TRACKED) == 0) {
  876                 ocursor->flags |= HAMMER_CURSOR_TRACKED;
  877                 TAILQ_INSERT_TAIL(&node->cursor_list, ocursor, deadlk_entry);
  878         }
  879         if (ncursor->parent)
  880                 ocursor->parent = NULL;
  881         ocursor->data_buffer = NULL;
  882         ocursor->leaf = NULL;
  883         ocursor->data = NULL;
  884         if (ncursor->flags & HAMMER_CURSOR_TRACKED)
  885                 TAILQ_INSERT_TAIL(&node->cursor_list, ncursor, deadlk_entry);
  886         if ((ip = ncursor->ip) != NULL) {
  887                 ++ip->cursor_ip_refs;
  888         }
  889         if (ncursor->iprec)
  890                 hammer_ref(&ncursor->iprec->lock);
  891         return(ncursor);
  892 }
  893 
  894 /*
  895  * Destroy ncursor and restore ocursor
  896  *
  897  * This is a temporary hack for the release.  We can't afford to lose
  898  * the IP lock until the IP object scan code is able to deal with it,
  899  * so have ocursor inherit it back.
  900  */
  901 void
  902 hammer_pop_cursor(hammer_cursor_t ocursor, hammer_cursor_t ncursor)
  903 {
  904         hammer_mount_t hmp;
  905         hammer_inode_t ip;
  906 
  907         hmp = ncursor->trans->hmp;
  908         ip = ncursor->ip;
  909         ncursor->ip = NULL;
  910         if (ip)
  911                 --ip->cursor_ip_refs;
  912         hammer_done_cursor(ncursor);
  913         kfree(ncursor, hmp->m_misc);
  914         KKASSERT(ocursor->ip == ip);
  915         hammer_lock_cursor(ocursor);
  916 }
  917 
  918 /*
  919  * onode is being replaced by nnode by the reblocking code.
  920  */
  921 void
  922 hammer_cursor_replaced_node(hammer_node_t onode, hammer_node_t nnode)
  923 {
  924         hammer_cursor_t cursor;
  925         hammer_node_ondisk_t ondisk;
  926         hammer_node_ondisk_t nndisk;
  927 
  928         ondisk = onode->ondisk;
  929         nndisk = nnode->ondisk;
  930 
  931         while ((cursor = TAILQ_FIRST(&onode->cursor_list)) != NULL) {
  932                 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry);
  933                 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
  934                 KKASSERT(cursor->node == onode);
  935                 if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
  936                         cursor->leaf = &nndisk->elms[cursor->index].leaf;
  937                 cursor->node = nnode;
  938                 hammer_ref_node(nnode);
  939                 hammer_rel_node(onode);
  940         }
  941 }
  942 
  943 /*
  944  * We have removed <node> from the parent and collapsed the parent.
  945  *
  946  * Cursors in deadlock recovery are seeked upward to the parent so the
  947  * btree_remove() recursion works properly even though we have marked
  948  * the cursor as requiring a reseek.
  949  *
  950  * This is the only cursor function which sets HAMMER_CURSOR_ITERATE_CHECK,
  951  * meaning the cursor is no longer definitively pointing at an element
  952  * within its iteration (if the cursor is being used to iterate).  The
  953  * iteration code will take this into account instead of asserting if the
  954  * cursor is outside the iteration range.
  955  */
  956 void
  957 hammer_cursor_removed_node(hammer_node_t node, hammer_node_t parent, int index)
  958 {
  959         hammer_cursor_t cursor;
  960         hammer_node_ondisk_t ondisk;
  961 
  962         KKASSERT(parent != NULL);
  963         ondisk = node->ondisk;
  964 
  965         while ((cursor = TAILQ_FIRST(&node->cursor_list)) != NULL) {
  966                 KKASSERT(cursor->node == node);
  967                 KKASSERT(cursor->index == 0);
  968                 TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry);
  969                 TAILQ_INSERT_TAIL(&parent->cursor_list, cursor, deadlk_entry);
  970                 if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
  971                         cursor->leaf = NULL;
  972                 cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT;
  973                 cursor->flags |= HAMMER_CURSOR_ITERATE_CHECK;
  974                 cursor->node = parent;
  975                 cursor->index = index;
  976                 hammer_ref_node(parent);
  977                 hammer_rel_node(node);
  978         }
  979 }
  980 
  981 /*
  982  * node was split at (onode, index) with elements >= index moved to nnode.
  983  */
  984 void
  985 hammer_cursor_split_node(hammer_node_t onode, hammer_node_t nnode, int index)
  986 {
  987         hammer_cursor_t cursor;
  988         hammer_node_ondisk_t ondisk;
  989         hammer_node_ondisk_t nndisk;
  990 
  991         ondisk = onode->ondisk;
  992         nndisk = nnode->ondisk;
  993 
  994 again:
  995         TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) {
  996                 KKASSERT(cursor->node == onode);
  997                 if (cursor->index < index)
  998                         continue;
  999                 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry);
 1000                 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
 1001                 if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
 1002                         cursor->leaf = &nndisk->elms[cursor->index - index].leaf;
 1003                 cursor->node = nnode;
 1004                 cursor->index -= index;
 1005                 hammer_ref_node(nnode);
 1006                 hammer_rel_node(onode);
 1007                 goto again;
 1008         }
 1009 }
 1010 
 1011 /*
 1012  * An element was moved from one node to another or within a node.  The
 1013  * index may also represent the end of the node (index == numelements).
 1014  *
 1015  * {oparent,pindex} is the parent node's pointer to onode/oindex.
 1016  *
 1017  * This is used by the rebalancing code.  This is not an insertion or
 1018  * deletion and any additional elements, including the degenerate case at
 1019  * the end of the node, will be dealt with by additional distinct calls.
 1020  */
 1021 void
 1022 hammer_cursor_moved_element(hammer_node_t oparent, int pindex,
 1023                             hammer_node_t onode, int oindex,
 1024                             hammer_node_t nnode, int nindex)
 1025 {
 1026         hammer_cursor_t cursor;
 1027         hammer_node_ondisk_t ondisk;
 1028         hammer_node_ondisk_t nndisk;
 1029 
 1030         /*
 1031          * Adjust any cursors pointing at the element
 1032          */
 1033         ondisk = onode->ondisk;
 1034         nndisk = nnode->ondisk;
 1035 again1:
 1036         TAILQ_FOREACH(cursor, &onode->cursor_list, deadlk_entry) {
 1037                 KKASSERT(cursor->node == onode);
 1038                 if (cursor->index != oindex)
 1039                         continue;
 1040                 TAILQ_REMOVE(&onode->cursor_list, cursor, deadlk_entry);
 1041                 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
 1042                 if (cursor->leaf == &ondisk->elms[oindex].leaf)
 1043                         cursor->leaf = &nndisk->elms[nindex].leaf;
 1044                 cursor->node = nnode;
 1045                 cursor->index = nindex;
 1046                 hammer_ref_node(nnode);
 1047                 hammer_rel_node(onode);
 1048                 goto again1;
 1049         }
 1050 
 1051         /*
 1052          * When moving the first element of onode to a different node any
 1053          * cursor which is pointing at (oparent,pindex) must be repointed
 1054          * to nnode and ATEDISK must be cleared.
 1055          *
 1056          * This prevents cursors from losing track due to insertions.
 1057          * Insertions temporarily release the cursor in order to update
 1058          * the mirror_tids.  It primarily effects the mirror_write code.
 1059          * The other code paths generally only do a single insertion and
 1060          * then relookup or drop the cursor.
 1061          */
 1062         if (onode == nnode || oindex)
 1063                 return;
 1064         ondisk = oparent->ondisk;
 1065 again2:
 1066         TAILQ_FOREACH(cursor, &oparent->cursor_list, deadlk_entry) {
 1067                 KKASSERT(cursor->node == oparent);
 1068                 if (cursor->index != pindex)
 1069                         continue;
 1070                 kprintf("HAMMER debug: shifted cursor pointing at parent\n"
 1071                         "parent %016jx:%d onode %016jx:%d nnode %016jx:%d\n",
 1072                         (intmax_t)oparent->node_offset, pindex,
 1073                         (intmax_t)onode->node_offset, oindex,
 1074                         (intmax_t)nnode->node_offset, nindex);
 1075                 TAILQ_REMOVE(&oparent->cursor_list, cursor, deadlk_entry);
 1076                 TAILQ_INSERT_TAIL(&nnode->cursor_list, cursor, deadlk_entry);
 1077                 if (cursor->leaf == &ondisk->elms[oindex].leaf)
 1078                         cursor->leaf = &nndisk->elms[nindex].leaf;
 1079                 cursor->node = nnode;
 1080                 cursor->index = nindex;
 1081                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
 1082                 hammer_ref_node(nnode);
 1083                 hammer_rel_node(oparent);
 1084                 goto again2;
 1085         }
 1086 }
 1087 
 1088 /*
 1089  * The B-Tree element pointing to the specified node was moved from (oparent)
 1090  * to (nparent, nindex).  We must locate any tracked cursors pointing at
 1091  * node and adjust their parent accordingly.
 1092  *
 1093  * This is used by the rebalancing code when packing elements causes an
 1094  * element to shift from one node to another.
 1095  */
 1096 void
 1097 hammer_cursor_parent_changed(hammer_node_t node, hammer_node_t oparent,
 1098                              hammer_node_t nparent, int nindex)
 1099 {
 1100         hammer_cursor_t cursor;
 1101 
 1102 again:
 1103         TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) {
 1104                 KKASSERT(cursor->node == node);
 1105                 if (cursor->parent == oparent) {
 1106                         cursor->parent = nparent;
 1107                         cursor->parent_index = nindex;
 1108                         hammer_ref_node(nparent);
 1109                         hammer_rel_node(oparent);
 1110                         goto again;
 1111                 }
 1112         }
 1113 }
 1114 
 1115 /*
 1116  * Deleted element at (node, index)
 1117  *
 1118  * Shift indexes >= index
 1119  */
 1120 void
 1121 hammer_cursor_deleted_element(hammer_node_t node, int index)
 1122 {
 1123         hammer_cursor_t cursor;
 1124         hammer_node_ondisk_t ondisk;
 1125 
 1126         ondisk = node->ondisk;
 1127 
 1128         TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) {
 1129                 KKASSERT(cursor->node == node);
 1130                 if (cursor->index == index) {
 1131                         cursor->flags |= HAMMER_CURSOR_TRACKED_RIPOUT;
 1132                         if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
 1133                                 cursor->leaf = NULL;
 1134                 } else if (cursor->index > index) {
 1135                         if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
 1136                                 cursor->leaf = &ondisk->elms[cursor->index - 1].leaf;
 1137                         --cursor->index;
 1138                 }
 1139         }
 1140 }
 1141 
 1142 /*
 1143  * Inserted element at (node, index)
 1144  *
 1145  * Shift indexes >= index
 1146  */
 1147 void
 1148 hammer_cursor_inserted_element(hammer_node_t node, int index)
 1149 {
 1150         hammer_cursor_t cursor;
 1151         hammer_node_ondisk_t ondisk;
 1152 
 1153         ondisk = node->ondisk;
 1154 
 1155         TAILQ_FOREACH(cursor, &node->cursor_list, deadlk_entry) {
 1156                 KKASSERT(cursor->node == node);
 1157                 if (cursor->index >= index) {
 1158                         if (cursor->leaf == &ondisk->elms[cursor->index].leaf)
 1159                                 cursor->leaf = &ondisk->elms[cursor->index + 1].leaf;
 1160                         ++cursor->index;
 1161                 }
 1162         }
 1163 }
 1164 
 1165 /*
 1166  * Invalidate the cached data buffer associated with a cursor.
 1167  *
 1168  * This needs to be done when the underlying block is being freed or
 1169  * the referenced buffer can prevent the related buffer cache buffer
 1170  * from being properly invalidated.
 1171  */
 1172 void
 1173 hammer_cursor_invalidate_cache(hammer_cursor_t cursor)
 1174 {
 1175         if (cursor->data_buffer) {
 1176                 hammer_rel_buffer(cursor->data_buffer, 0);
 1177                 cursor->data_buffer = NULL;
 1178                 cursor->data = NULL;
 1179         }
 1180 }
 1181
Cache object: 17fa8ced949985cd17a1aaf6c32aaba1
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/vfs/hammer/hammer_cursor.c

FreeBSD/Linux Kernel Cross Reference
sys/vfs/hammer/hammer_cursor.c