The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/dsl_deadlist.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
   23  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
   24  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
   25  */
   26 
   27 #include <sys/dmu.h>
   28 #include <sys/zap.h>
   29 #include <sys/zfs_context.h>
   30 #include <sys/dsl_pool.h>
   31 #include <sys/dsl_dataset.h>
   32 
   33 /*
   34  * Deadlist concurrency:
   35  *
   36  * Deadlists can only be modified from the syncing thread.
   37  *
   38  * Except for dsl_deadlist_insert(), it can only be modified with the
   39  * dp_config_rwlock held with RW_WRITER.
   40  *
   41  * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
   42  * be called concurrently, from open context, with the dl_config_rwlock held
   43  * with RW_READER.
   44  *
   45  * Therefore, we only need to provide locking between dsl_deadlist_insert() and
   46  * the accessors, protecting:
   47  *     dl_phys->dl_used,comp,uncomp
   48  *     and protecting the dl_tree from being loaded.
   49  * The locking is provided by dl_lock.  Note that locking on the bpobj_t
   50  * provides its own locking, and dl_oldfmt is immutable.
   51  */
   52 
   53 /*
   54  * Livelist Overview
   55  * ================
   56  *
   57  * Livelists use the same 'deadlist_t' struct as deadlists and are also used
   58  * to track blkptrs over the lifetime of a dataset. Livelists however, belong
   59  * to clones and track the blkptrs that are clone-specific (were born after
   60  * the clone's creation). The exception is embedded block pointers which are
   61  * not included in livelists because they do not need to be freed.
   62  *
   63  * When it comes time to delete the clone, the livelist provides a quick
   64  * reference as to what needs to be freed. For this reason, livelists also track
   65  * when clone-specific blkptrs are freed before deletion to prevent double
   66  * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the
   67  * deletion algorithm iterates backwards over the livelist, matching
   68  * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists
   69  * are also updated in the case when blkptrs are remapped: the old version
   70  * of the blkptr is cancelled out with a FREE and the new version is tracked
   71  * with an ALLOC.
   72  *
   73  * To bound the amount of memory required for deletion, livelists over a
   74  * certain size are spread over multiple entries. Entries are grouped by
   75  * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will
   76  * be in the same entry. This allows us to delete livelists incrementally
   77  * over multiple syncs, one entry at a time.
   78  *
   79  * During the lifetime of the clone, livelists can get extremely large.
   80  * Their size is managed by periodic condensing (preemptively cancelling out
   81  * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when
   82  * the shared space between the clone and its origin is so small that it
   83  * doesn't make sense to use livelists anymore.
   84  */
   85 
   86 /*
   87  * The threshold sublist size at which we create a new sub-livelist for the
   88  * next txg. However, since blkptrs of the same transaction group must be in
   89  * the same sub-list, the actual sublist size may exceed this. When picking the
   90  * size we had to balance the fact that larger sublists mean fewer sublists
   91  * (decreasing the cost of insertion) against the consideration that sublists
   92  * will be loaded into memory and shouldn't take up an inordinate amount of
   93  * space. We settled on ~500000 entries, corresponding to roughly 128M.
   94  */
   95 uint64_t zfs_livelist_max_entries = 500000;
   96 
   97 /*
   98  * We can approximate how much of a performance gain a livelist will give us
   99  * based on the percentage of blocks shared between the clone and its origin.
  100  * 0 percent shared means that the clone has completely diverged and that the
  101  * old method is maximally effective: every read from the block tree will
  102  * result in lots of frees. Livelists give us gains when they track blocks
  103  * scattered across the tree, when one read in the old method might only
  104  * result in a few frees. Once the clone has been overwritten enough,
  105  * writes are no longer sparse and we'll no longer get much of a benefit from
  106  * tracking them with a livelist. We chose a lower limit of 75 percent shared
  107  * (25 percent overwritten). This means that 1/4 of all block pointers will be
  108  * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists
  109  * to make deletion 4x faster. Once the amount of shared space drops below this
  110  * threshold, the clone will revert to the old deletion method.
  111  */
  112 int zfs_livelist_min_percent_shared = 75;
  113 
  114 static int
  115 dsl_deadlist_compare(const void *arg1, const void *arg2)
  116 {
  117         const dsl_deadlist_entry_t *dle1 = arg1;
  118         const dsl_deadlist_entry_t *dle2 = arg2;
  119 
  120         return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
  121 }
  122 
  123 static int
  124 dsl_deadlist_cache_compare(const void *arg1, const void *arg2)
  125 {
  126         const dsl_deadlist_cache_entry_t *dlce1 = arg1;
  127         const dsl_deadlist_cache_entry_t *dlce2 = arg2;
  128 
  129         return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg));
  130 }
  131 
  132 static void
  133 dsl_deadlist_load_tree(dsl_deadlist_t *dl)
  134 {
  135         zap_cursor_t zc;
  136         zap_attribute_t za;
  137         int error;
  138 
  139         ASSERT(MUTEX_HELD(&dl->dl_lock));
  140 
  141         ASSERT(!dl->dl_oldfmt);
  142         if (dl->dl_havecache) {
  143                 /*
  144                  * After loading the tree, the caller may modify the tree,
  145                  * e.g. to add or remove nodes, or to make a node no longer
  146                  * refer to the empty_bpobj.  These changes would make the
  147                  * dl_cache incorrect.  Therefore we discard the cache here,
  148                  * so that it can't become incorrect.
  149                  */
  150                 dsl_deadlist_cache_entry_t *dlce;
  151                 void *cookie = NULL;
  152                 while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
  153                     != NULL) {
  154                         kmem_free(dlce, sizeof (*dlce));
  155                 }
  156                 avl_destroy(&dl->dl_cache);
  157                 dl->dl_havecache = B_FALSE;
  158         }
  159         if (dl->dl_havetree)
  160                 return;
  161 
  162         avl_create(&dl->dl_tree, dsl_deadlist_compare,
  163             sizeof (dsl_deadlist_entry_t),
  164             offsetof(dsl_deadlist_entry_t, dle_node));
  165         for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
  166             (error = zap_cursor_retrieve(&zc, &za)) == 0;
  167             zap_cursor_advance(&zc)) {
  168                 dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
  169                 dle->dle_mintxg = zfs_strtonum(za.za_name, NULL);
  170 
  171                 /*
  172                  * Prefetch all the bpobj's so that we do that i/o
  173                  * in parallel.  Then open them all in a second pass.
  174                  */
  175                 dle->dle_bpobj.bpo_object = za.za_first_integer;
  176                 dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object,
  177                     0, 0, 0, ZIO_PRIORITY_SYNC_READ);
  178 
  179                 avl_add(&dl->dl_tree, dle);
  180         }
  181         VERIFY3U(error, ==, ENOENT);
  182         zap_cursor_fini(&zc);
  183 
  184         for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree);
  185             dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) {
  186                 VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os,
  187                     dle->dle_bpobj.bpo_object));
  188         }
  189         dl->dl_havetree = B_TRUE;
  190 }
  191 
  192 /*
  193  * Load only the non-empty bpobj's into the dl_cache.  The cache is an analog
  194  * of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It
  195  * is used only for gathering space statistics.  The dl_cache has two
  196  * advantages over the dl_tree:
  197  *
  198  * 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's
  199  * mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj
  200  * many times and to inquire about its (zero) space stats many times.
  201  *
  202  * 2. The dl_cache uses less memory than the dl_tree.  We only need to load
  203  * the dl_tree of snapshots when deleting a snapshot, after which we free the
  204  * dl_tree with dsl_deadlist_discard_tree
  205  */
  206 static void
  207 dsl_deadlist_load_cache(dsl_deadlist_t *dl)
  208 {
  209         zap_cursor_t zc;
  210         zap_attribute_t za;
  211         int error;
  212 
  213         ASSERT(MUTEX_HELD(&dl->dl_lock));
  214 
  215         ASSERT(!dl->dl_oldfmt);
  216         if (dl->dl_havecache)
  217                 return;
  218 
  219         uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj;
  220 
  221         avl_create(&dl->dl_cache, dsl_deadlist_cache_compare,
  222             sizeof (dsl_deadlist_cache_entry_t),
  223             offsetof(dsl_deadlist_cache_entry_t, dlce_node));
  224         for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
  225             (error = zap_cursor_retrieve(&zc, &za)) == 0;
  226             zap_cursor_advance(&zc)) {
  227                 if (za.za_first_integer == empty_bpobj)
  228                         continue;
  229                 dsl_deadlist_cache_entry_t *dlce =
  230                     kmem_zalloc(sizeof (*dlce), KM_SLEEP);
  231                 dlce->dlce_mintxg = zfs_strtonum(za.za_name, NULL);
  232 
  233                 /*
  234                  * Prefetch all the bpobj's so that we do that i/o
  235                  * in parallel.  Then open them all in a second pass.
  236                  */
  237                 dlce->dlce_bpobj = za.za_first_integer;
  238                 dmu_prefetch(dl->dl_os, dlce->dlce_bpobj,
  239                     0, 0, 0, ZIO_PRIORITY_SYNC_READ);
  240                 avl_add(&dl->dl_cache, dlce);
  241         }
  242         VERIFY3U(error, ==, ENOENT);
  243         zap_cursor_fini(&zc);
  244 
  245         for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache);
  246             dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) {
  247                 bpobj_t bpo;
  248                 VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj));
  249 
  250                 VERIFY0(bpobj_space(&bpo,
  251                     &dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp));
  252                 bpobj_close(&bpo);
  253         }
  254         dl->dl_havecache = B_TRUE;
  255 }
  256 
  257 /*
  258  * Discard the tree to save memory.
  259  */
  260 void
  261 dsl_deadlist_discard_tree(dsl_deadlist_t *dl)
  262 {
  263         mutex_enter(&dl->dl_lock);
  264 
  265         if (!dl->dl_havetree) {
  266                 mutex_exit(&dl->dl_lock);
  267                 return;
  268         }
  269         dsl_deadlist_entry_t *dle;
  270         void *cookie = NULL;
  271         while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) {
  272                 bpobj_close(&dle->dle_bpobj);
  273                 kmem_free(dle, sizeof (*dle));
  274         }
  275         avl_destroy(&dl->dl_tree);
  276 
  277         dl->dl_havetree = B_FALSE;
  278         mutex_exit(&dl->dl_lock);
  279 }
  280 
  281 void
  282 dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args)
  283 {
  284         dsl_deadlist_entry_t *dle;
  285 
  286         ASSERT(dsl_deadlist_is_open(dl));
  287 
  288         mutex_enter(&dl->dl_lock);
  289         dsl_deadlist_load_tree(dl);
  290         mutex_exit(&dl->dl_lock);
  291         for (dle = avl_first(&dl->dl_tree); dle != NULL;
  292             dle = AVL_NEXT(&dl->dl_tree, dle)) {
  293                 if (func(args, dle) != 0)
  294                         break;
  295         }
  296 }
  297 
  298 void
  299 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
  300 {
  301         dmu_object_info_t doi;
  302 
  303         ASSERT(!dsl_deadlist_is_open(dl));
  304 
  305         mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
  306         dl->dl_os = os;
  307         dl->dl_object = object;
  308         VERIFY0(dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
  309         dmu_object_info_from_db(dl->dl_dbuf, &doi);
  310         if (doi.doi_type == DMU_OT_BPOBJ) {
  311                 dmu_buf_rele(dl->dl_dbuf, dl);
  312                 dl->dl_dbuf = NULL;
  313                 dl->dl_oldfmt = B_TRUE;
  314                 VERIFY0(bpobj_open(&dl->dl_bpobj, os, object));
  315                 return;
  316         }
  317 
  318         dl->dl_oldfmt = B_FALSE;
  319         dl->dl_phys = dl->dl_dbuf->db_data;
  320         dl->dl_havetree = B_FALSE;
  321         dl->dl_havecache = B_FALSE;
  322 }
  323 
  324 boolean_t
  325 dsl_deadlist_is_open(dsl_deadlist_t *dl)
  326 {
  327         return (dl->dl_os != NULL);
  328 }
  329 
  330 void
  331 dsl_deadlist_close(dsl_deadlist_t *dl)
  332 {
  333         ASSERT(dsl_deadlist_is_open(dl));
  334         mutex_destroy(&dl->dl_lock);
  335 
  336         if (dl->dl_oldfmt) {
  337                 dl->dl_oldfmt = B_FALSE;
  338                 bpobj_close(&dl->dl_bpobj);
  339                 dl->dl_os = NULL;
  340                 dl->dl_object = 0;
  341                 return;
  342         }
  343 
  344         if (dl->dl_havetree) {
  345                 dsl_deadlist_entry_t *dle;
  346                 void *cookie = NULL;
  347                 while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
  348                     != NULL) {
  349                         bpobj_close(&dle->dle_bpobj);
  350                         kmem_free(dle, sizeof (*dle));
  351                 }
  352                 avl_destroy(&dl->dl_tree);
  353         }
  354         if (dl->dl_havecache) {
  355                 dsl_deadlist_cache_entry_t *dlce;
  356                 void *cookie = NULL;
  357                 while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
  358                     != NULL) {
  359                         kmem_free(dlce, sizeof (*dlce));
  360                 }
  361                 avl_destroy(&dl->dl_cache);
  362         }
  363         dmu_buf_rele(dl->dl_dbuf, dl);
  364         dl->dl_dbuf = NULL;
  365         dl->dl_phys = NULL;
  366         dl->dl_os = NULL;
  367         dl->dl_object = 0;
  368 }
  369 
  370 uint64_t
  371 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
  372 {
  373         if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
  374                 return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
  375         return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
  376             sizeof (dsl_deadlist_phys_t), tx));
  377 }
  378 
  379 void
  380 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
  381 {
  382         dmu_object_info_t doi;
  383         zap_cursor_t zc;
  384         zap_attribute_t za;
  385         int error;
  386 
  387         VERIFY0(dmu_object_info(os, dlobj, &doi));
  388         if (doi.doi_type == DMU_OT_BPOBJ) {
  389                 bpobj_free(os, dlobj, tx);
  390                 return;
  391         }
  392 
  393         for (zap_cursor_init(&zc, os, dlobj);
  394             (error = zap_cursor_retrieve(&zc, &za)) == 0;
  395             zap_cursor_advance(&zc)) {
  396                 uint64_t obj = za.za_first_integer;
  397                 if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
  398                         bpobj_decr_empty(os, tx);
  399                 else
  400                         bpobj_free(os, obj, tx);
  401         }
  402         VERIFY3U(error, ==, ENOENT);
  403         zap_cursor_fini(&zc);
  404         VERIFY0(dmu_object_free(os, dlobj, tx));
  405 }
  406 
  407 static void
  408 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
  409     const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
  410 {
  411         ASSERT(MUTEX_HELD(&dl->dl_lock));
  412         if (dle->dle_bpobj.bpo_object ==
  413             dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
  414                 uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
  415                 bpobj_close(&dle->dle_bpobj);
  416                 bpobj_decr_empty(dl->dl_os, tx);
  417                 VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
  418                 VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
  419                     dle->dle_mintxg, obj, tx));
  420         }
  421         bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx);
  422 }
  423 
  424 static void
  425 dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
  426     uint64_t obj, dmu_tx_t *tx)
  427 {
  428         ASSERT(MUTEX_HELD(&dl->dl_lock));
  429         if (dle->dle_bpobj.bpo_object !=
  430             dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
  431                 bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
  432         } else {
  433                 bpobj_close(&dle->dle_bpobj);
  434                 bpobj_decr_empty(dl->dl_os, tx);
  435                 VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
  436                 VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
  437                     dle->dle_mintxg, obj, tx));
  438         }
  439 }
  440 
  441 void
  442 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
  443     dmu_tx_t *tx)
  444 {
  445         dsl_deadlist_entry_t dle_tofind;
  446         dsl_deadlist_entry_t *dle;
  447         avl_index_t where;
  448 
  449         if (dl->dl_oldfmt) {
  450                 bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx);
  451                 return;
  452         }
  453 
  454         mutex_enter(&dl->dl_lock);
  455         dsl_deadlist_load_tree(dl);
  456 
  457         dmu_buf_will_dirty(dl->dl_dbuf, tx);
  458 
  459         int sign = bp_freed ? -1 : +1;
  460         dl->dl_phys->dl_used +=
  461             sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
  462         dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
  463         dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
  464 
  465         dle_tofind.dle_mintxg = bp->blk_birth;
  466         dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
  467         if (dle == NULL)
  468                 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
  469         else
  470                 dle = AVL_PREV(&dl->dl_tree, dle);
  471 
  472         if (dle == NULL) {
  473                 zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu",
  474                     bp, (longlong_t)bp->blk_birth);
  475                 dle = avl_first(&dl->dl_tree);
  476         }
  477 
  478         ASSERT3P(dle, !=, NULL);
  479         dle_enqueue(dl, dle, bp, bp_freed, tx);
  480         mutex_exit(&dl->dl_lock);
  481 }
  482 
  483 int
  484 dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
  485 {
  486         dsl_deadlist_t *dl = arg;
  487         dsl_deadlist_insert(dl, bp, B_FALSE, tx);
  488         return (0);
  489 }
  490 
  491 int
  492 dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
  493 {
  494         dsl_deadlist_t *dl = arg;
  495         dsl_deadlist_insert(dl, bp, B_TRUE, tx);
  496         return (0);
  497 }
  498 
  499 /*
  500  * Insert new key in deadlist, which must be > all current entries.
  501  * mintxg is not inclusive.
  502  */
  503 void
  504 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
  505 {
  506         uint64_t obj;
  507         dsl_deadlist_entry_t *dle;
  508 
  509         if (dl->dl_oldfmt)
  510                 return;
  511 
  512         dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
  513         dle->dle_mintxg = mintxg;
  514 
  515         mutex_enter(&dl->dl_lock);
  516         dsl_deadlist_load_tree(dl);
  517 
  518         obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
  519         VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
  520         avl_add(&dl->dl_tree, dle);
  521 
  522         VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object,
  523             mintxg, obj, tx));
  524         mutex_exit(&dl->dl_lock);
  525 }
  526 
  527 /*
  528  * Remove this key, merging its entries into the previous key.
  529  */
  530 void
  531 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
  532 {
  533         dsl_deadlist_entry_t dle_tofind;
  534         dsl_deadlist_entry_t *dle, *dle_prev;
  535 
  536         if (dl->dl_oldfmt)
  537                 return;
  538         mutex_enter(&dl->dl_lock);
  539         dsl_deadlist_load_tree(dl);
  540 
  541         dle_tofind.dle_mintxg = mintxg;
  542         dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
  543         ASSERT3P(dle, !=, NULL);
  544         dle_prev = AVL_PREV(&dl->dl_tree, dle);
  545         ASSERT3P(dle_prev, !=, NULL);
  546 
  547         dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
  548 
  549         avl_remove(&dl->dl_tree, dle);
  550         bpobj_close(&dle->dle_bpobj);
  551         kmem_free(dle, sizeof (*dle));
  552 
  553         VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
  554         mutex_exit(&dl->dl_lock);
  555 }
  556 
  557 /*
  558  * Remove a deadlist entry and all of its contents by removing the entry from
  559  * the deadlist's avl tree, freeing the entry's bpobj and adjusting the
  560  * deadlist's space accounting accordingly.
  561  */
  562 void
  563 dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
  564 {
  565         uint64_t used, comp, uncomp;
  566         dsl_deadlist_entry_t dle_tofind;
  567         dsl_deadlist_entry_t *dle;
  568         objset_t *os = dl->dl_os;
  569 
  570         if (dl->dl_oldfmt)
  571                 return;
  572 
  573         mutex_enter(&dl->dl_lock);
  574         dsl_deadlist_load_tree(dl);
  575 
  576         dle_tofind.dle_mintxg = mintxg;
  577         dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
  578         VERIFY3P(dle, !=, NULL);
  579 
  580         avl_remove(&dl->dl_tree, dle);
  581         VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx));
  582         VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
  583         dmu_buf_will_dirty(dl->dl_dbuf, tx);
  584         dl->dl_phys->dl_used -= used;
  585         dl->dl_phys->dl_comp -= comp;
  586         dl->dl_phys->dl_uncomp -= uncomp;
  587         if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) {
  588                 bpobj_decr_empty(os, tx);
  589         } else {
  590                 bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
  591         }
  592         bpobj_close(&dle->dle_bpobj);
  593         kmem_free(dle, sizeof (*dle));
  594         mutex_exit(&dl->dl_lock);
  595 }
  596 
  597 /*
  598  * Clear out the contents of a deadlist_entry by freeing its bpobj,
  599  * replacing it with an empty bpobj and adjusting the deadlist's
  600  * space accounting
  601  */
  602 void
  603 dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
  604     dmu_tx_t *tx)
  605 {
  606         uint64_t new_obj, used, comp, uncomp;
  607         objset_t *os = dl->dl_os;
  608 
  609         mutex_enter(&dl->dl_lock);
  610         VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx));
  611         VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
  612         dmu_buf_will_dirty(dl->dl_dbuf, tx);
  613         dl->dl_phys->dl_used -= used;
  614         dl->dl_phys->dl_comp -= comp;
  615         dl->dl_phys->dl_uncomp -= uncomp;
  616         if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj)
  617                 bpobj_decr_empty(os, tx);
  618         else
  619                 bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
  620         bpobj_close(&dle->dle_bpobj);
  621         new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx);
  622         VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj));
  623         VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg,
  624             new_obj, tx));
  625         ASSERT(bpobj_is_empty(&dle->dle_bpobj));
  626         mutex_exit(&dl->dl_lock);
  627 }
  628 
  629 /*
  630  * Return the first entry in deadlist's avl tree
  631  */
  632 dsl_deadlist_entry_t *
  633 dsl_deadlist_first(dsl_deadlist_t *dl)
  634 {
  635         dsl_deadlist_entry_t *dle;
  636 
  637         mutex_enter(&dl->dl_lock);
  638         dsl_deadlist_load_tree(dl);
  639         dle = avl_first(&dl->dl_tree);
  640         mutex_exit(&dl->dl_lock);
  641 
  642         return (dle);
  643 }
  644 
  645 /*
  646  * Return the last entry in deadlist's avl tree
  647  */
  648 dsl_deadlist_entry_t *
  649 dsl_deadlist_last(dsl_deadlist_t *dl)
  650 {
  651         dsl_deadlist_entry_t *dle;
  652 
  653         mutex_enter(&dl->dl_lock);
  654         dsl_deadlist_load_tree(dl);
  655         dle = avl_last(&dl->dl_tree);
  656         mutex_exit(&dl->dl_lock);
  657 
  658         return (dle);
  659 }
  660 
  661 /*
  662  * Walk ds's snapshots to regenerate generate ZAP & AVL.
  663  */
  664 static void
  665 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
  666     uint64_t mrs_obj, dmu_tx_t *tx)
  667 {
  668         dsl_deadlist_t dl = { 0 };
  669         dsl_pool_t *dp = dmu_objset_pool(os);
  670 
  671         dsl_deadlist_open(&dl, os, dlobj);
  672         if (dl.dl_oldfmt) {
  673                 dsl_deadlist_close(&dl);
  674                 return;
  675         }
  676 
  677         while (mrs_obj != 0) {
  678                 dsl_dataset_t *ds;
  679                 VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
  680                 dsl_deadlist_add_key(&dl,
  681                     dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
  682                 mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
  683                 dsl_dataset_rele(ds, FTAG);
  684         }
  685         dsl_deadlist_close(&dl);
  686 }
  687 
  688 uint64_t
  689 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
  690     uint64_t mrs_obj, dmu_tx_t *tx)
  691 {
  692         dsl_deadlist_entry_t *dle;
  693         uint64_t newobj;
  694 
  695         newobj = dsl_deadlist_alloc(dl->dl_os, tx);
  696 
  697         if (dl->dl_oldfmt) {
  698                 dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
  699                 return (newobj);
  700         }
  701 
  702         mutex_enter(&dl->dl_lock);
  703         dsl_deadlist_load_tree(dl);
  704 
  705         for (dle = avl_first(&dl->dl_tree); dle;
  706             dle = AVL_NEXT(&dl->dl_tree, dle)) {
  707                 uint64_t obj;
  708 
  709                 if (dle->dle_mintxg >= maxtxg)
  710                         break;
  711 
  712                 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
  713                 VERIFY0(zap_add_int_key(dl->dl_os, newobj,
  714                     dle->dle_mintxg, obj, tx));
  715         }
  716         mutex_exit(&dl->dl_lock);
  717         return (newobj);
  718 }
  719 
  720 void
  721 dsl_deadlist_space(dsl_deadlist_t *dl,
  722     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
  723 {
  724         ASSERT(dsl_deadlist_is_open(dl));
  725         if (dl->dl_oldfmt) {
  726                 VERIFY0(bpobj_space(&dl->dl_bpobj,
  727                     usedp, compp, uncompp));
  728                 return;
  729         }
  730 
  731         mutex_enter(&dl->dl_lock);
  732         *usedp = dl->dl_phys->dl_used;
  733         *compp = dl->dl_phys->dl_comp;
  734         *uncompp = dl->dl_phys->dl_uncomp;
  735         mutex_exit(&dl->dl_lock);
  736 }
  737 
  738 /*
  739  * return space used in the range (mintxg, maxtxg].
  740  * Includes maxtxg, does not include mintxg.
  741  * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
  742  * UINT64_MAX).
  743  */
  744 void
  745 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
  746     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
  747 {
  748         dsl_deadlist_cache_entry_t *dlce;
  749         dsl_deadlist_cache_entry_t dlce_tofind;
  750         avl_index_t where;
  751 
  752         if (dl->dl_oldfmt) {
  753                 VERIFY0(bpobj_space_range(&dl->dl_bpobj,
  754                     mintxg, maxtxg, usedp, compp, uncompp));
  755                 return;
  756         }
  757 
  758         *usedp = *compp = *uncompp = 0;
  759 
  760         mutex_enter(&dl->dl_lock);
  761         dsl_deadlist_load_cache(dl);
  762         dlce_tofind.dlce_mintxg = mintxg;
  763         dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where);
  764 
  765         /*
  766          * If this mintxg doesn't exist, it may be an empty_bpobj which
  767          * is omitted from the sparse tree.  Start at the next non-empty
  768          * entry.
  769          */
  770         if (dlce == NULL)
  771                 dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER);
  772 
  773         for (; dlce && dlce->dlce_mintxg < maxtxg;
  774             dlce = AVL_NEXT(&dl->dl_tree, dlce)) {
  775                 *usedp += dlce->dlce_bytes;
  776                 *compp += dlce->dlce_comp;
  777                 *uncompp += dlce->dlce_uncomp;
  778         }
  779 
  780         mutex_exit(&dl->dl_lock);
  781 }
  782 
  783 static void
  784 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
  785     dmu_tx_t *tx)
  786 {
  787         dsl_deadlist_entry_t dle_tofind;
  788         dsl_deadlist_entry_t *dle;
  789         avl_index_t where;
  790         uint64_t used, comp, uncomp;
  791         bpobj_t bpo;
  792 
  793         ASSERT(MUTEX_HELD(&dl->dl_lock));
  794 
  795         VERIFY0(bpobj_open(&bpo, dl->dl_os, obj));
  796         VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp));
  797         bpobj_close(&bpo);
  798 
  799         dsl_deadlist_load_tree(dl);
  800 
  801         dmu_buf_will_dirty(dl->dl_dbuf, tx);
  802         dl->dl_phys->dl_used += used;
  803         dl->dl_phys->dl_comp += comp;
  804         dl->dl_phys->dl_uncomp += uncomp;
  805 
  806         dle_tofind.dle_mintxg = birth;
  807         dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
  808         if (dle == NULL)
  809                 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
  810         dle_enqueue_subobj(dl, dle, obj, tx);
  811 }
  812 
  813 static int
  814 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
  815     dmu_tx_t *tx)
  816 {
  817         dsl_deadlist_t *dl = arg;
  818         dsl_deadlist_insert(dl, bp, bp_freed, tx);
  819         return (0);
  820 }
  821 
  822 /*
  823  * Merge the deadlist pointed to by 'obj' into dl.  obj will be left as
  824  * an empty deadlist.
  825  */
  826 void
  827 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
  828 {
  829         zap_cursor_t zc;
  830         zap_attribute_t za;
  831         dmu_buf_t *bonus;
  832         dsl_deadlist_phys_t *dlp;
  833         dmu_object_info_t doi;
  834         int error;
  835 
  836         VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));
  837         if (doi.doi_type == DMU_OT_BPOBJ) {
  838                 bpobj_t bpo;
  839                 VERIFY0(bpobj_open(&bpo, dl->dl_os, obj));
  840                 VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx));
  841                 bpobj_close(&bpo);
  842                 return;
  843         }
  844 
  845         mutex_enter(&dl->dl_lock);
  846         for (zap_cursor_init(&zc, dl->dl_os, obj);
  847             (error = zap_cursor_retrieve(&zc, &za)) == 0;
  848             zap_cursor_advance(&zc)) {
  849                 uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
  850                 dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
  851                 VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
  852         }
  853         VERIFY3U(error, ==, ENOENT);
  854         zap_cursor_fini(&zc);
  855 
  856         VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
  857         dlp = bonus->db_data;
  858         dmu_buf_will_dirty(bonus, tx);
  859         memset(dlp, 0, sizeof (*dlp));
  860         dmu_buf_rele(bonus, FTAG);
  861         mutex_exit(&dl->dl_lock);
  862 }
  863 
  864 /*
  865  * Remove entries on dl that are born > mintxg, and put them on the bpobj.
  866  */
  867 void
  868 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
  869     dmu_tx_t *tx)
  870 {
  871         dsl_deadlist_entry_t dle_tofind;
  872         dsl_deadlist_entry_t *dle;
  873         avl_index_t where;
  874 
  875         ASSERT(!dl->dl_oldfmt);
  876 
  877         mutex_enter(&dl->dl_lock);
  878         dmu_buf_will_dirty(dl->dl_dbuf, tx);
  879         dsl_deadlist_load_tree(dl);
  880 
  881         dle_tofind.dle_mintxg = mintxg;
  882         dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
  883         if (dle == NULL)
  884                 dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
  885         while (dle) {
  886                 uint64_t used, comp, uncomp;
  887                 dsl_deadlist_entry_t *dle_next;
  888 
  889                 bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
  890 
  891                 VERIFY0(bpobj_space(&dle->dle_bpobj,
  892                     &used, &comp, &uncomp));
  893                 ASSERT3U(dl->dl_phys->dl_used, >=, used);
  894                 ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
  895                 ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
  896                 dl->dl_phys->dl_used -= used;
  897                 dl->dl_phys->dl_comp -= comp;
  898                 dl->dl_phys->dl_uncomp -= uncomp;
  899 
  900                 VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object,
  901                     dle->dle_mintxg, tx));
  902 
  903                 dle_next = AVL_NEXT(&dl->dl_tree, dle);
  904                 avl_remove(&dl->dl_tree, dle);
  905                 bpobj_close(&dle->dle_bpobj);
  906                 kmem_free(dle, sizeof (*dle));
  907                 dle = dle_next;
  908         }
  909         mutex_exit(&dl->dl_lock);
  910 }
  911 
  912 typedef struct livelist_entry {
  913         blkptr_t le_bp;
  914         uint32_t le_refcnt;
  915         avl_node_t le_node;
  916 } livelist_entry_t;
  917 
  918 static int
  919 livelist_compare(const void *larg, const void *rarg)
  920 {
  921         const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp;
  922         const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp;
  923 
  924         /* Sort them according to dva[0] */
  925         uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
  926         uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
  927 
  928         if (l_dva0_vdev != r_dva0_vdev)
  929                 return (TREE_CMP(l_dva0_vdev, r_dva0_vdev));
  930 
  931         /* if vdevs are equal, sort by offsets. */
  932         uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
  933         uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
  934         if (l_dva0_offset == r_dva0_offset)
  935                 ASSERT3U(l->blk_birth, ==, r->blk_birth);
  936         return (TREE_CMP(l_dva0_offset, r_dva0_offset));
  937 }
  938 
  939 struct livelist_iter_arg {
  940         avl_tree_t *avl;
  941         bplist_t *to_free;
  942         zthr_t *t;
  943 };
  944 
  945 /*
  946  * Expects an AVL tree which is incrementally filled will FREE blkptrs
  947  * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
  948  * corresponding FREE are stored in the supplied bplist.
  949  *
  950  * Note that multiple FREE and ALLOC entries for the same blkptr may
  951  * be encountered when dedup is involved. For this reason we keep a
  952  * refcount for all the FREE entries of each blkptr and ensure that
  953  * each of those FREE entries has a corresponding ALLOC preceding it.
  954  */
  955 static int
  956 dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
  957     dmu_tx_t *tx)
  958 {
  959         struct livelist_iter_arg *lia = arg;
  960         avl_tree_t *avl = lia->avl;
  961         bplist_t *to_free = lia->to_free;
  962         zthr_t *t = lia->t;
  963         ASSERT(tx == NULL);
  964 
  965         if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
  966                 return (SET_ERROR(EINTR));
  967 
  968         livelist_entry_t node;
  969         node.le_bp = *bp;
  970         livelist_entry_t *found = avl_find(avl, &node, NULL);
  971         if (bp_freed) {
  972                 if (found == NULL) {
  973                         /* first free entry for this blkptr */
  974                         livelist_entry_t *e =
  975                             kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP);
  976                         e->le_bp = *bp;
  977                         e->le_refcnt = 1;
  978                         avl_add(avl, e);
  979                 } else {
  980                         /* dedup block free */
  981                         ASSERT(BP_GET_DEDUP(bp));
  982                         ASSERT3U(BP_GET_CHECKSUM(bp), ==,
  983                             BP_GET_CHECKSUM(&found->le_bp));
  984                         ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt);
  985                         found->le_refcnt++;
  986                 }
  987         } else {
  988                 if (found == NULL) {
  989                         /* block is currently marked as allocated */
  990                         bplist_append(to_free, bp);
  991                 } else {
  992                         /* alloc matches a free entry */
  993                         ASSERT3U(found->le_refcnt, !=, 0);
  994                         found->le_refcnt--;
  995                         if (found->le_refcnt == 0) {
  996                                 /* all tracked free pairs have been matched */
  997                                 avl_remove(avl, found);
  998                                 kmem_free(found, sizeof (livelist_entry_t));
  999                         } else {
 1000                                 /*
 1001                                  * This is definitely a deduped blkptr so
 1002                                  * let's validate it.
 1003                                  */
 1004                                 ASSERT(BP_GET_DEDUP(bp));
 1005                                 ASSERT3U(BP_GET_CHECKSUM(bp), ==,
 1006                                     BP_GET_CHECKSUM(&found->le_bp));
 1007                         }
 1008                 }
 1009         }
 1010         return (0);
 1011 }
 1012 
 1013 /*
 1014  * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs
 1015  * which have an ALLOC entry but no matching FREE
 1016  */
 1017 int
 1018 dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
 1019     uint64_t *size)
 1020 {
 1021         avl_tree_t avl;
 1022         avl_create(&avl, livelist_compare, sizeof (livelist_entry_t),
 1023             offsetof(livelist_entry_t, le_node));
 1024 
 1025         /* process the sublist */
 1026         struct livelist_iter_arg arg = {
 1027             .avl = &avl,
 1028             .to_free = to_free,
 1029             .t = t
 1030         };
 1031         int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
 1032         VERIFY(err != 0 || avl_numnodes(&avl) == 0);
 1033 
 1034         void *cookie = NULL;
 1035         livelist_entry_t *le = NULL;
 1036         while ((le = avl_destroy_nodes(&avl, &cookie)) != NULL) {
 1037                 kmem_free(le, sizeof (livelist_entry_t));
 1038         }
 1039         avl_destroy(&avl);
 1040         return (err);
 1041 }
 1042 
 1043 ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, U64, ZMOD_RW,
 1044         "Size to start the next sub-livelist in a livelist");
 1045 
 1046 ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW,
 1047         "Threshold at which livelist is disabled");

Cache object: 9ad93521d77987c446ff0569ef11d5b0


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.