The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/bpobj.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   23  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
   24  * Copyright (c) 2017 Datto Inc.
   25  */
   26 
   27 #include <sys/bpobj.h>
   28 #include <sys/zfs_context.h>
   29 #include <sys/zfs_refcount.h>
   30 #include <sys/dsl_pool.h>
   31 #include <sys/zfeature.h>
   32 #include <sys/zap.h>
   33 
   34 /*
   35  * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
   36  */
   37 uint64_t
   38 bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
   39 {
   40         spa_t *spa = dmu_objset_spa(os);
   41         dsl_pool_t *dp = dmu_objset_pool(os);
   42 
   43         if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
   44                 if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
   45                         ASSERT0(dp->dp_empty_bpobj);
   46                         dp->dp_empty_bpobj =
   47                             bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
   48                         VERIFY(zap_add(os,
   49                             DMU_POOL_DIRECTORY_OBJECT,
   50                             DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
   51                             &dp->dp_empty_bpobj, tx) == 0);
   52                 }
   53                 spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
   54                 ASSERT(dp->dp_empty_bpobj != 0);
   55                 return (dp->dp_empty_bpobj);
   56         } else {
   57                 return (bpobj_alloc(os, blocksize, tx));
   58         }
   59 }
   60 
   61 void
   62 bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
   63 {
   64         dsl_pool_t *dp = dmu_objset_pool(os);
   65 
   66         spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
   67         if (!spa_feature_is_active(dmu_objset_spa(os),
   68             SPA_FEATURE_EMPTY_BPOBJ)) {
   69                 VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
   70                     DMU_POOL_DIRECTORY_OBJECT,
   71                     DMU_POOL_EMPTY_BPOBJ, tx));
   72                 VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
   73                 dp->dp_empty_bpobj = 0;
   74         }
   75 }
   76 
   77 uint64_t
   78 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
   79 {
   80         int size;
   81 
   82         if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
   83                 size = BPOBJ_SIZE_V0;
   84         else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
   85                 size = BPOBJ_SIZE_V1;
   86         else if (!spa_feature_is_active(dmu_objset_spa(os),
   87             SPA_FEATURE_LIVELIST))
   88                 size = BPOBJ_SIZE_V2;
   89         else
   90                 size = sizeof (bpobj_phys_t);
   91 
   92         return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
   93             DMU_OT_BPOBJ_HDR, size, tx));
   94 }
   95 
   96 void
   97 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
   98 {
   99         int64_t i;
  100         bpobj_t bpo;
  101         dmu_object_info_t doi;
  102         int epb;
  103         dmu_buf_t *dbuf = NULL;
  104 
  105         ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
  106         VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
  107 
  108         mutex_enter(&bpo.bpo_lock);
  109 
  110         if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
  111                 goto out;
  112 
  113         VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
  114         epb = doi.doi_data_block_size / sizeof (uint64_t);
  115 
  116         for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
  117                 uint64_t *objarray;
  118                 uint64_t offset, blkoff;
  119 
  120                 offset = i * sizeof (uint64_t);
  121                 blkoff = P2PHASE(i, epb);
  122 
  123                 if (dbuf == NULL || dbuf->db_offset > offset) {
  124                         if (dbuf)
  125                                 dmu_buf_rele(dbuf, FTAG);
  126                         VERIFY3U(0, ==, dmu_buf_hold(os,
  127                             bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
  128                 }
  129 
  130                 ASSERT3U(offset, >=, dbuf->db_offset);
  131                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
  132 
  133                 objarray = dbuf->db_data;
  134                 bpobj_free(os, objarray[blkoff], tx);
  135         }
  136         if (dbuf) {
  137                 dmu_buf_rele(dbuf, FTAG);
  138                 dbuf = NULL;
  139         }
  140         VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
  141 
  142 out:
  143         mutex_exit(&bpo.bpo_lock);
  144         bpobj_close(&bpo);
  145 
  146         VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
  147 }
  148 
  149 int
  150 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
  151 {
  152         dmu_object_info_t doi;
  153         int err;
  154 
  155         err = dmu_object_info(os, object, &doi);
  156         if (err)
  157                 return (err);
  158 
  159         memset(bpo, 0, sizeof (*bpo));
  160         mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
  161 
  162         ASSERT(bpo->bpo_dbuf == NULL);
  163         ASSERT(bpo->bpo_phys == NULL);
  164         ASSERT(object != 0);
  165         ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
  166         ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
  167 
  168         err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
  169         if (err)
  170                 return (err);
  171 
  172         bpo->bpo_os = os;
  173         bpo->bpo_object = object;
  174         bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
  175         bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
  176         bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
  177         bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);
  178         bpo->bpo_phys = bpo->bpo_dbuf->db_data;
  179         return (0);
  180 }
  181 
  182 boolean_t
  183 bpobj_is_open(const bpobj_t *bpo)
  184 {
  185         return (bpo->bpo_object != 0);
  186 }
  187 
  188 void
  189 bpobj_close(bpobj_t *bpo)
  190 {
  191         /* Lame workaround for closing a bpobj that was never opened. */
  192         if (bpo->bpo_object == 0)
  193                 return;
  194 
  195         dmu_buf_rele(bpo->bpo_dbuf, bpo);
  196         if (bpo->bpo_cached_dbuf != NULL)
  197                 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
  198         bpo->bpo_dbuf = NULL;
  199         bpo->bpo_phys = NULL;
  200         bpo->bpo_cached_dbuf = NULL;
  201         bpo->bpo_object = 0;
  202 
  203         mutex_destroy(&bpo->bpo_lock);
  204 }
  205 
  206 static boolean_t
  207 bpobj_is_empty_impl(bpobj_t *bpo)
  208 {
  209         ASSERT(MUTEX_HELD(&bpo->bpo_lock));
  210         return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
  211             (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
  212 }
  213 
  214 boolean_t
  215 bpobj_is_empty(bpobj_t *bpo)
  216 {
  217         mutex_enter(&bpo->bpo_lock);
  218         boolean_t is_empty = bpobj_is_empty_impl(bpo);
  219         mutex_exit(&bpo->bpo_lock);
  220         return (is_empty);
  221 }
  222 
  223 /*
  224  * A recursive iteration of the bpobjs would be nice here but we run the risk
  225  * of overflowing function stack space.  Instead, find each subobj and add it
  226  * to the head of our list so it can be scanned for subjobjs.  Like a
  227  * recursive implementation, the "deepest" subobjs will be freed first.
  228  * When a subobj is found to have no additional subojs, free it.
  229  */
  230 typedef struct bpobj_info {
  231         bpobj_t *bpi_bpo;
  232         /*
  233          * This object is a subobj of bpi_parent,
  234          * at bpi_index in its subobj array.
  235          */
  236         struct bpobj_info *bpi_parent;
  237         uint64_t bpi_index;
  238         /* How many of our subobj's are left to process. */
  239         uint64_t bpi_unprocessed_subobjs;
  240         /* True after having visited this bpo's directly referenced BPs. */
  241         boolean_t bpi_visited;
  242         list_node_t bpi_node;
  243 } bpobj_info_t;
  244 
  245 static bpobj_info_t *
  246 bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
  247 {
  248         bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP);
  249         bpi->bpi_bpo = bpo;
  250         bpi->bpi_parent = parent;
  251         bpi->bpi_index = index;
  252         if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
  253                 bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs;
  254         }
  255         return (bpi);
  256 }
  257 
  258 /*
  259  * Update bpobj and all of its parents with new space accounting.
  260  */
  261 static void
  262 propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,
  263     int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)
  264 {
  265 
  266         for (; bpi != NULL; bpi = bpi->bpi_parent) {
  267                 bpobj_t *p = bpi->bpi_bpo;
  268                 ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx));
  269                 p->bpo_phys->bpo_bytes -= freed;
  270                 ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0);
  271                 if (p->bpo_havecomp) {
  272                         p->bpo_phys->bpo_comp -= comp_freed;
  273                         p->bpo_phys->bpo_uncomp -= uncomp_freed;
  274                 }
  275         }
  276 }
  277 
  278 static int
  279 bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
  280     int64_t start, dmu_tx_t *tx, boolean_t free)
  281 {
  282         int err = 0;
  283         int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
  284         dmu_buf_t *dbuf = NULL;
  285         bpobj_t *bpo = bpi->bpi_bpo;
  286 
  287         for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
  288                 uint64_t offset = i * sizeof (blkptr_t);
  289                 uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
  290 
  291                 if (dbuf == NULL || dbuf->db_offset > offset) {
  292                         if (dbuf)
  293                                 dmu_buf_rele(dbuf, FTAG);
  294                         err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
  295                             offset, FTAG, &dbuf, 0);
  296                         if (err)
  297                                 break;
  298                 }
  299 
  300                 ASSERT3U(offset, >=, dbuf->db_offset);
  301                 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
  302 
  303                 blkptr_t *bparray = dbuf->db_data;
  304                 blkptr_t *bp = &bparray[blkoff];
  305 
  306                 boolean_t bp_freed = BP_GET_FREE(bp);
  307                 err = func(arg, bp, bp_freed, tx);
  308                 if (err)
  309                         break;
  310 
  311                 if (free) {
  312                         int sign = bp_freed ? -1 : +1;
  313                         spa_t *spa = dmu_objset_spa(bpo->bpo_os);
  314                         freed += sign * bp_get_dsize_sync(spa, bp);
  315                         comp_freed += sign * BP_GET_PSIZE(bp);
  316                         uncomp_freed += sign * BP_GET_UCSIZE(bp);
  317                         ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
  318                         bpo->bpo_phys->bpo_num_blkptrs--;
  319                         ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
  320                         if (bp_freed) {
  321                                 ASSERT(bpo->bpo_havefreed);
  322                                 bpo->bpo_phys->bpo_num_freed--;
  323                                 ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);
  324                         }
  325                 }
  326         }
  327         if (free) {
  328                 propagate_space_reduction(bpi, freed, comp_freed,
  329                     uncomp_freed, tx);
  330                 VERIFY0(dmu_free_range(bpo->bpo_os,
  331                     bpo->bpo_object,
  332                     bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
  333                     DMU_OBJECT_END, tx));
  334         }
  335         if (dbuf) {
  336                 dmu_buf_rele(dbuf, FTAG);
  337                 dbuf = NULL;
  338         }
  339         return (err);
  340 }
  341 
  342 /*
  343  * Given an initial bpo, start by freeing the BPs that are directly referenced
  344  * by that bpo. If the bpo has subobjs, read in its last subobj and push the
  345  * subobj to our stack. By popping items off our stack, eventually we will
  346  * encounter a bpo that has no subobjs.  We can free its bpobj_info_t, and if
  347  * requested also free the now-empty bpo from disk and decrement
  348  * its parent's subobj count. We continue popping each subobj from our stack,
  349  * visiting its last subobj until they too have no more subobjs, and so on.
  350  */
  351 static int
  352 bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
  353     dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)
  354 {
  355         list_t stack;
  356         bpobj_info_t *bpi;
  357         int err = 0;
  358 
  359         /*
  360          * Create a "stack" for us to work with without worrying about
  361          * stack overflows. Initialize it with the initial_bpo.
  362          */
  363         list_create(&stack, sizeof (bpobj_info_t),
  364             offsetof(bpobj_info_t, bpi_node));
  365         mutex_enter(&initial_bpo->bpo_lock);
  366 
  367         if (bpobj_size != NULL)
  368                 *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;
  369 
  370         list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
  371 
  372         while ((bpi = list_head(&stack)) != NULL) {
  373                 bpobj_t *bpo = bpi->bpi_bpo;
  374 
  375                 ASSERT3P(bpo, !=, NULL);
  376                 ASSERT(MUTEX_HELD(&bpo->bpo_lock));
  377                 ASSERT(bpobj_is_open(bpo));
  378 
  379                 if (free)
  380                         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
  381 
  382                 if (bpi->bpi_visited == B_FALSE) {
  383                         err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,
  384                             free);
  385                         bpi->bpi_visited = B_TRUE;
  386                         if (err != 0)
  387                                 break;
  388                 }
  389                 /*
  390                  * We've finished with this bpo's directly-referenced BP's and
  391                  * it has no more unprocessed subobjs. We can free its
  392                  * bpobj_info_t (unless it is the topmost, initial_bpo).
  393                  * If we are freeing from disk, we can also do that.
  394                  */
  395                 if (bpi->bpi_unprocessed_subobjs == 0) {
  396                         /*
  397                          * If there are no entries, there should
  398                          * be no bytes.
  399                          */
  400                         if (bpobj_is_empty_impl(bpo)) {
  401                                 ASSERT0(bpo->bpo_phys->bpo_bytes);
  402                                 ASSERT0(bpo->bpo_phys->bpo_comp);
  403                                 ASSERT0(bpo->bpo_phys->bpo_uncomp);
  404                         }
  405 
  406                         /* The initial_bpo has no parent and is not closed. */
  407                         if (bpi->bpi_parent != NULL) {
  408                                 if (free) {
  409                                         bpobj_t *p = bpi->bpi_parent->bpi_bpo;
  410 
  411                                         ASSERT0(bpo->bpo_phys->bpo_num_blkptrs);
  412                                         ASSERT3U(p->bpo_phys->bpo_num_subobjs,
  413                                             >, 0);
  414                                         ASSERT3U(bpi->bpi_index, ==,
  415                                             p->bpo_phys->bpo_num_subobjs - 1);
  416                                         ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf,
  417                                             tx));
  418 
  419                                         p->bpo_phys->bpo_num_subobjs--;
  420 
  421                                         VERIFY0(dmu_free_range(p->bpo_os,
  422                                             p->bpo_phys->bpo_subobjs,
  423                                             bpi->bpi_index * sizeof (uint64_t),
  424                                             sizeof (uint64_t), tx));
  425 
  426                                         /* eliminate the empty subobj list */
  427                                         if (bpo->bpo_havesubobj &&
  428                                             bpo->bpo_phys->bpo_subobjs != 0) {
  429                                                 ASSERT0(bpo->bpo_phys->
  430                                                     bpo_num_subobjs);
  431                                                 err = dmu_object_free(
  432                                                     bpo->bpo_os,
  433                                                     bpo->bpo_phys->bpo_subobjs,
  434                                                     tx);
  435                                                 if (err)
  436                                                         break;
  437                                                 bpo->bpo_phys->bpo_subobjs = 0;
  438                                         }
  439                                         err = dmu_object_free(p->bpo_os,
  440                                             bpo->bpo_object, tx);
  441                                         if (err)
  442                                                 break;
  443                                 }
  444 
  445                                 mutex_exit(&bpo->bpo_lock);
  446                                 bpobj_close(bpo);
  447                                 kmem_free(bpo, sizeof (bpobj_t));
  448                         } else {
  449                                 mutex_exit(&bpo->bpo_lock);
  450                         }
  451 
  452                         /*
  453                          * Finished processing this bpo. Unlock, and free
  454                          * our "stack" info.
  455                          */
  456                         list_remove_head(&stack);
  457                         kmem_free(bpi, sizeof (bpobj_info_t));
  458                 } else {
  459                         /*
  460                          * We have unprocessed subobjs. Process the next one.
  461                          */
  462                         ASSERT(bpo->bpo_havecomp);
  463                         ASSERT3P(bpobj_size, ==, NULL);
  464 
  465                         /* Add the last subobj to stack. */
  466                         int64_t i = bpi->bpi_unprocessed_subobjs - 1;
  467                         uint64_t offset = i * sizeof (uint64_t);
  468 
  469                         uint64_t obj_from_sublist;
  470                         err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
  471                             offset, sizeof (uint64_t), &obj_from_sublist,
  472                             DMU_READ_PREFETCH);
  473                         if (err)
  474                                 break;
  475                         bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
  476                             KM_SLEEP);
  477 
  478                         err = bpobj_open(sublist, bpo->bpo_os,
  479                             obj_from_sublist);
  480                         if (err)
  481                                 break;
  482 
  483                         list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
  484                         mutex_enter(&sublist->bpo_lock);
  485                         bpi->bpi_unprocessed_subobjs--;
  486                 }
  487         }
  488         /*
  489          * Cleanup anything left on the "stack" after we left the loop.
  490          * Every bpo on the stack is locked so we must remember to undo
  491          * that now (in LIFO order).
  492          */
  493         while ((bpi = list_remove_head(&stack)) != NULL) {
  494                 bpobj_t *bpo = bpi->bpi_bpo;
  495                 ASSERT(err != 0);
  496                 ASSERT3P(bpo, !=, NULL);
  497 
  498                 mutex_exit(&bpo->bpo_lock);
  499 
  500                 /* do not free the initial_bpo */
  501                 if (bpi->bpi_parent != NULL) {
  502                         bpobj_close(bpi->bpi_bpo);
  503                         kmem_free(bpi->bpi_bpo, sizeof (bpobj_t));
  504                 }
  505                 kmem_free(bpi, sizeof (bpobj_info_t));
  506         }
  507 
  508         list_destroy(&stack);
  509 
  510         return (err);
  511 }
  512 
  513 /*
  514  * Iterate and remove the entries.  If func returns nonzero, iteration
  515  * will stop and that entry will not be removed.
  516  */
  517 int
  518 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
  519 {
  520         return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));
  521 }
  522 
  523 /*
  524  * Iterate the entries.  If func returns nonzero, iteration will stop.
  525  *
  526  * If there are no subobjs:
  527  *
  528  * *bpobj_size can be used to return the number of block pointers in the
  529  * bpobj.  Note that this may be different from the number of block pointers
  530  * that are iterated over, if iteration is terminated early (e.g. by the func
  531  * returning nonzero).
  532  *
  533  * If there are concurrent (or subsequent) modifications to the bpobj then the
  534  * returned *bpobj_size can be passed as "start" to
  535  * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
  536  */
  537 int
  538 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
  539     uint64_t *bpobj_size)
  540 {
  541         return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));
  542 }
  543 
  544 /*
  545  * Iterate over the blkptrs in the bpobj beginning at index start. If func
  546  * returns nonzero, iteration will stop. This is a livelist specific function
  547  * since it assumes that there are no subobjs present.
  548  */
  549 int
  550 livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
  551     int64_t start)
  552 {
  553         if (bpo->bpo_havesubobj)
  554                 VERIFY0(bpo->bpo_phys->bpo_subobjs);
  555         bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);
  556         int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);
  557         kmem_free(bpi, sizeof (bpobj_info_t));
  558         return (err);
  559 }
  560 
  561 /*
  562  * Logically add subobj's contents to the parent bpobj.
  563  *
  564  * In the most general case, this is accomplished in constant time by adding
  565  * a reference to subobj.  This case is used when enqueuing a large subobj:
  566  * +--------------+                        +--------------+
  567  * | bpobj        |----------------------->| subobj list  |
  568  * +----+----+----+----+----+              +-----+-----+--+--+
  569  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
  570  * +----+----+----+----+----+              +-----+-----+-----+
  571  *
  572  * +--------------+                        +--------------+
  573  * | sub-bpobj    |----------------------> | subsubobj    |
  574  * +----+----+----+----+---------+----+    +-----+-----+--+--------+-----+
  575  * | bp | bp | bp | bp |   ...   | bp |    | obj | obj |    ...    | obj |
  576  * +----+----+----+----+---------+----+    +-----+-----+-----------+-----+
  577  *
  578  * Result: sub-bpobj added to parent's subobj list.
  579  * +--------------+                        +--------------+
  580  * | bpobj        |----------------------->| subobj list  |
  581  * +----+----+----+----+----+              +-----+-----+--+--+-----+
  582  * | bp | bp | bp | bp | bp |              | obj | obj | obj | OBJ |
  583  * +----+----+----+----+----+              +-----+-----+-----+--|--+
  584  *                                                              |
  585  *       /-----------------------------------------------------/
  586  *       v
  587  * +--------------+                        +--------------+
  588  * | sub-bpobj    |----------------------> | subsubobj    |
  589  * +----+----+----+----+---------+----+    +-----+-----+--+--------+-----+
  590  * | bp | bp | bp | bp |   ...   | bp |    | obj | obj |    ...    | obj |
  591  * +----+----+----+----+---------+----+    +-----+-----+-----------+-----+
  592  *
  593  *
  594  * In a common case, the subobj is small: its bp's and its list of subobj's
  595  * are each stored in a single block.  In this case we copy the subobj's
  596  * contents to the parent:
  597  * +--------------+                        +--------------+
  598  * | bpobj        |----------------------->| subobj list  |
  599  * +----+----+----+----+----+              +-----+-----+--+--+
  600  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
  601  * +----+----+----+----+----+              +-----+-----+-----+
  602  *                          ^                                ^
  603  * +--------------+         |              +--------------+  |
  604  * | sub-bpobj    |---------^------------> | subsubobj    |  ^
  605  * +----+----+----+         |              +-----+-----+--+  |
  606  * | BP | BP |-->-->-->-->-/               | OBJ | OBJ |-->-/
  607  * +----+----+                             +-----+-----+
  608  *
  609  * Result: subobj destroyed, contents copied to parent:
  610  * +--------------+                        +--------------+
  611  * | bpobj        |----------------------->| subobj list  |
  612  * +----+----+----+----+----+----+----+    +-----+-----+--+--+-----+-----+
  613  * | bp | bp | bp | bp | bp | BP | BP |    | obj | obj | obj | OBJ | OBJ |
  614  * +----+----+----+----+----+----+----+    +-----+-----+-----+-----+-----+
  615  *
  616  *
  617  * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
  618  * but retain the sub-bpobj:
  619  * +--------------+                        +--------------+
  620  * | bpobj        |----------------------->| subobj list  |
  621  * +----+----+----+----+----+              +-----+-----+--+--+
  622  * | bp | bp | bp | bp | bp |              | obj | obj | obj |
  623  * +----+----+----+----+----+              +-----+-----+-----+
  624  *                                                           ^
  625  * +--------------+                        +--------------+  |
  626  * | sub-bpobj    |----------------------> | subsubobj    |  ^
  627  * +----+----+----+----+---------+----+    +-----+-----+--+  |
  628  * | bp | bp | bp | bp |   ...   | bp |    | OBJ | OBJ |-->-/
  629  * +----+----+----+----+---------+----+    +-----+-----+
  630  *
  631  * Result: sub-sub-bpobjs and subobj added to parent's subobj list.
  632  * +--------------+                     +--------------+
  633  * | bpobj        |-------------------->| subobj list  |
  634  * +----+----+----+----+----+           +-----+-----+--+--+-----+-----+------+
  635  * | bp | bp | bp | bp | bp |           | obj | obj | obj | OBJ | OBJ | OBJ* |
  636  * +----+----+----+----+----+           +-----+-----+-----+-----+-----+--|---+
  637  *                                                                       |
  638  *       /--------------------------------------------------------------/
  639  *       v
  640  * +--------------+
  641  * | sub-bpobj    |
  642  * +----+----+----+----+---------+----+
  643  * | bp | bp | bp | bp |   ...   | bp |
  644  * +----+----+----+----+---------+----+
  645  */
  646 void
  647 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
  648 {
  649         bpobj_t subbpo;
  650         uint64_t used, comp, uncomp, subsubobjs;
  651         boolean_t copy_subsub = B_TRUE;
  652         boolean_t copy_bps = B_TRUE;
  653 
  654         ASSERT(bpobj_is_open(bpo));
  655         ASSERT(subobj != 0);
  656         ASSERT(bpo->bpo_havesubobj);
  657         ASSERT(bpo->bpo_havecomp);
  658         ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
  659 
  660         if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
  661                 bpobj_decr_empty(bpo->bpo_os, tx);
  662                 return;
  663         }
  664 
  665         VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
  666         VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
  667 
  668         if (bpobj_is_empty(&subbpo)) {
  669                 /* No point in having an empty subobj. */
  670                 bpobj_close(&subbpo);
  671                 bpobj_free(bpo->bpo_os, subobj, tx);
  672                 return;
  673         }
  674 
  675         mutex_enter(&bpo->bpo_lock);
  676         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
  677 
  678         dmu_object_info_t doi;
  679 
  680         if (bpo->bpo_phys->bpo_subobjs != 0) {
  681                 ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
  682                     &doi));
  683                 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
  684         }
  685 
  686         /*
  687          * If subobj has only one block of subobjs, then move subobj's
  688          * subobjs to bpo's subobj list directly.  This reduces recursion in
  689          * bpobj_iterate due to nested subobjs.
  690          */
  691         subsubobjs = subbpo.bpo_phys->bpo_subobjs;
  692         if (subsubobjs != 0) {
  693                 VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
  694                 if (doi.doi_max_offset > doi.doi_data_block_size) {
  695                         copy_subsub = B_FALSE;
  696                 }
  697         }
  698 
  699         /*
  700          * If, in addition to having only one block of subobj's, subobj has
  701          * only one block of bp's, then move subobj's bp's to bpo's bp list
  702          * directly. This reduces recursion in bpobj_iterate due to nested
  703          * subobjs.
  704          */
  705         VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));
  706         if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) {
  707                 copy_bps = B_FALSE;
  708         }
  709 
  710         if (copy_subsub && subsubobjs != 0) {
  711                 dmu_buf_t *subdb;
  712                 uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
  713 
  714                 VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,
  715                     0, FTAG, &subdb, 0));
  716                 /*
  717                  * Make sure that we are not asking dmu_write()
  718                  * to write more data than we have in our buffer.
  719                  */
  720                 VERIFY3U(subdb->db_size, >=,
  721                     numsubsub * sizeof (subobj));
  722                 if (bpo->bpo_phys->bpo_subobjs == 0) {
  723                         bpo->bpo_phys->bpo_subobjs =
  724                             dmu_object_alloc(bpo->bpo_os,
  725                             DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
  726                             DMU_OT_NONE, 0, tx);
  727                 }
  728                 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
  729                     bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
  730                     numsubsub * sizeof (subobj), subdb->db_data, tx);
  731                 dmu_buf_rele(subdb, FTAG);
  732                 bpo->bpo_phys->bpo_num_subobjs += numsubsub;
  733 
  734                 dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
  735                 subbpo.bpo_phys->bpo_subobjs = 0;
  736                 VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx));
  737         }
  738 
  739         if (copy_bps) {
  740                 dmu_buf_t *bps;
  741                 uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs;
  742 
  743                 ASSERT(copy_subsub);
  744                 VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj,
  745                     0, FTAG, &bps, 0));
  746 
  747                 /*
  748                  * Make sure that we are not asking dmu_write()
  749                  * to write more data than we have in our buffer.
  750                  */
  751                 VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t));
  752                 dmu_write(bpo->bpo_os, bpo->bpo_object,
  753                     bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
  754                     numbps * sizeof (blkptr_t),
  755                     bps->db_data, tx);
  756                 dmu_buf_rele(bps, FTAG);
  757                 bpo->bpo_phys->bpo_num_blkptrs += numbps;
  758 
  759                 bpobj_close(&subbpo);
  760                 VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx));
  761         } else {
  762                 bpobj_close(&subbpo);
  763                 if (bpo->bpo_phys->bpo_subobjs == 0) {
  764                         bpo->bpo_phys->bpo_subobjs =
  765                             dmu_object_alloc(bpo->bpo_os,
  766                             DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
  767                             DMU_OT_NONE, 0, tx);
  768                 }
  769 
  770                 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
  771                     bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
  772                     sizeof (subobj), &subobj, tx);
  773                 bpo->bpo_phys->bpo_num_subobjs++;
  774         }
  775 
  776         bpo->bpo_phys->bpo_bytes += used;
  777         bpo->bpo_phys->bpo_comp += comp;
  778         bpo->bpo_phys->bpo_uncomp += uncomp;
  779         mutex_exit(&bpo->bpo_lock);
  780 
  781 }
  782 
  783 void
  784 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
  785     dmu_tx_t *tx)
  786 {
  787         blkptr_t stored_bp = *bp;
  788         uint64_t offset;
  789         int blkoff;
  790         blkptr_t *bparray;
  791 
  792         ASSERT(bpobj_is_open(bpo));
  793         ASSERT(!BP_IS_HOLE(bp));
  794         ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
  795 
  796         if (BP_IS_EMBEDDED(bp)) {
  797                 /*
  798                  * The bpobj will compress better without the payload.
  799                  *
  800                  * Note that we store EMBEDDED bp's because they have an
  801                  * uncompressed size, which must be accounted for.  An
  802                  * alternative would be to add their size to bpo_uncomp
  803                  * without storing the bp, but that would create additional
  804                  * complications: bpo_uncomp would be inconsistent with the
  805                  * set of BP's stored, and bpobj_iterate() wouldn't visit
  806                  * all the space accounted for in the bpobj.
  807                  */
  808                 memset(&stored_bp, 0, sizeof (stored_bp));
  809                 stored_bp.blk_prop = bp->blk_prop;
  810                 stored_bp.blk_birth = bp->blk_birth;
  811         } else if (!BP_GET_DEDUP(bp)) {
  812                 /* The bpobj will compress better without the checksum */
  813                 memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum));
  814         }
  815 
  816         stored_bp.blk_fill = 0;
  817         BP_SET_FREE(&stored_bp, bp_freed);
  818 
  819         mutex_enter(&bpo->bpo_lock);
  820 
  821         offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
  822         blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
  823 
  824         if (bpo->bpo_cached_dbuf == NULL ||
  825             offset < bpo->bpo_cached_dbuf->db_offset ||
  826             offset >= bpo->bpo_cached_dbuf->db_offset +
  827             bpo->bpo_cached_dbuf->db_size) {
  828                 if (bpo->bpo_cached_dbuf)
  829                         dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
  830                 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
  831                     offset, bpo, &bpo->bpo_cached_dbuf, 0));
  832         }
  833 
  834         dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
  835         bparray = bpo->bpo_cached_dbuf->db_data;
  836         bparray[blkoff] = stored_bp;
  837 
  838         dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
  839         bpo->bpo_phys->bpo_num_blkptrs++;
  840         int sign = bp_freed ? -1 : +1;
  841         bpo->bpo_phys->bpo_bytes += sign *
  842             bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
  843         if (bpo->bpo_havecomp) {
  844                 bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);
  845                 bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);
  846         }
  847         if (bp_freed) {
  848                 ASSERT(bpo->bpo_havefreed);
  849                 bpo->bpo_phys->bpo_num_freed++;
  850         }
  851         mutex_exit(&bpo->bpo_lock);
  852 }
  853 
  854 struct space_range_arg {
  855         spa_t *spa;
  856         uint64_t mintxg;
  857         uint64_t maxtxg;
  858         uint64_t used;
  859         uint64_t comp;
  860         uint64_t uncomp;
  861 };
  862 
  863 static int
  864 space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
  865 {
  866         (void) bp_freed, (void) tx;
  867         struct space_range_arg *sra = arg;
  868 
  869         if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
  870                 if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
  871                         sra->used += bp_get_dsize_sync(sra->spa, bp);
  872                 else
  873                         sra->used += bp_get_dsize(sra->spa, bp);
  874                 sra->comp += BP_GET_PSIZE(bp);
  875                 sra->uncomp += BP_GET_UCSIZE(bp);
  876         }
  877         return (0);
  878 }
  879 
  880 int
  881 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
  882 {
  883         ASSERT(bpobj_is_open(bpo));
  884         mutex_enter(&bpo->bpo_lock);
  885 
  886         *usedp = bpo->bpo_phys->bpo_bytes;
  887         if (bpo->bpo_havecomp) {
  888                 *compp = bpo->bpo_phys->bpo_comp;
  889                 *uncompp = bpo->bpo_phys->bpo_uncomp;
  890                 mutex_exit(&bpo->bpo_lock);
  891                 return (0);
  892         } else {
  893                 mutex_exit(&bpo->bpo_lock);
  894                 return (bpobj_space_range(bpo, 0, UINT64_MAX,
  895                     usedp, compp, uncompp));
  896         }
  897 }
  898 
  899 /*
  900  * Return the amount of space in the bpobj which is:
  901  * mintxg < blk_birth <= maxtxg
  902  */
  903 int
  904 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
  905     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
  906 {
  907         struct space_range_arg sra = { 0 };
  908         int err;
  909 
  910         ASSERT(bpobj_is_open(bpo));
  911 
  912         /*
  913          * As an optimization, if they want the whole txg range, just
  914          * get bpo_bytes rather than iterating over the bps.
  915          */
  916         if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
  917                 return (bpobj_space(bpo, usedp, compp, uncompp));
  918 
  919         sra.spa = dmu_objset_spa(bpo->bpo_os);
  920         sra.mintxg = mintxg;
  921         sra.maxtxg = maxtxg;
  922 
  923         err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
  924         *usedp = sra.used;
  925         *compp = sra.comp;
  926         *uncompp = sra.uncomp;
  927         return (err);
  928 }
  929 
  930 /*
  931  * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
  932  * bpobj are designated as free or allocated that information is not preserved
  933  * in bplists.
  934  */
  935 int
  936 bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
  937     dmu_tx_t *tx)
  938 {
  939         (void) bp_freed, (void) tx;
  940         bplist_t *bpl = arg;
  941         bplist_append(bpl, bp);
  942         return (0);
  943 }

Cache object: ecb2e3c0a15134915999733b650dbf37


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.