The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/dmu_object.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   23  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
   24  * Copyright 2014 HybridCluster. All rights reserved.
   25  */
   26 
   27 #include <sys/dbuf.h>
   28 #include <sys/dmu.h>
   29 #include <sys/dmu_impl.h>
   30 #include <sys/dmu_objset.h>
   31 #include <sys/dmu_tx.h>
   32 #include <sys/dnode.h>
   33 #include <sys/zap.h>
   34 #include <sys/zfeature.h>
   35 #include <sys/dsl_dataset.h>
   36 
   37 /*
   38  * Each of the concurrent object allocators will grab
   39  * 2^dmu_object_alloc_chunk_shift dnode slots at a time.  The default is to
   40  * grab 128 slots, which is 4 blocks worth.  This was experimentally
   41  * determined to be the lowest value that eliminates the measurable effect
   42  * of lock contention from this code path.
   43  */
   44 uint_t dmu_object_alloc_chunk_shift = 7;
   45 
   46 static uint64_t
   47 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
   48     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
   49     int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
   50 {
   51         uint64_t object;
   52         uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
   53             (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
   54         dnode_t *dn = NULL;
   55         int dn_slots = dnodesize >> DNODE_SHIFT;
   56         boolean_t restarted = B_FALSE;
   57         uint64_t *cpuobj = NULL;
   58         uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
   59         int error;
   60 
   61         cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE %
   62             os->os_obj_next_percpu_len];
   63 
   64         if (dn_slots == 0) {
   65                 dn_slots = DNODE_MIN_SLOTS;
   66         } else {
   67                 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
   68                 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
   69         }
   70 
   71         /*
   72          * The "chunk" of dnodes that is assigned to a CPU-specific
   73          * allocator needs to be at least one block's worth, to avoid
   74          * lock contention on the dbuf.  It can be at most one L1 block's
   75          * worth, so that the "rescan after polishing off a L1's worth"
   76          * logic below will be sure to kick in.
   77          */
   78         if (dnodes_per_chunk < DNODES_PER_BLOCK)
   79                 dnodes_per_chunk = DNODES_PER_BLOCK;
   80         if (dnodes_per_chunk > L1_dnode_count)
   81                 dnodes_per_chunk = L1_dnode_count;
   82 
   83         /*
   84          * The caller requested the dnode be returned as a performance
   85          * optimization in order to avoid releasing the hold only to
   86          * immediately reacquire it.  Since they caller is responsible
   87          * for releasing the hold they must provide the tag.
   88          */
   89         if (allocated_dnode != NULL) {
   90                 ASSERT3P(tag, !=, NULL);
   91         } else {
   92                 ASSERT3P(tag, ==, NULL);
   93                 tag = FTAG;
   94         }
   95 
   96         object = *cpuobj;
   97         for (;;) {
   98                 /*
   99                  * If we finished a chunk of dnodes, get a new one from
  100                  * the global allocator.
  101                  */
  102                 if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
  103                     (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
  104                     dn_slots)) {
  105                         DNODE_STAT_BUMP(dnode_alloc_next_chunk);
  106                         mutex_enter(&os->os_obj_lock);
  107                         ASSERT0(P2PHASE(os->os_obj_next_chunk,
  108                             dnodes_per_chunk));
  109                         object = os->os_obj_next_chunk;
  110 
  111                         /*
  112                          * Each time we polish off a L1 bp worth of dnodes
  113                          * (2^12 objects), move to another L1 bp that's
  114                          * still reasonably sparse (at most 1/4 full). Look
  115                          * from the beginning at most once per txg. If we
  116                          * still can't allocate from that L1 block, search
  117                          * for an empty L0 block, which will quickly skip
  118                          * to the end of the metadnode if no nearby L0
  119                          * blocks are empty. This fallback avoids a
  120                          * pathology where full dnode blocks containing
  121                          * large dnodes appear sparse because they have a
  122                          * low blk_fill, leading to many failed allocation
  123                          * attempts. In the long term a better mechanism to
  124                          * search for sparse metadnode regions, such as
  125                          * spacemaps, could be implemented.
  126                          *
  127                          * os_scan_dnodes is set during txg sync if enough
  128                          * objects have been freed since the previous
  129                          * rescan to justify backfilling again.
  130                          *
  131                          * Note that dmu_traverse depends on the behavior
  132                          * that we use multiple blocks of the dnode object
  133                          * before going back to reuse objects.  Any change
  134                          * to this algorithm should preserve that property
  135                          * or find another solution to the issues described
  136                          * in traverse_visitbp.
  137                          */
  138                         if (P2PHASE(object, L1_dnode_count) == 0) {
  139                                 uint64_t offset;
  140                                 uint64_t blkfill;
  141                                 int minlvl;
  142                                 if (os->os_rescan_dnodes) {
  143                                         offset = 0;
  144                                         os->os_rescan_dnodes = B_FALSE;
  145                                 } else {
  146                                         offset = object << DNODE_SHIFT;
  147                                 }
  148                                 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
  149                                 minlvl = restarted ? 1 : 2;
  150                                 restarted = B_TRUE;
  151                                 error = dnode_next_offset(DMU_META_DNODE(os),
  152                                     DNODE_FIND_HOLE, &offset, minlvl,
  153                                     blkfill, 0);
  154                                 if (error == 0) {
  155                                         object = offset >> DNODE_SHIFT;
  156                                 }
  157                         }
  158                         /*
  159                          * Note: if "restarted", we may find a L0 that
  160                          * is not suitably aligned.
  161                          */
  162                         os->os_obj_next_chunk =
  163                             P2ALIGN(object, dnodes_per_chunk) +
  164                             dnodes_per_chunk;
  165                         (void) atomic_swap_64(cpuobj, object);
  166                         mutex_exit(&os->os_obj_lock);
  167                 }
  168 
  169                 /*
  170                  * The value of (*cpuobj) before adding dn_slots is the object
  171                  * ID assigned to us.  The value afterwards is the object ID
  172                  * assigned to whoever wants to do an allocation next.
  173                  */
  174                 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
  175 
  176                 /*
  177                  * XXX We should check for an i/o error here and return
  178                  * up to our caller.  Actually we should pre-read it in
  179                  * dmu_tx_assign(), but there is currently no mechanism
  180                  * to do so.
  181                  */
  182                 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
  183                     dn_slots, tag, &dn);
  184                 if (error == 0) {
  185                         rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
  186                         /*
  187                          * Another thread could have allocated it; check
  188                          * again now that we have the struct lock.
  189                          */
  190                         if (dn->dn_type == DMU_OT_NONE) {
  191                                 dnode_allocate(dn, ot, blocksize,
  192                                     indirect_blockshift, bonustype,
  193                                     bonuslen, dn_slots, tx);
  194                                 rw_exit(&dn->dn_struct_rwlock);
  195                                 dmu_tx_add_new_object(tx, dn);
  196 
  197                                 /*
  198                                  * Caller requested the allocated dnode be
  199                                  * returned and is responsible for the hold.
  200                                  */
  201                                 if (allocated_dnode != NULL)
  202                                         *allocated_dnode = dn;
  203                                 else
  204                                         dnode_rele(dn, tag);
  205 
  206                                 return (object);
  207                         }
  208                         rw_exit(&dn->dn_struct_rwlock);
  209                         dnode_rele(dn, tag);
  210                         DNODE_STAT_BUMP(dnode_alloc_race);
  211                 }
  212 
  213                 /*
  214                  * Skip to next known valid starting point on error.  This
  215                  * is the start of the next block of dnodes.
  216                  */
  217                 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
  218                         object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
  219                         DNODE_STAT_BUMP(dnode_alloc_next_block);
  220                 }
  221                 (void) atomic_swap_64(cpuobj, object);
  222         }
  223 }
  224 
  225 uint64_t
  226 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
  227     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
  228 {
  229         return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
  230             bonuslen, 0, NULL, NULL, tx);
  231 }
  232 
  233 uint64_t
  234 dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
  235     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
  236     dmu_tx_t *tx)
  237 {
  238         return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
  239             bonustype, bonuslen, 0, NULL, NULL, tx);
  240 }
  241 
  242 uint64_t
  243 dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
  244     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
  245 {
  246         return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
  247             bonuslen, dnodesize, NULL, NULL, tx));
  248 }
  249 
  250 /*
  251  * Allocate a new object and return a pointer to the newly allocated dnode
  252  * via the allocated_dnode argument.  The returned dnode will be held and
  253  * the caller is responsible for releasing the hold by calling dnode_rele().
  254  */
  255 uint64_t
  256 dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
  257     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
  258     int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
  259 {
  260         return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
  261             bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
  262 }
  263 
  264 int
  265 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
  266     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
  267 {
  268         return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
  269             bonuslen, 0, tx));
  270 }
  271 
  272 int
  273 dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
  274     int blocksize, dmu_object_type_t bonustype, int bonuslen,
  275     int dnodesize, dmu_tx_t *tx)
  276 {
  277         dnode_t *dn;
  278         int dn_slots = dnodesize >> DNODE_SHIFT;
  279         int err;
  280 
  281         if (dn_slots == 0)
  282                 dn_slots = DNODE_MIN_SLOTS;
  283         ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
  284         ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
  285 
  286         if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
  287                 return (SET_ERROR(EBADF));
  288 
  289         err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
  290             FTAG, &dn);
  291         if (err)
  292                 return (err);
  293 
  294         dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
  295         dmu_tx_add_new_object(tx, dn);
  296 
  297         dnode_rele(dn, FTAG);
  298 
  299         return (0);
  300 }
  301 
  302 int
  303 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
  304     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
  305 {
  306         return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
  307             bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
  308 }
  309 
  310 int
  311 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
  312     int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
  313     boolean_t keep_spill, dmu_tx_t *tx)
  314 {
  315         dnode_t *dn;
  316         int dn_slots = dnodesize >> DNODE_SHIFT;
  317         int err;
  318 
  319         if (dn_slots == 0)
  320                 dn_slots = DNODE_MIN_SLOTS;
  321 
  322         if (object == DMU_META_DNODE_OBJECT)
  323                 return (SET_ERROR(EBADF));
  324 
  325         err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
  326             FTAG, &dn);
  327         if (err)
  328                 return (err);
  329 
  330         dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
  331             keep_spill, tx);
  332 
  333         dnode_rele(dn, FTAG);
  334         return (err);
  335 }
  336 
  337 int
  338 dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
  339 {
  340         dnode_t *dn;
  341         int err;
  342 
  343         err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
  344             FTAG, &dn);
  345         if (err)
  346                 return (err);
  347 
  348         rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
  349         if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
  350                 dbuf_rm_spill(dn, tx);
  351                 dnode_rm_spill(dn, tx);
  352         }
  353         rw_exit(&dn->dn_struct_rwlock);
  354 
  355         dnode_rele(dn, FTAG);
  356         return (err);
  357 }
  358 
  359 int
  360 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
  361 {
  362         dnode_t *dn;
  363         int err;
  364 
  365         ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
  366 
  367         err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
  368             FTAG, &dn);
  369         if (err)
  370                 return (err);
  371 
  372         ASSERT(dn->dn_type != DMU_OT_NONE);
  373         /*
  374          * If we don't create this free range, we'll leak indirect blocks when
  375          * we get to freeing the dnode in syncing context.
  376          */
  377         dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
  378         dnode_free(dn, tx);
  379         dnode_rele(dn, FTAG);
  380 
  381         return (0);
  382 }
  383 
  384 /*
  385  * Return (in *objectp) the next object which is allocated (or a hole)
  386  * after *object, taking into account only objects that may have been modified
  387  * after the specified txg.
  388  */
  389 int
  390 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
  391 {
  392         uint64_t offset;
  393         uint64_t start_obj;
  394         struct dsl_dataset *ds = os->os_dsl_dataset;
  395         int error;
  396 
  397         if (*objectp == 0) {
  398                 start_obj = 1;
  399         } else if (ds && dsl_dataset_feature_is_active(ds,
  400             SPA_FEATURE_LARGE_DNODE)) {
  401                 uint64_t i = *objectp + 1;
  402                 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
  403                 dmu_object_info_t doi;
  404 
  405                 /*
  406                  * Scan through the remaining meta dnode block.  The contents
  407                  * of each slot in the block are known so it can be quickly
  408                  * checked.  If the block is exhausted without a match then
  409                  * hand off to dnode_next_offset() for further scanning.
  410                  */
  411                 while (i <= last_obj) {
  412                         error = dmu_object_info(os, i, &doi);
  413                         if (error == ENOENT) {
  414                                 if (hole) {
  415                                         *objectp = i;
  416                                         return (0);
  417                                 } else {
  418                                         i++;
  419                                 }
  420                         } else if (error == EEXIST) {
  421                                 i++;
  422                         } else if (error == 0) {
  423                                 if (hole) {
  424                                         i += doi.doi_dnodesize >> DNODE_SHIFT;
  425                                 } else {
  426                                         *objectp = i;
  427                                         return (0);
  428                                 }
  429                         } else {
  430                                 return (error);
  431                         }
  432                 }
  433 
  434                 start_obj = i;
  435         } else {
  436                 start_obj = *objectp + 1;
  437         }
  438 
  439         offset = start_obj << DNODE_SHIFT;
  440 
  441         error = dnode_next_offset(DMU_META_DNODE(os),
  442             (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
  443 
  444         *objectp = offset >> DNODE_SHIFT;
  445 
  446         return (error);
  447 }
  448 
  449 /*
  450  * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
  451  * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
  452  *
  453  * Only for use from syncing context, on MOS objects.
  454  */
  455 void
  456 dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
  457     dmu_tx_t *tx)
  458 {
  459         dnode_t *dn;
  460 
  461         ASSERT(dmu_tx_is_syncing(tx));
  462 
  463         VERIFY0(dnode_hold(mos, object, FTAG, &dn));
  464         if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
  465                 dnode_rele(dn, FTAG);
  466                 return;
  467         }
  468         ASSERT3U(dn->dn_type, ==, old_type);
  469         ASSERT0(dn->dn_maxblkid);
  470 
  471         /*
  472          * We must initialize the ZAP data before changing the type,
  473          * so that concurrent calls to *_is_zapified() can determine if
  474          * the object has been completely zapified by checking the type.
  475          */
  476         mzap_create_impl(dn, 0, 0, tx);
  477 
  478         dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
  479             DMU_OTN_ZAP_METADATA;
  480         dnode_setdirty(dn, tx);
  481         dnode_rele(dn, FTAG);
  482 
  483         spa_feature_incr(dmu_objset_spa(mos),
  484             SPA_FEATURE_EXTENSIBLE_DATASET, tx);
  485 }
  486 
  487 void
  488 dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
  489 {
  490         dnode_t *dn;
  491         dmu_object_type_t t;
  492 
  493         ASSERT(dmu_tx_is_syncing(tx));
  494 
  495         VERIFY0(dnode_hold(mos, object, FTAG, &dn));
  496         t = dn->dn_type;
  497         dnode_rele(dn, FTAG);
  498 
  499         if (t == DMU_OTN_ZAP_METADATA) {
  500                 spa_feature_decr(dmu_objset_spa(mos),
  501                     SPA_FEATURE_EXTENSIBLE_DATASET, tx);
  502         }
  503         VERIFY0(dmu_object_free(mos, object, tx));
  504 }
  505 
  506 EXPORT_SYMBOL(dmu_object_alloc);
  507 EXPORT_SYMBOL(dmu_object_alloc_ibs);
  508 EXPORT_SYMBOL(dmu_object_alloc_dnsize);
  509 EXPORT_SYMBOL(dmu_object_alloc_hold);
  510 EXPORT_SYMBOL(dmu_object_claim);
  511 EXPORT_SYMBOL(dmu_object_claim_dnsize);
  512 EXPORT_SYMBOL(dmu_object_reclaim);
  513 EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
  514 EXPORT_SYMBOL(dmu_object_rm_spill);
  515 EXPORT_SYMBOL(dmu_object_free);
  516 EXPORT_SYMBOL(dmu_object_next);
  517 EXPORT_SYMBOL(dmu_object_zapify);
  518 EXPORT_SYMBOL(dmu_object_free_zapified);
  519 
  520 /* BEGIN CSTYLED */
  521 ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, UINT, ZMOD_RW,
  522         "CPU-specific allocator grabs 2^N objects at once");
  523 /* END CSTYLED */

Cache object: 8698077306765731657e0bcc2da587ae


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.