The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/metaslab.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   23  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
   24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
   25  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
   26  * Copyright (c) 2017, Intel Corporation.
   27  */
   28 
   29 #include <sys/zfs_context.h>
   30 #include <sys/dmu.h>
   31 #include <sys/dmu_tx.h>
   32 #include <sys/space_map.h>
   33 #include <sys/metaslab_impl.h>
   34 #include <sys/vdev_impl.h>
   35 #include <sys/vdev_draid.h>
   36 #include <sys/zio.h>
   37 #include <sys/spa_impl.h>
   38 #include <sys/zfeature.h>
   39 #include <sys/vdev_indirect_mapping.h>
   40 #include <sys/zap.h>
   41 #include <sys/btree.h>
   42 
   43 #define WITH_DF_BLOCK_ALLOCATOR
   44 
   45 #define GANG_ALLOCATION(flags) \
   46         ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
   47 
   48 /*
   49  * Metaslab granularity, in bytes. This is roughly similar to what would be
   50  * referred to as the "stripe size" in traditional RAID arrays. In normal
   51  * operation, we will try to write this amount of data to each disk before
   52  * moving on to the next top-level vdev.
   53  */
   54 static uint64_t metaslab_aliquot = 1024 * 1024;
   55 
   56 /*
   57  * For testing, make some blocks above a certain size be gang blocks.
   58  */
   59 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
   60 
   61 /*
   62  * In pools where the log space map feature is not enabled we touch
   63  * multiple metaslabs (and their respective space maps) with each
   64  * transaction group. Thus, we benefit from having a small space map
   65  * block size since it allows us to issue more I/O operations scattered
   66  * around the disk. So a sane default for the space map block size
   67  * is 8~16K.
   68  */
   69 int zfs_metaslab_sm_blksz_no_log = (1 << 14);
   70 
   71 /*
   72  * When the log space map feature is enabled, we accumulate a lot of
   73  * changes per metaslab that are flushed once in a while so we benefit
   74  * from a bigger block size like 128K for the metaslab space maps.
   75  */
   76 int zfs_metaslab_sm_blksz_with_log = (1 << 17);
   77 
   78 /*
   79  * The in-core space map representation is more compact than its on-disk form.
   80  * The zfs_condense_pct determines how much more compact the in-core
   81  * space map representation must be before we compact it on-disk.
   82  * Values should be greater than or equal to 100.
   83  */
   84 uint_t zfs_condense_pct = 200;
   85 
   86 /*
   87  * Condensing a metaslab is not guaranteed to actually reduce the amount of
   88  * space used on disk. In particular, a space map uses data in increments of
   89  * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
   90  * same number of blocks after condensing. Since the goal of condensing is to
   91  * reduce the number of IOPs required to read the space map, we only want to
   92  * condense when we can be sure we will reduce the number of blocks used by the
   93  * space map. Unfortunately, we cannot precisely compute whether or not this is
   94  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
   95  * we apply the following heuristic: do not condense a spacemap unless the
   96  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
   97  * blocks.
   98  */
   99 static const int zfs_metaslab_condense_block_threshold = 4;
  100 
  101 /*
  102  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  103  * be eligible for allocation. The value is defined as a percentage of
  104  * free space. Metaslab groups that have more free space than
  105  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  106  * a metaslab group's free space is less than or equal to the
  107  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  108  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  109  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  110  * groups are allowed to accept allocations. Gang blocks are always
  111  * eligible to allocate on any metaslab group. The default value of 0 means
  112  * no metaslab group will be excluded based on this criterion.
  113  */
  114 static uint_t zfs_mg_noalloc_threshold = 0;
  115 
  116 /*
  117  * Metaslab groups are considered eligible for allocations if their
  118  * fragmentation metric (measured as a percentage) is less than or
  119  * equal to zfs_mg_fragmentation_threshold. If a metaslab group
  120  * exceeds this threshold then it will be skipped unless all metaslab
  121  * groups within the metaslab class have also crossed this threshold.
  122  *
  123  * This tunable was introduced to avoid edge cases where we continue
  124  * allocating from very fragmented disks in our pool while other, less
  125  * fragmented disks, exists. On the other hand, if all disks in the
  126  * pool are uniformly approaching the threshold, the threshold can
  127  * be a speed bump in performance, where we keep switching the disks
  128  * that we allocate from (e.g. we allocate some segments from disk A
  129  * making it bypassing the threshold while freeing segments from disk
  130  * B getting its fragmentation below the threshold).
  131  *
  132  * Empirically, we've seen that our vdev selection for allocations is
  133  * good enough that fragmentation increases uniformly across all vdevs
  134  * the majority of the time. Thus we set the threshold percentage high
  135  * enough to avoid hitting the speed bump on pools that are being pushed
  136  * to the edge.
  137  */
  138 static uint_t zfs_mg_fragmentation_threshold = 95;
  139 
  140 /*
  141  * Allow metaslabs to keep their active state as long as their fragmentation
  142  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
  143  * active metaslab that exceeds this threshold will no longer keep its active
  144  * status allowing better metaslabs to be selected.
  145  */
  146 static uint_t zfs_metaslab_fragmentation_threshold = 70;
  147 
  148 /*
  149  * When set will load all metaslabs when pool is first opened.
  150  */
  151 int metaslab_debug_load = B_FALSE;
  152 
  153 /*
  154  * When set will prevent metaslabs from being unloaded.
  155  */
  156 static int metaslab_debug_unload = B_FALSE;
  157 
  158 /*
  159  * Minimum size which forces the dynamic allocator to change
  160  * it's allocation strategy.  Once the space map cannot satisfy
  161  * an allocation of this size then it switches to using more
  162  * aggressive strategy (i.e search by size rather than offset).
  163  */
  164 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
  165 
  166 /*
  167  * The minimum free space, in percent, which must be available
  168  * in a space map to continue allocations in a first-fit fashion.
  169  * Once the space map's free space drops below this level we dynamically
  170  * switch to using best-fit allocations.
  171  */
  172 uint_t metaslab_df_free_pct = 4;
  173 
  174 /*
  175  * Maximum distance to search forward from the last offset. Without this
  176  * limit, fragmented pools can see >100,000 iterations and
  177  * metaslab_block_picker() becomes the performance limiting factor on
  178  * high-performance storage.
  179  *
  180  * With the default setting of 16MB, we typically see less than 500
  181  * iterations, even with very fragmented, ashift=9 pools. The maximum number
  182  * of iterations possible is:
  183  *     metaslab_df_max_search / (2 * (1<<ashift))
  184  * With the default setting of 16MB this is 16*1024 (with ashift=9) or
  185  * 2048 (with ashift=12).
  186  */
  187 static uint_t metaslab_df_max_search = 16 * 1024 * 1024;
  188 
  189 /*
  190  * Forces the metaslab_block_picker function to search for at least this many
  191  * segments forwards until giving up on finding a segment that the allocation
  192  * will fit into.
  193  */
  194 static const uint32_t metaslab_min_search_count = 100;
  195 
  196 /*
  197  * If we are not searching forward (due to metaslab_df_max_search,
  198  * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
  199  * controls what segment is used.  If it is set, we will use the largest free
  200  * segment.  If it is not set, we will use a segment of exactly the requested
  201  * size (or larger).
  202  */
  203 static int metaslab_df_use_largest_segment = B_FALSE;
  204 
  205 /*
  206  * Percentage of all cpus that can be used by the metaslab taskq.
  207  */
  208 int metaslab_load_pct = 50;
  209 
  210 /*
  211  * These tunables control how long a metaslab will remain loaded after the
  212  * last allocation from it.  A metaslab can't be unloaded until at least
  213  * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
  214  * have elapsed.  However, zfs_metaslab_mem_limit may cause it to be
  215  * unloaded sooner.  These settings are intended to be generous -- to keep
  216  * metaslabs loaded for a long time, reducing the rate of metaslab loading.
  217  */
  218 static uint_t metaslab_unload_delay = 32;
  219 static uint_t metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
  220 
  221 /*
  222  * Max number of metaslabs per group to preload.
  223  */
  224 uint_t metaslab_preload_limit = 10;
  225 
  226 /*
  227  * Enable/disable preloading of metaslab.
  228  */
  229 static int metaslab_preload_enabled = B_TRUE;
  230 
  231 /*
  232  * Enable/disable fragmentation weighting on metaslabs.
  233  */
  234 static int metaslab_fragmentation_factor_enabled = B_TRUE;
  235 
  236 /*
  237  * Enable/disable lba weighting (i.e. outer tracks are given preference).
  238  */
  239 static int metaslab_lba_weighting_enabled = B_TRUE;
  240 
  241 /*
  242  * Enable/disable metaslab group biasing.
  243  */
  244 static int metaslab_bias_enabled = B_TRUE;
  245 
  246 /*
  247  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
  248  */
  249 static const boolean_t zfs_remap_blkptr_enable = B_TRUE;
  250 
  251 /*
  252  * Enable/disable segment-based metaslab selection.
  253  */
  254 static int zfs_metaslab_segment_weight_enabled = B_TRUE;
  255 
  256 /*
  257  * When using segment-based metaslab selection, we will continue
  258  * allocating from the active metaslab until we have exhausted
  259  * zfs_metaslab_switch_threshold of its buckets.
  260  */
  261 static int zfs_metaslab_switch_threshold = 2;
  262 
  263 /*
  264  * Internal switch to enable/disable the metaslab allocation tracing
  265  * facility.
  266  */
  267 static const boolean_t metaslab_trace_enabled = B_FALSE;
  268 
  269 /*
  270  * Maximum entries that the metaslab allocation tracing facility will keep
  271  * in a given list when running in non-debug mode. We limit the number
  272  * of entries in non-debug mode to prevent us from using up too much memory.
  273  * The limit should be sufficiently large that we don't expect any allocation
  274  * to every exceed this value. In debug mode, the system will panic if this
  275  * limit is ever reached allowing for further investigation.
  276  */
  277 static const uint64_t metaslab_trace_max_entries = 5000;
  278 
  279 /*
  280  * Maximum number of metaslabs per group that can be disabled
  281  * simultaneously.
  282  */
  283 static const int max_disabled_ms = 3;
  284 
  285 /*
  286  * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
  287  * To avoid 64-bit overflow, don't set above UINT32_MAX.
  288  */
  289 static uint64_t zfs_metaslab_max_size_cache_sec = 1 * 60 * 60; /* 1 hour */
  290 
  291 /*
  292  * Maximum percentage of memory to use on storing loaded metaslabs. If loading
  293  * a metaslab would take it over this percentage, the oldest selected metaslab
  294  * is automatically unloaded.
  295  */
  296 static uint_t zfs_metaslab_mem_limit = 25;
  297 
  298 /*
  299  * Force the per-metaslab range trees to use 64-bit integers to store
  300  * segments. Used for debugging purposes.
  301  */
  302 static const boolean_t zfs_metaslab_force_large_segs = B_FALSE;
  303 
  304 /*
  305  * By default we only store segments over a certain size in the size-sorted
  306  * metaslab trees (ms_allocatable_by_size and
  307  * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
  308  * improves load and unload times at the cost of causing us to use slightly
  309  * larger segments than we would otherwise in some cases.
  310  */
  311 static const uint32_t metaslab_by_size_min_shift = 14;
  312 
  313 /*
  314  * If not set, we will first try normal allocation.  If that fails then
  315  * we will do a gang allocation.  If that fails then we will do a "try hard"
  316  * gang allocation.  If that fails then we will have a multi-layer gang
  317  * block.
  318  *
  319  * If set, we will first try normal allocation.  If that fails then
  320  * we will do a "try hard" allocation.  If that fails we will do a gang
  321  * allocation.  If that fails we will do a "try hard" gang allocation.  If
  322  * that fails then we will have a multi-layer gang block.
  323  */
  324 static int zfs_metaslab_try_hard_before_gang = B_FALSE;
  325 
  326 /*
  327  * When not trying hard, we only consider the best zfs_metaslab_find_max_tries
  328  * metaslabs.  This improves performance, especially when there are many
  329  * metaslabs per vdev and the allocation can't actually be satisfied (so we
  330  * would otherwise iterate all the metaslabs).  If there is a metaslab with a
  331  * worse weight but it can actually satisfy the allocation, we won't find it
  332  * until trying hard.  This may happen if the worse metaslab is not loaded
  333  * (and the true weight is better than we have calculated), or due to weight
  334  * bucketization.  E.g. we are looking for a 60K segment, and the best
  335  * metaslabs all have free segments in the 32-63K bucket, but the best
  336  * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a
  337  * subsequent metaslab has ms_max_size >60KB (but fewer segments in this
  338  * bucket, and therefore a lower weight).
  339  */
  340 static uint_t zfs_metaslab_find_max_tries = 100;
  341 
  342 static uint64_t metaslab_weight(metaslab_t *, boolean_t);
  343 static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
  344 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
  345 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
  346 
  347 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
  348 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
  349 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
  350 static unsigned int metaslab_idx_func(multilist_t *, void *);
  351 static void metaslab_evict(metaslab_t *, uint64_t);
  352 static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
  353 kmem_cache_t *metaslab_alloc_trace_cache;
  354 
  355 typedef struct metaslab_stats {
  356         kstat_named_t metaslabstat_trace_over_limit;
  357         kstat_named_t metaslabstat_reload_tree;
  358         kstat_named_t metaslabstat_too_many_tries;
  359         kstat_named_t metaslabstat_try_hard;
  360 } metaslab_stats_t;
  361 
  362 static metaslab_stats_t metaslab_stats = {
  363         { "trace_over_limit",           KSTAT_DATA_UINT64 },
  364         { "reload_tree",                KSTAT_DATA_UINT64 },
  365         { "too_many_tries",             KSTAT_DATA_UINT64 },
  366         { "try_hard",                   KSTAT_DATA_UINT64 },
  367 };
  368 
  369 #define METASLABSTAT_BUMP(stat) \
  370         atomic_inc_64(&metaslab_stats.stat.value.ui64);
  371 
  372 
  373 static kstat_t *metaslab_ksp;
  374 
  375 void
  376 metaslab_stat_init(void)
  377 {
  378         ASSERT(metaslab_alloc_trace_cache == NULL);
  379         metaslab_alloc_trace_cache = kmem_cache_create(
  380             "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
  381             0, NULL, NULL, NULL, NULL, NULL, 0);
  382         metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
  383             "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
  384             sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
  385         if (metaslab_ksp != NULL) {
  386                 metaslab_ksp->ks_data = &metaslab_stats;
  387                 kstat_install(metaslab_ksp);
  388         }
  389 }
  390 
  391 void
  392 metaslab_stat_fini(void)
  393 {
  394         if (metaslab_ksp != NULL) {
  395                 kstat_delete(metaslab_ksp);
  396                 metaslab_ksp = NULL;
  397         }
  398 
  399         kmem_cache_destroy(metaslab_alloc_trace_cache);
  400         metaslab_alloc_trace_cache = NULL;
  401 }
  402 
  403 /*
  404  * ==========================================================================
  405  * Metaslab classes
  406  * ==========================================================================
  407  */
  408 metaslab_class_t *
  409 metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops)
  410 {
  411         metaslab_class_t *mc;
  412 
  413         mc = kmem_zalloc(offsetof(metaslab_class_t,
  414             mc_allocator[spa->spa_alloc_count]), KM_SLEEP);
  415 
  416         mc->mc_spa = spa;
  417         mc->mc_ops = ops;
  418         mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
  419         multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t),
  420             offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
  421         for (int i = 0; i < spa->spa_alloc_count; i++) {
  422                 metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
  423                 mca->mca_rotor = NULL;
  424                 zfs_refcount_create_tracked(&mca->mca_alloc_slots);
  425         }
  426 
  427         return (mc);
  428 }
  429 
  430 void
  431 metaslab_class_destroy(metaslab_class_t *mc)
  432 {
  433         spa_t *spa = mc->mc_spa;
  434 
  435         ASSERT(mc->mc_alloc == 0);
  436         ASSERT(mc->mc_deferred == 0);
  437         ASSERT(mc->mc_space == 0);
  438         ASSERT(mc->mc_dspace == 0);
  439 
  440         for (int i = 0; i < spa->spa_alloc_count; i++) {
  441                 metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
  442                 ASSERT(mca->mca_rotor == NULL);
  443                 zfs_refcount_destroy(&mca->mca_alloc_slots);
  444         }
  445         mutex_destroy(&mc->mc_lock);
  446         multilist_destroy(&mc->mc_metaslab_txg_list);
  447         kmem_free(mc, offsetof(metaslab_class_t,
  448             mc_allocator[spa->spa_alloc_count]));
  449 }
  450 
  451 int
  452 metaslab_class_validate(metaslab_class_t *mc)
  453 {
  454         metaslab_group_t *mg;
  455         vdev_t *vd;
  456 
  457         /*
  458          * Must hold one of the spa_config locks.
  459          */
  460         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
  461             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
  462 
  463         if ((mg = mc->mc_allocator[0].mca_rotor) == NULL)
  464                 return (0);
  465 
  466         do {
  467                 vd = mg->mg_vd;
  468                 ASSERT(vd->vdev_mg != NULL);
  469                 ASSERT3P(vd->vdev_top, ==, vd);
  470                 ASSERT3P(mg->mg_class, ==, mc);
  471                 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
  472         } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor);
  473 
  474         return (0);
  475 }
  476 
  477 static void
  478 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
  479     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
  480 {
  481         atomic_add_64(&mc->mc_alloc, alloc_delta);
  482         atomic_add_64(&mc->mc_deferred, defer_delta);
  483         atomic_add_64(&mc->mc_space, space_delta);
  484         atomic_add_64(&mc->mc_dspace, dspace_delta);
  485 }
  486 
  487 uint64_t
  488 metaslab_class_get_alloc(metaslab_class_t *mc)
  489 {
  490         return (mc->mc_alloc);
  491 }
  492 
  493 uint64_t
  494 metaslab_class_get_deferred(metaslab_class_t *mc)
  495 {
  496         return (mc->mc_deferred);
  497 }
  498 
  499 uint64_t
  500 metaslab_class_get_space(metaslab_class_t *mc)
  501 {
  502         return (mc->mc_space);
  503 }
  504 
  505 uint64_t
  506 metaslab_class_get_dspace(metaslab_class_t *mc)
  507 {
  508         return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
  509 }
  510 
  511 void
  512 metaslab_class_histogram_verify(metaslab_class_t *mc)
  513 {
  514         spa_t *spa = mc->mc_spa;
  515         vdev_t *rvd = spa->spa_root_vdev;
  516         uint64_t *mc_hist;
  517         int i;
  518 
  519         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
  520                 return;
  521 
  522         mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
  523             KM_SLEEP);
  524 
  525         mutex_enter(&mc->mc_lock);
  526         for (int c = 0; c < rvd->vdev_children; c++) {
  527                 vdev_t *tvd = rvd->vdev_child[c];
  528                 metaslab_group_t *mg = vdev_get_mg(tvd, mc);
  529 
  530                 /*
  531                  * Skip any holes, uninitialized top-levels, or
  532                  * vdevs that are not in this metalab class.
  533                  */
  534                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
  535                     mg->mg_class != mc) {
  536                         continue;
  537                 }
  538 
  539                 IMPLY(mg == mg->mg_vd->vdev_log_mg,
  540                     mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
  541 
  542                 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
  543                         mc_hist[i] += mg->mg_histogram[i];
  544         }
  545 
  546         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
  547                 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
  548         }
  549 
  550         mutex_exit(&mc->mc_lock);
  551         kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
  552 }
  553 
  554 /*
  555  * Calculate the metaslab class's fragmentation metric. The metric
  556  * is weighted based on the space contribution of each metaslab group.
  557  * The return value will be a number between 0 and 100 (inclusive), or
  558  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
  559  * zfs_frag_table for more information about the metric.
  560  */
  561 uint64_t
  562 metaslab_class_fragmentation(metaslab_class_t *mc)
  563 {
  564         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
  565         uint64_t fragmentation = 0;
  566 
  567         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
  568 
  569         for (int c = 0; c < rvd->vdev_children; c++) {
  570                 vdev_t *tvd = rvd->vdev_child[c];
  571                 metaslab_group_t *mg = tvd->vdev_mg;
  572 
  573                 /*
  574                  * Skip any holes, uninitialized top-levels,
  575                  * or vdevs that are not in this metalab class.
  576                  */
  577                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
  578                     mg->mg_class != mc) {
  579                         continue;
  580                 }
  581 
  582                 /*
  583                  * If a metaslab group does not contain a fragmentation
  584                  * metric then just bail out.
  585                  */
  586                 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
  587                         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
  588                         return (ZFS_FRAG_INVALID);
  589                 }
  590 
  591                 /*
  592                  * Determine how much this metaslab_group is contributing
  593                  * to the overall pool fragmentation metric.
  594                  */
  595                 fragmentation += mg->mg_fragmentation *
  596                     metaslab_group_get_space(mg);
  597         }
  598         fragmentation /= metaslab_class_get_space(mc);
  599 
  600         ASSERT3U(fragmentation, <=, 100);
  601         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
  602         return (fragmentation);
  603 }
  604 
  605 /*
  606  * Calculate the amount of expandable space that is available in
  607  * this metaslab class. If a device is expanded then its expandable
  608  * space will be the amount of allocatable space that is currently not
  609  * part of this metaslab class.
  610  */
  611 uint64_t
  612 metaslab_class_expandable_space(metaslab_class_t *mc)
  613 {
  614         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
  615         uint64_t space = 0;
  616 
  617         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
  618         for (int c = 0; c < rvd->vdev_children; c++) {
  619                 vdev_t *tvd = rvd->vdev_child[c];
  620                 metaslab_group_t *mg = tvd->vdev_mg;
  621 
  622                 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
  623                     mg->mg_class != mc) {
  624                         continue;
  625                 }
  626 
  627                 /*
  628                  * Calculate if we have enough space to add additional
  629                  * metaslabs. We report the expandable space in terms
  630                  * of the metaslab size since that's the unit of expansion.
  631                  */
  632                 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
  633                     1ULL << tvd->vdev_ms_shift);
  634         }
  635         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
  636         return (space);
  637 }
  638 
  639 void
  640 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
  641 {
  642         multilist_t *ml = &mc->mc_metaslab_txg_list;
  643         for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
  644                 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
  645                 metaslab_t *msp = multilist_sublist_head(mls);
  646                 multilist_sublist_unlock(mls);
  647                 while (msp != NULL) {
  648                         mutex_enter(&msp->ms_lock);
  649 
  650                         /*
  651                          * If the metaslab has been removed from the list
  652                          * (which could happen if we were at the memory limit
  653                          * and it was evicted during this loop), then we can't
  654                          * proceed and we should restart the sublist.
  655                          */
  656                         if (!multilist_link_active(&msp->ms_class_txg_node)) {
  657                                 mutex_exit(&msp->ms_lock);
  658                                 i--;
  659                                 break;
  660                         }
  661                         mls = multilist_sublist_lock(ml, i);
  662                         metaslab_t *next_msp = multilist_sublist_next(mls, msp);
  663                         multilist_sublist_unlock(mls);
  664                         if (txg >
  665                             msp->ms_selected_txg + metaslab_unload_delay &&
  666                             gethrtime() > msp->ms_selected_time +
  667                             (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
  668                                 metaslab_evict(msp, txg);
  669                         } else {
  670                                 /*
  671                                  * Once we've hit a metaslab selected too
  672                                  * recently to evict, we're done evicting for
  673                                  * now.
  674                                  */
  675                                 mutex_exit(&msp->ms_lock);
  676                                 break;
  677                         }
  678                         mutex_exit(&msp->ms_lock);
  679                         msp = next_msp;
  680                 }
  681         }
  682 }
  683 
  684 static int
  685 metaslab_compare(const void *x1, const void *x2)
  686 {
  687         const metaslab_t *m1 = (const metaslab_t *)x1;
  688         const metaslab_t *m2 = (const metaslab_t *)x2;
  689 
  690         int sort1 = 0;
  691         int sort2 = 0;
  692         if (m1->ms_allocator != -1 && m1->ms_primary)
  693                 sort1 = 1;
  694         else if (m1->ms_allocator != -1 && !m1->ms_primary)
  695                 sort1 = 2;
  696         if (m2->ms_allocator != -1 && m2->ms_primary)
  697                 sort2 = 1;
  698         else if (m2->ms_allocator != -1 && !m2->ms_primary)
  699                 sort2 = 2;
  700 
  701         /*
  702          * Sort inactive metaslabs first, then primaries, then secondaries. When
  703          * selecting a metaslab to allocate from, an allocator first tries its
  704          * primary, then secondary active metaslab. If it doesn't have active
  705          * metaslabs, or can't allocate from them, it searches for an inactive
  706          * metaslab to activate. If it can't find a suitable one, it will steal
  707          * a primary or secondary metaslab from another allocator.
  708          */
  709         if (sort1 < sort2)
  710                 return (-1);
  711         if (sort1 > sort2)
  712                 return (1);
  713 
  714         int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
  715         if (likely(cmp))
  716                 return (cmp);
  717 
  718         IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
  719 
  720         return (TREE_CMP(m1->ms_start, m2->ms_start));
  721 }
  722 
  723 /*
  724  * ==========================================================================
  725  * Metaslab groups
  726  * ==========================================================================
  727  */
  728 /*
  729  * Update the allocatable flag and the metaslab group's capacity.
  730  * The allocatable flag is set to true if the capacity is below
  731  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
  732  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
  733  * transitions from allocatable to non-allocatable or vice versa then the
  734  * metaslab group's class is updated to reflect the transition.
  735  */
  736 static void
  737 metaslab_group_alloc_update(metaslab_group_t *mg)
  738 {
  739         vdev_t *vd = mg->mg_vd;
  740         metaslab_class_t *mc = mg->mg_class;
  741         vdev_stat_t *vs = &vd->vdev_stat;
  742         boolean_t was_allocatable;
  743         boolean_t was_initialized;
  744 
  745         ASSERT(vd == vd->vdev_top);
  746         ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
  747             SCL_ALLOC);
  748 
  749         mutex_enter(&mg->mg_lock);
  750         was_allocatable = mg->mg_allocatable;
  751         was_initialized = mg->mg_initialized;
  752 
  753         mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
  754             (vs->vs_space + 1);
  755 
  756         mutex_enter(&mc->mc_lock);
  757 
  758         /*
  759          * If the metaslab group was just added then it won't
  760          * have any space until we finish syncing out this txg.
  761          * At that point we will consider it initialized and available
  762          * for allocations.  We also don't consider non-activated
  763          * metaslab groups (e.g. vdevs that are in the middle of being removed)
  764          * to be initialized, because they can't be used for allocation.
  765          */
  766         mg->mg_initialized = metaslab_group_initialized(mg);
  767         if (!was_initialized && mg->mg_initialized) {
  768                 mc->mc_groups++;
  769         } else if (was_initialized && !mg->mg_initialized) {
  770                 ASSERT3U(mc->mc_groups, >, 0);
  771                 mc->mc_groups--;
  772         }
  773         if (mg->mg_initialized)
  774                 mg->mg_no_free_space = B_FALSE;
  775 
  776         /*
  777          * A metaslab group is considered allocatable if it has plenty
  778          * of free space or is not heavily fragmented. We only take
  779          * fragmentation into account if the metaslab group has a valid
  780          * fragmentation metric (i.e. a value between 0 and 100).
  781          */
  782         mg->mg_allocatable = (mg->mg_activation_count > 0 &&
  783             mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
  784             (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
  785             mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
  786 
  787         /*
  788          * The mc_alloc_groups maintains a count of the number of
  789          * groups in this metaslab class that are still above the
  790          * zfs_mg_noalloc_threshold. This is used by the allocating
  791          * threads to determine if they should avoid allocations to
  792          * a given group. The allocator will avoid allocations to a group
  793          * if that group has reached or is below the zfs_mg_noalloc_threshold
  794          * and there are still other groups that are above the threshold.
  795          * When a group transitions from allocatable to non-allocatable or
  796          * vice versa we update the metaslab class to reflect that change.
  797          * When the mc_alloc_groups value drops to 0 that means that all
  798          * groups have reached the zfs_mg_noalloc_threshold making all groups
  799          * eligible for allocations. This effectively means that all devices
  800          * are balanced again.
  801          */
  802         if (was_allocatable && !mg->mg_allocatable)
  803                 mc->mc_alloc_groups--;
  804         else if (!was_allocatable && mg->mg_allocatable)
  805                 mc->mc_alloc_groups++;
  806         mutex_exit(&mc->mc_lock);
  807 
  808         mutex_exit(&mg->mg_lock);
  809 }
  810 
  811 int
  812 metaslab_sort_by_flushed(const void *va, const void *vb)
  813 {
  814         const metaslab_t *a = va;
  815         const metaslab_t *b = vb;
  816 
  817         int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
  818         if (likely(cmp))
  819                 return (cmp);
  820 
  821         uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
  822         uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
  823         cmp = TREE_CMP(a_vdev_id, b_vdev_id);
  824         if (cmp)
  825                 return (cmp);
  826 
  827         return (TREE_CMP(a->ms_id, b->ms_id));
  828 }
  829 
  830 metaslab_group_t *
  831 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
  832 {
  833         metaslab_group_t *mg;
  834 
  835         mg = kmem_zalloc(offsetof(metaslab_group_t,
  836             mg_allocator[allocators]), KM_SLEEP);
  837         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
  838         mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
  839         cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
  840         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
  841             sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
  842         mg->mg_vd = vd;
  843         mg->mg_class = mc;
  844         mg->mg_activation_count = 0;
  845         mg->mg_initialized = B_FALSE;
  846         mg->mg_no_free_space = B_TRUE;
  847         mg->mg_allocators = allocators;
  848 
  849         for (int i = 0; i < allocators; i++) {
  850                 metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
  851                 zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
  852         }
  853 
  854         mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
  855             maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
  856 
  857         return (mg);
  858 }
  859 
  860 void
  861 metaslab_group_destroy(metaslab_group_t *mg)
  862 {
  863         ASSERT(mg->mg_prev == NULL);
  864         ASSERT(mg->mg_next == NULL);
  865         /*
  866          * We may have gone below zero with the activation count
  867          * either because we never activated in the first place or
  868          * because we're done, and possibly removing the vdev.
  869          */
  870         ASSERT(mg->mg_activation_count <= 0);
  871 
  872         taskq_destroy(mg->mg_taskq);
  873         avl_destroy(&mg->mg_metaslab_tree);
  874         mutex_destroy(&mg->mg_lock);
  875         mutex_destroy(&mg->mg_ms_disabled_lock);
  876         cv_destroy(&mg->mg_ms_disabled_cv);
  877 
  878         for (int i = 0; i < mg->mg_allocators; i++) {
  879                 metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
  880                 zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
  881         }
  882         kmem_free(mg, offsetof(metaslab_group_t,
  883             mg_allocator[mg->mg_allocators]));
  884 }
  885 
  886 void
  887 metaslab_group_activate(metaslab_group_t *mg)
  888 {
  889         metaslab_class_t *mc = mg->mg_class;
  890         spa_t *spa = mc->mc_spa;
  891         metaslab_group_t *mgprev, *mgnext;
  892 
  893         ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0);
  894 
  895         ASSERT(mg->mg_prev == NULL);
  896         ASSERT(mg->mg_next == NULL);
  897         ASSERT(mg->mg_activation_count <= 0);
  898 
  899         if (++mg->mg_activation_count <= 0)
  900                 return;
  901 
  902         mg->mg_aliquot = metaslab_aliquot * MAX(1,
  903             vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd));
  904         metaslab_group_alloc_update(mg);
  905 
  906         if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
  907                 mg->mg_prev = mg;
  908                 mg->mg_next = mg;
  909         } else {
  910                 mgnext = mgprev->mg_next;
  911                 mg->mg_prev = mgprev;
  912                 mg->mg_next = mgnext;
  913                 mgprev->mg_next = mg;
  914                 mgnext->mg_prev = mg;
  915         }
  916         for (int i = 0; i < spa->spa_alloc_count; i++) {
  917                 mc->mc_allocator[i].mca_rotor = mg;
  918                 mg = mg->mg_next;
  919         }
  920 }
  921 
  922 /*
  923  * Passivate a metaslab group and remove it from the allocation rotor.
  924  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
  925  * a metaslab group. This function will momentarily drop spa_config_locks
  926  * that are lower than the SCL_ALLOC lock (see comment below).
  927  */
  928 void
  929 metaslab_group_passivate(metaslab_group_t *mg)
  930 {
  931         metaslab_class_t *mc = mg->mg_class;
  932         spa_t *spa = mc->mc_spa;
  933         metaslab_group_t *mgprev, *mgnext;
  934         int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
  935 
  936         ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
  937             (SCL_ALLOC | SCL_ZIO));
  938 
  939         if (--mg->mg_activation_count != 0) {
  940                 for (int i = 0; i < spa->spa_alloc_count; i++)
  941                         ASSERT(mc->mc_allocator[i].mca_rotor != mg);
  942                 ASSERT(mg->mg_prev == NULL);
  943                 ASSERT(mg->mg_next == NULL);
  944                 ASSERT(mg->mg_activation_count < 0);
  945                 return;
  946         }
  947 
  948         /*
  949          * The spa_config_lock is an array of rwlocks, ordered as
  950          * follows (from highest to lowest):
  951          *      SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
  952          *      SCL_ZIO > SCL_FREE > SCL_VDEV
  953          * (For more information about the spa_config_lock see spa_misc.c)
  954          * The higher the lock, the broader its coverage. When we passivate
  955          * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
  956          * config locks. However, the metaslab group's taskq might be trying
  957          * to preload metaslabs so we must drop the SCL_ZIO lock and any
  958          * lower locks to allow the I/O to complete. At a minimum,
  959          * we continue to hold the SCL_ALLOC lock, which prevents any future
  960          * allocations from taking place and any changes to the vdev tree.
  961          */
  962         spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
  963         taskq_wait_outstanding(mg->mg_taskq, 0);
  964         spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
  965         metaslab_group_alloc_update(mg);
  966         for (int i = 0; i < mg->mg_allocators; i++) {
  967                 metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
  968                 metaslab_t *msp = mga->mga_primary;
  969                 if (msp != NULL) {
  970                         mutex_enter(&msp->ms_lock);
  971                         metaslab_passivate(msp,
  972                             metaslab_weight_from_range_tree(msp));
  973                         mutex_exit(&msp->ms_lock);
  974                 }
  975                 msp = mga->mga_secondary;
  976                 if (msp != NULL) {
  977                         mutex_enter(&msp->ms_lock);
  978                         metaslab_passivate(msp,
  979                             metaslab_weight_from_range_tree(msp));
  980                         mutex_exit(&msp->ms_lock);
  981                 }
  982         }
  983 
  984         mgprev = mg->mg_prev;
  985         mgnext = mg->mg_next;
  986 
  987         if (mg == mgnext) {
  988                 mgnext = NULL;
  989         } else {
  990                 mgprev->mg_next = mgnext;
  991                 mgnext->mg_prev = mgprev;
  992         }
  993         for (int i = 0; i < spa->spa_alloc_count; i++) {
  994                 if (mc->mc_allocator[i].mca_rotor == mg)
  995                         mc->mc_allocator[i].mca_rotor = mgnext;
  996         }
  997 
  998         mg->mg_prev = NULL;
  999         mg->mg_next = NULL;
 1000 }
 1001 
 1002 boolean_t
 1003 metaslab_group_initialized(metaslab_group_t *mg)
 1004 {
 1005         vdev_t *vd = mg->mg_vd;
 1006         vdev_stat_t *vs = &vd->vdev_stat;
 1007 
 1008         return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 1009 }
 1010 
 1011 uint64_t
 1012 metaslab_group_get_space(metaslab_group_t *mg)
 1013 {
 1014         /*
 1015          * Note that the number of nodes in mg_metaslab_tree may be one less
 1016          * than vdev_ms_count, due to the embedded log metaslab.
 1017          */
 1018         mutex_enter(&mg->mg_lock);
 1019         uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree);
 1020         mutex_exit(&mg->mg_lock);
 1021         return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count);
 1022 }
 1023 
 1024 void
 1025 metaslab_group_histogram_verify(metaslab_group_t *mg)
 1026 {
 1027         uint64_t *mg_hist;
 1028         avl_tree_t *t = &mg->mg_metaslab_tree;
 1029         uint64_t ashift = mg->mg_vd->vdev_ashift;
 1030 
 1031         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 1032                 return;
 1033 
 1034         mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 1035             KM_SLEEP);
 1036 
 1037         ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 1038             SPACE_MAP_HISTOGRAM_SIZE + ashift);
 1039 
 1040         mutex_enter(&mg->mg_lock);
 1041         for (metaslab_t *msp = avl_first(t);
 1042             msp != NULL; msp = AVL_NEXT(t, msp)) {
 1043                 VERIFY3P(msp->ms_group, ==, mg);
 1044                 /* skip if not active */
 1045                 if (msp->ms_sm == NULL)
 1046                         continue;
 1047 
 1048                 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 1049                         mg_hist[i + ashift] +=
 1050                             msp->ms_sm->sm_phys->smp_histogram[i];
 1051                 }
 1052         }
 1053 
 1054         for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 1055                 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 1056 
 1057         mutex_exit(&mg->mg_lock);
 1058 
 1059         kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 1060 }
 1061 
 1062 static void
 1063 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 1064 {
 1065         metaslab_class_t *mc = mg->mg_class;
 1066         uint64_t ashift = mg->mg_vd->vdev_ashift;
 1067 
 1068         ASSERT(MUTEX_HELD(&msp->ms_lock));
 1069         if (msp->ms_sm == NULL)
 1070                 return;
 1071 
 1072         mutex_enter(&mg->mg_lock);
 1073         mutex_enter(&mc->mc_lock);
 1074         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 1075                 IMPLY(mg == mg->mg_vd->vdev_log_mg,
 1076                     mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 1077                 mg->mg_histogram[i + ashift] +=
 1078                     msp->ms_sm->sm_phys->smp_histogram[i];
 1079                 mc->mc_histogram[i + ashift] +=
 1080                     msp->ms_sm->sm_phys->smp_histogram[i];
 1081         }
 1082         mutex_exit(&mc->mc_lock);
 1083         mutex_exit(&mg->mg_lock);
 1084 }
 1085 
 1086 void
 1087 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 1088 {
 1089         metaslab_class_t *mc = mg->mg_class;
 1090         uint64_t ashift = mg->mg_vd->vdev_ashift;
 1091 
 1092         ASSERT(MUTEX_HELD(&msp->ms_lock));
 1093         if (msp->ms_sm == NULL)
 1094                 return;
 1095 
 1096         mutex_enter(&mg->mg_lock);
 1097         mutex_enter(&mc->mc_lock);
 1098         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 1099                 ASSERT3U(mg->mg_histogram[i + ashift], >=,
 1100                     msp->ms_sm->sm_phys->smp_histogram[i]);
 1101                 ASSERT3U(mc->mc_histogram[i + ashift], >=,
 1102                     msp->ms_sm->sm_phys->smp_histogram[i]);
 1103                 IMPLY(mg == mg->mg_vd->vdev_log_mg,
 1104                     mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 1105 
 1106                 mg->mg_histogram[i + ashift] -=
 1107                     msp->ms_sm->sm_phys->smp_histogram[i];
 1108                 mc->mc_histogram[i + ashift] -=
 1109                     msp->ms_sm->sm_phys->smp_histogram[i];
 1110         }
 1111         mutex_exit(&mc->mc_lock);
 1112         mutex_exit(&mg->mg_lock);
 1113 }
 1114 
 1115 static void
 1116 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 1117 {
 1118         ASSERT(msp->ms_group == NULL);
 1119         mutex_enter(&mg->mg_lock);
 1120         msp->ms_group = mg;
 1121         msp->ms_weight = 0;
 1122         avl_add(&mg->mg_metaslab_tree, msp);
 1123         mutex_exit(&mg->mg_lock);
 1124 
 1125         mutex_enter(&msp->ms_lock);
 1126         metaslab_group_histogram_add(mg, msp);
 1127         mutex_exit(&msp->ms_lock);
 1128 }
 1129 
 1130 static void
 1131 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 1132 {
 1133         mutex_enter(&msp->ms_lock);
 1134         metaslab_group_histogram_remove(mg, msp);
 1135         mutex_exit(&msp->ms_lock);
 1136 
 1137         mutex_enter(&mg->mg_lock);
 1138         ASSERT(msp->ms_group == mg);
 1139         avl_remove(&mg->mg_metaslab_tree, msp);
 1140 
 1141         metaslab_class_t *mc = msp->ms_group->mg_class;
 1142         multilist_sublist_t *mls =
 1143             multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 1144         if (multilist_link_active(&msp->ms_class_txg_node))
 1145                 multilist_sublist_remove(mls, msp);
 1146         multilist_sublist_unlock(mls);
 1147 
 1148         msp->ms_group = NULL;
 1149         mutex_exit(&mg->mg_lock);
 1150 }
 1151 
 1152 static void
 1153 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 1154 {
 1155         ASSERT(MUTEX_HELD(&msp->ms_lock));
 1156         ASSERT(MUTEX_HELD(&mg->mg_lock));
 1157         ASSERT(msp->ms_group == mg);
 1158 
 1159         avl_remove(&mg->mg_metaslab_tree, msp);
 1160         msp->ms_weight = weight;
 1161         avl_add(&mg->mg_metaslab_tree, msp);
 1162 
 1163 }
 1164 
 1165 static void
 1166 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 1167 {
 1168         /*
 1169          * Although in principle the weight can be any value, in
 1170          * practice we do not use values in the range [1, 511].
 1171          */
 1172         ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 1173         ASSERT(MUTEX_HELD(&msp->ms_lock));
 1174 
 1175         mutex_enter(&mg->mg_lock);
 1176         metaslab_group_sort_impl(mg, msp, weight);
 1177         mutex_exit(&mg->mg_lock);
 1178 }
 1179 
 1180 /*
 1181  * Calculate the fragmentation for a given metaslab group. We can use
 1182  * a simple average here since all metaslabs within the group must have
 1183  * the same size. The return value will be a value between 0 and 100
 1184  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
 1185  * group have a fragmentation metric.
 1186  */
 1187 uint64_t
 1188 metaslab_group_fragmentation(metaslab_group_t *mg)
 1189 {
 1190         vdev_t *vd = mg->mg_vd;
 1191         uint64_t fragmentation = 0;
 1192         uint64_t valid_ms = 0;
 1193 
 1194         for (int m = 0; m < vd->vdev_ms_count; m++) {
 1195                 metaslab_t *msp = vd->vdev_ms[m];
 1196 
 1197                 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 1198                         continue;
 1199                 if (msp->ms_group != mg)
 1200                         continue;
 1201 
 1202                 valid_ms++;
 1203                 fragmentation += msp->ms_fragmentation;
 1204         }
 1205 
 1206         if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
 1207                 return (ZFS_FRAG_INVALID);
 1208 
 1209         fragmentation /= valid_ms;
 1210         ASSERT3U(fragmentation, <=, 100);
 1211         return (fragmentation);
 1212 }
 1213 
 1214 /*
 1215  * Determine if a given metaslab group should skip allocations. A metaslab
 1216  * group should avoid allocations if its free capacity is less than the
 1217  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
 1218  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
 1219  * that can still handle allocations. If the allocation throttle is enabled
 1220  * then we skip allocations to devices that have reached their maximum
 1221  * allocation queue depth unless the selected metaslab group is the only
 1222  * eligible group remaining.
 1223  */
 1224 static boolean_t
 1225 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 1226     int flags, uint64_t psize, int allocator, int d)
 1227 {
 1228         spa_t *spa = mg->mg_vd->vdev_spa;
 1229         metaslab_class_t *mc = mg->mg_class;
 1230 
 1231         /*
 1232          * We can only consider skipping this metaslab group if it's
 1233          * in the normal metaslab class and there are other metaslab
 1234          * groups to select from. Otherwise, we always consider it eligible
 1235          * for allocations.
 1236          */
 1237         if ((mc != spa_normal_class(spa) &&
 1238             mc != spa_special_class(spa) &&
 1239             mc != spa_dedup_class(spa)) ||
 1240             mc->mc_groups <= 1)
 1241                 return (B_TRUE);
 1242 
 1243         /*
 1244          * If the metaslab group's mg_allocatable flag is set (see comments
 1245          * in metaslab_group_alloc_update() for more information) and
 1246          * the allocation throttle is disabled then allow allocations to this
 1247          * device. However, if the allocation throttle is enabled then
 1248          * check if we have reached our allocation limit (mga_alloc_queue_depth)
 1249          * to determine if we should allow allocations to this metaslab group.
 1250          * If all metaslab groups are no longer considered allocatable
 1251          * (mc_alloc_groups == 0) or we're trying to allocate the smallest
 1252          * gang block size then we allow allocations on this metaslab group
 1253          * regardless of the mg_allocatable or throttle settings.
 1254          */
 1255         if (mg->mg_allocatable) {
 1256                 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 1257                 int64_t qdepth;
 1258                 uint64_t qmax = mga->mga_cur_max_alloc_queue_depth;
 1259 
 1260                 if (!mc->mc_alloc_throttle_enabled)
 1261                         return (B_TRUE);
 1262 
 1263                 /*
 1264                  * If this metaslab group does not have any free space, then
 1265                  * there is no point in looking further.
 1266                  */
 1267                 if (mg->mg_no_free_space)
 1268                         return (B_FALSE);
 1269 
 1270                 /*
 1271                  * Some allocations (e.g., those coming from device removal
 1272                  * where the * allocations are not even counted in the
 1273                  * metaslab * allocation queues) are allowed to bypass
 1274                  * the throttle.
 1275                  */
 1276                 if (flags & METASLAB_DONT_THROTTLE)
 1277                         return (B_TRUE);
 1278 
 1279                 /*
 1280                  * Relax allocation throttling for ditto blocks.  Due to
 1281                  * random imbalances in allocation it tends to push copies
 1282                  * to one vdev, that looks a bit better at the moment.
 1283                  */
 1284                 qmax = qmax * (4 + d) / 4;
 1285 
 1286                 qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth);
 1287 
 1288                 /*
 1289                  * If this metaslab group is below its qmax or it's
 1290                  * the only allocatable metasable group, then attempt
 1291                  * to allocate from it.
 1292                  */
 1293                 if (qdepth < qmax || mc->mc_alloc_groups == 1)
 1294                         return (B_TRUE);
 1295                 ASSERT3U(mc->mc_alloc_groups, >, 1);
 1296 
 1297                 /*
 1298                  * Since this metaslab group is at or over its qmax, we
 1299                  * need to determine if there are metaslab groups after this
 1300                  * one that might be able to handle this allocation. This is
 1301                  * racy since we can't hold the locks for all metaslab
 1302                  * groups at the same time when we make this check.
 1303                  */
 1304                 for (metaslab_group_t *mgp = mg->mg_next;
 1305                     mgp != rotor; mgp = mgp->mg_next) {
 1306                         metaslab_group_allocator_t *mgap =
 1307                             &mgp->mg_allocator[allocator];
 1308                         qmax = mgap->mga_cur_max_alloc_queue_depth;
 1309                         qmax = qmax * (4 + d) / 4;
 1310                         qdepth =
 1311                             zfs_refcount_count(&mgap->mga_alloc_queue_depth);
 1312 
 1313                         /*
 1314                          * If there is another metaslab group that
 1315                          * might be able to handle the allocation, then
 1316                          * we return false so that we skip this group.
 1317                          */
 1318                         if (qdepth < qmax && !mgp->mg_no_free_space)
 1319                                 return (B_FALSE);
 1320                 }
 1321 
 1322                 /*
 1323                  * We didn't find another group to handle the allocation
 1324                  * so we can't skip this metaslab group even though
 1325                  * we are at or over our qmax.
 1326                  */
 1327                 return (B_TRUE);
 1328 
 1329         } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
 1330                 return (B_TRUE);
 1331         }
 1332         return (B_FALSE);
 1333 }
 1334 
 1335 /*
 1336  * ==========================================================================
 1337  * Range tree callbacks
 1338  * ==========================================================================
 1339  */
 1340 
 1341 /*
 1342  * Comparison function for the private size-ordered tree using 32-bit
 1343  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
 1344  */
 1345 static int
 1346 metaslab_rangesize32_compare(const void *x1, const void *x2)
 1347 {
 1348         const range_seg32_t *r1 = x1;
 1349         const range_seg32_t *r2 = x2;
 1350 
 1351         uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 1352         uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 1353 
 1354         int cmp = TREE_CMP(rs_size1, rs_size2);
 1355         if (likely(cmp))
 1356                 return (cmp);
 1357 
 1358         return (TREE_CMP(r1->rs_start, r2->rs_start));
 1359 }
 1360 
 1361 /*
 1362  * Comparison function for the private size-ordered tree using 64-bit
 1363  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
 1364  */
 1365 static int
 1366 metaslab_rangesize64_compare(const void *x1, const void *x2)
 1367 {
 1368         const range_seg64_t *r1 = x1;
 1369         const range_seg64_t *r2 = x2;
 1370 
 1371         uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 1372         uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 1373 
 1374         int cmp = TREE_CMP(rs_size1, rs_size2);
 1375         if (likely(cmp))
 1376                 return (cmp);
 1377 
 1378         return (TREE_CMP(r1->rs_start, r2->rs_start));
 1379 }
 1380 typedef struct metaslab_rt_arg {
 1381         zfs_btree_t *mra_bt;
 1382         uint32_t mra_floor_shift;
 1383 } metaslab_rt_arg_t;
 1384 
 1385 struct mssa_arg {
 1386         range_tree_t *rt;
 1387         metaslab_rt_arg_t *mra;
 1388 };
 1389 
 1390 static void
 1391 metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
 1392 {
 1393         struct mssa_arg *mssap = arg;
 1394         range_tree_t *rt = mssap->rt;
 1395         metaslab_rt_arg_t *mrap = mssap->mra;
 1396         range_seg_max_t seg = {0};
 1397         rs_set_start(&seg, rt, start);
 1398         rs_set_end(&seg, rt, start + size);
 1399         metaslab_rt_add(rt, &seg, mrap);
 1400 }
 1401 
 1402 static void
 1403 metaslab_size_tree_full_load(range_tree_t *rt)
 1404 {
 1405         metaslab_rt_arg_t *mrap = rt->rt_arg;
 1406         METASLABSTAT_BUMP(metaslabstat_reload_tree);
 1407         ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
 1408         mrap->mra_floor_shift = 0;
 1409         struct mssa_arg arg = {0};
 1410         arg.rt = rt;
 1411         arg.mra = mrap;
 1412         range_tree_walk(rt, metaslab_size_sorted_add, &arg);
 1413 }
 1414 
 1415 /*
 1416  * Create any block allocator specific components. The current allocators
 1417  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
 1418  */
 1419 static void
 1420 metaslab_rt_create(range_tree_t *rt, void *arg)
 1421 {
 1422         metaslab_rt_arg_t *mrap = arg;
 1423         zfs_btree_t *size_tree = mrap->mra_bt;
 1424 
 1425         size_t size;
 1426         int (*compare) (const void *, const void *);
 1427         switch (rt->rt_type) {
 1428         case RANGE_SEG32:
 1429                 size = sizeof (range_seg32_t);
 1430                 compare = metaslab_rangesize32_compare;
 1431                 break;
 1432         case RANGE_SEG64:
 1433                 size = sizeof (range_seg64_t);
 1434                 compare = metaslab_rangesize64_compare;
 1435                 break;
 1436         default:
 1437                 panic("Invalid range seg type %d", rt->rt_type);
 1438         }
 1439         zfs_btree_create(size_tree, compare, size);
 1440         mrap->mra_floor_shift = metaslab_by_size_min_shift;
 1441 }
 1442 
 1443 static void
 1444 metaslab_rt_destroy(range_tree_t *rt, void *arg)
 1445 {
 1446         (void) rt;
 1447         metaslab_rt_arg_t *mrap = arg;
 1448         zfs_btree_t *size_tree = mrap->mra_bt;
 1449 
 1450         zfs_btree_destroy(size_tree);
 1451         kmem_free(mrap, sizeof (*mrap));
 1452 }
 1453 
 1454 static void
 1455 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 1456 {
 1457         metaslab_rt_arg_t *mrap = arg;
 1458         zfs_btree_t *size_tree = mrap->mra_bt;
 1459 
 1460         if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
 1461             (1ULL << mrap->mra_floor_shift))
 1462                 return;
 1463 
 1464         zfs_btree_add(size_tree, rs);
 1465 }
 1466 
 1467 static void
 1468 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 1469 {
 1470         metaslab_rt_arg_t *mrap = arg;
 1471         zfs_btree_t *size_tree = mrap->mra_bt;
 1472 
 1473         if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL <<
 1474             mrap->mra_floor_shift))
 1475                 return;
 1476 
 1477         zfs_btree_remove(size_tree, rs);
 1478 }
 1479 
 1480 static void
 1481 metaslab_rt_vacate(range_tree_t *rt, void *arg)
 1482 {
 1483         metaslab_rt_arg_t *mrap = arg;
 1484         zfs_btree_t *size_tree = mrap->mra_bt;
 1485         zfs_btree_clear(size_tree);
 1486         zfs_btree_destroy(size_tree);
 1487 
 1488         metaslab_rt_create(rt, arg);
 1489 }
 1490 
 1491 static const range_tree_ops_t metaslab_rt_ops = {
 1492         .rtop_create = metaslab_rt_create,
 1493         .rtop_destroy = metaslab_rt_destroy,
 1494         .rtop_add = metaslab_rt_add,
 1495         .rtop_remove = metaslab_rt_remove,
 1496         .rtop_vacate = metaslab_rt_vacate
 1497 };
 1498 
 1499 /*
 1500  * ==========================================================================
 1501  * Common allocator routines
 1502  * ==========================================================================
 1503  */
 1504 
 1505 /*
 1506  * Return the maximum contiguous segment within the metaslab.
 1507  */
 1508 uint64_t
 1509 metaslab_largest_allocatable(metaslab_t *msp)
 1510 {
 1511         zfs_btree_t *t = &msp->ms_allocatable_by_size;
 1512         range_seg_t *rs;
 1513 
 1514         if (t == NULL)
 1515                 return (0);
 1516         if (zfs_btree_numnodes(t) == 0)
 1517                 metaslab_size_tree_full_load(msp->ms_allocatable);
 1518 
 1519         rs = zfs_btree_last(t, NULL);
 1520         if (rs == NULL)
 1521                 return (0);
 1522 
 1523         return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs,
 1524             msp->ms_allocatable));
 1525 }
 1526 
 1527 /*
 1528  * Return the maximum contiguous segment within the unflushed frees of this
 1529  * metaslab.
 1530  */
 1531 static uint64_t
 1532 metaslab_largest_unflushed_free(metaslab_t *msp)
 1533 {
 1534         ASSERT(MUTEX_HELD(&msp->ms_lock));
 1535 
 1536         if (msp->ms_unflushed_frees == NULL)
 1537                 return (0);
 1538 
 1539         if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
 1540                 metaslab_size_tree_full_load(msp->ms_unflushed_frees);
 1541         range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
 1542             NULL);
 1543         if (rs == NULL)
 1544                 return (0);
 1545 
 1546         /*
 1547          * When a range is freed from the metaslab, that range is added to
 1548          * both the unflushed frees and the deferred frees. While the block
 1549          * will eventually be usable, if the metaslab were loaded the range
 1550          * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
 1551          * txgs had passed.  As a result, when attempting to estimate an upper
 1552          * bound for the largest currently-usable free segment in the
 1553          * metaslab, we need to not consider any ranges currently in the defer
 1554          * trees. This algorithm approximates the largest available chunk in
 1555          * the largest range in the unflushed_frees tree by taking the first
 1556          * chunk.  While this may be a poor estimate, it should only remain so
 1557          * briefly and should eventually self-correct as frees are no longer
 1558          * deferred. Similar logic applies to the ms_freed tree. See
 1559          * metaslab_load() for more details.
 1560          *
 1561          * There are two primary sources of inaccuracy in this estimate. Both
 1562          * are tolerated for performance reasons. The first source is that we
 1563          * only check the largest segment for overlaps. Smaller segments may
 1564          * have more favorable overlaps with the other trees, resulting in
 1565          * larger usable chunks.  Second, we only look at the first chunk in
 1566          * the largest segment; there may be other usable chunks in the
 1567          * largest segment, but we ignore them.
 1568          */
 1569         uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees);
 1570         uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
 1571         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 1572                 uint64_t start = 0;
 1573                 uint64_t size = 0;
 1574                 boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
 1575                     rsize, &start, &size);
 1576                 if (found) {
 1577                         if (rstart == start)
 1578                                 return (0);
 1579                         rsize = start - rstart;
 1580                 }
 1581         }
 1582 
 1583         uint64_t start = 0;
 1584         uint64_t size = 0;
 1585         boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
 1586             rsize, &start, &size);
 1587         if (found)
 1588                 rsize = start - rstart;
 1589 
 1590         return (rsize);
 1591 }
 1592 
 1593 static range_seg_t *
 1594 metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
 1595     uint64_t size, zfs_btree_index_t *where)
 1596 {
 1597         range_seg_t *rs;
 1598         range_seg_max_t rsearch;
 1599 
 1600         rs_set_start(&rsearch, rt, start);
 1601         rs_set_end(&rsearch, rt, start + size);
 1602 
 1603         rs = zfs_btree_find(t, &rsearch, where);
 1604         if (rs == NULL) {
 1605                 rs = zfs_btree_next(t, where, where);
 1606         }
 1607 
 1608         return (rs);
 1609 }
 1610 
 1611 #if defined(WITH_DF_BLOCK_ALLOCATOR) || \
 1612     defined(WITH_CF_BLOCK_ALLOCATOR)
 1613 
 1614 /*
 1615  * This is a helper function that can be used by the allocator to find a
 1616  * suitable block to allocate. This will search the specified B-tree looking
 1617  * for a block that matches the specified criteria.
 1618  */
 1619 static uint64_t
 1620 metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
 1621     uint64_t max_search)
 1622 {
 1623         if (*cursor == 0)
 1624                 *cursor = rt->rt_start;
 1625         zfs_btree_t *bt = &rt->rt_root;
 1626         zfs_btree_index_t where;
 1627         range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where);
 1628         uint64_t first_found;
 1629         int count_searched = 0;
 1630 
 1631         if (rs != NULL)
 1632                 first_found = rs_get_start(rs, rt);
 1633 
 1634         while (rs != NULL && (rs_get_start(rs, rt) - first_found <=
 1635             max_search || count_searched < metaslab_min_search_count)) {
 1636                 uint64_t offset = rs_get_start(rs, rt);
 1637                 if (offset + size <= rs_get_end(rs, rt)) {
 1638                         *cursor = offset + size;
 1639                         return (offset);
 1640                 }
 1641                 rs = zfs_btree_next(bt, &where, &where);
 1642                 count_searched++;
 1643         }
 1644 
 1645         *cursor = 0;
 1646         return (-1ULL);
 1647 }
 1648 #endif /* WITH_DF/CF_BLOCK_ALLOCATOR */
 1649 
 1650 #if defined(WITH_DF_BLOCK_ALLOCATOR)
 1651 /*
 1652  * ==========================================================================
 1653  * Dynamic Fit (df) block allocator
 1654  *
 1655  * Search for a free chunk of at least this size, starting from the last
 1656  * offset (for this alignment of block) looking for up to
 1657  * metaslab_df_max_search bytes (16MB).  If a large enough free chunk is not
 1658  * found within 16MB, then return a free chunk of exactly the requested size (or
 1659  * larger).
 1660  *
 1661  * If it seems like searching from the last offset will be unproductive, skip
 1662  * that and just return a free chunk of exactly the requested size (or larger).
 1663  * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct.  This
 1664  * mechanism is probably not very useful and may be removed in the future.
 1665  *
 1666  * The behavior when not searching can be changed to return the largest free
 1667  * chunk, instead of a free chunk of exactly the requested size, by setting
 1668  * metaslab_df_use_largest_segment.
 1669  * ==========================================================================
 1670  */
 1671 static uint64_t
 1672 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 1673 {
 1674         /*
 1675          * Find the largest power of 2 block size that evenly divides the
 1676          * requested size. This is used to try to allocate blocks with similar
 1677          * alignment from the same area of the metaslab (i.e. same cursor
 1678          * bucket) but it does not guarantee that other allocations sizes
 1679          * may exist in the same region.
 1680          */
 1681         uint64_t align = size & -size;
 1682         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 1683         range_tree_t *rt = msp->ms_allocatable;
 1684         uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 1685         uint64_t offset;
 1686 
 1687         ASSERT(MUTEX_HELD(&msp->ms_lock));
 1688 
 1689         /*
 1690          * If we're running low on space, find a segment based on size,
 1691          * rather than iterating based on offset.
 1692          */
 1693         if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
 1694             free_pct < metaslab_df_free_pct) {
 1695                 offset = -1;
 1696         } else {
 1697                 offset = metaslab_block_picker(rt,
 1698                     cursor, size, metaslab_df_max_search);
 1699         }
 1700 
 1701         if (offset == -1) {
 1702                 range_seg_t *rs;
 1703                 if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
 1704                         metaslab_size_tree_full_load(msp->ms_allocatable);
 1705 
 1706                 if (metaslab_df_use_largest_segment) {
 1707                         /* use largest free segment */
 1708                         rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
 1709                 } else {
 1710                         zfs_btree_index_t where;
 1711                         /* use segment of this size, or next largest */
 1712                         rs = metaslab_block_find(&msp->ms_allocatable_by_size,
 1713                             rt, msp->ms_start, size, &where);
 1714                 }
 1715                 if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs,
 1716                     rt)) {
 1717                         offset = rs_get_start(rs, rt);
 1718                         *cursor = offset + size;
 1719                 }
 1720         }
 1721 
 1722         return (offset);
 1723 }
 1724 
 1725 const metaslab_ops_t zfs_metaslab_ops = {
 1726         metaslab_df_alloc
 1727 };
 1728 #endif /* WITH_DF_BLOCK_ALLOCATOR */
 1729 
 1730 #if defined(WITH_CF_BLOCK_ALLOCATOR)
 1731 /*
 1732  * ==========================================================================
 1733  * Cursor fit block allocator -
 1734  * Select the largest region in the metaslab, set the cursor to the beginning
 1735  * of the range and the cursor_end to the end of the range. As allocations
 1736  * are made advance the cursor. Continue allocating from the cursor until
 1737  * the range is exhausted and then find a new range.
 1738  * ==========================================================================
 1739  */
 1740 static uint64_t
 1741 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 1742 {
 1743         range_tree_t *rt = msp->ms_allocatable;
 1744         zfs_btree_t *t = &msp->ms_allocatable_by_size;
 1745         uint64_t *cursor = &msp->ms_lbas[0];
 1746         uint64_t *cursor_end = &msp->ms_lbas[1];
 1747         uint64_t offset = 0;
 1748 
 1749         ASSERT(MUTEX_HELD(&msp->ms_lock));
 1750 
 1751         ASSERT3U(*cursor_end, >=, *cursor);
 1752 
 1753         if ((*cursor + size) > *cursor_end) {
 1754                 range_seg_t *rs;
 1755 
 1756                 if (zfs_btree_numnodes(t) == 0)
 1757                         metaslab_size_tree_full_load(msp->ms_allocatable);
 1758                 rs = zfs_btree_last(t, NULL);
 1759                 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) <
 1760                     size)
 1761                         return (-1ULL);
 1762 
 1763                 *cursor = rs_get_start(rs, rt);
 1764                 *cursor_end = rs_get_end(rs, rt);
 1765         }
 1766 
 1767         offset = *cursor;
 1768         *cursor += size;
 1769 
 1770         return (offset);
 1771 }
 1772 
 1773 const metaslab_ops_t zfs_metaslab_ops = {
 1774         metaslab_cf_alloc
 1775 };
 1776 #endif /* WITH_CF_BLOCK_ALLOCATOR */
 1777 
 1778 #if defined(WITH_NDF_BLOCK_ALLOCATOR)
 1779 /*
 1780  * ==========================================================================
 1781  * New dynamic fit allocator -
 1782  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
 1783  * contiguous blocks. If no region is found then just use the largest segment
 1784  * that remains.
 1785  * ==========================================================================
 1786  */
 1787 
 1788 /*
 1789  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
 1790  * to request from the allocator.
 1791  */
 1792 uint64_t metaslab_ndf_clump_shift = 4;
 1793 
 1794 static uint64_t
 1795 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 1796 {
 1797         zfs_btree_t *t = &msp->ms_allocatable->rt_root;
 1798         range_tree_t *rt = msp->ms_allocatable;
 1799         zfs_btree_index_t where;
 1800         range_seg_t *rs;
 1801         range_seg_max_t rsearch;
 1802         uint64_t hbit = highbit64(size);
 1803         uint64_t *cursor = &msp->ms_lbas[hbit - 1];
 1804         uint64_t max_size = metaslab_largest_allocatable(msp);
 1805 
 1806         ASSERT(MUTEX_HELD(&msp->ms_lock));
 1807 
 1808         if (max_size < size)
 1809                 return (-1ULL);
 1810 
 1811         rs_set_start(&rsearch, rt, *cursor);
 1812         rs_set_end(&rsearch, rt, *cursor + size);
 1813 
 1814         rs = zfs_btree_find(t, &rsearch, &where);
 1815         if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) {
 1816                 t = &msp->ms_allocatable_by_size;
 1817 
 1818                 rs_set_start(&rsearch, rt, 0);
 1819                 rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
 1820                     metaslab_ndf_clump_shift)));
 1821 
 1822                 rs = zfs_btree_find(t, &rsearch, &where);
 1823                 if (rs == NULL)
 1824                         rs = zfs_btree_next(t, &where, &where);
 1825                 ASSERT(rs != NULL);
 1826         }
 1827 
 1828         if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) {
 1829                 *cursor = rs_get_start(rs, rt) + size;
 1830                 return (rs_get_start(rs, rt));
 1831         }
 1832         return (-1ULL);
 1833 }
 1834 
 1835 const metaslab_ops_t zfs_metaslab_ops = {
 1836         metaslab_ndf_alloc
 1837 };
 1838 #endif /* WITH_NDF_BLOCK_ALLOCATOR */
 1839 
 1840 
 1841 /*
 1842  * ==========================================================================
 1843  * Metaslabs
 1844  * ==========================================================================
 1845  */
 1846 
 1847 /*
 1848  * Wait for any in-progress metaslab loads to complete.
 1849  */
 1850 static void
 1851 metaslab_load_wait(metaslab_t *msp)
 1852 {
 1853         ASSERT(MUTEX_HELD(&msp->ms_lock));
 1854 
 1855         while (msp->ms_loading) {
 1856                 ASSERT(!msp->ms_loaded);
 1857                 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
 1858         }
 1859 }
 1860 
 1861 /*
 1862  * Wait for any in-progress flushing to complete.
 1863  */
 1864 static void
 1865 metaslab_flush_wait(metaslab_t *msp)
 1866 {
 1867         ASSERT(MUTEX_HELD(&msp->ms_lock));
 1868 
 1869         while (msp->ms_flushing)
 1870                 cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
 1871 }
 1872 
 1873 static unsigned int
 1874 metaslab_idx_func(multilist_t *ml, void *arg)
 1875 {
 1876         metaslab_t *msp = arg;
 1877 
 1878         /*
 1879          * ms_id values are allocated sequentially, so full 64bit
 1880          * division would be a waste of time, so limit it to 32 bits.
 1881          */
 1882         return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml));
 1883 }
 1884 
 1885 uint64_t
 1886 metaslab_allocated_space(metaslab_t *msp)
 1887 {
 1888         return (msp->ms_allocated_space);
 1889 }
 1890 
 1891 /*
 1892  * Verify that the space accounting on disk matches the in-core range_trees.
 1893  */
 1894 static void
 1895 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 1896 {
 1897         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 1898         uint64_t allocating = 0;
 1899         uint64_t sm_free_space, msp_free_space;
 1900 
 1901         ASSERT(MUTEX_HELD(&msp->ms_lock));
 1902         ASSERT(!msp->ms_condensing);
 1903 
 1904         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 1905                 return;
 1906 
 1907         /*
 1908          * We can only verify the metaslab space when we're called
 1909          * from syncing context with a loaded metaslab that has an
 1910          * allocated space map. Calling this in non-syncing context
 1911          * does not provide a consistent view of the metaslab since
 1912          * we're performing allocations in the future.
 1913          */
 1914         if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 1915             !msp->ms_loaded)
 1916                 return;
 1917 
 1918         /*
 1919          * Even though the smp_alloc field can get negative,
 1920          * when it comes to a metaslab's space map, that should
 1921          * never be the case.
 1922          */
 1923         ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
 1924 
 1925         ASSERT3U(space_map_allocated(msp->ms_sm), >=,
 1926             range_tree_space(msp->ms_unflushed_frees));
 1927 
 1928         ASSERT3U(metaslab_allocated_space(msp), ==,
 1929             space_map_allocated(msp->ms_sm) +
 1930             range_tree_space(msp->ms_unflushed_allocs) -
 1931             range_tree_space(msp->ms_unflushed_frees));
 1932 
 1933         sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
 1934 
 1935         /*
 1936          * Account for future allocations since we would have
 1937          * already deducted that space from the ms_allocatable.
 1938          */
 1939         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 1940                 allocating +=
 1941                     range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
 1942         }
 1943         ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
 1944             msp->ms_allocating_total);
 1945 
 1946         ASSERT3U(msp->ms_deferspace, ==,
 1947             range_tree_space(msp->ms_defer[0]) +
 1948             range_tree_space(msp->ms_defer[1]));
 1949 
 1950         msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
 1951             msp->ms_deferspace + range_tree_space(msp->ms_freed);
 1952 
 1953         VERIFY3U(sm_free_space, ==, msp_free_space);
 1954 }
 1955 
 1956 static void
 1957 metaslab_aux_histograms_clear(metaslab_t *msp)
 1958 {
 1959         /*
 1960          * Auxiliary histograms are only cleared when resetting them,
 1961          * which can only happen while the metaslab is loaded.
 1962          */
 1963         ASSERT(msp->ms_loaded);
 1964 
 1965         memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
 1966         for (int t = 0; t < TXG_DEFER_SIZE; t++)
 1967                 memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t]));
 1968 }
 1969 
 1970 static void
 1971 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
 1972     range_tree_t *rt)
 1973 {
 1974         /*
 1975          * This is modeled after space_map_histogram_add(), so refer to that
 1976          * function for implementation details. We want this to work like
 1977          * the space map histogram, and not the range tree histogram, as we
 1978          * are essentially constructing a delta that will be later subtracted
 1979          * from the space map histogram.
 1980          */
 1981         int idx = 0;
 1982         for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 1983                 ASSERT3U(i, >=, idx + shift);
 1984                 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
 1985 
 1986                 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
 1987                         ASSERT3U(idx + shift, ==, i);
 1988                         idx++;
 1989                         ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
 1990                 }
 1991         }
 1992 }
 1993 
 1994 /*
 1995  * Called at every sync pass that the metaslab gets synced.
 1996  *
 1997  * The reason is that we want our auxiliary histograms to be updated
 1998  * wherever the metaslab's space map histogram is updated. This way
 1999  * we stay consistent on which parts of the metaslab space map's
 2000  * histogram are currently not available for allocations (e.g because
 2001  * they are in the defer, freed, and freeing trees).
 2002  */
 2003 static void
 2004 metaslab_aux_histograms_update(metaslab_t *msp)
 2005 {
 2006         space_map_t *sm = msp->ms_sm;
 2007         ASSERT(sm != NULL);
 2008 
 2009         /*
 2010          * This is similar to the metaslab's space map histogram updates
 2011          * that take place in metaslab_sync(). The only difference is that
 2012          * we only care about segments that haven't made it into the
 2013          * ms_allocatable tree yet.
 2014          */
 2015         if (msp->ms_loaded) {
 2016                 metaslab_aux_histograms_clear(msp);
 2017 
 2018                 metaslab_aux_histogram_add(msp->ms_synchist,
 2019                     sm->sm_shift, msp->ms_freed);
 2020 
 2021                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 2022                         metaslab_aux_histogram_add(msp->ms_deferhist[t],
 2023                             sm->sm_shift, msp->ms_defer[t]);
 2024                 }
 2025         }
 2026 
 2027         metaslab_aux_histogram_add(msp->ms_synchist,
 2028             sm->sm_shift, msp->ms_freeing);
 2029 }
 2030 
 2031 /*
 2032  * Called every time we are done syncing (writing to) the metaslab,
 2033  * i.e. at the end of each sync pass.
 2034  * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
 2035  */
 2036 static void
 2037 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
 2038 {
 2039         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 2040         space_map_t *sm = msp->ms_sm;
 2041 
 2042         if (sm == NULL) {
 2043                 /*
 2044                  * We came here from metaslab_init() when creating/opening a
 2045                  * pool, looking at a metaslab that hasn't had any allocations
 2046                  * yet.
 2047                  */
 2048                 return;
 2049         }
 2050 
 2051         /*
 2052          * This is similar to the actions that we take for the ms_freed
 2053          * and ms_defer trees in metaslab_sync_done().
 2054          */
 2055         uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
 2056         if (defer_allowed) {
 2057                 memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist,
 2058                     sizeof (msp->ms_synchist));
 2059         } else {
 2060                 memset(msp->ms_deferhist[hist_index], 0,
 2061                     sizeof (msp->ms_deferhist[hist_index]));
 2062         }
 2063         memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
 2064 }
 2065 
 2066 /*
 2067  * Ensure that the metaslab's weight and fragmentation are consistent
 2068  * with the contents of the histogram (either the range tree's histogram
 2069  * or the space map's depending whether the metaslab is loaded).
 2070  */
 2071 static void
 2072 metaslab_verify_weight_and_frag(metaslab_t *msp)
 2073 {
 2074         ASSERT(MUTEX_HELD(&msp->ms_lock));
 2075 
 2076         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 2077                 return;
 2078 
 2079         /*
 2080          * We can end up here from vdev_remove_complete(), in which case we
 2081          * cannot do these assertions because we hold spa config locks and
 2082          * thus we are not allowed to read from the DMU.
 2083          *
 2084          * We check if the metaslab group has been removed and if that's
 2085          * the case we return immediately as that would mean that we are
 2086          * here from the aforementioned code path.
 2087          */
 2088         if (msp->ms_group == NULL)
 2089                 return;
 2090 
 2091         /*
 2092          * Devices being removed always return a weight of 0 and leave
 2093          * fragmentation and ms_max_size as is - there is nothing for
 2094          * us to verify here.
 2095          */
 2096         vdev_t *vd = msp->ms_group->mg_vd;
 2097         if (vd->vdev_removing)
 2098                 return;
 2099 
 2100         /*
 2101          * If the metaslab is dirty it probably means that we've done
 2102          * some allocations or frees that have changed our histograms
 2103          * and thus the weight.
 2104          */
 2105         for (int t = 0; t < TXG_SIZE; t++) {
 2106                 if (txg_list_member(&vd->vdev_ms_list, msp, t))
 2107                         return;
 2108         }
 2109 
 2110         /*
 2111          * This verification checks that our in-memory state is consistent
 2112          * with what's on disk. If the pool is read-only then there aren't
 2113          * any changes and we just have the initially-loaded state.
 2114          */
 2115         if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
 2116                 return;
 2117 
 2118         /* some extra verification for in-core tree if you can */
 2119         if (msp->ms_loaded) {
 2120                 range_tree_stat_verify(msp->ms_allocatable);
 2121                 VERIFY(space_map_histogram_verify(msp->ms_sm,
 2122                     msp->ms_allocatable));
 2123         }
 2124 
 2125         uint64_t weight = msp->ms_weight;
 2126         uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 2127         boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
 2128         uint64_t frag = msp->ms_fragmentation;
 2129         uint64_t max_segsize = msp->ms_max_size;
 2130 
 2131         msp->ms_weight = 0;
 2132         msp->ms_fragmentation = 0;
 2133 
 2134         /*
 2135          * This function is used for verification purposes and thus should
 2136          * not introduce any side-effects/mutations on the system's state.
 2137          *
 2138          * Regardless of whether metaslab_weight() thinks this metaslab
 2139          * should be active or not, we want to ensure that the actual weight
 2140          * (and therefore the value of ms_weight) would be the same if it
 2141          * was to be recalculated at this point.
 2142          *
 2143          * In addition we set the nodirty flag so metaslab_weight() does
 2144          * not dirty the metaslab for future TXGs (e.g. when trying to
 2145          * force condensing to upgrade the metaslab spacemaps).
 2146          */
 2147         msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active;
 2148 
 2149         VERIFY3U(max_segsize, ==, msp->ms_max_size);
 2150 
 2151         /*
 2152          * If the weight type changed then there is no point in doing
 2153          * verification. Revert fields to their original values.
 2154          */
 2155         if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
 2156             (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
 2157                 msp->ms_fragmentation = frag;
 2158                 msp->ms_weight = weight;
 2159                 return;
 2160         }
 2161 
 2162         VERIFY3U(msp->ms_fragmentation, ==, frag);
 2163         VERIFY3U(msp->ms_weight, ==, weight);
 2164 }
 2165 
 2166 /*
 2167  * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
 2168  * this class that was used longest ago, and attempt to unload it.  We don't
 2169  * want to spend too much time in this loop to prevent performance
 2170  * degradation, and we expect that most of the time this operation will
 2171  * succeed. Between that and the normal unloading processing during txg sync,
 2172  * we expect this to keep the metaslab memory usage under control.
 2173  */
 2174 static void
 2175 metaslab_potentially_evict(metaslab_class_t *mc)
 2176 {
 2177 #ifdef _KERNEL
 2178         uint64_t allmem = arc_all_memory();
 2179         uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 2180         uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache);
 2181         uint_t tries = 0;
 2182         for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
 2183             tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2;
 2184             tries++) {
 2185                 unsigned int idx = multilist_get_random_index(
 2186                     &mc->mc_metaslab_txg_list);
 2187                 multilist_sublist_t *mls =
 2188                     multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
 2189                 metaslab_t *msp = multilist_sublist_head(mls);
 2190                 multilist_sublist_unlock(mls);
 2191                 while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
 2192                     inuse * size) {
 2193                         VERIFY3P(mls, ==, multilist_sublist_lock(
 2194                             &mc->mc_metaslab_txg_list, idx));
 2195                         ASSERT3U(idx, ==,
 2196                             metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
 2197 
 2198                         if (!multilist_link_active(&msp->ms_class_txg_node)) {
 2199                                 multilist_sublist_unlock(mls);
 2200                                 break;
 2201                         }
 2202                         metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 2203                         multilist_sublist_unlock(mls);
 2204                         /*
 2205                          * If the metaslab is currently loading there are two
 2206                          * cases. If it's the metaslab we're evicting, we
 2207                          * can't continue on or we'll panic when we attempt to
 2208                          * recursively lock the mutex. If it's another
 2209                          * metaslab that's loading, it can be safely skipped,
 2210                          * since we know it's very new and therefore not a
 2211                          * good eviction candidate. We check later once the
 2212                          * lock is held that the metaslab is fully loaded
 2213                          * before actually unloading it.
 2214                          */
 2215                         if (msp->ms_loading) {
 2216                                 msp = next_msp;
 2217                                 inuse =
 2218                                     spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 2219                                 continue;
 2220                         }
 2221                         /*
 2222                          * We can't unload metaslabs with no spacemap because
 2223                          * they're not ready to be unloaded yet. We can't
 2224                          * unload metaslabs with outstanding allocations
 2225                          * because doing so could cause the metaslab's weight
 2226                          * to decrease while it's unloaded, which violates an
 2227                          * invariant that we use to prevent unnecessary
 2228                          * loading. We also don't unload metaslabs that are
 2229                          * currently active because they are high-weight
 2230                          * metaslabs that are likely to be used in the near
 2231                          * future.
 2232                          */
 2233                         mutex_enter(&msp->ms_lock);
 2234                         if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
 2235                             msp->ms_allocating_total == 0) {
 2236                                 metaslab_unload(msp);
 2237                         }
 2238                         mutex_exit(&msp->ms_lock);
 2239                         msp = next_msp;
 2240                         inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 2241                 }
 2242         }
 2243 #else
 2244         (void) mc, (void) zfs_metaslab_mem_limit;
 2245 #endif
 2246 }
 2247 
 2248 static int
 2249 metaslab_load_impl(metaslab_t *msp)
 2250 {
 2251         int error = 0;
 2252 
 2253         ASSERT(MUTEX_HELD(&msp->ms_lock));
 2254         ASSERT(msp->ms_loading);
 2255         ASSERT(!msp->ms_condensing);
 2256 
 2257         /*
 2258          * We temporarily drop the lock to unblock other operations while we
 2259          * are reading the space map. Therefore, metaslab_sync() and
 2260          * metaslab_sync_done() can run at the same time as we do.
 2261          *
 2262          * If we are using the log space maps, metaslab_sync() can't write to
 2263          * the metaslab's space map while we are loading as we only write to
 2264          * it when we are flushing the metaslab, and that can't happen while
 2265          * we are loading it.
 2266          *
 2267          * If we are not using log space maps though, metaslab_sync() can
 2268          * append to the space map while we are loading. Therefore we load
 2269          * only entries that existed when we started the load. Additionally,
 2270          * metaslab_sync_done() has to wait for the load to complete because
 2271          * there are potential races like metaslab_load() loading parts of the
 2272          * space map that are currently being appended by metaslab_sync(). If
 2273          * we didn't, the ms_allocatable would have entries that
 2274          * metaslab_sync_done() would try to re-add later.
 2275          *
 2276          * That's why before dropping the lock we remember the synced length
 2277          * of the metaslab and read up to that point of the space map,
 2278          * ignoring entries appended by metaslab_sync() that happen after we
 2279          * drop the lock.
 2280          */
 2281         uint64_t length = msp->ms_synced_length;
 2282         mutex_exit(&msp->ms_lock);
 2283 
 2284         hrtime_t load_start = gethrtime();
 2285         metaslab_rt_arg_t *mrap;
 2286         if (msp->ms_allocatable->rt_arg == NULL) {
 2287                 mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
 2288         } else {
 2289                 mrap = msp->ms_allocatable->rt_arg;
 2290                 msp->ms_allocatable->rt_ops = NULL;
 2291                 msp->ms_allocatable->rt_arg = NULL;
 2292         }
 2293         mrap->mra_bt = &msp->ms_allocatable_by_size;
 2294         mrap->mra_floor_shift = metaslab_by_size_min_shift;
 2295 
 2296         if (msp->ms_sm != NULL) {
 2297                 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
 2298                     SM_FREE, length);
 2299 
 2300                 /* Now, populate the size-sorted tree. */
 2301                 metaslab_rt_create(msp->ms_allocatable, mrap);
 2302                 msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
 2303                 msp->ms_allocatable->rt_arg = mrap;
 2304 
 2305                 struct mssa_arg arg = {0};
 2306                 arg.rt = msp->ms_allocatable;
 2307                 arg.mra = mrap;
 2308                 range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add,
 2309                     &arg);
 2310         } else {
 2311                 /*
 2312                  * Add the size-sorted tree first, since we don't need to load
 2313                  * the metaslab from the spacemap.
 2314                  */
 2315                 metaslab_rt_create(msp->ms_allocatable, mrap);
 2316                 msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
 2317                 msp->ms_allocatable->rt_arg = mrap;
 2318                 /*
 2319                  * The space map has not been allocated yet, so treat
 2320                  * all the space in the metaslab as free and add it to the
 2321                  * ms_allocatable tree.
 2322                  */
 2323                 range_tree_add(msp->ms_allocatable,
 2324                     msp->ms_start, msp->ms_size);
 2325 
 2326                 if (msp->ms_new) {
 2327                         /*
 2328                          * If the ms_sm doesn't exist, this means that this
 2329                          * metaslab hasn't gone through metaslab_sync() and
 2330                          * thus has never been dirtied. So we shouldn't
 2331                          * expect any unflushed allocs or frees from previous
 2332                          * TXGs.
 2333                          */
 2334                         ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
 2335                         ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 2336                 }
 2337         }
 2338 
 2339         /*
 2340          * We need to grab the ms_sync_lock to prevent metaslab_sync() from
 2341          * changing the ms_sm (or log_sm) and the metaslab's range trees
 2342          * while we are about to use them and populate the ms_allocatable.
 2343          * The ms_lock is insufficient for this because metaslab_sync() doesn't
 2344          * hold the ms_lock while writing the ms_checkpointing tree to disk.
 2345          */
 2346         mutex_enter(&msp->ms_sync_lock);
 2347         mutex_enter(&msp->ms_lock);
 2348 
 2349         ASSERT(!msp->ms_condensing);
 2350         ASSERT(!msp->ms_flushing);
 2351 
 2352         if (error != 0) {
 2353                 mutex_exit(&msp->ms_sync_lock);
 2354                 return (error);
 2355         }
 2356 
 2357         ASSERT3P(msp->ms_group, !=, NULL);
 2358         msp->ms_loaded = B_TRUE;
 2359 
 2360         /*
 2361          * Apply all the unflushed changes to ms_allocatable right
 2362          * away so any manipulations we do below have a clear view
 2363          * of what is allocated and what is free.
 2364          */
 2365         range_tree_walk(msp->ms_unflushed_allocs,
 2366             range_tree_remove, msp->ms_allocatable);
 2367         range_tree_walk(msp->ms_unflushed_frees,
 2368             range_tree_add, msp->ms_allocatable);
 2369 
 2370         ASSERT3P(msp->ms_group, !=, NULL);
 2371         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 2372         if (spa_syncing_log_sm(spa) != NULL) {
 2373                 ASSERT(spa_feature_is_enabled(spa,
 2374                     SPA_FEATURE_LOG_SPACEMAP));
 2375 
 2376                 /*
 2377                  * If we use a log space map we add all the segments
 2378                  * that are in ms_unflushed_frees so they are available
 2379                  * for allocation.
 2380                  *
 2381                  * ms_allocatable needs to contain all free segments
 2382                  * that are ready for allocations (thus not segments
 2383                  * from ms_freeing, ms_freed, and the ms_defer trees).
 2384                  * But if we grab the lock in this code path at a sync
 2385                  * pass later that 1, then it also contains the
 2386                  * segments of ms_freed (they were added to it earlier
 2387                  * in this path through ms_unflushed_frees). So we
 2388                  * need to remove all the segments that exist in
 2389                  * ms_freed from ms_allocatable as they will be added
 2390                  * later in metaslab_sync_done().
 2391                  *
 2392                  * When there's no log space map, the ms_allocatable
 2393                  * correctly doesn't contain any segments that exist
 2394                  * in ms_freed [see ms_synced_length].
 2395                  */
 2396                 range_tree_walk(msp->ms_freed,
 2397                     range_tree_remove, msp->ms_allocatable);
 2398         }
 2399 
 2400         /*
 2401          * If we are not using the log space map, ms_allocatable
 2402          * contains the segments that exist in the ms_defer trees
 2403          * [see ms_synced_length]. Thus we need to remove them
 2404          * from ms_allocatable as they will be added again in
 2405          * metaslab_sync_done().
 2406          *
 2407          * If we are using the log space map, ms_allocatable still
 2408          * contains the segments that exist in the ms_defer trees.
 2409          * Not because it read them through the ms_sm though. But
 2410          * because these segments are part of ms_unflushed_frees
 2411          * whose segments we add to ms_allocatable earlier in this
 2412          * code path.
 2413          */
 2414         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 2415                 range_tree_walk(msp->ms_defer[t],
 2416                     range_tree_remove, msp->ms_allocatable);
 2417         }
 2418 
 2419         /*
 2420          * Call metaslab_recalculate_weight_and_sort() now that the
 2421          * metaslab is loaded so we get the metaslab's real weight.
 2422          *
 2423          * Unless this metaslab was created with older software and
 2424          * has not yet been converted to use segment-based weight, we
 2425          * expect the new weight to be better or equal to the weight
 2426          * that the metaslab had while it was not loaded. This is
 2427          * because the old weight does not take into account the
 2428          * consolidation of adjacent segments between TXGs. [see
 2429          * comment for ms_synchist and ms_deferhist[] for more info]
 2430          */
 2431         uint64_t weight = msp->ms_weight;
 2432         uint64_t max_size = msp->ms_max_size;
 2433         metaslab_recalculate_weight_and_sort(msp);
 2434         if (!WEIGHT_IS_SPACEBASED(weight))
 2435                 ASSERT3U(weight, <=, msp->ms_weight);
 2436         msp->ms_max_size = metaslab_largest_allocatable(msp);
 2437         ASSERT3U(max_size, <=, msp->ms_max_size);
 2438         hrtime_t load_end = gethrtime();
 2439         msp->ms_load_time = load_end;
 2440         zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, "
 2441             "ms_id %llu, smp_length %llu, "
 2442             "unflushed_allocs %llu, unflushed_frees %llu, "
 2443             "freed %llu, defer %llu + %llu, unloaded time %llu ms, "
 2444             "loading_time %lld ms, ms_max_size %llu, "
 2445             "max size error %lld, "
 2446             "old_weight %llx, new_weight %llx",
 2447             (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
 2448             (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 2449             (u_longlong_t)msp->ms_id,
 2450             (u_longlong_t)space_map_length(msp->ms_sm),
 2451             (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs),
 2452             (u_longlong_t)range_tree_space(msp->ms_unflushed_frees),
 2453             (u_longlong_t)range_tree_space(msp->ms_freed),
 2454             (u_longlong_t)range_tree_space(msp->ms_defer[0]),
 2455             (u_longlong_t)range_tree_space(msp->ms_defer[1]),
 2456             (longlong_t)((load_start - msp->ms_unload_time) / 1000000),
 2457             (longlong_t)((load_end - load_start) / 1000000),
 2458             (u_longlong_t)msp->ms_max_size,
 2459             (u_longlong_t)msp->ms_max_size - max_size,
 2460             (u_longlong_t)weight, (u_longlong_t)msp->ms_weight);
 2461 
 2462         metaslab_verify_space(msp, spa_syncing_txg(spa));
 2463         mutex_exit(&msp->ms_sync_lock);
 2464         return (0);
 2465 }
 2466 
 2467 int
 2468 metaslab_load(metaslab_t *msp)
 2469 {
 2470         ASSERT(MUTEX_HELD(&msp->ms_lock));
 2471 
 2472         /*
 2473          * There may be another thread loading the same metaslab, if that's
 2474          * the case just wait until the other thread is done and return.
 2475          */
 2476         metaslab_load_wait(msp);
 2477         if (msp->ms_loaded)
 2478                 return (0);
 2479         VERIFY(!msp->ms_loading);
 2480         ASSERT(!msp->ms_condensing);
 2481 
 2482         /*
 2483          * We set the loading flag BEFORE potentially dropping the lock to
 2484          * wait for an ongoing flush (see ms_flushing below). This way other
 2485          * threads know that there is already a thread that is loading this
 2486          * metaslab.
 2487          */
 2488         msp->ms_loading = B_TRUE;
 2489 
 2490         /*
 2491          * Wait for any in-progress flushing to finish as we drop the ms_lock
 2492          * both here (during space_map_load()) and in metaslab_flush() (when
 2493          * we flush our changes to the ms_sm).
 2494          */
 2495         if (msp->ms_flushing)
 2496                 metaslab_flush_wait(msp);
 2497 
 2498         /*
 2499          * In the possibility that we were waiting for the metaslab to be
 2500          * flushed (where we temporarily dropped the ms_lock), ensure that
 2501          * no one else loaded the metaslab somehow.
 2502          */
 2503         ASSERT(!msp->ms_loaded);
 2504 
 2505         /*
 2506          * If we're loading a metaslab in the normal class, consider evicting
 2507          * another one to keep our memory usage under the limit defined by the
 2508          * zfs_metaslab_mem_limit tunable.
 2509          */
 2510         if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
 2511             msp->ms_group->mg_class) {
 2512                 metaslab_potentially_evict(msp->ms_group->mg_class);
 2513         }
 2514 
 2515         int error = metaslab_load_impl(msp);
 2516 
 2517         ASSERT(MUTEX_HELD(&msp->ms_lock));
 2518         msp->ms_loading = B_FALSE;
 2519         cv_broadcast(&msp->ms_load_cv);
 2520 
 2521         return (error);
 2522 }
 2523 
 2524 void
 2525 metaslab_unload(metaslab_t *msp)
 2526 {
 2527         ASSERT(MUTEX_HELD(&msp->ms_lock));
 2528 
 2529         /*
 2530          * This can happen if a metaslab is selected for eviction (in
 2531          * metaslab_potentially_evict) and then unloaded during spa_sync (via
 2532          * metaslab_class_evict_old).
 2533          */
 2534         if (!msp->ms_loaded)
 2535                 return;
 2536 
 2537         range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 2538         msp->ms_loaded = B_FALSE;
 2539         msp->ms_unload_time = gethrtime();
 2540 
 2541         msp->ms_activation_weight = 0;
 2542         msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
 2543 
 2544         if (msp->ms_group != NULL) {
 2545                 metaslab_class_t *mc = msp->ms_group->mg_class;
 2546                 multilist_sublist_t *mls =
 2547                     multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 2548                 if (multilist_link_active(&msp->ms_class_txg_node))
 2549                         multilist_sublist_remove(mls, msp);
 2550                 multilist_sublist_unlock(mls);
 2551 
 2552                 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 2553                 zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, "
 2554                     "ms_id %llu, weight %llx, "
 2555                     "selected txg %llu (%llu ms ago), alloc_txg %llu, "
 2556                     "loaded %llu ms ago, max_size %llu",
 2557                     (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
 2558                     (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 2559                     (u_longlong_t)msp->ms_id,
 2560                     (u_longlong_t)msp->ms_weight,
 2561                     (u_longlong_t)msp->ms_selected_txg,
 2562                     (u_longlong_t)(msp->ms_unload_time -
 2563                     msp->ms_selected_time) / 1000 / 1000,
 2564                     (u_longlong_t)msp->ms_alloc_txg,
 2565                     (u_longlong_t)(msp->ms_unload_time -
 2566                     msp->ms_load_time) / 1000 / 1000,
 2567                     (u_longlong_t)msp->ms_max_size);
 2568         }
 2569 
 2570         /*
 2571          * We explicitly recalculate the metaslab's weight based on its space
 2572          * map (as it is now not loaded). We want unload metaslabs to always
 2573          * have their weights calculated from the space map histograms, while
 2574          * loaded ones have it calculated from their in-core range tree
 2575          * [see metaslab_load()]. This way, the weight reflects the information
 2576          * available in-core, whether it is loaded or not.
 2577          *
 2578          * If ms_group == NULL means that we came here from metaslab_fini(),
 2579          * at which point it doesn't make sense for us to do the recalculation
 2580          * and the sorting.
 2581          */
 2582         if (msp->ms_group != NULL)
 2583                 metaslab_recalculate_weight_and_sort(msp);
 2584 }
 2585 
 2586 /*
 2587  * We want to optimize the memory use of the per-metaslab range
 2588  * trees. To do this, we store the segments in the range trees in
 2589  * units of sectors, zero-indexing from the start of the metaslab. If
 2590  * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
 2591  * the ranges using two uint32_ts, rather than two uint64_ts.
 2592  */
 2593 range_seg_type_t
 2594 metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
 2595     uint64_t *start, uint64_t *shift)
 2596 {
 2597         if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
 2598             !zfs_metaslab_force_large_segs) {
 2599                 *shift = vdev->vdev_ashift;
 2600                 *start = msp->ms_start;
 2601                 return (RANGE_SEG32);
 2602         } else {
 2603                 *shift = 0;
 2604                 *start = 0;
 2605                 return (RANGE_SEG64);
 2606         }
 2607 }
 2608 
 2609 void
 2610 metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
 2611 {
 2612         ASSERT(MUTEX_HELD(&msp->ms_lock));
 2613         metaslab_class_t *mc = msp->ms_group->mg_class;
 2614         multilist_sublist_t *mls =
 2615             multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 2616         if (multilist_link_active(&msp->ms_class_txg_node))
 2617                 multilist_sublist_remove(mls, msp);
 2618         msp->ms_selected_txg = txg;
 2619         msp->ms_selected_time = gethrtime();
 2620         multilist_sublist_insert_tail(mls, msp);
 2621         multilist_sublist_unlock(mls);
 2622 }
 2623 
 2624 void
 2625 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
 2626     int64_t defer_delta, int64_t space_delta)
 2627 {
 2628         vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
 2629 
 2630         ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
 2631         ASSERT(vd->vdev_ms_count != 0);
 2632 
 2633         metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
 2634             vdev_deflated_space(vd, space_delta));
 2635 }
 2636 
 2637 int
 2638 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
 2639     uint64_t txg, metaslab_t **msp)
 2640 {
 2641         vdev_t *vd = mg->mg_vd;
 2642         spa_t *spa = vd->vdev_spa;
 2643         objset_t *mos = spa->spa_meta_objset;
 2644         metaslab_t *ms;
 2645         int error;
 2646 
 2647         ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 2648         mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 2649         mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 2650         cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
 2651         cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
 2652         multilist_link_init(&ms->ms_class_txg_node);
 2653 
 2654         ms->ms_id = id;
 2655         ms->ms_start = id << vd->vdev_ms_shift;
 2656         ms->ms_size = 1ULL << vd->vdev_ms_shift;
 2657         ms->ms_allocator = -1;
 2658         ms->ms_new = B_TRUE;
 2659 
 2660         vdev_ops_t *ops = vd->vdev_ops;
 2661         if (ops->vdev_op_metaslab_init != NULL)
 2662                 ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
 2663 
 2664         /*
 2665          * We only open space map objects that already exist. All others
 2666          * will be opened when we finally allocate an object for it. For
 2667          * readonly pools there is no need to open the space map object.
 2668          *
 2669          * Note:
 2670          * When called from vdev_expand(), we can't call into the DMU as
 2671          * we are holding the spa_config_lock as a writer and we would
 2672          * deadlock [see relevant comment in vdev_metaslab_init()]. in
 2673          * that case, the object parameter is zero though, so we won't
 2674          * call into the DMU.
 2675          */
 2676         if (object != 0 && !(spa->spa_mode == SPA_MODE_READ &&
 2677             !spa->spa_read_spacemaps)) {
 2678                 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
 2679                     ms->ms_size, vd->vdev_ashift);
 2680 
 2681                 if (error != 0) {
 2682                         kmem_free(ms, sizeof (metaslab_t));
 2683                         return (error);
 2684                 }
 2685 
 2686                 ASSERT(ms->ms_sm != NULL);
 2687                 ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
 2688         }
 2689 
 2690         uint64_t shift, start;
 2691         range_seg_type_t type =
 2692             metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
 2693 
 2694         ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
 2695         for (int t = 0; t < TXG_SIZE; t++) {
 2696                 ms->ms_allocating[t] = range_tree_create(NULL, type,
 2697                     NULL, start, shift);
 2698         }
 2699         ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift);
 2700         ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift);
 2701         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 2702                 ms->ms_defer[t] = range_tree_create(NULL, type, NULL,
 2703                     start, shift);
 2704         }
 2705         ms->ms_checkpointing =
 2706             range_tree_create(NULL, type, NULL, start, shift);
 2707         ms->ms_unflushed_allocs =
 2708             range_tree_create(NULL, type, NULL, start, shift);
 2709 
 2710         metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
 2711         mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
 2712         mrap->mra_floor_shift = metaslab_by_size_min_shift;
 2713         ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
 2714             type, mrap, start, shift);
 2715 
 2716         ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
 2717 
 2718         metaslab_group_add(mg, ms);
 2719         metaslab_set_fragmentation(ms, B_FALSE);
 2720 
 2721         /*
 2722          * If we're opening an existing pool (txg == 0) or creating
 2723          * a new one (txg == TXG_INITIAL), all space is available now.
 2724          * If we're adding space to an existing pool, the new space
 2725          * does not become available until after this txg has synced.
 2726          * The metaslab's weight will also be initialized when we sync
 2727          * out this txg. This ensures that we don't attempt to allocate
 2728          * from it before we have initialized it completely.
 2729          */
 2730         if (txg <= TXG_INITIAL) {
 2731                 metaslab_sync_done(ms, 0);
 2732                 metaslab_space_update(vd, mg->mg_class,
 2733                     metaslab_allocated_space(ms), 0, 0);
 2734         }
 2735 
 2736         if (txg != 0) {
 2737                 vdev_dirty(vd, 0, NULL, txg);
 2738                 vdev_dirty(vd, VDD_METASLAB, ms, txg);
 2739         }
 2740 
 2741         *msp = ms;
 2742 
 2743         return (0);
 2744 }
 2745 
 2746 static void
 2747 metaslab_fini_flush_data(metaslab_t *msp)
 2748 {
 2749         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 2750 
 2751         if (metaslab_unflushed_txg(msp) == 0) {
 2752                 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
 2753                     ==, NULL);
 2754                 return;
 2755         }
 2756         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 2757 
 2758         mutex_enter(&spa->spa_flushed_ms_lock);
 2759         avl_remove(&spa->spa_metaslabs_by_flushed, msp);
 2760         mutex_exit(&spa->spa_flushed_ms_lock);
 2761 
 2762         spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
 2763         spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp),
 2764             metaslab_unflushed_dirty(msp));
 2765 }
 2766 
 2767 uint64_t
 2768 metaslab_unflushed_changes_memused(metaslab_t *ms)
 2769 {
 2770         return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
 2771             range_tree_numsegs(ms->ms_unflushed_frees)) *
 2772             ms->ms_unflushed_allocs->rt_root.bt_elem_size);
 2773 }
 2774 
 2775 void
 2776 metaslab_fini(metaslab_t *msp)
 2777 {
 2778         metaslab_group_t *mg = msp->ms_group;
 2779         vdev_t *vd = mg->mg_vd;
 2780         spa_t *spa = vd->vdev_spa;
 2781 
 2782         metaslab_fini_flush_data(msp);
 2783 
 2784         metaslab_group_remove(mg, msp);
 2785 
 2786         mutex_enter(&msp->ms_lock);
 2787         VERIFY(msp->ms_group == NULL);
 2788 
 2789         /*
 2790          * If this metaslab hasn't been through metaslab_sync_done() yet its
 2791          * space hasn't been accounted for in its vdev and doesn't need to be
 2792          * subtracted.
 2793          */
 2794         if (!msp->ms_new) {
 2795                 metaslab_space_update(vd, mg->mg_class,
 2796                     -metaslab_allocated_space(msp), 0, -msp->ms_size);
 2797 
 2798         }
 2799         space_map_close(msp->ms_sm);
 2800         msp->ms_sm = NULL;
 2801 
 2802         metaslab_unload(msp);
 2803 
 2804         range_tree_destroy(msp->ms_allocatable);
 2805         range_tree_destroy(msp->ms_freeing);
 2806         range_tree_destroy(msp->ms_freed);
 2807 
 2808         ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 2809             metaslab_unflushed_changes_memused(msp));
 2810         spa->spa_unflushed_stats.sus_memused -=
 2811             metaslab_unflushed_changes_memused(msp);
 2812         range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 2813         range_tree_destroy(msp->ms_unflushed_allocs);
 2814         range_tree_destroy(msp->ms_checkpointing);
 2815         range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 2816         range_tree_destroy(msp->ms_unflushed_frees);
 2817 
 2818         for (int t = 0; t < TXG_SIZE; t++) {
 2819                 range_tree_destroy(msp->ms_allocating[t]);
 2820         }
 2821         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 2822                 range_tree_destroy(msp->ms_defer[t]);
 2823         }
 2824         ASSERT0(msp->ms_deferspace);
 2825 
 2826         for (int t = 0; t < TXG_SIZE; t++)
 2827                 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
 2828 
 2829         range_tree_vacate(msp->ms_trim, NULL, NULL);
 2830         range_tree_destroy(msp->ms_trim);
 2831 
 2832         mutex_exit(&msp->ms_lock);
 2833         cv_destroy(&msp->ms_load_cv);
 2834         cv_destroy(&msp->ms_flush_cv);
 2835         mutex_destroy(&msp->ms_lock);
 2836         mutex_destroy(&msp->ms_sync_lock);
 2837         ASSERT3U(msp->ms_allocator, ==, -1);
 2838 
 2839         kmem_free(msp, sizeof (metaslab_t));
 2840 }
 2841 
 2842 #define FRAGMENTATION_TABLE_SIZE        17
 2843 
 2844 /*
 2845  * This table defines a segment size based fragmentation metric that will
 2846  * allow each metaslab to derive its own fragmentation value. This is done
 2847  * by calculating the space in each bucket of the spacemap histogram and
 2848  * multiplying that by the fragmentation metric in this table. Doing
 2849  * this for all buckets and dividing it by the total amount of free
 2850  * space in this metaslab (i.e. the total free space in all buckets) gives
 2851  * us the fragmentation metric. This means that a high fragmentation metric
 2852  * equates to most of the free space being comprised of small segments.
 2853  * Conversely, if the metric is low, then most of the free space is in
 2854  * large segments. A 10% change in fragmentation equates to approximately
 2855  * double the number of segments.
 2856  *
 2857  * This table defines 0% fragmented space using 16MB segments. Testing has
 2858  * shown that segments that are greater than or equal to 16MB do not suffer
 2859  * from drastic performance problems. Using this value, we derive the rest
 2860  * of the table. Since the fragmentation value is never stored on disk, it
 2861  * is possible to change these calculations in the future.
 2862  */
 2863 static const int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
 2864         100,    /* 512B */
 2865         100,    /* 1K   */
 2866         98,     /* 2K   */
 2867         95,     /* 4K   */
 2868         90,     /* 8K   */
 2869         80,     /* 16K  */
 2870         70,     /* 32K  */
 2871         60,     /* 64K  */
 2872         50,     /* 128K */
 2873         40,     /* 256K */
 2874         30,     /* 512K */
 2875         20,     /* 1M   */
 2876         15,     /* 2M   */
 2877         10,     /* 4M   */
 2878         5,      /* 8M   */
 2879         0       /* 16M  */
 2880 };
 2881 
 2882 /*
 2883  * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
 2884  * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
 2885  * been upgraded and does not support this metric. Otherwise, the return
 2886  * value should be in the range [0, 100].
 2887  */
 2888 static void
 2889 metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty)
 2890 {
 2891         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 2892         uint64_t fragmentation = 0;
 2893         uint64_t total = 0;
 2894         boolean_t feature_enabled = spa_feature_is_enabled(spa,
 2895             SPA_FEATURE_SPACEMAP_HISTOGRAM);
 2896 
 2897         if (!feature_enabled) {
 2898                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
 2899                 return;
 2900         }
 2901 
 2902         /*
 2903          * A null space map means that the entire metaslab is free
 2904          * and thus is not fragmented.
 2905          */
 2906         if (msp->ms_sm == NULL) {
 2907                 msp->ms_fragmentation = 0;
 2908                 return;
 2909         }
 2910 
 2911         /*
 2912          * If this metaslab's space map has not been upgraded, flag it
 2913          * so that we upgrade next time we encounter it.
 2914          */
 2915         if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
 2916                 uint64_t txg = spa_syncing_txg(spa);
 2917                 vdev_t *vd = msp->ms_group->mg_vd;
 2918 
 2919                 /*
 2920                  * If we've reached the final dirty txg, then we must
 2921                  * be shutting down the pool. We don't want to dirty
 2922                  * any data past this point so skip setting the condense
 2923                  * flag. We can retry this action the next time the pool
 2924                  * is imported. We also skip marking this metaslab for
 2925                  * condensing if the caller has explicitly set nodirty.
 2926                  */
 2927                 if (!nodirty &&
 2928                     spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
 2929                         msp->ms_condense_wanted = B_TRUE;
 2930                         vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 2931                         zfs_dbgmsg("txg %llu, requesting force condense: "
 2932                             "ms_id %llu, vdev_id %llu", (u_longlong_t)txg,
 2933                             (u_longlong_t)msp->ms_id,
 2934                             (u_longlong_t)vd->vdev_id);
 2935                 }
 2936                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
 2937                 return;
 2938         }
 2939 
 2940         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 2941                 uint64_t space = 0;
 2942                 uint8_t shift = msp->ms_sm->sm_shift;
 2943 
 2944                 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
 2945                     FRAGMENTATION_TABLE_SIZE - 1);
 2946 
 2947                 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
 2948                         continue;
 2949 
 2950                 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
 2951                 total += space;
 2952 
 2953                 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
 2954                 fragmentation += space * zfs_frag_table[idx];
 2955         }
 2956 
 2957         if (total > 0)
 2958                 fragmentation /= total;
 2959         ASSERT3U(fragmentation, <=, 100);
 2960 
 2961         msp->ms_fragmentation = fragmentation;
 2962 }
 2963 
 2964 /*
 2965  * Compute a weight -- a selection preference value -- for the given metaslab.
 2966  * This is based on the amount of free space, the level of fragmentation,
 2967  * the LBA range, and whether the metaslab is loaded.
 2968  */
 2969 static uint64_t
 2970 metaslab_space_weight(metaslab_t *msp)
 2971 {
 2972         metaslab_group_t *mg = msp->ms_group;
 2973         vdev_t *vd = mg->mg_vd;
 2974         uint64_t weight, space;
 2975 
 2976         ASSERT(MUTEX_HELD(&msp->ms_lock));
 2977 
 2978         /*
 2979          * The baseline weight is the metaslab's free space.
 2980          */
 2981         space = msp->ms_size - metaslab_allocated_space(msp);
 2982 
 2983         if (metaslab_fragmentation_factor_enabled &&
 2984             msp->ms_fragmentation != ZFS_FRAG_INVALID) {
 2985                 /*
 2986                  * Use the fragmentation information to inversely scale
 2987                  * down the baseline weight. We need to ensure that we
 2988                  * don't exclude this metaslab completely when it's 100%
 2989                  * fragmented. To avoid this we reduce the fragmented value
 2990                  * by 1.
 2991                  */
 2992                 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
 2993 
 2994                 /*
 2995                  * If space < SPA_MINBLOCKSIZE, then we will not allocate from
 2996                  * this metaslab again. The fragmentation metric may have
 2997                  * decreased the space to something smaller than
 2998                  * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
 2999                  * so that we can consume any remaining space.
 3000                  */
 3001                 if (space > 0 && space < SPA_MINBLOCKSIZE)
 3002                         space = SPA_MINBLOCKSIZE;
 3003         }
 3004         weight = space;
 3005 
 3006         /*
 3007          * Modern disks have uniform bit density and constant angular velocity.
 3008          * Therefore, the outer recording zones are faster (higher bandwidth)
 3009          * than the inner zones by the ratio of outer to inner track diameter,
 3010          * which is typically around 2:1.  We account for this by assigning
 3011          * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
 3012          * In effect, this means that we'll select the metaslab with the most
 3013          * free bandwidth rather than simply the one with the most free space.
 3014          */
 3015         if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
 3016                 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
 3017                 ASSERT(weight >= space && weight <= 2 * space);
 3018         }
 3019 
 3020         /*
 3021          * If this metaslab is one we're actively using, adjust its
 3022          * weight to make it preferable to any inactive metaslab so
 3023          * we'll polish it off. If the fragmentation on this metaslab
 3024          * has exceed our threshold, then don't mark it active.
 3025          */
 3026         if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
 3027             msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
 3028                 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 3029         }
 3030 
 3031         WEIGHT_SET_SPACEBASED(weight);
 3032         return (weight);
 3033 }
 3034 
 3035 /*
 3036  * Return the weight of the specified metaslab, according to the segment-based
 3037  * weighting algorithm. The metaslab must be loaded. This function can
 3038  * be called within a sync pass since it relies only on the metaslab's
 3039  * range tree which is always accurate when the metaslab is loaded.
 3040  */
 3041 static uint64_t
 3042 metaslab_weight_from_range_tree(metaslab_t *msp)
 3043 {
 3044         uint64_t weight = 0;
 3045         uint32_t segments = 0;
 3046 
 3047         ASSERT(msp->ms_loaded);
 3048 
 3049         for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
 3050             i--) {
 3051                 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
 3052                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 3053 
 3054                 segments <<= 1;
 3055                 segments += msp->ms_allocatable->rt_histogram[i];
 3056 
 3057                 /*
 3058                  * The range tree provides more precision than the space map
 3059                  * and must be downgraded so that all values fit within the
 3060                  * space map's histogram. This allows us to compare loaded
 3061                  * vs. unloaded metaslabs to determine which metaslab is
 3062                  * considered "best".
 3063                  */
 3064                 if (i > max_idx)
 3065                         continue;
 3066 
 3067                 if (segments != 0) {
 3068                         WEIGHT_SET_COUNT(weight, segments);
 3069                         WEIGHT_SET_INDEX(weight, i);
 3070                         WEIGHT_SET_ACTIVE(weight, 0);
 3071                         break;
 3072                 }
 3073         }
 3074         return (weight);
 3075 }
 3076 
 3077 /*
 3078  * Calculate the weight based on the on-disk histogram. Should be applied
 3079  * only to unloaded metaslabs  (i.e no incoming allocations) in-order to
 3080  * give results consistent with the on-disk state
 3081  */
 3082 static uint64_t
 3083 metaslab_weight_from_spacemap(metaslab_t *msp)
 3084 {
 3085         space_map_t *sm = msp->ms_sm;
 3086         ASSERT(!msp->ms_loaded);
 3087         ASSERT(sm != NULL);
 3088         ASSERT3U(space_map_object(sm), !=, 0);
 3089         ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
 3090 
 3091         /*
 3092          * Create a joint histogram from all the segments that have made
 3093          * it to the metaslab's space map histogram, that are not yet
 3094          * available for allocation because they are still in the freeing
 3095          * pipeline (e.g. freeing, freed, and defer trees). Then subtract
 3096          * these segments from the space map's histogram to get a more
 3097          * accurate weight.
 3098          */
 3099         uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
 3100         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 3101                 deferspace_histogram[i] += msp->ms_synchist[i];
 3102         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 3103                 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 3104                         deferspace_histogram[i] += msp->ms_deferhist[t][i];
 3105                 }
 3106         }
 3107 
 3108         uint64_t weight = 0;
 3109         for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
 3110                 ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
 3111                     deferspace_histogram[i]);
 3112                 uint64_t count =
 3113                     sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
 3114                 if (count != 0) {
 3115                         WEIGHT_SET_COUNT(weight, count);
 3116                         WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
 3117                         WEIGHT_SET_ACTIVE(weight, 0);
 3118                         break;
 3119                 }
 3120         }
 3121         return (weight);
 3122 }
 3123 
 3124 /*
 3125  * Compute a segment-based weight for the specified metaslab. The weight
 3126  * is determined by highest bucket in the histogram. The information
 3127  * for the highest bucket is encoded into the weight value.
 3128  */
 3129 static uint64_t
 3130 metaslab_segment_weight(metaslab_t *msp)
 3131 {
 3132         metaslab_group_t *mg = msp->ms_group;
 3133         uint64_t weight = 0;
 3134         uint8_t shift = mg->mg_vd->vdev_ashift;
 3135 
 3136         ASSERT(MUTEX_HELD(&msp->ms_lock));
 3137 
 3138         /*
 3139          * The metaslab is completely free.
 3140          */
 3141         if (metaslab_allocated_space(msp) == 0) {
 3142                 int idx = highbit64(msp->ms_size) - 1;
 3143                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 3144 
 3145                 if (idx < max_idx) {
 3146                         WEIGHT_SET_COUNT(weight, 1ULL);
 3147                         WEIGHT_SET_INDEX(weight, idx);
 3148                 } else {
 3149                         WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
 3150                         WEIGHT_SET_INDEX(weight, max_idx);
 3151                 }
 3152                 WEIGHT_SET_ACTIVE(weight, 0);
 3153                 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
 3154                 return (weight);
 3155         }
 3156 
 3157         ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
 3158 
 3159         /*
 3160          * If the metaslab is fully allocated then just make the weight 0.
 3161          */
 3162         if (metaslab_allocated_space(msp) == msp->ms_size)
 3163                 return (0);
 3164         /*
 3165          * If the metaslab is already loaded, then use the range tree to
 3166          * determine the weight. Otherwise, we rely on the space map information
 3167          * to generate the weight.
 3168          */
 3169         if (msp->ms_loaded) {
 3170                 weight = metaslab_weight_from_range_tree(msp);
 3171         } else {
 3172                 weight = metaslab_weight_from_spacemap(msp);
 3173         }
 3174 
 3175         /*
 3176          * If the metaslab was active the last time we calculated its weight
 3177          * then keep it active. We want to consume the entire region that
 3178          * is associated with this weight.
 3179          */
 3180         if (msp->ms_activation_weight != 0 && weight != 0)
 3181                 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
 3182         return (weight);
 3183 }
 3184 
 3185 /*
 3186  * Determine if we should attempt to allocate from this metaslab. If the
 3187  * metaslab is loaded, then we can determine if the desired allocation
 3188  * can be satisfied by looking at the size of the maximum free segment
 3189  * on that metaslab. Otherwise, we make our decision based on the metaslab's
 3190  * weight. For segment-based weighting we can determine the maximum
 3191  * allocation based on the index encoded in its value. For space-based
 3192  * weights we rely on the entire weight (excluding the weight-type bit).
 3193  */
 3194 static boolean_t
 3195 metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
 3196 {
 3197         /*
 3198          * If the metaslab is loaded, ms_max_size is definitive and we can use
 3199          * the fast check. If it's not, the ms_max_size is a lower bound (once
 3200          * set), and we should use the fast check as long as we're not in
 3201          * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
 3202          * seconds since the metaslab was unloaded.
 3203          */
 3204         if (msp->ms_loaded ||
 3205             (msp->ms_max_size != 0 && !try_hard && gethrtime() <
 3206             msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
 3207                 return (msp->ms_max_size >= asize);
 3208 
 3209         boolean_t should_allocate;
 3210         if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
 3211                 /*
 3212                  * The metaslab segment weight indicates segments in the
 3213                  * range [2^i, 2^(i+1)), where i is the index in the weight.
 3214                  * Since the asize might be in the middle of the range, we
 3215                  * should attempt the allocation if asize < 2^(i+1).
 3216                  */
 3217                 should_allocate = (asize <
 3218                     1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
 3219         } else {
 3220                 should_allocate = (asize <=
 3221                     (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
 3222         }
 3223 
 3224         return (should_allocate);
 3225 }
 3226 
 3227 static uint64_t
 3228 metaslab_weight(metaslab_t *msp, boolean_t nodirty)
 3229 {
 3230         vdev_t *vd = msp->ms_group->mg_vd;
 3231         spa_t *spa = vd->vdev_spa;
 3232         uint64_t weight;
 3233 
 3234         ASSERT(MUTEX_HELD(&msp->ms_lock));
 3235 
 3236         metaslab_set_fragmentation(msp, nodirty);
 3237 
 3238         /*
 3239          * Update the maximum size. If the metaslab is loaded, this will
 3240          * ensure that we get an accurate maximum size if newly freed space
 3241          * has been added back into the free tree. If the metaslab is
 3242          * unloaded, we check if there's a larger free segment in the
 3243          * unflushed frees. This is a lower bound on the largest allocatable
 3244          * segment size. Coalescing of adjacent entries may reveal larger
 3245          * allocatable segments, but we aren't aware of those until loading
 3246          * the space map into a range tree.
 3247          */
 3248         if (msp->ms_loaded) {
 3249                 msp->ms_max_size = metaslab_largest_allocatable(msp);
 3250         } else {
 3251                 msp->ms_max_size = MAX(msp->ms_max_size,
 3252                     metaslab_largest_unflushed_free(msp));
 3253         }
 3254 
 3255         /*
 3256          * Segment-based weighting requires space map histogram support.
 3257          */
 3258         if (zfs_metaslab_segment_weight_enabled &&
 3259             spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
 3260             (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
 3261             sizeof (space_map_phys_t))) {
 3262                 weight = metaslab_segment_weight(msp);
 3263         } else {
 3264                 weight = metaslab_space_weight(msp);
 3265         }
 3266         return (weight);
 3267 }
 3268 
 3269 void
 3270 metaslab_recalculate_weight_and_sort(metaslab_t *msp)
 3271 {
 3272         ASSERT(MUTEX_HELD(&msp->ms_lock));
 3273 
 3274         /* note: we preserve the mask (e.g. indication of primary, etc..) */
 3275         uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 3276         metaslab_group_sort(msp->ms_group, msp,
 3277             metaslab_weight(msp, B_FALSE) | was_active);
 3278 }
 3279 
 3280 static int
 3281 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
 3282     int allocator, uint64_t activation_weight)
 3283 {
 3284         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 3285         ASSERT(MUTEX_HELD(&msp->ms_lock));
 3286 
 3287         /*
 3288          * If we're activating for the claim code, we don't want to actually
 3289          * set the metaslab up for a specific allocator.
 3290          */
 3291         if (activation_weight == METASLAB_WEIGHT_CLAIM) {
 3292                 ASSERT0(msp->ms_activation_weight);
 3293                 msp->ms_activation_weight = msp->ms_weight;
 3294                 metaslab_group_sort(mg, msp, msp->ms_weight |
 3295                     activation_weight);
 3296                 return (0);
 3297         }
 3298 
 3299         metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
 3300             &mga->mga_primary : &mga->mga_secondary);
 3301 
 3302         mutex_enter(&mg->mg_lock);
 3303         if (*mspp != NULL) {
 3304                 mutex_exit(&mg->mg_lock);
 3305                 return (EEXIST);
 3306         }
 3307 
 3308         *mspp = msp;
 3309         ASSERT3S(msp->ms_allocator, ==, -1);
 3310         msp->ms_allocator = allocator;
 3311         msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
 3312 
 3313         ASSERT0(msp->ms_activation_weight);
 3314         msp->ms_activation_weight = msp->ms_weight;
 3315         metaslab_group_sort_impl(mg, msp,
 3316             msp->ms_weight | activation_weight);
 3317         mutex_exit(&mg->mg_lock);
 3318 
 3319         return (0);
 3320 }
 3321 
 3322 static int
 3323 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
 3324 {
 3325         ASSERT(MUTEX_HELD(&msp->ms_lock));
 3326 
 3327         /*
 3328          * The current metaslab is already activated for us so there
 3329          * is nothing to do. Already activated though, doesn't mean
 3330          * that this metaslab is activated for our allocator nor our
 3331          * requested activation weight. The metaslab could have started
 3332          * as an active one for our allocator but changed allocators
 3333          * while we were waiting to grab its ms_lock or we stole it
 3334          * [see find_valid_metaslab()]. This means that there is a
 3335          * possibility of passivating a metaslab of another allocator
 3336          * or from a different activation mask, from this thread.
 3337          */
 3338         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
 3339                 ASSERT(msp->ms_loaded);
 3340                 return (0);
 3341         }
 3342 
 3343         int error = metaslab_load(msp);
 3344         if (error != 0) {
 3345                 metaslab_group_sort(msp->ms_group, msp, 0);
 3346                 return (error);
 3347         }
 3348 
 3349         /*
 3350          * When entering metaslab_load() we may have dropped the
 3351          * ms_lock because we were loading this metaslab, or we
 3352          * were waiting for another thread to load it for us. In
 3353          * that scenario, we recheck the weight of the metaslab
 3354          * to see if it was activated by another thread.
 3355          *
 3356          * If the metaslab was activated for another allocator or
 3357          * it was activated with a different activation weight (e.g.
 3358          * we wanted to make it a primary but it was activated as
 3359          * secondary) we return error (EBUSY).
 3360          *
 3361          * If the metaslab was activated for the same allocator
 3362          * and requested activation mask, skip activating it.
 3363          */
 3364         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
 3365                 if (msp->ms_allocator != allocator)
 3366                         return (EBUSY);
 3367 
 3368                 if ((msp->ms_weight & activation_weight) == 0)
 3369                         return (SET_ERROR(EBUSY));
 3370 
 3371                 EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
 3372                     msp->ms_primary);
 3373                 return (0);
 3374         }
 3375 
 3376         /*
 3377          * If the metaslab has literally 0 space, it will have weight 0. In
 3378          * that case, don't bother activating it. This can happen if the
 3379          * metaslab had space during find_valid_metaslab, but another thread
 3380          * loaded it and used all that space while we were waiting to grab the
 3381          * lock.
 3382          */
 3383         if (msp->ms_weight == 0) {
 3384                 ASSERT0(range_tree_space(msp->ms_allocatable));
 3385                 return (SET_ERROR(ENOSPC));
 3386         }
 3387 
 3388         if ((error = metaslab_activate_allocator(msp->ms_group, msp,
 3389             allocator, activation_weight)) != 0) {
 3390                 return (error);
 3391         }
 3392 
 3393         ASSERT(msp->ms_loaded);
 3394         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 3395 
 3396         return (0);
 3397 }
 3398 
 3399 static void
 3400 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
 3401     uint64_t weight)
 3402 {
 3403         ASSERT(MUTEX_HELD(&msp->ms_lock));
 3404         ASSERT(msp->ms_loaded);
 3405 
 3406         if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
 3407                 metaslab_group_sort(mg, msp, weight);
 3408                 return;
 3409         }
 3410 
 3411         mutex_enter(&mg->mg_lock);
 3412         ASSERT3P(msp->ms_group, ==, mg);
 3413         ASSERT3S(0, <=, msp->ms_allocator);
 3414         ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
 3415 
 3416         metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator];
 3417         if (msp->ms_primary) {
 3418                 ASSERT3P(mga->mga_primary, ==, msp);
 3419                 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 3420                 mga->mga_primary = NULL;
 3421         } else {
 3422                 ASSERT3P(mga->mga_secondary, ==, msp);
 3423                 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 3424                 mga->mga_secondary = NULL;
 3425         }
 3426         msp->ms_allocator = -1;
 3427         metaslab_group_sort_impl(mg, msp, weight);
 3428         mutex_exit(&mg->mg_lock);
 3429 }
 3430 
 3431 static void
 3432 metaslab_passivate(metaslab_t *msp, uint64_t weight)
 3433 {
 3434         uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE;
 3435 
 3436         /*
 3437          * If size < SPA_MINBLOCKSIZE, then we will not allocate from
 3438          * this metaslab again.  In that case, it had better be empty,
 3439          * or we would be leaving space on the table.
 3440          */
 3441         ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
 3442             size >= SPA_MINBLOCKSIZE ||
 3443             range_tree_space(msp->ms_allocatable) == 0);
 3444         ASSERT0(weight & METASLAB_ACTIVE_MASK);
 3445 
 3446         ASSERT(msp->ms_activation_weight != 0);
 3447         msp->ms_activation_weight = 0;
 3448         metaslab_passivate_allocator(msp->ms_group, msp, weight);
 3449         ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
 3450 }
 3451 
 3452 /*
 3453  * Segment-based metaslabs are activated once and remain active until
 3454  * we either fail an allocation attempt (similar to space-based metaslabs)
 3455  * or have exhausted the free space in zfs_metaslab_switch_threshold
 3456  * buckets since the metaslab was activated. This function checks to see
 3457  * if we've exhausted the zfs_metaslab_switch_threshold buckets in the
 3458  * metaslab and passivates it proactively. This will allow us to select a
 3459  * metaslab with a larger contiguous region, if any, remaining within this
 3460  * metaslab group. If we're in sync pass > 1, then we continue using this
 3461  * metaslab so that we don't dirty more block and cause more sync passes.
 3462  */
 3463 static void
 3464 metaslab_segment_may_passivate(metaslab_t *msp)
 3465 {
 3466         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 3467 
 3468         if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
 3469                 return;
 3470 
 3471         /*
 3472          * Since we are in the middle of a sync pass, the most accurate
 3473          * information that is accessible to us is the in-core range tree
 3474          * histogram; calculate the new weight based on that information.
 3475          */
 3476         uint64_t weight = metaslab_weight_from_range_tree(msp);
 3477         int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
 3478         int current_idx = WEIGHT_GET_INDEX(weight);
 3479 
 3480         if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
 3481                 metaslab_passivate(msp, weight);
 3482 }
 3483 
 3484 static void
 3485 metaslab_preload(void *arg)
 3486 {
 3487         metaslab_t *msp = arg;
 3488         metaslab_class_t *mc = msp->ms_group->mg_class;
 3489         spa_t *spa = mc->mc_spa;
 3490         fstrans_cookie_t cookie = spl_fstrans_mark();
 3491 
 3492         ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
 3493 
 3494         mutex_enter(&msp->ms_lock);
 3495         (void) metaslab_load(msp);
 3496         metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
 3497         mutex_exit(&msp->ms_lock);
 3498         spl_fstrans_unmark(cookie);
 3499 }
 3500 
 3501 static void
 3502 metaslab_group_preload(metaslab_group_t *mg)
 3503 {
 3504         spa_t *spa = mg->mg_vd->vdev_spa;
 3505         metaslab_t *msp;
 3506         avl_tree_t *t = &mg->mg_metaslab_tree;
 3507         int m = 0;
 3508 
 3509         if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
 3510                 taskq_wait_outstanding(mg->mg_taskq, 0);
 3511                 return;
 3512         }
 3513 
 3514         mutex_enter(&mg->mg_lock);
 3515 
 3516         /*
 3517          * Load the next potential metaslabs
 3518          */
 3519         for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
 3520                 ASSERT3P(msp->ms_group, ==, mg);
 3521 
 3522                 /*
 3523                  * We preload only the maximum number of metaslabs specified
 3524                  * by metaslab_preload_limit. If a metaslab is being forced
 3525                  * to condense then we preload it too. This will ensure
 3526                  * that force condensing happens in the next txg.
 3527                  */
 3528                 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
 3529                         continue;
 3530                 }
 3531 
 3532                 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
 3533                     msp, TQ_SLEEP) != TASKQID_INVALID);
 3534         }
 3535         mutex_exit(&mg->mg_lock);
 3536 }
 3537 
 3538 /*
 3539  * Determine if the space map's on-disk footprint is past our tolerance for
 3540  * inefficiency. We would like to use the following criteria to make our
 3541  * decision:
 3542  *
 3543  * 1. Do not condense if the size of the space map object would dramatically
 3544  *    increase as a result of writing out the free space range tree.
 3545  *
 3546  * 2. Condense if the on on-disk space map representation is at least
 3547  *    zfs_condense_pct/100 times the size of the optimal representation
 3548  *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
 3549  *
 3550  * 3. Do not condense if the on-disk size of the space map does not actually
 3551  *    decrease.
 3552  *
 3553  * Unfortunately, we cannot compute the on-disk size of the space map in this
 3554  * context because we cannot accurately compute the effects of compression, etc.
 3555  * Instead, we apply the heuristic described in the block comment for
 3556  * zfs_metaslab_condense_block_threshold - we only condense if the space used
 3557  * is greater than a threshold number of blocks.
 3558  */
 3559 static boolean_t
 3560 metaslab_should_condense(metaslab_t *msp)
 3561 {
 3562         space_map_t *sm = msp->ms_sm;
 3563         vdev_t *vd = msp->ms_group->mg_vd;
 3564         uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift;
 3565 
 3566         ASSERT(MUTEX_HELD(&msp->ms_lock));
 3567         ASSERT(msp->ms_loaded);
 3568         ASSERT(sm != NULL);
 3569         ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
 3570 
 3571         /*
 3572          * We always condense metaslabs that are empty and metaslabs for
 3573          * which a condense request has been made.
 3574          */
 3575         if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
 3576             msp->ms_condense_wanted)
 3577                 return (B_TRUE);
 3578 
 3579         uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
 3580         uint64_t object_size = space_map_length(sm);
 3581         uint64_t optimal_size = space_map_estimate_optimal_size(sm,
 3582             msp->ms_allocatable, SM_NO_VDEVID);
 3583 
 3584         return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
 3585             object_size > zfs_metaslab_condense_block_threshold * record_size);
 3586 }
 3587 
 3588 /*
 3589  * Condense the on-disk space map representation to its minimized form.
 3590  * The minimized form consists of a small number of allocations followed
 3591  * by the entries of the free range tree (ms_allocatable). The condensed
 3592  * spacemap contains all the entries of previous TXGs (including those in
 3593  * the pool-wide log spacemaps; thus this is effectively a superset of
 3594  * metaslab_flush()), but this TXG's entries still need to be written.
 3595  */
 3596 static void
 3597 metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
 3598 {
 3599         range_tree_t *condense_tree;
 3600         space_map_t *sm = msp->ms_sm;
 3601         uint64_t txg = dmu_tx_get_txg(tx);
 3602         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 3603 
 3604         ASSERT(MUTEX_HELD(&msp->ms_lock));
 3605         ASSERT(msp->ms_loaded);
 3606         ASSERT(msp->ms_sm != NULL);
 3607 
 3608         /*
 3609          * In order to condense the space map, we need to change it so it
 3610          * only describes which segments are currently allocated and free.
 3611          *
 3612          * All the current free space resides in the ms_allocatable, all
 3613          * the ms_defer trees, and all the ms_allocating trees. We ignore
 3614          * ms_freed because it is empty because we're in sync pass 1. We
 3615          * ignore ms_freeing because these changes are not yet reflected
 3616          * in the spacemap (they will be written later this txg).
 3617          *
 3618          * So to truncate the space map to represent all the entries of
 3619          * previous TXGs we do the following:
 3620          *
 3621          * 1] We create a range tree (condense tree) that is 100% empty.
 3622          * 2] We add to it all segments found in the ms_defer trees
 3623          *    as those segments are marked as free in the original space
 3624          *    map. We do the same with the ms_allocating trees for the same
 3625          *    reason. Adding these segments should be a relatively
 3626          *    inexpensive operation since we expect these trees to have a
 3627          *    small number of nodes.
 3628          * 3] We vacate any unflushed allocs, since they are not frees we
 3629          *    need to add to the condense tree. Then we vacate any
 3630          *    unflushed frees as they should already be part of ms_allocatable.
 3631          * 4] At this point, we would ideally like to add all segments
 3632          *    in the ms_allocatable tree from the condense tree. This way
 3633          *    we would write all the entries of the condense tree as the
 3634          *    condensed space map, which would only contain freed
 3635          *    segments with everything else assumed to be allocated.
 3636          *
 3637          *    Doing so can be prohibitively expensive as ms_allocatable can
 3638          *    be large, and therefore computationally expensive to add to
 3639          *    the condense_tree. Instead we first sync out an entry marking
 3640          *    everything as allocated, then the condense_tree and then the
 3641          *    ms_allocatable, in the condensed space map. While this is not
 3642          *    optimal, it is typically close to optimal and more importantly
 3643          *    much cheaper to compute.
 3644          *
 3645          * 5] Finally, as both of the unflushed trees were written to our
 3646          *    new and condensed metaslab space map, we basically flushed
 3647          *    all the unflushed changes to disk, thus we call
 3648          *    metaslab_flush_update().
 3649          */
 3650         ASSERT3U(spa_sync_pass(spa), ==, 1);
 3651         ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
 3652 
 3653         zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
 3654             "spa %s, smp size %llu, segments %llu, forcing condense=%s",
 3655             (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp,
 3656             (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 3657             spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm),
 3658             (u_longlong_t)range_tree_numsegs(msp->ms_allocatable),
 3659             msp->ms_condense_wanted ? "TRUE" : "FALSE");
 3660 
 3661         msp->ms_condense_wanted = B_FALSE;
 3662 
 3663         range_seg_type_t type;
 3664         uint64_t shift, start;
 3665         type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
 3666             &start, &shift);
 3667 
 3668         condense_tree = range_tree_create(NULL, type, NULL, start, shift);
 3669 
 3670         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 3671                 range_tree_walk(msp->ms_defer[t],
 3672                     range_tree_add, condense_tree);
 3673         }
 3674 
 3675         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 3676                 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
 3677                     range_tree_add, condense_tree);
 3678         }
 3679 
 3680         ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 3681             metaslab_unflushed_changes_memused(msp));
 3682         spa->spa_unflushed_stats.sus_memused -=
 3683             metaslab_unflushed_changes_memused(msp);
 3684         range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 3685         range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 3686 
 3687         /*
 3688          * We're about to drop the metaslab's lock thus allowing other
 3689          * consumers to change it's content. Set the metaslab's ms_condensing
 3690          * flag to ensure that allocations on this metaslab do not occur
 3691          * while we're in the middle of committing it to disk. This is only
 3692          * critical for ms_allocatable as all other range trees use per TXG
 3693          * views of their content.
 3694          */
 3695         msp->ms_condensing = B_TRUE;
 3696 
 3697         mutex_exit(&msp->ms_lock);
 3698         uint64_t object = space_map_object(msp->ms_sm);
 3699         space_map_truncate(sm,
 3700             spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
 3701             zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
 3702 
 3703         /*
 3704          * space_map_truncate() may have reallocated the spacemap object.
 3705          * If so, update the vdev_ms_array.
 3706          */
 3707         if (space_map_object(msp->ms_sm) != object) {
 3708                 object = space_map_object(msp->ms_sm);
 3709                 dmu_write(spa->spa_meta_objset,
 3710                     msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
 3711                     msp->ms_id, sizeof (uint64_t), &object, tx);
 3712         }
 3713 
 3714         /*
 3715          * Note:
 3716          * When the log space map feature is enabled, each space map will
 3717          * always have ALLOCS followed by FREES for each sync pass. This is
 3718          * typically true even when the log space map feature is disabled,
 3719          * except from the case where a metaslab goes through metaslab_sync()
 3720          * and gets condensed. In that case the metaslab's space map will have
 3721          * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
 3722          * followed by FREES (due to space_map_write() in metaslab_sync()) for
 3723          * sync pass 1.
 3724          */
 3725         range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start,
 3726             shift);
 3727         range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
 3728         space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
 3729         space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
 3730         space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
 3731 
 3732         range_tree_vacate(condense_tree, NULL, NULL);
 3733         range_tree_destroy(condense_tree);
 3734         range_tree_vacate(tmp_tree, NULL, NULL);
 3735         range_tree_destroy(tmp_tree);
 3736         mutex_enter(&msp->ms_lock);
 3737 
 3738         msp->ms_condensing = B_FALSE;
 3739         metaslab_flush_update(msp, tx);
 3740 }
 3741 
 3742 static void
 3743 metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx)
 3744 {
 3745         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 3746         ASSERT(spa_syncing_log_sm(spa) != NULL);
 3747         ASSERT(msp->ms_sm != NULL);
 3748         ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
 3749         ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 3750 
 3751         mutex_enter(&spa->spa_flushed_ms_lock);
 3752         metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
 3753         metaslab_set_unflushed_dirty(msp, B_TRUE);
 3754         avl_add(&spa->spa_metaslabs_by_flushed, msp);
 3755         mutex_exit(&spa->spa_flushed_ms_lock);
 3756 
 3757         spa_log_sm_increment_current_mscount(spa);
 3758         spa_log_summary_add_flushed_metaslab(spa, B_TRUE);
 3759 }
 3760 
 3761 void
 3762 metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty)
 3763 {
 3764         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 3765         ASSERT(spa_syncing_log_sm(spa) != NULL);
 3766         ASSERT(msp->ms_sm != NULL);
 3767         ASSERT(metaslab_unflushed_txg(msp) != 0);
 3768         ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
 3769         ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
 3770         ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 3771 
 3772         VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
 3773 
 3774         /* update metaslab's position in our flushing tree */
 3775         uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
 3776         boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp);
 3777         mutex_enter(&spa->spa_flushed_ms_lock);
 3778         avl_remove(&spa->spa_metaslabs_by_flushed, msp);
 3779         metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
 3780         metaslab_set_unflushed_dirty(msp, dirty);
 3781         avl_add(&spa->spa_metaslabs_by_flushed, msp);
 3782         mutex_exit(&spa->spa_flushed_ms_lock);
 3783 
 3784         /* update metaslab counts of spa_log_sm_t nodes */
 3785         spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
 3786         spa_log_sm_increment_current_mscount(spa);
 3787 
 3788         /* update log space map summary */
 3789         spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg,
 3790             ms_prev_flushed_dirty);
 3791         spa_log_summary_add_flushed_metaslab(spa, dirty);
 3792 
 3793         /* cleanup obsolete logs if any */
 3794         spa_cleanup_old_sm_logs(spa, tx);
 3795 }
 3796 
 3797 /*
 3798  * Called when the metaslab has been flushed (its own spacemap now reflects
 3799  * all the contents of the pool-wide spacemap log). Updates the metaslab's
 3800  * metadata and any pool-wide related log space map data (e.g. summary,
 3801  * obsolete logs, etc..) to reflect that.
 3802  */
 3803 static void
 3804 metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
 3805 {
 3806         metaslab_group_t *mg = msp->ms_group;
 3807         spa_t *spa = mg->mg_vd->vdev_spa;
 3808 
 3809         ASSERT(MUTEX_HELD(&msp->ms_lock));
 3810 
 3811         ASSERT3U(spa_sync_pass(spa), ==, 1);
 3812 
 3813         /*
 3814          * Just because a metaslab got flushed, that doesn't mean that
 3815          * it will pass through metaslab_sync_done(). Thus, make sure to
 3816          * update ms_synced_length here in case it doesn't.
 3817          */
 3818         msp->ms_synced_length = space_map_length(msp->ms_sm);
 3819 
 3820         /*
 3821          * We may end up here from metaslab_condense() without the
 3822          * feature being active. In that case this is a no-op.
 3823          */
 3824         if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) ||
 3825             metaslab_unflushed_txg(msp) == 0)
 3826                 return;
 3827 
 3828         metaslab_unflushed_bump(msp, tx, B_FALSE);
 3829 }
 3830 
 3831 boolean_t
 3832 metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
 3833 {
 3834         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 3835 
 3836         ASSERT(MUTEX_HELD(&msp->ms_lock));
 3837         ASSERT3U(spa_sync_pass(spa), ==, 1);
 3838         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 3839 
 3840         ASSERT(msp->ms_sm != NULL);
 3841         ASSERT(metaslab_unflushed_txg(msp) != 0);
 3842         ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
 3843 
 3844         /*
 3845          * There is nothing wrong with flushing the same metaslab twice, as
 3846          * this codepath should work on that case. However, the current
 3847          * flushing scheme makes sure to avoid this situation as we would be
 3848          * making all these calls without having anything meaningful to write
 3849          * to disk. We assert this behavior here.
 3850          */
 3851         ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
 3852 
 3853         /*
 3854          * We can not flush while loading, because then we would
 3855          * not load the ms_unflushed_{allocs,frees}.
 3856          */
 3857         if (msp->ms_loading)
 3858                 return (B_FALSE);
 3859 
 3860         metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 3861         metaslab_verify_weight_and_frag(msp);
 3862 
 3863         /*
 3864          * Metaslab condensing is effectively flushing. Therefore if the
 3865          * metaslab can be condensed we can just condense it instead of
 3866          * flushing it.
 3867          *
 3868          * Note that metaslab_condense() does call metaslab_flush_update()
 3869          * so we can just return immediately after condensing. We also
 3870          * don't need to care about setting ms_flushing or broadcasting
 3871          * ms_flush_cv, even if we temporarily drop the ms_lock in
 3872          * metaslab_condense(), as the metaslab is already loaded.
 3873          */
 3874         if (msp->ms_loaded && metaslab_should_condense(msp)) {
 3875                 metaslab_group_t *mg = msp->ms_group;
 3876 
 3877                 /*
 3878                  * For all histogram operations below refer to the
 3879                  * comments of metaslab_sync() where we follow a
 3880                  * similar procedure.
 3881                  */
 3882                 metaslab_group_histogram_verify(mg);
 3883                 metaslab_class_histogram_verify(mg->mg_class);
 3884                 metaslab_group_histogram_remove(mg, msp);
 3885 
 3886                 metaslab_condense(msp, tx);
 3887 
 3888                 space_map_histogram_clear(msp->ms_sm);
 3889                 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
 3890                 ASSERT(range_tree_is_empty(msp->ms_freed));
 3891                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 3892                         space_map_histogram_add(msp->ms_sm,
 3893                             msp->ms_defer[t], tx);
 3894                 }
 3895                 metaslab_aux_histograms_update(msp);
 3896 
 3897                 metaslab_group_histogram_add(mg, msp);
 3898                 metaslab_group_histogram_verify(mg);
 3899                 metaslab_class_histogram_verify(mg->mg_class);
 3900 
 3901                 metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 3902 
 3903                 /*
 3904                  * Since we recreated the histogram (and potentially
 3905                  * the ms_sm too while condensing) ensure that the
 3906                  * weight is updated too because we are not guaranteed
 3907                  * that this metaslab is dirty and will go through
 3908                  * metaslab_sync_done().
 3909                  */
 3910                 metaslab_recalculate_weight_and_sort(msp);
 3911                 return (B_TRUE);
 3912         }
 3913 
 3914         msp->ms_flushing = B_TRUE;
 3915         uint64_t sm_len_before = space_map_length(msp->ms_sm);
 3916 
 3917         mutex_exit(&msp->ms_lock);
 3918         space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
 3919             SM_NO_VDEVID, tx);
 3920         space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
 3921             SM_NO_VDEVID, tx);
 3922         mutex_enter(&msp->ms_lock);
 3923 
 3924         uint64_t sm_len_after = space_map_length(msp->ms_sm);
 3925         if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
 3926                 zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
 3927                     "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
 3928                     "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx),
 3929                     spa_name(spa),
 3930                     (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 3931                     (u_longlong_t)msp->ms_id,
 3932                     (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs),
 3933                     (u_longlong_t)range_tree_space(msp->ms_unflushed_frees),
 3934                     (u_longlong_t)(sm_len_after - sm_len_before));
 3935         }
 3936 
 3937         ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 3938             metaslab_unflushed_changes_memused(msp));
 3939         spa->spa_unflushed_stats.sus_memused -=
 3940             metaslab_unflushed_changes_memused(msp);
 3941         range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 3942         range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 3943 
 3944         metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 3945         metaslab_verify_weight_and_frag(msp);
 3946 
 3947         metaslab_flush_update(msp, tx);
 3948 
 3949         metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 3950         metaslab_verify_weight_and_frag(msp);
 3951 
 3952         msp->ms_flushing = B_FALSE;
 3953         cv_broadcast(&msp->ms_flush_cv);
 3954         return (B_TRUE);
 3955 }
 3956 
 3957 /*
 3958  * Write a metaslab to disk in the context of the specified transaction group.
 3959  */
 3960 void
 3961 metaslab_sync(metaslab_t *msp, uint64_t txg)
 3962 {
 3963         metaslab_group_t *mg = msp->ms_group;
 3964         vdev_t *vd = mg->mg_vd;
 3965         spa_t *spa = vd->vdev_spa;
 3966         objset_t *mos = spa_meta_objset(spa);
 3967         range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
 3968         dmu_tx_t *tx;
 3969 
 3970         ASSERT(!vd->vdev_ishole);
 3971 
 3972         /*
 3973          * This metaslab has just been added so there's no work to do now.
 3974          */
 3975         if (msp->ms_new) {
 3976                 ASSERT0(range_tree_space(alloctree));
 3977                 ASSERT0(range_tree_space(msp->ms_freeing));
 3978                 ASSERT0(range_tree_space(msp->ms_freed));
 3979                 ASSERT0(range_tree_space(msp->ms_checkpointing));
 3980                 ASSERT0(range_tree_space(msp->ms_trim));
 3981                 return;
 3982         }
 3983 
 3984         /*
 3985          * Normally, we don't want to process a metaslab if there are no
 3986          * allocations or frees to perform. However, if the metaslab is being
 3987          * forced to condense, it's loaded and we're not beyond the final
 3988          * dirty txg, we need to let it through. Not condensing beyond the
 3989          * final dirty txg prevents an issue where metaslabs that need to be
 3990          * condensed but were loaded for other reasons could cause a panic
 3991          * here. By only checking the txg in that branch of the conditional,
 3992          * we preserve the utility of the VERIFY statements in all other
 3993          * cases.
 3994          */
 3995         if (range_tree_is_empty(alloctree) &&
 3996             range_tree_is_empty(msp->ms_freeing) &&
 3997             range_tree_is_empty(msp->ms_checkpointing) &&
 3998             !(msp->ms_loaded && msp->ms_condense_wanted &&
 3999             txg <= spa_final_dirty_txg(spa)))
 4000                 return;
 4001 
 4002 
 4003         VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
 4004 
 4005         /*
 4006          * The only state that can actually be changing concurrently
 4007          * with metaslab_sync() is the metaslab's ms_allocatable. No
 4008          * other thread can be modifying this txg's alloc, freeing,
 4009          * freed, or space_map_phys_t.  We drop ms_lock whenever we
 4010          * could call into the DMU, because the DMU can call down to
 4011          * us (e.g. via zio_free()) at any time.
 4012          *
 4013          * The spa_vdev_remove_thread() can be reading metaslab state
 4014          * concurrently, and it is locked out by the ms_sync_lock.
 4015          * Note that the ms_lock is insufficient for this, because it
 4016          * is dropped by space_map_write().
 4017          */
 4018         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 4019 
 4020         /*
 4021          * Generate a log space map if one doesn't exist already.
 4022          */
 4023         spa_generate_syncing_log_sm(spa, tx);
 4024 
 4025         if (msp->ms_sm == NULL) {
 4026                 uint64_t new_object = space_map_alloc(mos,
 4027                     spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
 4028                     zfs_metaslab_sm_blksz_with_log :
 4029                     zfs_metaslab_sm_blksz_no_log, tx);
 4030                 VERIFY3U(new_object, !=, 0);
 4031 
 4032                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
 4033                     msp->ms_id, sizeof (uint64_t), &new_object, tx);
 4034 
 4035                 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
 4036                     msp->ms_start, msp->ms_size, vd->vdev_ashift));
 4037                 ASSERT(msp->ms_sm != NULL);
 4038 
 4039                 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
 4040                 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 4041                 ASSERT0(metaslab_allocated_space(msp));
 4042         }
 4043 
 4044         if (!range_tree_is_empty(msp->ms_checkpointing) &&
 4045             vd->vdev_checkpoint_sm == NULL) {
 4046                 ASSERT(spa_has_checkpoint(spa));
 4047 
 4048                 uint64_t new_object = space_map_alloc(mos,
 4049                     zfs_vdev_standard_sm_blksz, tx);
 4050                 VERIFY3U(new_object, !=, 0);
 4051 
 4052                 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
 4053                     mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
 4054                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 4055 
 4056                 /*
 4057                  * We save the space map object as an entry in vdev_top_zap
 4058                  * so it can be retrieved when the pool is reopened after an
 4059                  * export or through zdb.
 4060                  */
 4061                 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
 4062                     vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
 4063                     sizeof (new_object), 1, &new_object, tx));
 4064         }
 4065 
 4066         mutex_enter(&msp->ms_sync_lock);
 4067         mutex_enter(&msp->ms_lock);
 4068 
 4069         /*
 4070          * Note: metaslab_condense() clears the space map's histogram.
 4071          * Therefore we must verify and remove this histogram before
 4072          * condensing.
 4073          */
 4074         metaslab_group_histogram_verify(mg);
 4075         metaslab_class_histogram_verify(mg->mg_class);
 4076         metaslab_group_histogram_remove(mg, msp);
 4077 
 4078         if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
 4079             metaslab_should_condense(msp))
 4080                 metaslab_condense(msp, tx);
 4081 
 4082         /*
 4083          * We'll be going to disk to sync our space accounting, thus we
 4084          * drop the ms_lock during that time so allocations coming from
 4085          * open-context (ZIL) for future TXGs do not block.
 4086          */
 4087         mutex_exit(&msp->ms_lock);
 4088         space_map_t *log_sm = spa_syncing_log_sm(spa);
 4089         if (log_sm != NULL) {
 4090                 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
 4091                 if (metaslab_unflushed_txg(msp) == 0)
 4092                         metaslab_unflushed_add(msp, tx);
 4093                 else if (!metaslab_unflushed_dirty(msp))
 4094                         metaslab_unflushed_bump(msp, tx, B_TRUE);
 4095 
 4096                 space_map_write(log_sm, alloctree, SM_ALLOC,
 4097                     vd->vdev_id, tx);
 4098                 space_map_write(log_sm, msp->ms_freeing, SM_FREE,
 4099                     vd->vdev_id, tx);
 4100                 mutex_enter(&msp->ms_lock);
 4101 
 4102                 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 4103                     metaslab_unflushed_changes_memused(msp));
 4104                 spa->spa_unflushed_stats.sus_memused -=
 4105                     metaslab_unflushed_changes_memused(msp);
 4106                 range_tree_remove_xor_add(alloctree,
 4107                     msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
 4108                 range_tree_remove_xor_add(msp->ms_freeing,
 4109                     msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
 4110                 spa->spa_unflushed_stats.sus_memused +=
 4111                     metaslab_unflushed_changes_memused(msp);
 4112         } else {
 4113                 ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
 4114 
 4115                 space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
 4116                     SM_NO_VDEVID, tx);
 4117                 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
 4118                     SM_NO_VDEVID, tx);
 4119                 mutex_enter(&msp->ms_lock);
 4120         }
 4121 
 4122         msp->ms_allocated_space += range_tree_space(alloctree);
 4123         ASSERT3U(msp->ms_allocated_space, >=,
 4124             range_tree_space(msp->ms_freeing));
 4125         msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
 4126 
 4127         if (!range_tree_is_empty(msp->ms_checkpointing)) {
 4128                 ASSERT(spa_has_checkpoint(spa));
 4129                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 4130 
 4131                 /*
 4132                  * Since we are doing writes to disk and the ms_checkpointing
 4133                  * tree won't be changing during that time, we drop the
 4134                  * ms_lock while writing to the checkpoint space map, for the
 4135                  * same reason mentioned above.
 4136                  */
 4137                 mutex_exit(&msp->ms_lock);
 4138                 space_map_write(vd->vdev_checkpoint_sm,
 4139                     msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
 4140                 mutex_enter(&msp->ms_lock);
 4141 
 4142                 spa->spa_checkpoint_info.sci_dspace +=
 4143                     range_tree_space(msp->ms_checkpointing);
 4144                 vd->vdev_stat.vs_checkpoint_space +=
 4145                     range_tree_space(msp->ms_checkpointing);
 4146                 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
 4147                     -space_map_allocated(vd->vdev_checkpoint_sm));
 4148 
 4149                 range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
 4150         }
 4151 
 4152         if (msp->ms_loaded) {
 4153                 /*
 4154                  * When the space map is loaded, we have an accurate
 4155                  * histogram in the range tree. This gives us an opportunity
 4156                  * to bring the space map's histogram up-to-date so we clear
 4157                  * it first before updating it.
 4158                  */
 4159                 space_map_histogram_clear(msp->ms_sm);
 4160                 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
 4161 
 4162                 /*
 4163                  * Since we've cleared the histogram we need to add back
 4164                  * any free space that has already been processed, plus
 4165                  * any deferred space. This allows the on-disk histogram
 4166                  * to accurately reflect all free space even if some space
 4167                  * is not yet available for allocation (i.e. deferred).
 4168                  */
 4169                 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
 4170 
 4171                 /*
 4172                  * Add back any deferred free space that has not been
 4173                  * added back into the in-core free tree yet. This will
 4174                  * ensure that we don't end up with a space map histogram
 4175                  * that is completely empty unless the metaslab is fully
 4176                  * allocated.
 4177                  */
 4178                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 4179                         space_map_histogram_add(msp->ms_sm,
 4180                             msp->ms_defer[t], tx);
 4181                 }
 4182         }
 4183 
 4184         /*
 4185          * Always add the free space from this sync pass to the space
 4186          * map histogram. We want to make sure that the on-disk histogram
 4187          * accounts for all free space. If the space map is not loaded,
 4188          * then we will lose some accuracy but will correct it the next
 4189          * time we load the space map.
 4190          */
 4191         space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
 4192         metaslab_aux_histograms_update(msp);
 4193 
 4194         metaslab_group_histogram_add(mg, msp);
 4195         metaslab_group_histogram_verify(mg);
 4196         metaslab_class_histogram_verify(mg->mg_class);
 4197 
 4198         /*
 4199          * For sync pass 1, we avoid traversing this txg's free range tree
 4200          * and instead will just swap the pointers for freeing and freed.
 4201          * We can safely do this since the freed_tree is guaranteed to be
 4202          * empty on the initial pass.
 4203          *
 4204          * Keep in mind that even if we are currently using a log spacemap
 4205          * we want current frees to end up in the ms_allocatable (but not
 4206          * get appended to the ms_sm) so their ranges can be reused as usual.
 4207          */
 4208         if (spa_sync_pass(spa) == 1) {
 4209                 range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
 4210                 ASSERT0(msp->ms_allocated_this_txg);
 4211         } else {
 4212                 range_tree_vacate(msp->ms_freeing,
 4213                     range_tree_add, msp->ms_freed);
 4214         }
 4215         msp->ms_allocated_this_txg += range_tree_space(alloctree);
 4216         range_tree_vacate(alloctree, NULL, NULL);
 4217 
 4218         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 4219         ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
 4220             & TXG_MASK]));
 4221         ASSERT0(range_tree_space(msp->ms_freeing));
 4222         ASSERT0(range_tree_space(msp->ms_checkpointing));
 4223 
 4224         mutex_exit(&msp->ms_lock);
 4225 
 4226         /*
 4227          * Verify that the space map object ID has been recorded in the
 4228          * vdev_ms_array.
 4229          */
 4230         uint64_t object;
 4231         VERIFY0(dmu_read(mos, vd->vdev_ms_array,
 4232             msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
 4233         VERIFY3U(object, ==, space_map_object(msp->ms_sm));
 4234 
 4235         mutex_exit(&msp->ms_sync_lock);
 4236         dmu_tx_commit(tx);
 4237 }
 4238 
 4239 static void
 4240 metaslab_evict(metaslab_t *msp, uint64_t txg)
 4241 {
 4242         if (!msp->ms_loaded || msp->ms_disabled != 0)
 4243                 return;
 4244 
 4245         for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
 4246                 VERIFY0(range_tree_space(
 4247                     msp->ms_allocating[(txg + t) & TXG_MASK]));
 4248         }
 4249         if (msp->ms_allocator != -1)
 4250                 metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
 4251 
 4252         if (!metaslab_debug_unload)
 4253                 metaslab_unload(msp);
 4254 }
 4255 
 4256 /*
 4257  * Called after a transaction group has completely synced to mark
 4258  * all of the metaslab's free space as usable.
 4259  */
 4260 void
 4261 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 4262 {
 4263         metaslab_group_t *mg = msp->ms_group;
 4264         vdev_t *vd = mg->mg_vd;
 4265         spa_t *spa = vd->vdev_spa;
 4266         range_tree_t **defer_tree;
 4267         int64_t alloc_delta, defer_delta;
 4268         boolean_t defer_allowed = B_TRUE;
 4269 
 4270         ASSERT(!vd->vdev_ishole);
 4271 
 4272         mutex_enter(&msp->ms_lock);
 4273 
 4274         if (msp->ms_new) {
 4275                 /* this is a new metaslab, add its capacity to the vdev */
 4276                 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
 4277 
 4278                 /* there should be no allocations nor frees at this point */
 4279                 VERIFY0(msp->ms_allocated_this_txg);
 4280                 VERIFY0(range_tree_space(msp->ms_freed));
 4281         }
 4282 
 4283         ASSERT0(range_tree_space(msp->ms_freeing));
 4284         ASSERT0(range_tree_space(msp->ms_checkpointing));
 4285 
 4286         defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
 4287 
 4288         uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
 4289             metaslab_class_get_alloc(spa_normal_class(spa));
 4290         if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
 4291                 defer_allowed = B_FALSE;
 4292         }
 4293 
 4294         defer_delta = 0;
 4295         alloc_delta = msp->ms_allocated_this_txg -
 4296             range_tree_space(msp->ms_freed);
 4297 
 4298         if (defer_allowed) {
 4299                 defer_delta = range_tree_space(msp->ms_freed) -
 4300                     range_tree_space(*defer_tree);
 4301         } else {
 4302                 defer_delta -= range_tree_space(*defer_tree);
 4303         }
 4304         metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
 4305             defer_delta, 0);
 4306 
 4307         if (spa_syncing_log_sm(spa) == NULL) {
 4308                 /*
 4309                  * If there's a metaslab_load() in progress and we don't have
 4310                  * a log space map, it means that we probably wrote to the
 4311                  * metaslab's space map. If this is the case, we need to
 4312                  * make sure that we wait for the load to complete so that we
 4313                  * have a consistent view at the in-core side of the metaslab.
 4314                  */
 4315                 metaslab_load_wait(msp);
 4316         } else {
 4317                 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 4318         }
 4319 
 4320         /*
 4321          * When auto-trimming is enabled, free ranges which are added to
 4322          * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
 4323          * periodically consumed by the vdev_autotrim_thread() which issues
 4324          * trims for all ranges and then vacates the tree.  The ms_trim tree
 4325          * can be discarded at any time with the sole consequence of recent
 4326          * frees not being trimmed.
 4327          */
 4328         if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
 4329                 range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
 4330                 if (!defer_allowed) {
 4331                         range_tree_walk(msp->ms_freed, range_tree_add,
 4332                             msp->ms_trim);
 4333                 }
 4334         } else {
 4335                 range_tree_vacate(msp->ms_trim, NULL, NULL);
 4336         }
 4337 
 4338         /*
 4339          * Move the frees from the defer_tree back to the free
 4340          * range tree (if it's loaded). Swap the freed_tree and
 4341          * the defer_tree -- this is safe to do because we've
 4342          * just emptied out the defer_tree.
 4343          */
 4344         range_tree_vacate(*defer_tree,
 4345             msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
 4346         if (defer_allowed) {
 4347                 range_tree_swap(&msp->ms_freed, defer_tree);
 4348         } else {
 4349                 range_tree_vacate(msp->ms_freed,
 4350                     msp->ms_loaded ? range_tree_add : NULL,
 4351                     msp->ms_allocatable);
 4352         }
 4353 
 4354         msp->ms_synced_length = space_map_length(msp->ms_sm);
 4355 
 4356         msp->ms_deferspace += defer_delta;
 4357         ASSERT3S(msp->ms_deferspace, >=, 0);
 4358         ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
 4359         if (msp->ms_deferspace != 0) {
 4360                 /*
 4361                  * Keep syncing this metaslab until all deferred frees
 4362                  * are back in circulation.
 4363                  */
 4364                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 4365         }
 4366         metaslab_aux_histograms_update_done(msp, defer_allowed);
 4367 
 4368         if (msp->ms_new) {
 4369                 msp->ms_new = B_FALSE;
 4370                 mutex_enter(&mg->mg_lock);
 4371                 mg->mg_ms_ready++;
 4372                 mutex_exit(&mg->mg_lock);
 4373         }
 4374 
 4375         /*
 4376          * Re-sort metaslab within its group now that we've adjusted
 4377          * its allocatable space.
 4378          */
 4379         metaslab_recalculate_weight_and_sort(msp);
 4380 
 4381         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 4382         ASSERT0(range_tree_space(msp->ms_freeing));
 4383         ASSERT0(range_tree_space(msp->ms_freed));
 4384         ASSERT0(range_tree_space(msp->ms_checkpointing));
 4385         msp->ms_allocating_total -= msp->ms_allocated_this_txg;
 4386         msp->ms_allocated_this_txg = 0;
 4387         mutex_exit(&msp->ms_lock);
 4388 }
 4389 
 4390 void
 4391 metaslab_sync_reassess(metaslab_group_t *mg)
 4392 {
 4393         spa_t *spa = mg->mg_class->mc_spa;
 4394 
 4395         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 4396         metaslab_group_alloc_update(mg);
 4397         mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 4398 
 4399         /*
 4400          * Preload the next potential metaslabs but only on active
 4401          * metaslab groups. We can get into a state where the metaslab
 4402          * is no longer active since we dirty metaslabs as we remove a
 4403          * a device, thus potentially making the metaslab group eligible
 4404          * for preloading.
 4405          */
 4406         if (mg->mg_activation_count > 0) {
 4407                 metaslab_group_preload(mg);
 4408         }
 4409         spa_config_exit(spa, SCL_ALLOC, FTAG);
 4410 }
 4411 
 4412 /*
 4413  * When writing a ditto block (i.e. more than one DVA for a given BP) on
 4414  * the same vdev as an existing DVA of this BP, then try to allocate it
 4415  * on a different metaslab than existing DVAs (i.e. a unique metaslab).
 4416  */
 4417 static boolean_t
 4418 metaslab_is_unique(metaslab_t *msp, dva_t *dva)
 4419 {
 4420         uint64_t dva_ms_id;
 4421 
 4422         if (DVA_GET_ASIZE(dva) == 0)
 4423                 return (B_TRUE);
 4424 
 4425         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
 4426                 return (B_TRUE);
 4427 
 4428         dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
 4429 
 4430         return (msp->ms_id != dva_ms_id);
 4431 }
 4432 
 4433 /*
 4434  * ==========================================================================
 4435  * Metaslab allocation tracing facility
 4436  * ==========================================================================
 4437  */
 4438 
 4439 /*
 4440  * Add an allocation trace element to the allocation tracing list.
 4441  */
 4442 static void
 4443 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
 4444     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
 4445     int allocator)
 4446 {
 4447         metaslab_alloc_trace_t *mat;
 4448 
 4449         if (!metaslab_trace_enabled)
 4450                 return;
 4451 
 4452         /*
 4453          * When the tracing list reaches its maximum we remove
 4454          * the second element in the list before adding a new one.
 4455          * By removing the second element we preserve the original
 4456          * entry as a clue to what allocations steps have already been
 4457          * performed.
 4458          */
 4459         if (zal->zal_size == metaslab_trace_max_entries) {
 4460                 metaslab_alloc_trace_t *mat_next;
 4461 #ifdef ZFS_DEBUG
 4462                 panic("too many entries in allocation list");
 4463 #endif
 4464                 METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
 4465                 zal->zal_size--;
 4466                 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
 4467                 list_remove(&zal->zal_list, mat_next);
 4468                 kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
 4469         }
 4470 
 4471         mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
 4472         list_link_init(&mat->mat_list_node);
 4473         mat->mat_mg = mg;
 4474         mat->mat_msp = msp;
 4475         mat->mat_size = psize;
 4476         mat->mat_dva_id = dva_id;
 4477         mat->mat_offset = offset;
 4478         mat->mat_weight = 0;
 4479         mat->mat_allocator = allocator;
 4480 
 4481         if (msp != NULL)
 4482                 mat->mat_weight = msp->ms_weight;
 4483 
 4484         /*
 4485          * The list is part of the zio so locking is not required. Only
 4486          * a single thread will perform allocations for a given zio.
 4487          */
 4488         list_insert_tail(&zal->zal_list, mat);
 4489         zal->zal_size++;
 4490 
 4491         ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
 4492 }
 4493 
 4494 void
 4495 metaslab_trace_init(zio_alloc_list_t *zal)
 4496 {
 4497         list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
 4498             offsetof(metaslab_alloc_trace_t, mat_list_node));
 4499         zal->zal_size = 0;
 4500 }
 4501 
 4502 void
 4503 metaslab_trace_fini(zio_alloc_list_t *zal)
 4504 {
 4505         metaslab_alloc_trace_t *mat;
 4506 
 4507         while ((mat = list_remove_head(&zal->zal_list)) != NULL)
 4508                 kmem_cache_free(metaslab_alloc_trace_cache, mat);
 4509         list_destroy(&zal->zal_list);
 4510         zal->zal_size = 0;
 4511 }
 4512 
 4513 /*
 4514  * ==========================================================================
 4515  * Metaslab block operations
 4516  * ==========================================================================
 4517  */
 4518 
 4519 static void
 4520 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag,
 4521     int flags, int allocator)
 4522 {
 4523         if (!(flags & METASLAB_ASYNC_ALLOC) ||
 4524             (flags & METASLAB_DONT_THROTTLE))
 4525                 return;
 4526 
 4527         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 4528         if (!mg->mg_class->mc_alloc_throttle_enabled)
 4529                 return;
 4530 
 4531         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 4532         (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag);
 4533 }
 4534 
 4535 static void
 4536 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
 4537 {
 4538         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 4539         metaslab_class_allocator_t *mca =
 4540             &mg->mg_class->mc_allocator[allocator];
 4541         uint64_t max = mg->mg_max_alloc_queue_depth;
 4542         uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
 4543         while (cur < max) {
 4544                 if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
 4545                     cur, cur + 1) == cur) {
 4546                         atomic_inc_64(&mca->mca_alloc_max_slots);
 4547                         return;
 4548                 }
 4549                 cur = mga->mga_cur_max_alloc_queue_depth;
 4550         }
 4551 }
 4552 
 4553 void
 4554 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag,
 4555     int flags, int allocator, boolean_t io_complete)
 4556 {
 4557         if (!(flags & METASLAB_ASYNC_ALLOC) ||
 4558             (flags & METASLAB_DONT_THROTTLE))
 4559                 return;
 4560 
 4561         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 4562         if (!mg->mg_class->mc_alloc_throttle_enabled)
 4563                 return;
 4564 
 4565         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 4566         (void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag);
 4567         if (io_complete)
 4568                 metaslab_group_increment_qdepth(mg, allocator);
 4569 }
 4570 
 4571 void
 4572 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag,
 4573     int allocator)
 4574 {
 4575 #ifdef ZFS_DEBUG
 4576         const dva_t *dva = bp->blk_dva;
 4577         int ndvas = BP_GET_NDVAS(bp);
 4578 
 4579         for (int d = 0; d < ndvas; d++) {
 4580                 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
 4581                 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 4582                 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 4583                 VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag));
 4584         }
 4585 #endif
 4586 }
 4587 
 4588 static uint64_t
 4589 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 4590 {
 4591         uint64_t start;
 4592         range_tree_t *rt = msp->ms_allocatable;
 4593         metaslab_class_t *mc = msp->ms_group->mg_class;
 4594 
 4595         ASSERT(MUTEX_HELD(&msp->ms_lock));
 4596         VERIFY(!msp->ms_condensing);
 4597         VERIFY0(msp->ms_disabled);
 4598 
 4599         start = mc->mc_ops->msop_alloc(msp, size);
 4600         if (start != -1ULL) {
 4601                 metaslab_group_t *mg = msp->ms_group;
 4602                 vdev_t *vd = mg->mg_vd;
 4603 
 4604                 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
 4605                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 4606                 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
 4607                 range_tree_remove(rt, start, size);
 4608                 range_tree_clear(msp->ms_trim, start, size);
 4609 
 4610                 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 4611                         vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 4612 
 4613                 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
 4614                 msp->ms_allocating_total += size;
 4615 
 4616                 /* Track the last successful allocation */
 4617                 msp->ms_alloc_txg = txg;
 4618                 metaslab_verify_space(msp, txg);
 4619         }
 4620 
 4621         /*
 4622          * Now that we've attempted the allocation we need to update the
 4623          * metaslab's maximum block size since it may have changed.
 4624          */
 4625         msp->ms_max_size = metaslab_largest_allocatable(msp);
 4626         return (start);
 4627 }
 4628 
 4629 /*
 4630  * Find the metaslab with the highest weight that is less than what we've
 4631  * already tried.  In the common case, this means that we will examine each
 4632  * metaslab at most once. Note that concurrent callers could reorder metaslabs
 4633  * by activation/passivation once we have dropped the mg_lock. If a metaslab is
 4634  * activated by another thread, and we fail to allocate from the metaslab we
 4635  * have selected, we may not try the newly-activated metaslab, and instead
 4636  * activate another metaslab.  This is not optimal, but generally does not cause
 4637  * any problems (a possible exception being if every metaslab is completely full
 4638  * except for the newly-activated metaslab which we fail to examine).
 4639  */
 4640 static metaslab_t *
 4641 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
 4642     dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
 4643     boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
 4644     boolean_t *was_active)
 4645 {
 4646         avl_index_t idx;
 4647         avl_tree_t *t = &mg->mg_metaslab_tree;
 4648         metaslab_t *msp = avl_find(t, search, &idx);
 4649         if (msp == NULL)
 4650                 msp = avl_nearest(t, idx, AVL_AFTER);
 4651 
 4652         uint_t tries = 0;
 4653         for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
 4654                 int i;
 4655 
 4656                 if (!try_hard && tries > zfs_metaslab_find_max_tries) {
 4657                         METASLABSTAT_BUMP(metaslabstat_too_many_tries);
 4658                         return (NULL);
 4659                 }
 4660                 tries++;
 4661 
 4662                 if (!metaslab_should_allocate(msp, asize, try_hard)) {
 4663                         metaslab_trace_add(zal, mg, msp, asize, d,
 4664                             TRACE_TOO_SMALL, allocator);
 4665                         continue;
 4666                 }
 4667 
 4668                 /*
 4669                  * If the selected metaslab is condensing or disabled,
 4670                  * skip it.
 4671                  */
 4672                 if (msp->ms_condensing || msp->ms_disabled > 0)
 4673                         continue;
 4674 
 4675                 *was_active = msp->ms_allocator != -1;
 4676                 /*
 4677                  * If we're activating as primary, this is our first allocation
 4678                  * from this disk, so we don't need to check how close we are.
 4679                  * If the metaslab under consideration was already active,
 4680                  * we're getting desperate enough to steal another allocator's
 4681                  * metaslab, so we still don't care about distances.
 4682                  */
 4683                 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
 4684                         break;
 4685 
 4686                 for (i = 0; i < d; i++) {
 4687                         if (want_unique &&
 4688                             !metaslab_is_unique(msp, &dva[i]))
 4689                                 break;  /* try another metaslab */
 4690                 }
 4691                 if (i == d)
 4692                         break;
 4693         }
 4694 
 4695         if (msp != NULL) {
 4696                 search->ms_weight = msp->ms_weight;
 4697                 search->ms_start = msp->ms_start + 1;
 4698                 search->ms_allocator = msp->ms_allocator;
 4699                 search->ms_primary = msp->ms_primary;
 4700         }
 4701         return (msp);
 4702 }
 4703 
 4704 static void
 4705 metaslab_active_mask_verify(metaslab_t *msp)
 4706 {
 4707         ASSERT(MUTEX_HELD(&msp->ms_lock));
 4708 
 4709         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 4710                 return;
 4711 
 4712         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
 4713                 return;
 4714 
 4715         if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
 4716                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 4717                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
 4718                 VERIFY3S(msp->ms_allocator, !=, -1);
 4719                 VERIFY(msp->ms_primary);
 4720                 return;
 4721         }
 4722 
 4723         if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
 4724                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 4725                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
 4726                 VERIFY3S(msp->ms_allocator, !=, -1);
 4727                 VERIFY(!msp->ms_primary);
 4728                 return;
 4729         }
 4730 
 4731         if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
 4732                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 4733                 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 4734                 VERIFY3S(msp->ms_allocator, ==, -1);
 4735                 return;
 4736         }
 4737 }
 4738 
 4739 static uint64_t
 4740 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 4741     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
 4742     int allocator, boolean_t try_hard)
 4743 {
 4744         metaslab_t *msp = NULL;
 4745         uint64_t offset = -1ULL;
 4746 
 4747         uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
 4748         for (int i = 0; i < d; i++) {
 4749                 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
 4750                     DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 4751                         activation_weight = METASLAB_WEIGHT_SECONDARY;
 4752                 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
 4753                     DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 4754                         activation_weight = METASLAB_WEIGHT_CLAIM;
 4755                         break;
 4756                 }
 4757         }
 4758 
 4759         /*
 4760          * If we don't have enough metaslabs active to fill the entire array, we
 4761          * just use the 0th slot.
 4762          */
 4763         if (mg->mg_ms_ready < mg->mg_allocators * 3)
 4764                 allocator = 0;
 4765         metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 4766 
 4767         ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
 4768 
 4769         metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
 4770         search->ms_weight = UINT64_MAX;
 4771         search->ms_start = 0;
 4772         /*
 4773          * At the end of the metaslab tree are the already-active metaslabs,
 4774          * first the primaries, then the secondaries. When we resume searching
 4775          * through the tree, we need to consider ms_allocator and ms_primary so
 4776          * we start in the location right after where we left off, and don't
 4777          * accidentally loop forever considering the same metaslabs.
 4778          */
 4779         search->ms_allocator = -1;
 4780         search->ms_primary = B_TRUE;
 4781         for (;;) {
 4782                 boolean_t was_active = B_FALSE;
 4783 
 4784                 mutex_enter(&mg->mg_lock);
 4785 
 4786                 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
 4787                     mga->mga_primary != NULL) {
 4788                         msp = mga->mga_primary;
 4789 
 4790                         /*
 4791                          * Even though we don't hold the ms_lock for the
 4792                          * primary metaslab, those fields should not
 4793                          * change while we hold the mg_lock. Thus it is
 4794                          * safe to make assertions on them.
 4795                          */
 4796                         ASSERT(msp->ms_primary);
 4797                         ASSERT3S(msp->ms_allocator, ==, allocator);
 4798                         ASSERT(msp->ms_loaded);
 4799 
 4800                         was_active = B_TRUE;
 4801                         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 4802                 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
 4803                     mga->mga_secondary != NULL) {
 4804                         msp = mga->mga_secondary;
 4805 
 4806                         /*
 4807                          * See comment above about the similar assertions
 4808                          * for the primary metaslab.
 4809                          */
 4810                         ASSERT(!msp->ms_primary);
 4811                         ASSERT3S(msp->ms_allocator, ==, allocator);
 4812                         ASSERT(msp->ms_loaded);
 4813 
 4814                         was_active = B_TRUE;
 4815                         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 4816                 } else {
 4817                         msp = find_valid_metaslab(mg, activation_weight, dva, d,
 4818                             want_unique, asize, allocator, try_hard, zal,
 4819                             search, &was_active);
 4820                 }
 4821 
 4822                 mutex_exit(&mg->mg_lock);
 4823                 if (msp == NULL) {
 4824                         kmem_free(search, sizeof (*search));
 4825                         return (-1ULL);
 4826                 }
 4827                 mutex_enter(&msp->ms_lock);
 4828 
 4829                 metaslab_active_mask_verify(msp);
 4830 
 4831                 /*
 4832                  * This code is disabled out because of issues with
 4833                  * tracepoints in non-gpl kernel modules.
 4834                  */
 4835 #if 0
 4836                 DTRACE_PROBE3(ms__activation__attempt,
 4837                     metaslab_t *, msp, uint64_t, activation_weight,
 4838                     boolean_t, was_active);
 4839 #endif
 4840 
 4841                 /*
 4842                  * Ensure that the metaslab we have selected is still
 4843                  * capable of handling our request. It's possible that
 4844                  * another thread may have changed the weight while we
 4845                  * were blocked on the metaslab lock. We check the
 4846                  * active status first to see if we need to set_selected_txg
 4847                  * a new metaslab.
 4848                  */
 4849                 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
 4850                         ASSERT3S(msp->ms_allocator, ==, -1);
 4851                         mutex_exit(&msp->ms_lock);
 4852                         continue;
 4853                 }
 4854 
 4855                 /*
 4856                  * If the metaslab was activated for another allocator
 4857                  * while we were waiting in the ms_lock above, or it's
 4858                  * a primary and we're seeking a secondary (or vice versa),
 4859                  * we go back and select a new metaslab.
 4860                  */
 4861                 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
 4862                     (msp->ms_allocator != -1) &&
 4863                     (msp->ms_allocator != allocator || ((activation_weight ==
 4864                     METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
 4865                         ASSERT(msp->ms_loaded);
 4866                         ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
 4867                             msp->ms_allocator != -1);
 4868                         mutex_exit(&msp->ms_lock);
 4869                         continue;
 4870                 }
 4871 
 4872                 /*
 4873                  * This metaslab was used for claiming regions allocated
 4874                  * by the ZIL during pool import. Once these regions are
 4875                  * claimed we don't need to keep the CLAIM bit set
 4876                  * anymore. Passivate this metaslab to zero its activation
 4877                  * mask.
 4878                  */
 4879                 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
 4880                     activation_weight != METASLAB_WEIGHT_CLAIM) {
 4881                         ASSERT(msp->ms_loaded);
 4882                         ASSERT3S(msp->ms_allocator, ==, -1);
 4883                         metaslab_passivate(msp, msp->ms_weight &
 4884                             ~METASLAB_WEIGHT_CLAIM);
 4885                         mutex_exit(&msp->ms_lock);
 4886                         continue;
 4887                 }
 4888 
 4889                 metaslab_set_selected_txg(msp, txg);
 4890 
 4891                 int activation_error =
 4892                     metaslab_activate(msp, allocator, activation_weight);
 4893                 metaslab_active_mask_verify(msp);
 4894 
 4895                 /*
 4896                  * If the metaslab was activated by another thread for
 4897                  * another allocator or activation_weight (EBUSY), or it
 4898                  * failed because another metaslab was assigned as primary
 4899                  * for this allocator (EEXIST) we continue using this
 4900                  * metaslab for our allocation, rather than going on to a
 4901                  * worse metaslab (we waited for that metaslab to be loaded
 4902                  * after all).
 4903                  *
 4904                  * If the activation failed due to an I/O error or ENOSPC we
 4905                  * skip to the next metaslab.
 4906                  */
 4907                 boolean_t activated;
 4908                 if (activation_error == 0) {
 4909                         activated = B_TRUE;
 4910                 } else if (activation_error == EBUSY ||
 4911                     activation_error == EEXIST) {
 4912                         activated = B_FALSE;
 4913                 } else {
 4914                         mutex_exit(&msp->ms_lock);
 4915                         continue;
 4916                 }
 4917                 ASSERT(msp->ms_loaded);
 4918 
 4919                 /*
 4920                  * Now that we have the lock, recheck to see if we should
 4921                  * continue to use this metaslab for this allocation. The
 4922                  * the metaslab is now loaded so metaslab_should_allocate()
 4923                  * can accurately determine if the allocation attempt should
 4924                  * proceed.
 4925                  */
 4926                 if (!metaslab_should_allocate(msp, asize, try_hard)) {
 4927                         /* Passivate this metaslab and select a new one. */
 4928                         metaslab_trace_add(zal, mg, msp, asize, d,
 4929                             TRACE_TOO_SMALL, allocator);
 4930                         goto next;
 4931                 }
 4932 
 4933                 /*
 4934                  * If this metaslab is currently condensing then pick again
 4935                  * as we can't manipulate this metaslab until it's committed
 4936                  * to disk. If this metaslab is being initialized, we shouldn't
 4937                  * allocate from it since the allocated region might be
 4938                  * overwritten after allocation.
 4939                  */
 4940                 if (msp->ms_condensing) {
 4941                         metaslab_trace_add(zal, mg, msp, asize, d,
 4942                             TRACE_CONDENSING, allocator);
 4943                         if (activated) {
 4944                                 metaslab_passivate(msp, msp->ms_weight &
 4945                                     ~METASLAB_ACTIVE_MASK);
 4946                         }
 4947                         mutex_exit(&msp->ms_lock);
 4948                         continue;
 4949                 } else if (msp->ms_disabled > 0) {
 4950                         metaslab_trace_add(zal, mg, msp, asize, d,
 4951                             TRACE_DISABLED, allocator);
 4952                         if (activated) {
 4953                                 metaslab_passivate(msp, msp->ms_weight &
 4954                                     ~METASLAB_ACTIVE_MASK);
 4955                         }
 4956                         mutex_exit(&msp->ms_lock);
 4957                         continue;
 4958                 }
 4959 
 4960                 offset = metaslab_block_alloc(msp, asize, txg);
 4961                 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
 4962 
 4963                 if (offset != -1ULL) {
 4964                         /* Proactively passivate the metaslab, if needed */
 4965                         if (activated)
 4966                                 metaslab_segment_may_passivate(msp);
 4967                         break;
 4968                 }
 4969 next:
 4970                 ASSERT(msp->ms_loaded);
 4971 
 4972                 /*
 4973                  * This code is disabled out because of issues with
 4974                  * tracepoints in non-gpl kernel modules.
 4975                  */
 4976 #if 0
 4977                 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
 4978                     uint64_t, asize);
 4979 #endif
 4980 
 4981                 /*
 4982                  * We were unable to allocate from this metaslab so determine
 4983                  * a new weight for this metaslab. Now that we have loaded
 4984                  * the metaslab we can provide a better hint to the metaslab
 4985                  * selector.
 4986                  *
 4987                  * For space-based metaslabs, we use the maximum block size.
 4988                  * This information is only available when the metaslab
 4989                  * is loaded and is more accurate than the generic free
 4990                  * space weight that was calculated by metaslab_weight().
 4991                  * This information allows us to quickly compare the maximum
 4992                  * available allocation in the metaslab to the allocation
 4993                  * size being requested.
 4994                  *
 4995                  * For segment-based metaslabs, determine the new weight
 4996                  * based on the highest bucket in the range tree. We
 4997                  * explicitly use the loaded segment weight (i.e. the range
 4998                  * tree histogram) since it contains the space that is
 4999                  * currently available for allocation and is accurate
 5000                  * even within a sync pass.
 5001                  */
 5002                 uint64_t weight;
 5003                 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
 5004                         weight = metaslab_largest_allocatable(msp);
 5005                         WEIGHT_SET_SPACEBASED(weight);
 5006                 } else {
 5007                         weight = metaslab_weight_from_range_tree(msp);
 5008                 }
 5009 
 5010                 if (activated) {
 5011                         metaslab_passivate(msp, weight);
 5012                 } else {
 5013                         /*
 5014                          * For the case where we use the metaslab that is
 5015                          * active for another allocator we want to make
 5016                          * sure that we retain the activation mask.
 5017                          *
 5018                          * Note that we could attempt to use something like
 5019                          * metaslab_recalculate_weight_and_sort() that
 5020                          * retains the activation mask here. That function
 5021                          * uses metaslab_weight() to set the weight though
 5022                          * which is not as accurate as the calculations
 5023                          * above.
 5024                          */
 5025                         weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
 5026                         metaslab_group_sort(mg, msp, weight);
 5027                 }
 5028                 metaslab_active_mask_verify(msp);
 5029 
 5030                 /*
 5031                  * We have just failed an allocation attempt, check
 5032                  * that metaslab_should_allocate() agrees. Otherwise,
 5033                  * we may end up in an infinite loop retrying the same
 5034                  * metaslab.
 5035                  */
 5036                 ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
 5037 
 5038                 mutex_exit(&msp->ms_lock);
 5039         }
 5040         mutex_exit(&msp->ms_lock);
 5041         kmem_free(search, sizeof (*search));
 5042         return (offset);
 5043 }
 5044 
 5045 static uint64_t
 5046 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
 5047     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
 5048     int allocator, boolean_t try_hard)
 5049 {
 5050         uint64_t offset;
 5051         ASSERT(mg->mg_initialized);
 5052 
 5053         offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
 5054             dva, d, allocator, try_hard);
 5055 
 5056         mutex_enter(&mg->mg_lock);
 5057         if (offset == -1ULL) {
 5058                 mg->mg_failed_allocations++;
 5059                 metaslab_trace_add(zal, mg, NULL, asize, d,
 5060                     TRACE_GROUP_FAILURE, allocator);
 5061                 if (asize == SPA_GANGBLOCKSIZE) {
 5062                         /*
 5063                          * This metaslab group was unable to allocate
 5064                          * the minimum gang block size so it must be out of
 5065                          * space. We must notify the allocation throttle
 5066                          * to start skipping allocation attempts to this
 5067                          * metaslab group until more space becomes available.
 5068                          * Note: this failure cannot be caused by the
 5069                          * allocation throttle since the allocation throttle
 5070                          * is only responsible for skipping devices and
 5071                          * not failing block allocations.
 5072                          */
 5073                         mg->mg_no_free_space = B_TRUE;
 5074                 }
 5075         }
 5076         mg->mg_allocations++;
 5077         mutex_exit(&mg->mg_lock);
 5078         return (offset);
 5079 }
 5080 
 5081 /*
 5082  * Allocate a block for the specified i/o.
 5083  */
 5084 int
 5085 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 5086     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
 5087     zio_alloc_list_t *zal, int allocator)
 5088 {
 5089         metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 5090         metaslab_group_t *mg, *fast_mg, *rotor;
 5091         vdev_t *vd;
 5092         boolean_t try_hard = B_FALSE;
 5093 
 5094         ASSERT(!DVA_IS_VALID(&dva[d]));
 5095 
 5096         /*
 5097          * For testing, make some blocks above a certain size be gang blocks.
 5098          * This will result in more split blocks when using device removal,
 5099          * and a large number of split blocks coupled with ztest-induced
 5100          * damage can result in extremely long reconstruction times.  This
 5101          * will also test spilling from special to normal.
 5102          */
 5103         if (psize >= metaslab_force_ganging && (random_in_range(100) < 3)) {
 5104                 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
 5105                     allocator);
 5106                 return (SET_ERROR(ENOSPC));
 5107         }
 5108 
 5109         /*
 5110          * Start at the rotor and loop through all mgs until we find something.
 5111          * Note that there's no locking on mca_rotor or mca_aliquot because
 5112          * nothing actually breaks if we miss a few updates -- we just won't
 5113          * allocate quite as evenly.  It all balances out over time.
 5114          *
 5115          * If we are doing ditto or log blocks, try to spread them across
 5116          * consecutive vdevs.  If we're forced to reuse a vdev before we've
 5117          * allocated all of our ditto blocks, then try and spread them out on
 5118          * that vdev as much as possible.  If it turns out to not be possible,
 5119          * gradually lower our standards until anything becomes acceptable.
 5120          * Also, allocating on consecutive vdevs (as opposed to random vdevs)
 5121          * gives us hope of containing our fault domains to something we're
 5122          * able to reason about.  Otherwise, any two top-level vdev failures
 5123          * will guarantee the loss of data.  With consecutive allocation,
 5124          * only two adjacent top-level vdev failures will result in data loss.
 5125          *
 5126          * If we are doing gang blocks (hintdva is non-NULL), try to keep
 5127          * ourselves on the same vdev as our gang block header.  That
 5128          * way, we can hope for locality in vdev_cache, plus it makes our
 5129          * fault domains something tractable.
 5130          */
 5131         if (hintdva) {
 5132                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
 5133 
 5134                 /*
 5135                  * It's possible the vdev we're using as the hint no
 5136                  * longer exists or its mg has been closed (e.g. by
 5137                  * device removal).  Consult the rotor when
 5138                  * all else fails.
 5139                  */
 5140                 if (vd != NULL && vd->vdev_mg != NULL) {
 5141                         mg = vdev_get_mg(vd, mc);
 5142 
 5143                         if (flags & METASLAB_HINTBP_AVOID)
 5144                                 mg = mg->mg_next;
 5145                 } else {
 5146                         mg = mca->mca_rotor;
 5147                 }
 5148         } else if (d != 0) {
 5149                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 5150                 mg = vd->vdev_mg->mg_next;
 5151         } else if (flags & METASLAB_FASTWRITE) {
 5152                 mg = fast_mg = mca->mca_rotor;
 5153 
 5154                 do {
 5155                         if (fast_mg->mg_vd->vdev_pending_fastwrite <
 5156                             mg->mg_vd->vdev_pending_fastwrite)
 5157                                 mg = fast_mg;
 5158                 } while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor);
 5159 
 5160         } else {
 5161                 ASSERT(mca->mca_rotor != NULL);
 5162                 mg = mca->mca_rotor;
 5163         }
 5164 
 5165         /*
 5166          * If the hint put us into the wrong metaslab class, or into a
 5167          * metaslab group that has been passivated, just follow the rotor.
 5168          */
 5169         if (mg->mg_class != mc || mg->mg_activation_count <= 0)
 5170                 mg = mca->mca_rotor;
 5171 
 5172         rotor = mg;
 5173 top:
 5174         do {
 5175                 boolean_t allocatable;
 5176 
 5177                 ASSERT(mg->mg_activation_count == 1);
 5178                 vd = mg->mg_vd;
 5179 
 5180                 /*
 5181                  * Don't allocate from faulted devices.
 5182                  */
 5183                 if (try_hard) {
 5184                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
 5185                         allocatable = vdev_allocatable(vd);
 5186                         spa_config_exit(spa, SCL_ZIO, FTAG);
 5187                 } else {
 5188                         allocatable = vdev_allocatable(vd);
 5189                 }
 5190 
 5191                 /*
 5192                  * Determine if the selected metaslab group is eligible
 5193                  * for allocations. If we're ganging then don't allow
 5194                  * this metaslab group to skip allocations since that would
 5195                  * inadvertently return ENOSPC and suspend the pool
 5196                  * even though space is still available.
 5197                  */
 5198                 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
 5199                         allocatable = metaslab_group_allocatable(mg, rotor,
 5200                             flags, psize, allocator, d);
 5201                 }
 5202 
 5203                 if (!allocatable) {
 5204                         metaslab_trace_add(zal, mg, NULL, psize, d,
 5205                             TRACE_NOT_ALLOCATABLE, allocator);
 5206                         goto next;
 5207                 }
 5208 
 5209                 ASSERT(mg->mg_initialized);
 5210 
 5211                 /*
 5212                  * Avoid writing single-copy data to an unhealthy,
 5213                  * non-redundant vdev, unless we've already tried all
 5214                  * other vdevs.
 5215                  */
 5216                 if (vd->vdev_state < VDEV_STATE_HEALTHY &&
 5217                     d == 0 && !try_hard && vd->vdev_children == 0) {
 5218                         metaslab_trace_add(zal, mg, NULL, psize, d,
 5219                             TRACE_VDEV_ERROR, allocator);
 5220                         goto next;
 5221                 }
 5222 
 5223                 ASSERT(mg->mg_class == mc);
 5224 
 5225                 uint64_t asize = vdev_psize_to_asize(vd, psize);
 5226                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 5227 
 5228                 /*
 5229                  * If we don't need to try hard, then require that the
 5230                  * block be on a different metaslab from any other DVAs
 5231                  * in this BP (unique=true).  If we are trying hard, then
 5232                  * allow any metaslab to be used (unique=false).
 5233                  */
 5234                 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
 5235                     !try_hard, dva, d, allocator, try_hard);
 5236 
 5237                 if (offset != -1ULL) {
 5238                         /*
 5239                          * If we've just selected this metaslab group,
 5240                          * figure out whether the corresponding vdev is
 5241                          * over- or under-used relative to the pool,
 5242                          * and set an allocation bias to even it out.
 5243                          *
 5244                          * Bias is also used to compensate for unequally
 5245                          * sized vdevs so that space is allocated fairly.
 5246                          */
 5247                         if (mca->mca_aliquot == 0 && metaslab_bias_enabled) {
 5248                                 vdev_stat_t *vs = &vd->vdev_stat;
 5249                                 int64_t vs_free = vs->vs_space - vs->vs_alloc;
 5250                                 int64_t mc_free = mc->mc_space - mc->mc_alloc;
 5251                                 int64_t ratio;
 5252 
 5253                                 /*
 5254                                  * Calculate how much more or less we should
 5255                                  * try to allocate from this device during
 5256                                  * this iteration around the rotor.
 5257                                  *
 5258                                  * This basically introduces a zero-centered
 5259                                  * bias towards the devices with the most
 5260                                  * free space, while compensating for vdev
 5261                                  * size differences.
 5262                                  *
 5263                                  * Examples:
 5264                                  *  vdev V1 = 16M/128M
 5265                                  *  vdev V2 = 16M/128M
 5266                                  *  ratio(V1) = 100% ratio(V2) = 100%
 5267                                  *
 5268                                  *  vdev V1 = 16M/128M
 5269                                  *  vdev V2 = 64M/128M
 5270                                  *  ratio(V1) = 127% ratio(V2) =  72%
 5271                                  *
 5272                                  *  vdev V1 = 16M/128M
 5273                                  *  vdev V2 = 64M/512M
 5274                                  *  ratio(V1) =  40% ratio(V2) = 160%
 5275                                  */
 5276                                 ratio = (vs_free * mc->mc_alloc_groups * 100) /
 5277                                     (mc_free + 1);
 5278                                 mg->mg_bias = ((ratio - 100) *
 5279                                     (int64_t)mg->mg_aliquot) / 100;
 5280                         } else if (!metaslab_bias_enabled) {
 5281                                 mg->mg_bias = 0;
 5282                         }
 5283 
 5284                         if ((flags & METASLAB_FASTWRITE) ||
 5285                             atomic_add_64_nv(&mca->mca_aliquot, asize) >=
 5286                             mg->mg_aliquot + mg->mg_bias) {
 5287                                 mca->mca_rotor = mg->mg_next;
 5288                                 mca->mca_aliquot = 0;
 5289                         }
 5290 
 5291                         DVA_SET_VDEV(&dva[d], vd->vdev_id);
 5292                         DVA_SET_OFFSET(&dva[d], offset);
 5293                         DVA_SET_GANG(&dva[d],
 5294                             ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
 5295                         DVA_SET_ASIZE(&dva[d], asize);
 5296 
 5297                         if (flags & METASLAB_FASTWRITE) {
 5298                                 atomic_add_64(&vd->vdev_pending_fastwrite,
 5299                                     psize);
 5300                         }
 5301 
 5302                         return (0);
 5303                 }
 5304 next:
 5305                 mca->mca_rotor = mg->mg_next;
 5306                 mca->mca_aliquot = 0;
 5307         } while ((mg = mg->mg_next) != rotor);
 5308 
 5309         /*
 5310          * If we haven't tried hard, perhaps do so now.
 5311          */
 5312         if (!try_hard && (zfs_metaslab_try_hard_before_gang ||
 5313             GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 ||
 5314             psize <= 1 << spa->spa_min_ashift)) {
 5315                 METASLABSTAT_BUMP(metaslabstat_try_hard);
 5316                 try_hard = B_TRUE;
 5317                 goto top;
 5318         }
 5319 
 5320         memset(&dva[d], 0, sizeof (dva_t));
 5321 
 5322         metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
 5323         return (SET_ERROR(ENOSPC));
 5324 }
 5325 
 5326 void
 5327 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
 5328     boolean_t checkpoint)
 5329 {
 5330         metaslab_t *msp;
 5331         spa_t *spa = vd->vdev_spa;
 5332 
 5333         ASSERT(vdev_is_concrete(vd));
 5334         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 5335         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
 5336 
 5337         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 5338 
 5339         VERIFY(!msp->ms_condensing);
 5340         VERIFY3U(offset, >=, msp->ms_start);
 5341         VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
 5342         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 5343         VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
 5344 
 5345         metaslab_check_free_impl(vd, offset, asize);
 5346 
 5347         mutex_enter(&msp->ms_lock);
 5348         if (range_tree_is_empty(msp->ms_freeing) &&
 5349             range_tree_is_empty(msp->ms_checkpointing)) {
 5350                 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
 5351         }
 5352 
 5353         if (checkpoint) {
 5354                 ASSERT(spa_has_checkpoint(spa));
 5355                 range_tree_add(msp->ms_checkpointing, offset, asize);
 5356         } else {
 5357                 range_tree_add(msp->ms_freeing, offset, asize);
 5358         }
 5359         mutex_exit(&msp->ms_lock);
 5360 }
 5361 
 5362 void
 5363 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
 5364     uint64_t size, void *arg)
 5365 {
 5366         (void) inner_offset;
 5367         boolean_t *checkpoint = arg;
 5368 
 5369         ASSERT3P(checkpoint, !=, NULL);
 5370 
 5371         if (vd->vdev_ops->vdev_op_remap != NULL)
 5372                 vdev_indirect_mark_obsolete(vd, offset, size);
 5373         else
 5374                 metaslab_free_impl(vd, offset, size, *checkpoint);
 5375 }
 5376 
 5377 static void
 5378 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
 5379     boolean_t checkpoint)
 5380 {
 5381         spa_t *spa = vd->vdev_spa;
 5382 
 5383         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 5384 
 5385         if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
 5386                 return;
 5387 
 5388         if (spa->spa_vdev_removal != NULL &&
 5389             spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
 5390             vdev_is_concrete(vd)) {
 5391                 /*
 5392                  * Note: we check if the vdev is concrete because when
 5393                  * we complete the removal, we first change the vdev to be
 5394                  * an indirect vdev (in open context), and then (in syncing
 5395                  * context) clear spa_vdev_removal.
 5396                  */
 5397                 free_from_removing_vdev(vd, offset, size);
 5398         } else if (vd->vdev_ops->vdev_op_remap != NULL) {
 5399                 vdev_indirect_mark_obsolete(vd, offset, size);
 5400                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
 5401                     metaslab_free_impl_cb, &checkpoint);
 5402         } else {
 5403                 metaslab_free_concrete(vd, offset, size, checkpoint);
 5404         }
 5405 }
 5406 
 5407 typedef struct remap_blkptr_cb_arg {
 5408         blkptr_t *rbca_bp;
 5409         spa_remap_cb_t rbca_cb;
 5410         vdev_t *rbca_remap_vd;
 5411         uint64_t rbca_remap_offset;
 5412         void *rbca_cb_arg;
 5413 } remap_blkptr_cb_arg_t;
 5414 
 5415 static void
 5416 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
 5417     uint64_t size, void *arg)
 5418 {
 5419         remap_blkptr_cb_arg_t *rbca = arg;
 5420         blkptr_t *bp = rbca->rbca_bp;
 5421 
 5422         /* We can not remap split blocks. */
 5423         if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
 5424                 return;
 5425         ASSERT0(inner_offset);
 5426 
 5427         if (rbca->rbca_cb != NULL) {
 5428                 /*
 5429                  * At this point we know that we are not handling split
 5430                  * blocks and we invoke the callback on the previous
 5431                  * vdev which must be indirect.
 5432                  */
 5433                 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
 5434 
 5435                 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
 5436                     rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
 5437 
 5438                 /* set up remap_blkptr_cb_arg for the next call */
 5439                 rbca->rbca_remap_vd = vd;
 5440                 rbca->rbca_remap_offset = offset;
 5441         }
 5442 
 5443         /*
 5444          * The phys birth time is that of dva[0].  This ensures that we know
 5445          * when each dva was written, so that resilver can determine which
 5446          * blocks need to be scrubbed (i.e. those written during the time
 5447          * the vdev was offline).  It also ensures that the key used in
 5448          * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
 5449          * we didn't change the phys_birth, a lookup in the ARC for a
 5450          * remapped BP could find the data that was previously stored at
 5451          * this vdev + offset.
 5452          */
 5453         vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
 5454             DVA_GET_VDEV(&bp->blk_dva[0]));
 5455         vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
 5456         bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
 5457             DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
 5458 
 5459         DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
 5460         DVA_SET_OFFSET(&bp->blk_dva[0], offset);
 5461 }
 5462 
 5463 /*
 5464  * If the block pointer contains any indirect DVAs, modify them to refer to
 5465  * concrete DVAs.  Note that this will sometimes not be possible, leaving
 5466  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
 5467  * segments in the mapping (i.e. it is a "split block").
 5468  *
 5469  * If the BP was remapped, calls the callback on the original dva (note the
 5470  * callback can be called multiple times if the original indirect DVA refers
 5471  * to another indirect DVA, etc).
 5472  *
 5473  * Returns TRUE if the BP was remapped.
 5474  */
 5475 boolean_t
 5476 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
 5477 {
 5478         remap_blkptr_cb_arg_t rbca;
 5479 
 5480         if (!zfs_remap_blkptr_enable)
 5481                 return (B_FALSE);
 5482 
 5483         if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
 5484                 return (B_FALSE);
 5485 
 5486         /*
 5487          * Dedup BP's can not be remapped, because ddt_phys_select() depends
 5488          * on DVA[0] being the same in the BP as in the DDT (dedup table).
 5489          */
 5490         if (BP_GET_DEDUP(bp))
 5491                 return (B_FALSE);
 5492 
 5493         /*
 5494          * Gang blocks can not be remapped, because
 5495          * zio_checksum_gang_verifier() depends on the DVA[0] that's in
 5496          * the BP used to read the gang block header (GBH) being the same
 5497          * as the DVA[0] that we allocated for the GBH.
 5498          */
 5499         if (BP_IS_GANG(bp))
 5500                 return (B_FALSE);
 5501 
 5502         /*
 5503          * Embedded BP's have no DVA to remap.
 5504          */
 5505         if (BP_GET_NDVAS(bp) < 1)
 5506                 return (B_FALSE);
 5507 
 5508         /*
 5509          * Note: we only remap dva[0].  If we remapped other dvas, we
 5510          * would no longer know what their phys birth txg is.
 5511          */
 5512         dva_t *dva = &bp->blk_dva[0];
 5513 
 5514         uint64_t offset = DVA_GET_OFFSET(dva);
 5515         uint64_t size = DVA_GET_ASIZE(dva);
 5516         vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
 5517 
 5518         if (vd->vdev_ops->vdev_op_remap == NULL)
 5519                 return (B_FALSE);
 5520 
 5521         rbca.rbca_bp = bp;
 5522         rbca.rbca_cb = callback;
 5523         rbca.rbca_remap_vd = vd;
 5524         rbca.rbca_remap_offset = offset;
 5525         rbca.rbca_cb_arg = arg;
 5526 
 5527         /*
 5528          * remap_blkptr_cb() will be called in order for each level of
 5529          * indirection, until a concrete vdev is reached or a split block is
 5530          * encountered. old_vd and old_offset are updated within the callback
 5531          * as we go from the one indirect vdev to the next one (either concrete
 5532          * or indirect again) in that order.
 5533          */
 5534         vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
 5535 
 5536         /* Check if the DVA wasn't remapped because it is a split block */
 5537         if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
 5538                 return (B_FALSE);
 5539 
 5540         return (B_TRUE);
 5541 }
 5542 
 5543 /*
 5544  * Undo the allocation of a DVA which happened in the given transaction group.
 5545  */
 5546 void
 5547 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 5548 {
 5549         metaslab_t *msp;
 5550         vdev_t *vd;
 5551         uint64_t vdev = DVA_GET_VDEV(dva);
 5552         uint64_t offset = DVA_GET_OFFSET(dva);
 5553         uint64_t size = DVA_GET_ASIZE(dva);
 5554 
 5555         ASSERT(DVA_IS_VALID(dva));
 5556         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 5557 
 5558         if (txg > spa_freeze_txg(spa))
 5559                 return;
 5560 
 5561         if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
 5562             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
 5563                 zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
 5564                     (u_longlong_t)vdev, (u_longlong_t)offset,
 5565                     (u_longlong_t)size);
 5566                 return;
 5567         }
 5568 
 5569         ASSERT(!vd->vdev_removing);
 5570         ASSERT(vdev_is_concrete(vd));
 5571         ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
 5572         ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
 5573 
 5574         if (DVA_GET_GANG(dva))
 5575                 size = vdev_gang_header_asize(vd);
 5576 
 5577         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 5578 
 5579         mutex_enter(&msp->ms_lock);
 5580         range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
 5581             offset, size);
 5582         msp->ms_allocating_total -= size;
 5583 
 5584         VERIFY(!msp->ms_condensing);
 5585         VERIFY3U(offset, >=, msp->ms_start);
 5586         VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
 5587         VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
 5588             msp->ms_size);
 5589         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 5590         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 5591         range_tree_add(msp->ms_allocatable, offset, size);
 5592         mutex_exit(&msp->ms_lock);
 5593 }
 5594 
 5595 /*
 5596  * Free the block represented by the given DVA.
 5597  */
 5598 void
 5599 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
 5600 {
 5601         uint64_t vdev = DVA_GET_VDEV(dva);
 5602         uint64_t offset = DVA_GET_OFFSET(dva);
 5603         uint64_t size = DVA_GET_ASIZE(dva);
 5604         vdev_t *vd = vdev_lookup_top(spa, vdev);
 5605 
 5606         ASSERT(DVA_IS_VALID(dva));
 5607         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 5608 
 5609         if (DVA_GET_GANG(dva)) {
 5610                 size = vdev_gang_header_asize(vd);
 5611         }
 5612 
 5613         metaslab_free_impl(vd, offset, size, checkpoint);
 5614 }
 5615 
 5616 /*
 5617  * Reserve some allocation slots. The reservation system must be called
 5618  * before we call into the allocator. If there aren't any available slots
 5619  * then the I/O will be throttled until an I/O completes and its slots are
 5620  * freed up. The function returns true if it was successful in placing
 5621  * the reservation.
 5622  */
 5623 boolean_t
 5624 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
 5625     zio_t *zio, int flags)
 5626 {
 5627         metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 5628         uint64_t max = mca->mca_alloc_max_slots;
 5629 
 5630         ASSERT(mc->mc_alloc_throttle_enabled);
 5631         if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) ||
 5632             zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
 5633                 /*
 5634                  * The potential race between _count() and _add() is covered
 5635                  * by the allocator lock in most cases, or irrelevant due to
 5636                  * GANG_ALLOCATION() or METASLAB_MUST_RESERVE set in others.
 5637                  * But even if we assume some other non-existing scenario, the
 5638                  * worst that can happen is few more I/Os get to allocation
 5639                  * earlier, that is not a problem.
 5640                  *
 5641                  * We reserve the slots individually so that we can unreserve
 5642                  * them individually when an I/O completes.
 5643                  */
 5644                 for (int d = 0; d < slots; d++)
 5645                         zfs_refcount_add(&mca->mca_alloc_slots, zio);
 5646                 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
 5647                 return (B_TRUE);
 5648         }
 5649         return (B_FALSE);
 5650 }
 5651 
 5652 void
 5653 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
 5654     int allocator, zio_t *zio)
 5655 {
 5656         metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 5657 
 5658         ASSERT(mc->mc_alloc_throttle_enabled);
 5659         for (int d = 0; d < slots; d++)
 5660                 zfs_refcount_remove(&mca->mca_alloc_slots, zio);
 5661 }
 5662 
 5663 static int
 5664 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
 5665     uint64_t txg)
 5666 {
 5667         metaslab_t *msp;
 5668         spa_t *spa = vd->vdev_spa;
 5669         int error = 0;
 5670 
 5671         if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
 5672                 return (SET_ERROR(ENXIO));
 5673 
 5674         ASSERT3P(vd->vdev_ms, !=, NULL);
 5675         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 5676 
 5677         mutex_enter(&msp->ms_lock);
 5678 
 5679         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) {
 5680                 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
 5681                 if (error == EBUSY) {
 5682                         ASSERT(msp->ms_loaded);
 5683                         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 5684                         error = 0;
 5685                 }
 5686         }
 5687 
 5688         if (error == 0 &&
 5689             !range_tree_contains(msp->ms_allocatable, offset, size))
 5690                 error = SET_ERROR(ENOENT);
 5691 
 5692         if (error || txg == 0) {        /* txg == 0 indicates dry run */
 5693                 mutex_exit(&msp->ms_lock);
 5694                 return (error);
 5695         }
 5696 
 5697         VERIFY(!msp->ms_condensing);
 5698         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 5699         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 5700         VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
 5701             msp->ms_size);
 5702         range_tree_remove(msp->ms_allocatable, offset, size);
 5703         range_tree_clear(msp->ms_trim, offset, size);
 5704 
 5705         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(8) */
 5706                 metaslab_class_t *mc = msp->ms_group->mg_class;
 5707                 multilist_sublist_t *mls =
 5708                     multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 5709                 if (!multilist_link_active(&msp->ms_class_txg_node)) {
 5710                         msp->ms_selected_txg = txg;
 5711                         multilist_sublist_insert_head(mls, msp);
 5712                 }
 5713                 multilist_sublist_unlock(mls);
 5714 
 5715                 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 5716                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
 5717                 range_tree_add(msp->ms_allocating[txg & TXG_MASK],
 5718                     offset, size);
 5719                 msp->ms_allocating_total += size;
 5720         }
 5721 
 5722         mutex_exit(&msp->ms_lock);
 5723 
 5724         return (0);
 5725 }
 5726 
 5727 typedef struct metaslab_claim_cb_arg_t {
 5728         uint64_t        mcca_txg;
 5729         int             mcca_error;
 5730 } metaslab_claim_cb_arg_t;
 5731 
 5732 static void
 5733 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
 5734     uint64_t size, void *arg)
 5735 {
 5736         (void) inner_offset;
 5737         metaslab_claim_cb_arg_t *mcca_arg = arg;
 5738 
 5739         if (mcca_arg->mcca_error == 0) {
 5740                 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
 5741                     size, mcca_arg->mcca_txg);
 5742         }
 5743 }
 5744 
 5745 int
 5746 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
 5747 {
 5748         if (vd->vdev_ops->vdev_op_remap != NULL) {
 5749                 metaslab_claim_cb_arg_t arg;
 5750 
 5751                 /*
 5752                  * Only zdb(8) can claim on indirect vdevs.  This is used
 5753                  * to detect leaks of mapped space (that are not accounted
 5754                  * for in the obsolete counts, spacemap, or bpobj).
 5755                  */
 5756                 ASSERT(!spa_writeable(vd->vdev_spa));
 5757                 arg.mcca_error = 0;
 5758                 arg.mcca_txg = txg;
 5759 
 5760                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
 5761                     metaslab_claim_impl_cb, &arg);
 5762 
 5763                 if (arg.mcca_error == 0) {
 5764                         arg.mcca_error = metaslab_claim_concrete(vd,
 5765                             offset, size, txg);
 5766                 }
 5767                 return (arg.mcca_error);
 5768         } else {
 5769                 return (metaslab_claim_concrete(vd, offset, size, txg));
 5770         }
 5771 }
 5772 
 5773 /*
 5774  * Intent log support: upon opening the pool after a crash, notify the SPA
 5775  * of blocks that the intent log has allocated for immediate write, but
 5776  * which are still considered free by the SPA because the last transaction
 5777  * group didn't commit yet.
 5778  */
 5779 static int
 5780 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 5781 {
 5782         uint64_t vdev = DVA_GET_VDEV(dva);
 5783         uint64_t offset = DVA_GET_OFFSET(dva);
 5784         uint64_t size = DVA_GET_ASIZE(dva);
 5785         vdev_t *vd;
 5786 
 5787         if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
 5788                 return (SET_ERROR(ENXIO));
 5789         }
 5790 
 5791         ASSERT(DVA_IS_VALID(dva));
 5792 
 5793         if (DVA_GET_GANG(dva))
 5794                 size = vdev_gang_header_asize(vd);
 5795 
 5796         return (metaslab_claim_impl(vd, offset, size, txg));
 5797 }
 5798 
 5799 int
 5800 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 5801     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
 5802     zio_alloc_list_t *zal, zio_t *zio, int allocator)
 5803 {
 5804         dva_t *dva = bp->blk_dva;
 5805         dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
 5806         int error = 0;
 5807 
 5808         ASSERT(bp->blk_birth == 0);
 5809         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
 5810 
 5811         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 5812 
 5813         if (mc->mc_allocator[allocator].mca_rotor == NULL) {
 5814                 /* no vdevs in this class */
 5815                 spa_config_exit(spa, SCL_ALLOC, FTAG);
 5816                 return (SET_ERROR(ENOSPC));
 5817         }
 5818 
 5819         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
 5820         ASSERT(BP_GET_NDVAS(bp) == 0);
 5821         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
 5822         ASSERT3P(zal, !=, NULL);
 5823 
 5824         for (int d = 0; d < ndvas; d++) {
 5825                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
 5826                     txg, flags, zal, allocator);
 5827                 if (error != 0) {
 5828                         for (d--; d >= 0; d--) {
 5829                                 metaslab_unalloc_dva(spa, &dva[d], txg);
 5830                                 metaslab_group_alloc_decrement(spa,
 5831                                     DVA_GET_VDEV(&dva[d]), zio, flags,
 5832                                     allocator, B_FALSE);
 5833                                 memset(&dva[d], 0, sizeof (dva_t));
 5834                         }
 5835                         spa_config_exit(spa, SCL_ALLOC, FTAG);
 5836                         return (error);
 5837                 } else {
 5838                         /*
 5839                          * Update the metaslab group's queue depth
 5840                          * based on the newly allocated dva.
 5841                          */
 5842                         metaslab_group_alloc_increment(spa,
 5843                             DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
 5844                 }
 5845         }
 5846         ASSERT(error == 0);
 5847         ASSERT(BP_GET_NDVAS(bp) == ndvas);
 5848 
 5849         spa_config_exit(spa, SCL_ALLOC, FTAG);
 5850 
 5851         BP_SET_BIRTH(bp, txg, 0);
 5852 
 5853         return (0);
 5854 }
 5855 
 5856 void
 5857 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 5858 {
 5859         const dva_t *dva = bp->blk_dva;
 5860         int ndvas = BP_GET_NDVAS(bp);
 5861 
 5862         ASSERT(!BP_IS_HOLE(bp));
 5863         ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
 5864 
 5865         /*
 5866          * If we have a checkpoint for the pool we need to make sure that
 5867          * the blocks that we free that are part of the checkpoint won't be
 5868          * reused until the checkpoint is discarded or we revert to it.
 5869          *
 5870          * The checkpoint flag is passed down the metaslab_free code path
 5871          * and is set whenever we want to add a block to the checkpoint's
 5872          * accounting. That is, we "checkpoint" blocks that existed at the
 5873          * time the checkpoint was created and are therefore referenced by
 5874          * the checkpointed uberblock.
 5875          *
 5876          * Note that, we don't checkpoint any blocks if the current
 5877          * syncing txg <= spa_checkpoint_txg. We want these frees to sync
 5878          * normally as they will be referenced by the checkpointed uberblock.
 5879          */
 5880         boolean_t checkpoint = B_FALSE;
 5881         if (bp->blk_birth <= spa->spa_checkpoint_txg &&
 5882             spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
 5883                 /*
 5884                  * At this point, if the block is part of the checkpoint
 5885                  * there is no way it was created in the current txg.
 5886                  */
 5887                 ASSERT(!now);
 5888                 ASSERT3U(spa_syncing_txg(spa), ==, txg);
 5889                 checkpoint = B_TRUE;
 5890         }
 5891 
 5892         spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 5893 
 5894         for (int d = 0; d < ndvas; d++) {
 5895                 if (now) {
 5896                         metaslab_unalloc_dva(spa, &dva[d], txg);
 5897                 } else {
 5898                         ASSERT3U(txg, ==, spa_syncing_txg(spa));
 5899                         metaslab_free_dva(spa, &dva[d], checkpoint);
 5900                 }
 5901         }
 5902 
 5903         spa_config_exit(spa, SCL_FREE, FTAG);
 5904 }
 5905 
 5906 int
 5907 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 5908 {
 5909         const dva_t *dva = bp->blk_dva;
 5910         int ndvas = BP_GET_NDVAS(bp);
 5911         int error = 0;
 5912 
 5913         ASSERT(!BP_IS_HOLE(bp));
 5914 
 5915         if (txg != 0) {
 5916                 /*
 5917                  * First do a dry run to make sure all DVAs are claimable,
 5918                  * so we don't have to unwind from partial failures below.
 5919                  */
 5920                 if ((error = metaslab_claim(spa, bp, 0)) != 0)
 5921                         return (error);
 5922         }
 5923 
 5924         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 5925 
 5926         for (int d = 0; d < ndvas; d++) {
 5927                 error = metaslab_claim_dva(spa, &dva[d], txg);
 5928                 if (error != 0)
 5929                         break;
 5930         }
 5931 
 5932         spa_config_exit(spa, SCL_ALLOC, FTAG);
 5933 
 5934         ASSERT(error == 0 || txg == 0);
 5935 
 5936         return (error);
 5937 }
 5938 
 5939 void
 5940 metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
 5941 {
 5942         const dva_t *dva = bp->blk_dva;
 5943         int ndvas = BP_GET_NDVAS(bp);
 5944         uint64_t psize = BP_GET_PSIZE(bp);
 5945         int d;
 5946         vdev_t *vd;
 5947 
 5948         ASSERT(!BP_IS_HOLE(bp));
 5949         ASSERT(!BP_IS_EMBEDDED(bp));
 5950         ASSERT(psize > 0);
 5951 
 5952         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 5953 
 5954         for (d = 0; d < ndvas; d++) {
 5955                 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
 5956                         continue;
 5957                 atomic_add_64(&vd->vdev_pending_fastwrite, psize);
 5958         }
 5959 
 5960         spa_config_exit(spa, SCL_VDEV, FTAG);
 5961 }
 5962 
 5963 void
 5964 metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
 5965 {
 5966         const dva_t *dva = bp->blk_dva;
 5967         int ndvas = BP_GET_NDVAS(bp);
 5968         uint64_t psize = BP_GET_PSIZE(bp);
 5969         int d;
 5970         vdev_t *vd;
 5971 
 5972         ASSERT(!BP_IS_HOLE(bp));
 5973         ASSERT(!BP_IS_EMBEDDED(bp));
 5974         ASSERT(psize > 0);
 5975 
 5976         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 5977 
 5978         for (d = 0; d < ndvas; d++) {
 5979                 if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
 5980                         continue;
 5981                 ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
 5982                 atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
 5983         }
 5984 
 5985         spa_config_exit(spa, SCL_VDEV, FTAG);
 5986 }
 5987 
 5988 static void
 5989 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
 5990     uint64_t size, void *arg)
 5991 {
 5992         (void) inner, (void) arg;
 5993 
 5994         if (vd->vdev_ops == &vdev_indirect_ops)
 5995                 return;
 5996 
 5997         metaslab_check_free_impl(vd, offset, size);
 5998 }
 5999 
 6000 static void
 6001 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
 6002 {
 6003         metaslab_t *msp;
 6004         spa_t *spa __maybe_unused = vd->vdev_spa;
 6005 
 6006         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
 6007                 return;
 6008 
 6009         if (vd->vdev_ops->vdev_op_remap != NULL) {
 6010                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
 6011                     metaslab_check_free_impl_cb, NULL);
 6012                 return;
 6013         }
 6014 
 6015         ASSERT(vdev_is_concrete(vd));
 6016         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
 6017         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 6018 
 6019         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 6020 
 6021         mutex_enter(&msp->ms_lock);
 6022         if (msp->ms_loaded) {
 6023                 range_tree_verify_not_present(msp->ms_allocatable,
 6024                     offset, size);
 6025         }
 6026 
 6027         /*
 6028          * Check all segments that currently exist in the freeing pipeline.
 6029          *
 6030          * It would intuitively make sense to also check the current allocating
 6031          * tree since metaslab_unalloc_dva() exists for extents that are
 6032          * allocated and freed in the same sync pass within the same txg.
 6033          * Unfortunately there are places (e.g. the ZIL) where we allocate a
 6034          * segment but then we free part of it within the same txg
 6035          * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
 6036          * current allocating tree.
 6037          */
 6038         range_tree_verify_not_present(msp->ms_freeing, offset, size);
 6039         range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
 6040         range_tree_verify_not_present(msp->ms_freed, offset, size);
 6041         for (int j = 0; j < TXG_DEFER_SIZE; j++)
 6042                 range_tree_verify_not_present(msp->ms_defer[j], offset, size);
 6043         range_tree_verify_not_present(msp->ms_trim, offset, size);
 6044         mutex_exit(&msp->ms_lock);
 6045 }
 6046 
 6047 void
 6048 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
 6049 {
 6050         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
 6051                 return;
 6052 
 6053         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 6054         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 6055                 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 6056                 vdev_t *vd = vdev_lookup_top(spa, vdev);
 6057                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 6058                 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
 6059 
 6060                 if (DVA_GET_GANG(&bp->blk_dva[i]))
 6061                         size = vdev_gang_header_asize(vd);
 6062 
 6063                 ASSERT3P(vd, !=, NULL);
 6064 
 6065                 metaslab_check_free_impl(vd, offset, size);
 6066         }
 6067         spa_config_exit(spa, SCL_VDEV, FTAG);
 6068 }
 6069 
 6070 static void
 6071 metaslab_group_disable_wait(metaslab_group_t *mg)
 6072 {
 6073         ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
 6074         while (mg->mg_disabled_updating) {
 6075                 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
 6076         }
 6077 }
 6078 
 6079 static void
 6080 metaslab_group_disabled_increment(metaslab_group_t *mg)
 6081 {
 6082         ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
 6083         ASSERT(mg->mg_disabled_updating);
 6084 
 6085         while (mg->mg_ms_disabled >= max_disabled_ms) {
 6086                 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
 6087         }
 6088         mg->mg_ms_disabled++;
 6089         ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
 6090 }
 6091 
 6092 /*
 6093  * Mark the metaslab as disabled to prevent any allocations on this metaslab.
 6094  * We must also track how many metaslabs are currently disabled within a
 6095  * metaslab group and limit them to prevent allocation failures from
 6096  * occurring because all metaslabs are disabled.
 6097  */
 6098 void
 6099 metaslab_disable(metaslab_t *msp)
 6100 {
 6101         ASSERT(!MUTEX_HELD(&msp->ms_lock));
 6102         metaslab_group_t *mg = msp->ms_group;
 6103 
 6104         mutex_enter(&mg->mg_ms_disabled_lock);
 6105 
 6106         /*
 6107          * To keep an accurate count of how many threads have disabled
 6108          * a specific metaslab group, we only allow one thread to mark
 6109          * the metaslab group at a time. This ensures that the value of
 6110          * ms_disabled will be accurate when we decide to mark a metaslab
 6111          * group as disabled. To do this we force all other threads
 6112          * to wait till the metaslab's mg_disabled_updating flag is no
 6113          * longer set.
 6114          */
 6115         metaslab_group_disable_wait(mg);
 6116         mg->mg_disabled_updating = B_TRUE;
 6117         if (msp->ms_disabled == 0) {
 6118                 metaslab_group_disabled_increment(mg);
 6119         }
 6120         mutex_enter(&msp->ms_lock);
 6121         msp->ms_disabled++;
 6122         mutex_exit(&msp->ms_lock);
 6123 
 6124         mg->mg_disabled_updating = B_FALSE;
 6125         cv_broadcast(&mg->mg_ms_disabled_cv);
 6126         mutex_exit(&mg->mg_ms_disabled_lock);
 6127 }
 6128 
 6129 void
 6130 metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
 6131 {
 6132         metaslab_group_t *mg = msp->ms_group;
 6133         spa_t *spa = mg->mg_vd->vdev_spa;
 6134 
 6135         /*
 6136          * Wait for the outstanding IO to be synced to prevent newly
 6137          * allocated blocks from being overwritten.  This used by
 6138          * initialize and TRIM which are modifying unallocated space.
 6139          */
 6140         if (sync)
 6141                 txg_wait_synced(spa_get_dsl(spa), 0);
 6142 
 6143         mutex_enter(&mg->mg_ms_disabled_lock);
 6144         mutex_enter(&msp->ms_lock);
 6145         if (--msp->ms_disabled == 0) {
 6146                 mg->mg_ms_disabled--;
 6147                 cv_broadcast(&mg->mg_ms_disabled_cv);
 6148                 if (unload)
 6149                         metaslab_unload(msp);
 6150         }
 6151         mutex_exit(&msp->ms_lock);
 6152         mutex_exit(&mg->mg_ms_disabled_lock);
 6153 }
 6154 
 6155 void
 6156 metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty)
 6157 {
 6158         ms->ms_unflushed_dirty = dirty;
 6159 }
 6160 
 6161 static void
 6162 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
 6163 {
 6164         vdev_t *vd = ms->ms_group->mg_vd;
 6165         spa_t *spa = vd->vdev_spa;
 6166         objset_t *mos = spa_meta_objset(spa);
 6167 
 6168         ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 6169 
 6170         metaslab_unflushed_phys_t entry = {
 6171                 .msp_unflushed_txg = metaslab_unflushed_txg(ms),
 6172         };
 6173         uint64_t entry_size = sizeof (entry);
 6174         uint64_t entry_offset = ms->ms_id * entry_size;
 6175 
 6176         uint64_t object = 0;
 6177         int err = zap_lookup(mos, vd->vdev_top_zap,
 6178             VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
 6179             &object);
 6180         if (err == ENOENT) {
 6181                 object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
 6182                     SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
 6183                 VERIFY0(zap_add(mos, vd->vdev_top_zap,
 6184                     VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
 6185                     &object, tx));
 6186         } else {
 6187                 VERIFY0(err);
 6188         }
 6189 
 6190         dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
 6191             &entry, tx);
 6192 }
 6193 
 6194 void
 6195 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
 6196 {
 6197         ms->ms_unflushed_txg = txg;
 6198         metaslab_update_ondisk_flush_data(ms, tx);
 6199 }
 6200 
 6201 boolean_t
 6202 metaslab_unflushed_dirty(metaslab_t *ms)
 6203 {
 6204         return (ms->ms_unflushed_dirty);
 6205 }
 6206 
 6207 uint64_t
 6208 metaslab_unflushed_txg(metaslab_t *ms)
 6209 {
 6210         return (ms->ms_unflushed_txg);
 6211 }
 6212 
 6213 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW,
 6214         "Allocation granularity (a.k.a. stripe size)");
 6215 
 6216 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW,
 6217         "Load all metaslabs when pool is first opened");
 6218 
 6219 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
 6220         "Prevent metaslabs from being unloaded");
 6221 
 6222 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
 6223         "Preload potential metaslabs during reassessment");
 6224 
 6225 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW,
 6226         "Delay in txgs after metaslab was last used before unloading");
 6227 
 6228 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW,
 6229         "Delay in milliseconds after metaslab was last used before unloading");
 6230 
 6231 /* BEGIN CSTYLED */
 6232 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW,
 6233         "Percentage of metaslab group size that should be free to make it "
 6234         "eligible for allocation");
 6235 
 6236 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW,
 6237         "Percentage of metaslab group size that should be considered eligible "
 6238         "for allocations unless all metaslab groups within the metaslab class "
 6239         "have also crossed this threshold");
 6240 
 6241 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT,
 6242         ZMOD_RW,
 6243         "Use the fragmentation metric to prefer less fragmented metaslabs");
 6244 /* END CSTYLED */
 6245 
 6246 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT,
 6247         ZMOD_RW, "Fragmentation for metaslab to allow allocation");
 6248 
 6249 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW,
 6250         "Prefer metaslabs with lower LBAs");
 6251 
 6252 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW,
 6253         "Enable metaslab group biasing");
 6254 
 6255 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT,
 6256         ZMOD_RW, "Enable segment-based metaslab selection");
 6257 
 6258 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
 6259         "Segment-based metaslab selection maximum buckets before switching");
 6260 
 6261 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
 6262         "Blocks larger than this size are forced to be gang blocks");
 6263 
 6264 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
 6265         "Max distance (bytes) to search forward before using size tree");
 6266 
 6267 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
 6268         "When looking in size tree, use largest segment instead of exact fit");
 6269 
 6270 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64,
 6271         ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");
 6272 
 6273 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW,
 6274         "Percentage of memory that can be used to store metaslab range trees");
 6275 
 6276 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
 6277         ZMOD_RW, "Try hard to allocate before ganging");
 6278 
 6279 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW,
 6280         "Normally only consider this many of the best metaslabs in each vdev");

Cache object: 854f1ec17c5b8031434fd1c91d5d3875


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.