spa.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 
   22 /*
   23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   24  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
   25  * Copyright (c) 2018, Nexenta Systems, Inc.  All rights reserved.
   26  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
   27  * Copyright 2013 Saso Kiselkov. All rights reserved.
   28  * Copyright (c) 2014 Integros [integros.com]
   29  * Copyright 2016 Toomas Soome <tsoome@me.com>
   30  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
   31  * Copyright 2018 Joyent, Inc.
   32  * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
   33  * Copyright 2017 Joyent, Inc.
   34  * Copyright (c) 2017, Intel Corporation.
   35  * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
   36  */
   37 
   38 /*
   39  * SPA: Storage Pool Allocator
   40  *
   41  * This file contains all the routines used when modifying on-disk SPA state.
   42  * This includes opening, importing, destroying, exporting a pool, and syncing a
   43  * pool.
   44  */
   45 
   46 #include <sys/zfs_context.h>
   47 #include <sys/fm/fs/zfs.h>
   48 #include <sys/spa_impl.h>
   49 #include <sys/zio.h>
   50 #include <sys/zio_checksum.h>
   51 #include <sys/dmu.h>
   52 #include <sys/dmu_tx.h>
   53 #include <sys/zap.h>
   54 #include <sys/zil.h>
   55 #include <sys/ddt.h>
   56 #include <sys/vdev_impl.h>
   57 #include <sys/vdev_removal.h>
   58 #include <sys/vdev_indirect_mapping.h>
   59 #include <sys/vdev_indirect_births.h>
   60 #include <sys/vdev_initialize.h>
   61 #include <sys/vdev_rebuild.h>
   62 #include <sys/vdev_trim.h>
   63 #include <sys/vdev_disk.h>
   64 #include <sys/vdev_draid.h>
   65 #include <sys/metaslab.h>
   66 #include <sys/metaslab_impl.h>
   67 #include <sys/mmp.h>
   68 #include <sys/uberblock_impl.h>
   69 #include <sys/txg.h>
   70 #include <sys/avl.h>
   71 #include <sys/bpobj.h>
   72 #include <sys/dmu_traverse.h>
   73 #include <sys/dmu_objset.h>
   74 #include <sys/unique.h>
   75 #include <sys/dsl_pool.h>
   76 #include <sys/dsl_dataset.h>
   77 #include <sys/dsl_dir.h>
   78 #include <sys/dsl_prop.h>
   79 #include <sys/dsl_synctask.h>
   80 #include <sys/fs/zfs.h>
   81 #include <sys/arc.h>
   82 #include <sys/callb.h>
   83 #include <sys/systeminfo.h>
   84 #include <sys/zfs_ioctl.h>
   85 #include <sys/dsl_scan.h>
   86 #include <sys/zfeature.h>
   87 #include <sys/dsl_destroy.h>
   88 #include <sys/zvol.h>
   89 
   90 #ifdef  _KERNEL
   91 #include <sys/fm/protocol.h>
   92 #include <sys/fm/util.h>
   93 #include <sys/callb.h>
   94 #include <sys/zone.h>
   95 #include <sys/vmsystm.h>
   96 #endif  /* _KERNEL */
   97 
   98 #include "zfs_prop.h"
   99 #include "zfs_comutil.h"
  100 
  101 /*
  102  * The interval, in seconds, at which failed configuration cache file writes
  103  * should be retried.
  104  */
  105 int zfs_ccw_retry_interval = 300;
  106 
  107 typedef enum zti_modes {
  108         ZTI_MODE_FIXED,                 /* value is # of threads (min 1) */
  109         ZTI_MODE_BATCH,                 /* cpu-intensive; value is ignored */
  110         ZTI_MODE_SCALE,                 /* Taskqs scale with CPUs. */
  111         ZTI_MODE_NULL,                  /* don't create a taskq */
  112         ZTI_NMODES
  113 } zti_modes_t;
  114 
  115 #define ZTI_P(n, q)     { ZTI_MODE_FIXED, (n), (q) }
  116 #define ZTI_PCT(n)      { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
  117 #define ZTI_BATCH       { ZTI_MODE_BATCH, 0, 1 }
  118 #define ZTI_SCALE       { ZTI_MODE_SCALE, 0, 1 }
  119 #define ZTI_NULL        { ZTI_MODE_NULL, 0, 0 }
  120 
  121 #define ZTI_N(n)        ZTI_P(n, 1)
  122 #define ZTI_ONE         ZTI_N(1)
  123 
  124 typedef struct zio_taskq_info {
  125         zti_modes_t zti_mode;
  126         uint_t zti_value;
  127         uint_t zti_count;
  128 } zio_taskq_info_t;
  129 
  130 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
  131         "iss", "iss_h", "int", "int_h"
  132 };
  133 
  134 /*
  135  * This table defines the taskq settings for each ZFS I/O type. When
  136  * initializing a pool, we use this table to create an appropriately sized
  137  * taskq. Some operations are low volume and therefore have a small, static
  138  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
  139  * macros. Other operations process a large amount of data; the ZTI_BATCH
  140  * macro causes us to create a taskq oriented for throughput. Some operations
  141  * are so high frequency and short-lived that the taskq itself can become a
  142  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  143  * additional degree of parallelism specified by the number of threads per-
  144  * taskq and the number of taskqs; when dispatching an event in this case, the
  145  * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH,
  146  * but with number of taskqs also scaling with number of CPUs.
  147  *
  148  * The different taskq priorities are to handle the different contexts (issue
  149  * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
  150  * need to be handled with minimum delay.
  151  */
  152 static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
  153         /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
  154         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* NULL */
  155         { ZTI_N(8),     ZTI_NULL,       ZTI_SCALE,      ZTI_NULL }, /* READ */
  156         { ZTI_BATCH,    ZTI_N(5),       ZTI_SCALE,      ZTI_N(5) }, /* WRITE */
  157         { ZTI_SCALE,    ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* FREE */
  158         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* CLAIM */
  159         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* IOCTL */
  160         { ZTI_N(4),     ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* TRIM */
  161 };
  162 
  163 static void spa_sync_version(void *arg, dmu_tx_t *tx);
  164 static void spa_sync_props(void *arg, dmu_tx_t *tx);
  165 static boolean_t spa_has_active_shared_spare(spa_t *spa);
  166 static int spa_load_impl(spa_t *spa, spa_import_type_t type,
  167     const char **ereport);
  168 static void spa_vdev_resilver_done(spa_t *spa);
  169 
  170 static uint_t   zio_taskq_batch_pct = 80;         /* 1 thread per cpu in pset */
  171 static uint_t   zio_taskq_batch_tpq;              /* threads per taskq */
  172 static const boolean_t  zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
  173 static const uint_t     zio_taskq_basedc = 80;    /* base duty cycle */
  174 
  175 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
  176 
  177 /*
  178  * Report any spa_load_verify errors found, but do not fail spa_load.
  179  * This is used by zdb to analyze non-idle pools.
  180  */
  181 boolean_t       spa_load_verify_dryrun = B_FALSE;
  182 
  183 /*
  184  * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ).
  185  * This is used by zdb for spacemaps verification.
  186  */
  187 boolean_t       spa_mode_readable_spacemaps = B_FALSE;
  188 
  189 /*
  190  * This (illegal) pool name is used when temporarily importing a spa_t in order
  191  * to get the vdev stats associated with the imported devices.
  192  */
  193 #define TRYIMPORT_NAME  "$import"
  194 
  195 /*
  196  * For debugging purposes: print out vdev tree during pool import.
  197  */
  198 static int              spa_load_print_vdev_tree = B_FALSE;
  199 
  200 /*
  201  * A non-zero value for zfs_max_missing_tvds means that we allow importing
  202  * pools with missing top-level vdevs. This is strictly intended for advanced
  203  * pool recovery cases since missing data is almost inevitable. Pools with
  204  * missing devices can only be imported read-only for safety reasons, and their
  205  * fail-mode will be automatically set to "continue".
  206  *
  207  * With 1 missing vdev we should be able to import the pool and mount all
  208  * datasets. User data that was not modified after the missing device has been
  209  * added should be recoverable. This means that snapshots created prior to the
  210  * addition of that device should be completely intact.
  211  *
  212  * With 2 missing vdevs, some datasets may fail to mount since there are
  213  * dataset statistics that are stored as regular metadata. Some data might be
  214  * recoverable if those vdevs were added recently.
  215  *
  216  * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
  217  * may be missing entirely. Chances of data recovery are very low. Note that
  218  * there are also risks of performing an inadvertent rewind as we might be
  219  * missing all the vdevs with the latest uberblocks.
  220  */
  221 uint64_t        zfs_max_missing_tvds = 0;
  222 
  223 /*
  224  * The parameters below are similar to zfs_max_missing_tvds but are only
  225  * intended for a preliminary open of the pool with an untrusted config which
  226  * might be incomplete or out-dated.
  227  *
  228  * We are more tolerant for pools opened from a cachefile since we could have
  229  * an out-dated cachefile where a device removal was not registered.
  230  * We could have set the limit arbitrarily high but in the case where devices
  231  * are really missing we would want to return the proper error codes; we chose
  232  * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
  233  * and we get a chance to retrieve the trusted config.
  234  */
  235 uint64_t        zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
  236 
  237 /*
  238  * In the case where config was assembled by scanning device paths (/dev/dsks
  239  * by default) we are less tolerant since all the existing devices should have
  240  * been detected and we want spa_load to return the right error codes.
  241  */
  242 uint64_t        zfs_max_missing_tvds_scan = 0;
  243 
  244 /*
  245  * Debugging aid that pauses spa_sync() towards the end.
  246  */
  247 static const boolean_t  zfs_pause_spa_sync = B_FALSE;
  248 
  249 /*
  250  * Variables to indicate the livelist condense zthr func should wait at certain
  251  * points for the livelist to be removed - used to test condense/destroy races
  252  */
  253 static int zfs_livelist_condense_zthr_pause = 0;
  254 static int zfs_livelist_condense_sync_pause = 0;
  255 
  256 /*
  257  * Variables to track whether or not condense cancellation has been
  258  * triggered in testing.
  259  */
  260 static int zfs_livelist_condense_sync_cancel = 0;
  261 static int zfs_livelist_condense_zthr_cancel = 0;
  262 
  263 /*
  264  * Variable to track whether or not extra ALLOC blkptrs were added to a
  265  * livelist entry while it was being condensed (caused by the way we track
  266  * remapped blkptrs in dbuf_remap_impl)
  267  */
  268 static int zfs_livelist_condense_new_alloc = 0;
  269 
  270 /*
  271  * ==========================================================================
  272  * SPA properties routines
  273  * ==========================================================================
  274  */
  275 
  276 /*
  277  * Add a (source=src, propname=propval) list to an nvlist.
  278  */
  279 static void
  280 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval,
  281     uint64_t intval, zprop_source_t src)
  282 {
  283         const char *propname = zpool_prop_to_name(prop);
  284         nvlist_t *propval;
  285 
  286         propval = fnvlist_alloc();
  287         fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
  288 
  289         if (strval != NULL)
  290                 fnvlist_add_string(propval, ZPROP_VALUE, strval);
  291         else
  292                 fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
  293 
  294         fnvlist_add_nvlist(nvl, propname, propval);
  295         nvlist_free(propval);
  296 }
  297 
  298 /*
  299  * Get property values from the spa configuration.
  300  */
  301 static void
  302 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
  303 {
  304         vdev_t *rvd = spa->spa_root_vdev;
  305         dsl_pool_t *pool = spa->spa_dsl_pool;
  306         uint64_t size, alloc, cap, version;
  307         const zprop_source_t src = ZPROP_SRC_NONE;
  308         spa_config_dirent_t *dp;
  309         metaslab_class_t *mc = spa_normal_class(spa);
  310 
  311         ASSERT(MUTEX_HELD(&spa->spa_props_lock));
  312 
  313         if (rvd != NULL) {
  314                 alloc = metaslab_class_get_alloc(mc);
  315                 alloc += metaslab_class_get_alloc(spa_special_class(spa));
  316                 alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
  317                 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
  318 
  319                 size = metaslab_class_get_space(mc);
  320                 size += metaslab_class_get_space(spa_special_class(spa));
  321                 size += metaslab_class_get_space(spa_dedup_class(spa));
  322                 size += metaslab_class_get_space(spa_embedded_log_class(spa));
  323 
  324                 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
  325                 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
  326                 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
  327                 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
  328                     size - alloc, src);
  329                 spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
  330                     spa->spa_checkpoint_info.sci_dspace, src);
  331 
  332                 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
  333                     metaslab_class_fragmentation(mc), src);
  334                 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
  335                     metaslab_class_expandable_space(mc), src);
  336                 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
  337                     (spa_mode(spa) == SPA_MODE_READ), src);
  338 
  339                 cap = (size == 0) ? 0 : (alloc * 100 / size);
  340                 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
  341 
  342                 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
  343                     ddt_get_pool_dedup_ratio(spa), src);
  344 
  345                 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
  346                     rvd->vdev_state, src);
  347 
  348                 version = spa_version(spa);
  349                 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) {
  350                         spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
  351                             version, ZPROP_SRC_DEFAULT);
  352                 } else {
  353                         spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
  354                             version, ZPROP_SRC_LOCAL);
  355                 }
  356                 spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID,
  357                     NULL, spa_load_guid(spa), src);
  358         }
  359 
  360         if (pool != NULL) {
  361                 /*
  362                  * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
  363                  * when opening pools before this version freedir will be NULL.
  364                  */
  365                 if (pool->dp_free_dir != NULL) {
  366                         spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
  367                             dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
  368                             src);
  369                 } else {
  370                         spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
  371                             NULL, 0, src);
  372                 }
  373 
  374                 if (pool->dp_leak_dir != NULL) {
  375                         spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
  376                             dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
  377                             src);
  378                 } else {
  379                         spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
  380                             NULL, 0, src);
  381                 }
  382         }
  383 
  384         spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
  385 
  386         if (spa->spa_comment != NULL) {
  387                 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
  388                     0, ZPROP_SRC_LOCAL);
  389         }
  390 
  391         if (spa->spa_compatibility != NULL) {
  392                 spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY,
  393                     spa->spa_compatibility, 0, ZPROP_SRC_LOCAL);
  394         }
  395 
  396         if (spa->spa_root != NULL)
  397                 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
  398                     0, ZPROP_SRC_LOCAL);
  399 
  400         if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
  401                 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
  402                     MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
  403         } else {
  404                 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
  405                     SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
  406         }
  407 
  408         if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
  409                 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
  410                     DNODE_MAX_SIZE, ZPROP_SRC_NONE);
  411         } else {
  412                 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
  413                     DNODE_MIN_SIZE, ZPROP_SRC_NONE);
  414         }
  415 
  416         if ((dp = list_head(&spa->spa_config_list)) != NULL) {
  417                 if (dp->scd_path == NULL) {
  418                         spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
  419                             "none", 0, ZPROP_SRC_LOCAL);
  420                 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
  421                         spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
  422                             dp->scd_path, 0, ZPROP_SRC_LOCAL);
  423                 }
  424         }
  425 }
  426 
  427 /*
  428  * Get zpool property values.
  429  */
  430 int
  431 spa_prop_get(spa_t *spa, nvlist_t **nvp)
  432 {
  433         objset_t *mos = spa->spa_meta_objset;
  434         zap_cursor_t zc;
  435         zap_attribute_t za;
  436         dsl_pool_t *dp;
  437         int err;
  438 
  439         err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP);
  440         if (err)
  441                 return (err);
  442 
  443         dp = spa_get_dsl(spa);
  444         dsl_pool_config_enter(dp, FTAG);
  445         mutex_enter(&spa->spa_props_lock);
  446 
  447         /*
  448          * Get properties from the spa config.
  449          */
  450         spa_prop_get_config(spa, nvp);
  451 
  452         /* If no pool property object, no more prop to get. */
  453         if (mos == NULL || spa->spa_pool_props_object == 0)
  454                 goto out;
  455 
  456         /*
  457          * Get properties from the MOS pool property object.
  458          */
  459         for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
  460             (err = zap_cursor_retrieve(&zc, &za)) == 0;
  461             zap_cursor_advance(&zc)) {
  462                 uint64_t intval = 0;
  463                 char *strval = NULL;
  464                 zprop_source_t src = ZPROP_SRC_DEFAULT;
  465                 zpool_prop_t prop;
  466 
  467                 if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
  468                         continue;
  469 
  470                 switch (za.za_integer_length) {
  471                 case 8:
  472                         /* integer property */
  473                         if (za.za_first_integer !=
  474                             zpool_prop_default_numeric(prop))
  475                                 src = ZPROP_SRC_LOCAL;
  476 
  477                         if (prop == ZPOOL_PROP_BOOTFS) {
  478                                 dsl_dataset_t *ds = NULL;
  479 
  480                                 err = dsl_dataset_hold_obj(dp,
  481                                     za.za_first_integer, FTAG, &ds);
  482                                 if (err != 0)
  483                                         break;
  484 
  485                                 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
  486                                     KM_SLEEP);
  487                                 dsl_dataset_name(ds, strval);
  488                                 dsl_dataset_rele(ds, FTAG);
  489                         } else {
  490                                 strval = NULL;
  491                                 intval = za.za_first_integer;
  492                         }
  493 
  494                         spa_prop_add_list(*nvp, prop, strval, intval, src);
  495 
  496                         if (strval != NULL)
  497                                 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
  498 
  499                         break;
  500 
  501                 case 1:
  502                         /* string property */
  503                         strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
  504                         err = zap_lookup(mos, spa->spa_pool_props_object,
  505                             za.za_name, 1, za.za_num_integers, strval);
  506                         if (err) {
  507                                 kmem_free(strval, za.za_num_integers);
  508                                 break;
  509                         }
  510                         spa_prop_add_list(*nvp, prop, strval, 0, src);
  511                         kmem_free(strval, za.za_num_integers);
  512                         break;
  513 
  514                 default:
  515                         break;
  516                 }
  517         }
  518         zap_cursor_fini(&zc);
  519 out:
  520         mutex_exit(&spa->spa_props_lock);
  521         dsl_pool_config_exit(dp, FTAG);
  522         if (err && err != ENOENT) {
  523                 nvlist_free(*nvp);
  524                 *nvp = NULL;
  525                 return (err);
  526         }
  527 
  528         return (0);
  529 }
  530 
  531 /*
  532  * Validate the given pool properties nvlist and modify the list
  533  * for the property values to be set.
  534  */
  535 static int
  536 spa_prop_validate(spa_t *spa, nvlist_t *props)
  537 {
  538         nvpair_t *elem;
  539         int error = 0, reset_bootfs = 0;
  540         uint64_t objnum = 0;
  541         boolean_t has_feature = B_FALSE;
  542 
  543         elem = NULL;
  544         while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
  545                 uint64_t intval;
  546                 char *strval, *slash, *check, *fname;
  547                 const char *propname = nvpair_name(elem);
  548                 zpool_prop_t prop = zpool_name_to_prop(propname);
  549 
  550                 switch (prop) {
  551                 case ZPOOL_PROP_INVAL:
  552                         if (!zpool_prop_feature(propname)) {
  553                                 error = SET_ERROR(EINVAL);
  554                                 break;
  555                         }
  556 
  557                         /*
  558                          * Sanitize the input.
  559                          */
  560                         if (nvpair_type(elem) != DATA_TYPE_UINT64) {
  561                                 error = SET_ERROR(EINVAL);
  562                                 break;
  563                         }
  564 
  565                         if (nvpair_value_uint64(elem, &intval) != 0) {
  566                                 error = SET_ERROR(EINVAL);
  567                                 break;
  568                         }
  569 
  570                         if (intval != 0) {
  571                                 error = SET_ERROR(EINVAL);
  572                                 break;
  573                         }
  574 
  575                         fname = strchr(propname, '@') + 1;
  576                         if (zfeature_lookup_name(fname, NULL) != 0) {
  577                                 error = SET_ERROR(EINVAL);
  578                                 break;
  579                         }
  580 
  581                         has_feature = B_TRUE;
  582                         break;
  583 
  584                 case ZPOOL_PROP_VERSION:
  585                         error = nvpair_value_uint64(elem, &intval);
  586                         if (!error &&
  587                             (intval < spa_version(spa) ||
  588                             intval > SPA_VERSION_BEFORE_FEATURES ||
  589                             has_feature))
  590                                 error = SET_ERROR(EINVAL);
  591                         break;
  592 
  593                 case ZPOOL_PROP_DELEGATION:
  594                 case ZPOOL_PROP_AUTOREPLACE:
  595                 case ZPOOL_PROP_LISTSNAPS:
  596                 case ZPOOL_PROP_AUTOEXPAND:
  597                 case ZPOOL_PROP_AUTOTRIM:
  598                         error = nvpair_value_uint64(elem, &intval);
  599                         if (!error && intval > 1)
  600                                 error = SET_ERROR(EINVAL);
  601                         break;
  602 
  603                 case ZPOOL_PROP_MULTIHOST:
  604                         error = nvpair_value_uint64(elem, &intval);
  605                         if (!error && intval > 1)
  606                                 error = SET_ERROR(EINVAL);
  607 
  608                         if (!error) {
  609                                 uint32_t hostid = zone_get_hostid(NULL);
  610                                 if (hostid)
  611                                         spa->spa_hostid = hostid;
  612                                 else
  613                                         error = SET_ERROR(ENOTSUP);
  614                         }
  615 
  616                         break;
  617 
  618                 case ZPOOL_PROP_BOOTFS:
  619                         /*
  620                          * If the pool version is less than SPA_VERSION_BOOTFS,
  621                          * or the pool is still being created (version == 0),
  622                          * the bootfs property cannot be set.
  623                          */
  624                         if (spa_version(spa) < SPA_VERSION_BOOTFS) {
  625                                 error = SET_ERROR(ENOTSUP);
  626                                 break;
  627                         }
  628 
  629                         /*
  630                          * Make sure the vdev config is bootable
  631                          */
  632                         if (!vdev_is_bootable(spa->spa_root_vdev)) {
  633                                 error = SET_ERROR(ENOTSUP);
  634                                 break;
  635                         }
  636 
  637                         reset_bootfs = 1;
  638 
  639                         error = nvpair_value_string(elem, &strval);
  640 
  641                         if (!error) {
  642                                 objset_t *os;
  643 
  644                                 if (strval == NULL || strval[0] == '\0') {
  645                                         objnum = zpool_prop_default_numeric(
  646                                             ZPOOL_PROP_BOOTFS);
  647                                         break;
  648                                 }
  649 
  650                                 error = dmu_objset_hold(strval, FTAG, &os);
  651                                 if (error != 0)
  652                                         break;
  653 
  654                                 /* Must be ZPL. */
  655                                 if (dmu_objset_type(os) != DMU_OST_ZFS) {
  656                                         error = SET_ERROR(ENOTSUP);
  657                                 } else {
  658                                         objnum = dmu_objset_id(os);
  659                                 }
  660                                 dmu_objset_rele(os, FTAG);
  661                         }
  662                         break;
  663 
  664                 case ZPOOL_PROP_FAILUREMODE:
  665                         error = nvpair_value_uint64(elem, &intval);
  666                         if (!error && intval > ZIO_FAILURE_MODE_PANIC)
  667                                 error = SET_ERROR(EINVAL);
  668 
  669                         /*
  670                          * This is a special case which only occurs when
  671                          * the pool has completely failed. This allows
  672                          * the user to change the in-core failmode property
  673                          * without syncing it out to disk (I/Os might
  674                          * currently be blocked). We do this by returning
  675                          * EIO to the caller (spa_prop_set) to trick it
  676                          * into thinking we encountered a property validation
  677                          * error.
  678                          */
  679                         if (!error && spa_suspended(spa)) {
  680                                 spa->spa_failmode = intval;
  681                                 error = SET_ERROR(EIO);
  682                         }
  683                         break;
  684 
  685                 case ZPOOL_PROP_CACHEFILE:
  686                         if ((error = nvpair_value_string(elem, &strval)) != 0)
  687                                 break;
  688 
  689                         if (strval[0] == '\0')
  690                                 break;
  691 
  692                         if (strcmp(strval, "none") == 0)
  693                                 break;
  694 
  695                         if (strval[0] != '/') {
  696                                 error = SET_ERROR(EINVAL);
  697                                 break;
  698                         }
  699 
  700                         slash = strrchr(strval, '/');
  701                         ASSERT(slash != NULL);
  702 
  703                         if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
  704                             strcmp(slash, "/..") == 0)
  705                                 error = SET_ERROR(EINVAL);
  706                         break;
  707 
  708                 case ZPOOL_PROP_COMMENT:
  709                         if ((error = nvpair_value_string(elem, &strval)) != 0)
  710                                 break;
  711                         for (check = strval; *check != '\0'; check++) {
  712                                 if (!isprint(*check)) {
  713                                         error = SET_ERROR(EINVAL);
  714                                         break;
  715                                 }
  716                         }
  717                         if (strlen(strval) > ZPROP_MAX_COMMENT)
  718                                 error = SET_ERROR(E2BIG);
  719                         break;
  720 
  721                 default:
  722                         break;
  723                 }
  724 
  725                 if (error)
  726                         break;
  727         }
  728 
  729         (void) nvlist_remove_all(props,
  730             zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO));
  731 
  732         if (!error && reset_bootfs) {
  733                 error = nvlist_remove(props,
  734                     zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
  735 
  736                 if (!error) {
  737                         error = nvlist_add_uint64(props,
  738                             zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
  739                 }
  740         }
  741 
  742         return (error);
  743 }
  744 
  745 void
  746 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
  747 {
  748         char *cachefile;
  749         spa_config_dirent_t *dp;
  750 
  751         if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
  752             &cachefile) != 0)
  753                 return;
  754 
  755         dp = kmem_alloc(sizeof (spa_config_dirent_t),
  756             KM_SLEEP);
  757 
  758         if (cachefile[0] == '\0')
  759                 dp->scd_path = spa_strdup(spa_config_path);
  760         else if (strcmp(cachefile, "none") == 0)
  761                 dp->scd_path = NULL;
  762         else
  763                 dp->scd_path = spa_strdup(cachefile);
  764 
  765         list_insert_head(&spa->spa_config_list, dp);
  766         if (need_sync)
  767                 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
  768 }
  769 
  770 int
  771 spa_prop_set(spa_t *spa, nvlist_t *nvp)
  772 {
  773         int error;
  774         nvpair_t *elem = NULL;
  775         boolean_t need_sync = B_FALSE;
  776 
  777         if ((error = spa_prop_validate(spa, nvp)) != 0)
  778                 return (error);
  779 
  780         while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
  781                 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
  782 
  783                 if (prop == ZPOOL_PROP_CACHEFILE ||
  784                     prop == ZPOOL_PROP_ALTROOT ||
  785                     prop == ZPOOL_PROP_READONLY)
  786                         continue;
  787 
  788                 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
  789                         uint64_t ver = 0;
  790 
  791                         if (prop == ZPOOL_PROP_VERSION) {
  792                                 VERIFY(nvpair_value_uint64(elem, &ver) == 0);
  793                         } else {
  794                                 ASSERT(zpool_prop_feature(nvpair_name(elem)));
  795                                 ver = SPA_VERSION_FEATURES;
  796                                 need_sync = B_TRUE;
  797                         }
  798 
  799                         /* Save time if the version is already set. */
  800                         if (ver == spa_version(spa))
  801                                 continue;
  802 
  803                         /*
  804                          * In addition to the pool directory object, we might
  805                          * create the pool properties object, the features for
  806                          * read object, the features for write object, or the
  807                          * feature descriptions object.
  808                          */
  809                         error = dsl_sync_task(spa->spa_name, NULL,
  810                             spa_sync_version, &ver,
  811                             6, ZFS_SPACE_CHECK_RESERVED);
  812                         if (error)
  813                                 return (error);
  814                         continue;
  815                 }
  816 
  817                 need_sync = B_TRUE;
  818                 break;
  819         }
  820 
  821         if (need_sync) {
  822                 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
  823                     nvp, 6, ZFS_SPACE_CHECK_RESERVED));
  824         }
  825 
  826         return (0);
  827 }
  828 
  829 /*
  830  * If the bootfs property value is dsobj, clear it.
  831  */
  832 void
  833 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
  834 {
  835         if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
  836                 VERIFY(zap_remove(spa->spa_meta_objset,
  837                     spa->spa_pool_props_object,
  838                     zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
  839                 spa->spa_bootfs = 0;
  840         }
  841 }
  842 
  843 static int
  844 spa_change_guid_check(void *arg, dmu_tx_t *tx)
  845 {
  846         uint64_t *newguid __maybe_unused = arg;
  847         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
  848         vdev_t *rvd = spa->spa_root_vdev;
  849         uint64_t vdev_state;
  850 
  851         if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
  852                 int error = (spa_has_checkpoint(spa)) ?
  853                     ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
  854                 return (SET_ERROR(error));
  855         }
  856 
  857         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
  858         vdev_state = rvd->vdev_state;
  859         spa_config_exit(spa, SCL_STATE, FTAG);
  860 
  861         if (vdev_state != VDEV_STATE_HEALTHY)
  862                 return (SET_ERROR(ENXIO));
  863 
  864         ASSERT3U(spa_guid(spa), !=, *newguid);
  865 
  866         return (0);
  867 }
  868 
  869 static void
  870 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
  871 {
  872         uint64_t *newguid = arg;
  873         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
  874         uint64_t oldguid;
  875         vdev_t *rvd = spa->spa_root_vdev;
  876 
  877         oldguid = spa_guid(spa);
  878 
  879         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
  880         rvd->vdev_guid = *newguid;
  881         rvd->vdev_guid_sum += (*newguid - oldguid);
  882         vdev_config_dirty(rvd);
  883         spa_config_exit(spa, SCL_STATE, FTAG);
  884 
  885         spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
  886             (u_longlong_t)oldguid, (u_longlong_t)*newguid);
  887 }
  888 
  889 /*
  890  * Change the GUID for the pool.  This is done so that we can later
  891  * re-import a pool built from a clone of our own vdevs.  We will modify
  892  * the root vdev's guid, our own pool guid, and then mark all of our
  893  * vdevs dirty.  Note that we must make sure that all our vdevs are
  894  * online when we do this, or else any vdevs that weren't present
  895  * would be orphaned from our pool.  We are also going to issue a
  896  * sysevent to update any watchers.
  897  */
  898 int
  899 spa_change_guid(spa_t *spa)
  900 {
  901         int error;
  902         uint64_t guid;
  903 
  904         mutex_enter(&spa->spa_vdev_top_lock);
  905         mutex_enter(&spa_namespace_lock);
  906         guid = spa_generate_guid(NULL);
  907 
  908         error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
  909             spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
  910 
  911         if (error == 0) {
  912                 /*
  913                  * Clear the kobj flag from all the vdevs to allow
  914                  * vdev_cache_process_kobj_evt() to post events to all the
  915                  * vdevs since GUID is updated.
  916                  */
  917                 vdev_clear_kobj_evt(spa->spa_root_vdev);
  918                 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
  919                         vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]);
  920 
  921                 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
  922                 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
  923         }
  924 
  925         mutex_exit(&spa_namespace_lock);
  926         mutex_exit(&spa->spa_vdev_top_lock);
  927 
  928         return (error);
  929 }
  930 
  931 /*
  932  * ==========================================================================
  933  * SPA state manipulation (open/create/destroy/import/export)
  934  * ==========================================================================
  935  */
  936 
  937 static int
  938 spa_error_entry_compare(const void *a, const void *b)
  939 {
  940         const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
  941         const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
  942         int ret;
  943 
  944         ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
  945             sizeof (zbookmark_phys_t));
  946 
  947         return (TREE_ISIGN(ret));
  948 }
  949 
  950 /*
  951  * Utility function which retrieves copies of the current logs and
  952  * re-initializes them in the process.
  953  */
  954 void
  955 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
  956 {
  957         ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
  958 
  959         memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t));
  960         memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t));
  961 
  962         avl_create(&spa->spa_errlist_scrub,
  963             spa_error_entry_compare, sizeof (spa_error_entry_t),
  964             offsetof(spa_error_entry_t, se_avl));
  965         avl_create(&spa->spa_errlist_last,
  966             spa_error_entry_compare, sizeof (spa_error_entry_t),
  967             offsetof(spa_error_entry_t, se_avl));
  968 }
  969 
  970 static void
  971 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
  972 {
  973         const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
  974         enum zti_modes mode = ztip->zti_mode;
  975         uint_t value = ztip->zti_value;
  976         uint_t count = ztip->zti_count;
  977         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
  978         uint_t cpus, flags = TASKQ_DYNAMIC;
  979         boolean_t batch = B_FALSE;
  980 
  981         switch (mode) {
  982         case ZTI_MODE_FIXED:
  983                 ASSERT3U(value, >, 0);
  984                 break;
  985 
  986         case ZTI_MODE_BATCH:
  987                 batch = B_TRUE;
  988                 flags |= TASKQ_THREADS_CPU_PCT;
  989                 value = MIN(zio_taskq_batch_pct, 100);
  990                 break;
  991 
  992         case ZTI_MODE_SCALE:
  993                 flags |= TASKQ_THREADS_CPU_PCT;
  994                 /*
  995                  * We want more taskqs to reduce lock contention, but we want
  996                  * less for better request ordering and CPU utilization.
  997                  */
  998                 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
  999                 if (zio_taskq_batch_tpq > 0) {
 1000                         count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) /
 1001                             zio_taskq_batch_tpq);
 1002                 } else {
 1003                         /*
 1004                          * Prefer 6 threads per taskq, but no more taskqs
 1005                          * than threads in them on large systems. For 80%:
 1006                          *
 1007                          *                 taskq   taskq   total
 1008                          * cpus    taskqs  percent threads threads
 1009                          * ------- ------- ------- ------- -------
 1010                          * 1       1       80%     1       1
 1011                          * 2       1       80%     1       1
 1012                          * 4       1       80%     3       3
 1013                          * 8       2       40%     3       6
 1014                          * 16      3       27%     4       12
 1015                          * 32      5       16%     5       25
 1016                          * 64      7       11%     7       49
 1017                          * 128     10      8%      10      100
 1018                          * 256     14      6%      15      210
 1019                          */
 1020                         count = 1 + cpus / 6;
 1021                         while (count * count > cpus)
 1022                                 count--;
 1023                 }
 1024                 /* Limit each taskq within 100% to not trigger assertion. */
 1025                 count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
 1026                 value = (zio_taskq_batch_pct + count / 2) / count;
 1027                 break;
 1028 
 1029         case ZTI_MODE_NULL:
 1030                 tqs->stqs_count = 0;
 1031                 tqs->stqs_taskq = NULL;
 1032                 return;
 1033 
 1034         default:
 1035                 panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 1036                     "spa_activate()",
 1037                     zio_type_name[t], zio_taskq_types[q], mode, value);
 1038                 break;
 1039         }
 1040 
 1041         ASSERT3U(count, >, 0);
 1042         tqs->stqs_count = count;
 1043         tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 1044 
 1045         for (uint_t i = 0; i < count; i++) {
 1046                 taskq_t *tq;
 1047                 char name[32];
 1048 
 1049                 if (count > 1)
 1050                         (void) snprintf(name, sizeof (name), "%s_%s_%u",
 1051                             zio_type_name[t], zio_taskq_types[q], i);
 1052                 else
 1053                         (void) snprintf(name, sizeof (name), "%s_%s",
 1054                             zio_type_name[t], zio_taskq_types[q]);
 1055 
 1056                 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 1057                         if (batch)
 1058                                 flags |= TASKQ_DC_BATCH;
 1059 
 1060                         (void) zio_taskq_basedc;
 1061                         tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 1062                             spa->spa_proc, zio_taskq_basedc, flags);
 1063                 } else {
 1064                         pri_t pri = maxclsyspri;
 1065                         /*
 1066                          * The write issue taskq can be extremely CPU
 1067                          * intensive.  Run it at slightly less important
 1068                          * priority than the other taskqs.
 1069                          *
 1070                          * Under Linux and FreeBSD this means incrementing
 1071                          * the priority value as opposed to platforms like
 1072                          * illumos where it should be decremented.
 1073                          *
 1074                          * On FreeBSD, if priorities divided by four (RQ_PPQ)
 1075                          * are equal then a difference between them is
 1076                          * insignificant.
 1077                          */
 1078                         if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {
 1079 #if defined(__linux__)
 1080                                 pri++;
 1081 #elif defined(__FreeBSD__)
 1082                                 pri += 4;
 1083 #else
 1084 #error "unknown OS"
 1085 #endif
 1086                         }
 1087                         tq = taskq_create_proc(name, value, pri, 50,
 1088                             INT_MAX, spa->spa_proc, flags);
 1089                 }
 1090 
 1091                 tqs->stqs_taskq[i] = tq;
 1092         }
 1093 }
 1094 
 1095 static void
 1096 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 1097 {
 1098         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 1099 
 1100         if (tqs->stqs_taskq == NULL) {
 1101                 ASSERT3U(tqs->stqs_count, ==, 0);
 1102                 return;
 1103         }
 1104 
 1105         for (uint_t i = 0; i < tqs->stqs_count; i++) {
 1106                 ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
 1107                 taskq_destroy(tqs->stqs_taskq[i]);
 1108         }
 1109 
 1110         kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
 1111         tqs->stqs_taskq = NULL;
 1112 }
 1113 
 1114 /*
 1115  * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
 1116  * Note that a type may have multiple discrete taskqs to avoid lock contention
 1117  * on the taskq itself. In that case we choose which taskq at random by using
 1118  * the low bits of gethrtime().
 1119  */
 1120 void
 1121 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
 1122     task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
 1123 {
 1124         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 1125         taskq_t *tq;
 1126 
 1127         ASSERT3P(tqs->stqs_taskq, !=, NULL);
 1128         ASSERT3U(tqs->stqs_count, !=, 0);
 1129 
 1130         if (tqs->stqs_count == 1) {
 1131                 tq = tqs->stqs_taskq[0];
 1132         } else {
 1133                 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
 1134         }
 1135 
 1136         taskq_dispatch_ent(tq, func, arg, flags, ent);
 1137 }
 1138 
 1139 /*
 1140  * Same as spa_taskq_dispatch_ent() but block on the task until completion.
 1141  */
 1142 void
 1143 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
 1144     task_func_t *func, void *arg, uint_t flags)
 1145 {
 1146         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 1147         taskq_t *tq;
 1148         taskqid_t id;
 1149 
 1150         ASSERT3P(tqs->stqs_taskq, !=, NULL);
 1151         ASSERT3U(tqs->stqs_count, !=, 0);
 1152 
 1153         if (tqs->stqs_count == 1) {
 1154                 tq = tqs->stqs_taskq[0];
 1155         } else {
 1156                 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
 1157         }
 1158 
 1159         id = taskq_dispatch(tq, func, arg, flags);
 1160         if (id)
 1161                 taskq_wait_id(tq, id);
 1162 }
 1163 
 1164 static void
 1165 spa_create_zio_taskqs(spa_t *spa)
 1166 {
 1167         for (int t = 0; t < ZIO_TYPES; t++) {
 1168                 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 1169                         spa_taskqs_init(spa, t, q);
 1170                 }
 1171         }
 1172 }
 1173 
 1174 /*
 1175  * Disabled until spa_thread() can be adapted for Linux.
 1176  */
 1177 #undef HAVE_SPA_THREAD
 1178 
 1179 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
 1180 static void
 1181 spa_thread(void *arg)
 1182 {
 1183         psetid_t zio_taskq_psrset_bind = PS_NONE;
 1184         callb_cpr_t cprinfo;
 1185 
 1186         spa_t *spa = arg;
 1187         user_t *pu = PTOU(curproc);
 1188 
 1189         CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
 1190             spa->spa_name);
 1191 
 1192         ASSERT(curproc != &p0);
 1193         (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
 1194             "zpool-%s", spa->spa_name);
 1195         (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
 1196 
 1197         /* bind this thread to the requested psrset */
 1198         if (zio_taskq_psrset_bind != PS_NONE) {
 1199                 pool_lock();
 1200                 mutex_enter(&cpu_lock);
 1201                 mutex_enter(&pidlock);
 1202                 mutex_enter(&curproc->p_lock);
 1203 
 1204                 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
 1205                     0, NULL, NULL) == 0)  {
 1206                         curthread->t_bind_pset = zio_taskq_psrset_bind;
 1207                 } else {
 1208                         cmn_err(CE_WARN,
 1209                             "Couldn't bind process for zfs pool \"%s\" to "
 1210                             "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
 1211                 }
 1212 
 1213                 mutex_exit(&curproc->p_lock);
 1214                 mutex_exit(&pidlock);
 1215                 mutex_exit(&cpu_lock);
 1216                 pool_unlock();
 1217         }
 1218 
 1219         if (zio_taskq_sysdc) {
 1220                 sysdc_thread_enter(curthread, 100, 0);
 1221         }
 1222 
 1223         spa->spa_proc = curproc;
 1224         spa->spa_did = curthread->t_did;
 1225 
 1226         spa_create_zio_taskqs(spa);
 1227 
 1228         mutex_enter(&spa->spa_proc_lock);
 1229         ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
 1230 
 1231         spa->spa_proc_state = SPA_PROC_ACTIVE;
 1232         cv_broadcast(&spa->spa_proc_cv);
 1233 
 1234         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 1235         while (spa->spa_proc_state == SPA_PROC_ACTIVE)
 1236                 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 1237         CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
 1238 
 1239         ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
 1240         spa->spa_proc_state = SPA_PROC_GONE;
 1241         spa->spa_proc = &p0;
 1242         cv_broadcast(&spa->spa_proc_cv);
 1243         CALLB_CPR_EXIT(&cprinfo);       /* drops spa_proc_lock */
 1244 
 1245         mutex_enter(&curproc->p_lock);
 1246         lwp_exit();
 1247 }
 1248 #endif
 1249 
 1250 /*
 1251  * Activate an uninitialized pool.
 1252  */
 1253 static void
 1254 spa_activate(spa_t *spa, spa_mode_t mode)
 1255 {
 1256         ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 1257 
 1258         spa->spa_state = POOL_STATE_ACTIVE;
 1259         spa->spa_mode = mode;
 1260         spa->spa_read_spacemaps = spa_mode_readable_spacemaps;
 1261 
 1262         spa->spa_normal_class = metaslab_class_create(spa, &zfs_metaslab_ops);
 1263         spa->spa_log_class = metaslab_class_create(spa, &zfs_metaslab_ops);
 1264         spa->spa_embedded_log_class =
 1265             metaslab_class_create(spa, &zfs_metaslab_ops);
 1266         spa->spa_special_class = metaslab_class_create(spa, &zfs_metaslab_ops);
 1267         spa->spa_dedup_class = metaslab_class_create(spa, &zfs_metaslab_ops);
 1268 
 1269         /* Try to create a covering process */
 1270         mutex_enter(&spa->spa_proc_lock);
 1271         ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
 1272         ASSERT(spa->spa_proc == &p0);
 1273         spa->spa_did = 0;
 1274 
 1275         (void) spa_create_process;
 1276 #ifdef HAVE_SPA_THREAD
 1277         /* Only create a process if we're going to be around a while. */
 1278         if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
 1279                 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
 1280                     NULL, 0) == 0) {
 1281                         spa->spa_proc_state = SPA_PROC_CREATED;
 1282                         while (spa->spa_proc_state == SPA_PROC_CREATED) {
 1283                                 cv_wait(&spa->spa_proc_cv,
 1284                                     &spa->spa_proc_lock);
 1285                         }
 1286                         ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 1287                         ASSERT(spa->spa_proc != &p0);
 1288                         ASSERT(spa->spa_did != 0);
 1289                 } else {
 1290 #ifdef _KERNEL
 1291                         cmn_err(CE_WARN,
 1292                             "Couldn't create process for zfs pool \"%s\"\n",
 1293                             spa->spa_name);
 1294 #endif
 1295                 }
 1296         }
 1297 #endif /* HAVE_SPA_THREAD */
 1298         mutex_exit(&spa->spa_proc_lock);
 1299 
 1300         /* If we didn't create a process, we need to create our taskqs. */
 1301         if (spa->spa_proc == &p0) {
 1302                 spa_create_zio_taskqs(spa);
 1303         }
 1304 
 1305         for (size_t i = 0; i < TXG_SIZE; i++) {
 1306                 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
 1307                     ZIO_FLAG_CANFAIL);
 1308         }
 1309 
 1310         list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 1311             offsetof(vdev_t, vdev_config_dirty_node));
 1312         list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
 1313             offsetof(objset_t, os_evicting_node));
 1314         list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
 1315             offsetof(vdev_t, vdev_state_dirty_node));
 1316 
 1317         txg_list_create(&spa->spa_vdev_txg_list, spa,
 1318             offsetof(struct vdev, vdev_txg_node));
 1319 
 1320         avl_create(&spa->spa_errlist_scrub,
 1321             spa_error_entry_compare, sizeof (spa_error_entry_t),
 1322             offsetof(spa_error_entry_t, se_avl));
 1323         avl_create(&spa->spa_errlist_last,
 1324             spa_error_entry_compare, sizeof (spa_error_entry_t),
 1325             offsetof(spa_error_entry_t, se_avl));
 1326         avl_create(&spa->spa_errlist_healed,
 1327             spa_error_entry_compare, sizeof (spa_error_entry_t),
 1328             offsetof(spa_error_entry_t, se_avl));
 1329 
 1330         spa_activate_os(spa);
 1331 
 1332         spa_keystore_init(&spa->spa_keystore);
 1333 
 1334         /*
 1335          * This taskq is used to perform zvol-minor-related tasks
 1336          * asynchronously. This has several advantages, including easy
 1337          * resolution of various deadlocks.
 1338          *
 1339          * The taskq must be single threaded to ensure tasks are always
 1340          * processed in the order in which they were dispatched.
 1341          *
 1342          * A taskq per pool allows one to keep the pools independent.
 1343          * This way if one pool is suspended, it will not impact another.
 1344          *
 1345          * The preferred location to dispatch a zvol minor task is a sync
 1346          * task. In this context, there is easy access to the spa_t and minimal
 1347          * error handling is required because the sync task must succeed.
 1348          */
 1349         spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
 1350             1, INT_MAX, 0);
 1351 
 1352         /*
 1353          * Taskq dedicated to prefetcher threads: this is used to prevent the
 1354          * pool traverse code from monopolizing the global (and limited)
 1355          * system_taskq by inappropriately scheduling long running tasks on it.
 1356          */
 1357         spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100,
 1358             defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 1359 
 1360         /*
 1361          * The taskq to upgrade datasets in this pool. Currently used by
 1362          * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
 1363          */
 1364         spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100,
 1365             defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 1366 }
 1367 
 1368 /*
 1369  * Opposite of spa_activate().
 1370  */
 1371 static void
 1372 spa_deactivate(spa_t *spa)
 1373 {
 1374         ASSERT(spa->spa_sync_on == B_FALSE);
 1375         ASSERT(spa->spa_dsl_pool == NULL);
 1376         ASSERT(spa->spa_root_vdev == NULL);
 1377         ASSERT(spa->spa_async_zio_root == NULL);
 1378         ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 1379 
 1380         spa_evicting_os_wait(spa);
 1381 
 1382         if (spa->spa_zvol_taskq) {
 1383                 taskq_destroy(spa->spa_zvol_taskq);
 1384                 spa->spa_zvol_taskq = NULL;
 1385         }
 1386 
 1387         if (spa->spa_prefetch_taskq) {
 1388                 taskq_destroy(spa->spa_prefetch_taskq);
 1389                 spa->spa_prefetch_taskq = NULL;
 1390         }
 1391 
 1392         if (spa->spa_upgrade_taskq) {
 1393                 taskq_destroy(spa->spa_upgrade_taskq);
 1394                 spa->spa_upgrade_taskq = NULL;
 1395         }
 1396 
 1397         txg_list_destroy(&spa->spa_vdev_txg_list);
 1398 
 1399         list_destroy(&spa->spa_config_dirty_list);
 1400         list_destroy(&spa->spa_evicting_os_list);
 1401         list_destroy(&spa->spa_state_dirty_list);
 1402 
 1403         taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
 1404 
 1405         for (int t = 0; t < ZIO_TYPES; t++) {
 1406                 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 1407                         spa_taskqs_fini(spa, t, q);
 1408                 }
 1409         }
 1410 
 1411         for (size_t i = 0; i < TXG_SIZE; i++) {
 1412                 ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
 1413                 VERIFY0(zio_wait(spa->spa_txg_zio[i]));
 1414                 spa->spa_txg_zio[i] = NULL;
 1415         }
 1416 
 1417         metaslab_class_destroy(spa->spa_normal_class);
 1418         spa->spa_normal_class = NULL;
 1419 
 1420         metaslab_class_destroy(spa->spa_log_class);
 1421         spa->spa_log_class = NULL;
 1422 
 1423         metaslab_class_destroy(spa->spa_embedded_log_class);
 1424         spa->spa_embedded_log_class = NULL;
 1425 
 1426         metaslab_class_destroy(spa->spa_special_class);
 1427         spa->spa_special_class = NULL;
 1428 
 1429         metaslab_class_destroy(spa->spa_dedup_class);
 1430         spa->spa_dedup_class = NULL;
 1431 
 1432         /*
 1433          * If this was part of an import or the open otherwise failed, we may
 1434          * still have errors left in the queues.  Empty them just in case.
 1435          */
 1436         spa_errlog_drain(spa);
 1437         avl_destroy(&spa->spa_errlist_scrub);
 1438         avl_destroy(&spa->spa_errlist_last);
 1439         avl_destroy(&spa->spa_errlist_healed);
 1440 
 1441         spa_keystore_fini(&spa->spa_keystore);
 1442 
 1443         spa->spa_state = POOL_STATE_UNINITIALIZED;
 1444 
 1445         mutex_enter(&spa->spa_proc_lock);
 1446         if (spa->spa_proc_state != SPA_PROC_NONE) {
 1447                 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 1448                 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
 1449                 cv_broadcast(&spa->spa_proc_cv);
 1450                 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
 1451                         ASSERT(spa->spa_proc != &p0);
 1452                         cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 1453                 }
 1454                 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
 1455                 spa->spa_proc_state = SPA_PROC_NONE;
 1456         }
 1457         ASSERT(spa->spa_proc == &p0);
 1458         mutex_exit(&spa->spa_proc_lock);
 1459 
 1460         /*
 1461          * We want to make sure spa_thread() has actually exited the ZFS
 1462          * module, so that the module can't be unloaded out from underneath
 1463          * it.
 1464          */
 1465         if (spa->spa_did != 0) {
 1466                 thread_join(spa->spa_did);
 1467                 spa->spa_did = 0;
 1468         }
 1469 
 1470         spa_deactivate_os(spa);
 1471 
 1472 }
 1473 
 1474 /*
 1475  * Verify a pool configuration, and construct the vdev tree appropriately.  This
 1476  * will create all the necessary vdevs in the appropriate layout, with each vdev
 1477  * in the CLOSED state.  This will prep the pool before open/creation/import.
 1478  * All vdev validation is done by the vdev_alloc() routine.
 1479  */
 1480 int
 1481 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
 1482     uint_t id, int atype)
 1483 {
 1484         nvlist_t **child;
 1485         uint_t children;
 1486         int error;
 1487 
 1488         if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
 1489                 return (error);
 1490 
 1491         if ((*vdp)->vdev_ops->vdev_op_leaf)
 1492                 return (0);
 1493 
 1494         error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 1495             &child, &children);
 1496 
 1497         if (error == ENOENT)
 1498                 return (0);
 1499 
 1500         if (error) {
 1501                 vdev_free(*vdp);
 1502                 *vdp = NULL;
 1503                 return (SET_ERROR(EINVAL));
 1504         }
 1505 
 1506         for (int c = 0; c < children; c++) {
 1507                 vdev_t *vd;
 1508                 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
 1509                     atype)) != 0) {
 1510                         vdev_free(*vdp);
 1511                         *vdp = NULL;
 1512                         return (error);
 1513                 }
 1514         }
 1515 
 1516         ASSERT(*vdp != NULL);
 1517 
 1518         return (0);
 1519 }
 1520 
 1521 static boolean_t
 1522 spa_should_flush_logs_on_unload(spa_t *spa)
 1523 {
 1524         if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 1525                 return (B_FALSE);
 1526 
 1527         if (!spa_writeable(spa))
 1528                 return (B_FALSE);
 1529 
 1530         if (!spa->spa_sync_on)
 1531                 return (B_FALSE);
 1532 
 1533         if (spa_state(spa) != POOL_STATE_EXPORTED)
 1534                 return (B_FALSE);
 1535 
 1536         if (zfs_keep_log_spacemaps_at_export)
 1537                 return (B_FALSE);
 1538 
 1539         return (B_TRUE);
 1540 }
 1541 
 1542 /*
 1543  * Opens a transaction that will set the flag that will instruct
 1544  * spa_sync to attempt to flush all the metaslabs for that txg.
 1545  */
 1546 static void
 1547 spa_unload_log_sm_flush_all(spa_t *spa)
 1548 {
 1549         dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 1550         VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 1551 
 1552         ASSERT3U(spa->spa_log_flushall_txg, ==, 0);
 1553         spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
 1554 
 1555         dmu_tx_commit(tx);
 1556         txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
 1557 }
 1558 
 1559 static void
 1560 spa_unload_log_sm_metadata(spa_t *spa)
 1561 {
 1562         void *cookie = NULL;
 1563         spa_log_sm_t *sls;
 1564         while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
 1565             &cookie)) != NULL) {
 1566                 VERIFY0(sls->sls_mscount);
 1567                 kmem_free(sls, sizeof (spa_log_sm_t));
 1568         }
 1569 
 1570         for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 1571             e != NULL; e = list_head(&spa->spa_log_summary)) {
 1572                 VERIFY0(e->lse_mscount);
 1573                 list_remove(&spa->spa_log_summary, e);
 1574                 kmem_free(e, sizeof (log_summary_entry_t));
 1575         }
 1576 
 1577         spa->spa_unflushed_stats.sus_nblocks = 0;
 1578         spa->spa_unflushed_stats.sus_memused = 0;
 1579         spa->spa_unflushed_stats.sus_blocklimit = 0;
 1580 }
 1581 
 1582 static void
 1583 spa_destroy_aux_threads(spa_t *spa)
 1584 {
 1585         if (spa->spa_condense_zthr != NULL) {
 1586                 zthr_destroy(spa->spa_condense_zthr);
 1587                 spa->spa_condense_zthr = NULL;
 1588         }
 1589         if (spa->spa_checkpoint_discard_zthr != NULL) {
 1590                 zthr_destroy(spa->spa_checkpoint_discard_zthr);
 1591                 spa->spa_checkpoint_discard_zthr = NULL;
 1592         }
 1593         if (spa->spa_livelist_delete_zthr != NULL) {
 1594                 zthr_destroy(spa->spa_livelist_delete_zthr);
 1595                 spa->spa_livelist_delete_zthr = NULL;
 1596         }
 1597         if (spa->spa_livelist_condense_zthr != NULL) {
 1598                 zthr_destroy(spa->spa_livelist_condense_zthr);
 1599                 spa->spa_livelist_condense_zthr = NULL;
 1600         }
 1601 }
 1602 
 1603 /*
 1604  * Opposite of spa_load().
 1605  */
 1606 static void
 1607 spa_unload(spa_t *spa)
 1608 {
 1609         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 1610         ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
 1611 
 1612         spa_import_progress_remove(spa_guid(spa));
 1613         spa_load_note(spa, "UNLOADING");
 1614 
 1615         spa_wake_waiters(spa);
 1616 
 1617         /*
 1618          * If we have set the spa_final_txg, we have already performed the
 1619          * tasks below in spa_export_common(). We should not redo it here since
 1620          * we delay the final TXGs beyond what spa_final_txg is set at.
 1621          */
 1622         if (spa->spa_final_txg == UINT64_MAX) {
 1623                 /*
 1624                  * If the log space map feature is enabled and the pool is
 1625                  * getting exported (but not destroyed), we want to spend some
 1626                  * time flushing as many metaslabs as we can in an attempt to
 1627                  * destroy log space maps and save import time.
 1628                  */
 1629                 if (spa_should_flush_logs_on_unload(spa))
 1630                         spa_unload_log_sm_flush_all(spa);
 1631 
 1632                 /*
 1633                  * Stop async tasks.
 1634                  */
 1635                 spa_async_suspend(spa);
 1636 
 1637                 if (spa->spa_root_vdev) {
 1638                         vdev_t *root_vdev = spa->spa_root_vdev;
 1639                         vdev_initialize_stop_all(root_vdev,
 1640                             VDEV_INITIALIZE_ACTIVE);
 1641                         vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
 1642                         vdev_autotrim_stop_all(spa);
 1643                         vdev_rebuild_stop_all(spa);
 1644                 }
 1645         }
 1646 
 1647         /*
 1648          * Stop syncing.
 1649          */
 1650         if (spa->spa_sync_on) {
 1651                 txg_sync_stop(spa->spa_dsl_pool);
 1652                 spa->spa_sync_on = B_FALSE;
 1653         }
 1654 
 1655         /*
 1656          * This ensures that there is no async metaslab prefetching
 1657          * while we attempt to unload the spa.
 1658          */
 1659         if (spa->spa_root_vdev != NULL) {
 1660                 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
 1661                         vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
 1662                         if (vc->vdev_mg != NULL)
 1663                                 taskq_wait(vc->vdev_mg->mg_taskq);
 1664                 }
 1665         }
 1666 
 1667         if (spa->spa_mmp.mmp_thread)
 1668                 mmp_thread_stop(spa);
 1669 
 1670         /*
 1671          * Wait for any outstanding async I/O to complete.
 1672          */
 1673         if (spa->spa_async_zio_root != NULL) {
 1674                 for (int i = 0; i < max_ncpus; i++)
 1675                         (void) zio_wait(spa->spa_async_zio_root[i]);
 1676                 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
 1677                 spa->spa_async_zio_root = NULL;
 1678         }
 1679 
 1680         if (spa->spa_vdev_removal != NULL) {
 1681                 spa_vdev_removal_destroy(spa->spa_vdev_removal);
 1682                 spa->spa_vdev_removal = NULL;
 1683         }
 1684 
 1685         spa_destroy_aux_threads(spa);
 1686 
 1687         spa_condense_fini(spa);
 1688 
 1689         bpobj_close(&spa->spa_deferred_bpobj);
 1690 
 1691         spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 1692 
 1693         /*
 1694          * Close all vdevs.
 1695          */
 1696         if (spa->spa_root_vdev)
 1697                 vdev_free(spa->spa_root_vdev);
 1698         ASSERT(spa->spa_root_vdev == NULL);
 1699 
 1700         /*
 1701          * Close the dsl pool.
 1702          */
 1703         if (spa->spa_dsl_pool) {
 1704                 dsl_pool_close(spa->spa_dsl_pool);
 1705                 spa->spa_dsl_pool = NULL;
 1706                 spa->spa_meta_objset = NULL;
 1707         }
 1708 
 1709         ddt_unload(spa);
 1710         spa_unload_log_sm_metadata(spa);
 1711 
 1712         /*
 1713          * Drop and purge level 2 cache
 1714          */
 1715         spa_l2cache_drop(spa);
 1716 
 1717         for (int i = 0; i < spa->spa_spares.sav_count; i++)
 1718                 vdev_free(spa->spa_spares.sav_vdevs[i]);
 1719         if (spa->spa_spares.sav_vdevs) {
 1720                 kmem_free(spa->spa_spares.sav_vdevs,
 1721                     spa->spa_spares.sav_count * sizeof (void *));
 1722                 spa->spa_spares.sav_vdevs = NULL;
 1723         }
 1724         if (spa->spa_spares.sav_config) {
 1725                 nvlist_free(spa->spa_spares.sav_config);
 1726                 spa->spa_spares.sav_config = NULL;
 1727         }
 1728         spa->spa_spares.sav_count = 0;
 1729 
 1730         for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 1731                 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
 1732                 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
 1733         }
 1734         if (spa->spa_l2cache.sav_vdevs) {
 1735                 kmem_free(spa->spa_l2cache.sav_vdevs,
 1736                     spa->spa_l2cache.sav_count * sizeof (void *));
 1737                 spa->spa_l2cache.sav_vdevs = NULL;
 1738         }
 1739         if (spa->spa_l2cache.sav_config) {
 1740                 nvlist_free(spa->spa_l2cache.sav_config);
 1741                 spa->spa_l2cache.sav_config = NULL;
 1742         }
 1743         spa->spa_l2cache.sav_count = 0;
 1744 
 1745         spa->spa_async_suspended = 0;
 1746 
 1747         spa->spa_indirect_vdevs_loaded = B_FALSE;
 1748 
 1749         if (spa->spa_comment != NULL) {
 1750                 spa_strfree(spa->spa_comment);
 1751                 spa->spa_comment = NULL;
 1752         }
 1753         if (spa->spa_compatibility != NULL) {
 1754                 spa_strfree(spa->spa_compatibility);
 1755                 spa->spa_compatibility = NULL;
 1756         }
 1757 
 1758         spa_config_exit(spa, SCL_ALL, spa);
 1759 }
 1760 
 1761 /*
 1762  * Load (or re-load) the current list of vdevs describing the active spares for
 1763  * this pool.  When this is called, we have some form of basic information in
 1764  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
 1765  * then re-generate a more complete list including status information.
 1766  */
 1767 void
 1768 spa_load_spares(spa_t *spa)
 1769 {
 1770         nvlist_t **spares;
 1771         uint_t nspares;
 1772         int i;
 1773         vdev_t *vd, *tvd;
 1774 
 1775 #ifndef _KERNEL
 1776         /*
 1777          * zdb opens both the current state of the pool and the
 1778          * checkpointed state (if present), with a different spa_t.
 1779          *
 1780          * As spare vdevs are shared among open pools, we skip loading
 1781          * them when we load the checkpointed state of the pool.
 1782          */
 1783         if (!spa_writeable(spa))
 1784                 return;
 1785 #endif
 1786 
 1787         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 1788 
 1789         /*
 1790          * First, close and free any existing spare vdevs.
 1791          */
 1792         for (i = 0; i < spa->spa_spares.sav_count; i++) {
 1793                 vd = spa->spa_spares.sav_vdevs[i];
 1794 
 1795                 /* Undo the call to spa_activate() below */
 1796                 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 1797                     B_FALSE)) != NULL && tvd->vdev_isspare)
 1798                         spa_spare_remove(tvd);
 1799                 vdev_close(vd);
 1800                 vdev_free(vd);
 1801         }
 1802 
 1803         if (spa->spa_spares.sav_vdevs)
 1804                 kmem_free(spa->spa_spares.sav_vdevs,
 1805                     spa->spa_spares.sav_count * sizeof (void *));
 1806 
 1807         if (spa->spa_spares.sav_config == NULL)
 1808                 nspares = 0;
 1809         else
 1810                 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 1811                     ZPOOL_CONFIG_SPARES, &spares, &nspares));
 1812 
 1813         spa->spa_spares.sav_count = (int)nspares;
 1814         spa->spa_spares.sav_vdevs = NULL;
 1815 
 1816         if (nspares == 0)
 1817                 return;
 1818 
 1819         /*
 1820          * Construct the array of vdevs, opening them to get status in the
 1821          * process.   For each spare, there is potentially two different vdev_t
 1822          * structures associated with it: one in the list of spares (used only
 1823          * for basic validation purposes) and one in the active vdev
 1824          * configuration (if it's spared in).  During this phase we open and
 1825          * validate each vdev on the spare list.  If the vdev also exists in the
 1826          * active configuration, then we also mark this vdev as an active spare.
 1827          */
 1828         spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *),
 1829             KM_SLEEP);
 1830         for (i = 0; i < spa->spa_spares.sav_count; i++) {
 1831                 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
 1832                     VDEV_ALLOC_SPARE) == 0);
 1833                 ASSERT(vd != NULL);
 1834 
 1835                 spa->spa_spares.sav_vdevs[i] = vd;
 1836 
 1837                 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 1838                     B_FALSE)) != NULL) {
 1839                         if (!tvd->vdev_isspare)
 1840                                 spa_spare_add(tvd);
 1841 
 1842                         /*
 1843                          * We only mark the spare active if we were successfully
 1844                          * able to load the vdev.  Otherwise, importing a pool
 1845                          * with a bad active spare would result in strange
 1846                          * behavior, because multiple pool would think the spare
 1847                          * is actively in use.
 1848                          *
 1849                          * There is a vulnerability here to an equally bizarre
 1850                          * circumstance, where a dead active spare is later
 1851                          * brought back to life (onlined or otherwise).  Given
 1852                          * the rarity of this scenario, and the extra complexity
 1853                          * it adds, we ignore the possibility.
 1854                          */
 1855                         if (!vdev_is_dead(tvd))
 1856                                 spa_spare_activate(tvd);
 1857                 }
 1858 
 1859                 vd->vdev_top = vd;
 1860                 vd->vdev_aux = &spa->spa_spares;
 1861 
 1862                 if (vdev_open(vd) != 0)
 1863                         continue;
 1864 
 1865                 if (vdev_validate_aux(vd) == 0)
 1866                         spa_spare_add(vd);
 1867         }
 1868 
 1869         /*
 1870          * Recompute the stashed list of spares, with status information
 1871          * this time.
 1872          */
 1873         fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES);
 1874 
 1875         spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
 1876             KM_SLEEP);
 1877         for (i = 0; i < spa->spa_spares.sav_count; i++)
 1878                 spares[i] = vdev_config_generate(spa,
 1879                     spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
 1880         fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
 1881             ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
 1882             spa->spa_spares.sav_count);
 1883         for (i = 0; i < spa->spa_spares.sav_count; i++)
 1884                 nvlist_free(spares[i]);
 1885         kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
 1886 }
 1887 
 1888 /*
 1889  * Load (or re-load) the current list of vdevs describing the active l2cache for
 1890  * this pool.  When this is called, we have some form of basic information in
 1891  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
 1892  * then re-generate a more complete list including status information.
 1893  * Devices which are already active have their details maintained, and are
 1894  * not re-opened.
 1895  */
 1896 void
 1897 spa_load_l2cache(spa_t *spa)
 1898 {
 1899         nvlist_t **l2cache = NULL;
 1900         uint_t nl2cache;
 1901         int i, j, oldnvdevs;
 1902         uint64_t guid;
 1903         vdev_t *vd, **oldvdevs, **newvdevs;
 1904         spa_aux_vdev_t *sav = &spa->spa_l2cache;
 1905 
 1906 #ifndef _KERNEL
 1907         /*
 1908          * zdb opens both the current state of the pool and the
 1909          * checkpointed state (if present), with a different spa_t.
 1910          *
 1911          * As L2 caches are part of the ARC which is shared among open
 1912          * pools, we skip loading them when we load the checkpointed
 1913          * state of the pool.
 1914          */
 1915         if (!spa_writeable(spa))
 1916                 return;
 1917 #endif
 1918 
 1919         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 1920 
 1921         oldvdevs = sav->sav_vdevs;
 1922         oldnvdevs = sav->sav_count;
 1923         sav->sav_vdevs = NULL;
 1924         sav->sav_count = 0;
 1925 
 1926         if (sav->sav_config == NULL) {
 1927                 nl2cache = 0;
 1928                 newvdevs = NULL;
 1929                 goto out;
 1930         }
 1931 
 1932         VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config,
 1933             ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
 1934         newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
 1935 
 1936         /*
 1937          * Process new nvlist of vdevs.
 1938          */
 1939         for (i = 0; i < nl2cache; i++) {
 1940                 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID);
 1941 
 1942                 newvdevs[i] = NULL;
 1943                 for (j = 0; j < oldnvdevs; j++) {
 1944                         vd = oldvdevs[j];
 1945                         if (vd != NULL && guid == vd->vdev_guid) {
 1946                                 /*
 1947                                  * Retain previous vdev for add/remove ops.
 1948                                  */
 1949                                 newvdevs[i] = vd;
 1950                                 oldvdevs[j] = NULL;
 1951                                 break;
 1952                         }
 1953                 }
 1954 
 1955                 if (newvdevs[i] == NULL) {
 1956                         /*
 1957                          * Create new vdev
 1958                          */
 1959                         VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
 1960                             VDEV_ALLOC_L2CACHE) == 0);
 1961                         ASSERT(vd != NULL);
 1962                         newvdevs[i] = vd;
 1963 
 1964                         /*
 1965                          * Commit this vdev as an l2cache device,
 1966                          * even if it fails to open.
 1967                          */
 1968                         spa_l2cache_add(vd);
 1969 
 1970                         vd->vdev_top = vd;
 1971                         vd->vdev_aux = sav;
 1972 
 1973                         spa_l2cache_activate(vd);
 1974 
 1975                         if (vdev_open(vd) != 0)
 1976                                 continue;
 1977 
 1978                         (void) vdev_validate_aux(vd);
 1979 
 1980                         if (!vdev_is_dead(vd))
 1981                                 l2arc_add_vdev(spa, vd);
 1982 
 1983                         /*
 1984                          * Upon cache device addition to a pool or pool
 1985                          * creation with a cache device or if the header
 1986                          * of the device is invalid we issue an async
 1987                          * TRIM command for the whole device which will
 1988                          * execute if l2arc_trim_ahead > 0.
 1989                          */
 1990                         spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
 1991                 }
 1992         }
 1993 
 1994         sav->sav_vdevs = newvdevs;
 1995         sav->sav_count = (int)nl2cache;
 1996 
 1997         /*
 1998          * Recompute the stashed list of l2cache devices, with status
 1999          * information this time.
 2000          */
 2001         fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE);
 2002 
 2003         if (sav->sav_count > 0)
 2004                 l2cache = kmem_alloc(sav->sav_count * sizeof (void *),
 2005                     KM_SLEEP);
 2006         for (i = 0; i < sav->sav_count; i++)
 2007                 l2cache[i] = vdev_config_generate(spa,
 2008                     sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
 2009         fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
 2010             (const nvlist_t * const *)l2cache, sav->sav_count);
 2011 
 2012 out:
 2013         /*
 2014          * Purge vdevs that were dropped
 2015          */
 2016         for (i = 0; i < oldnvdevs; i++) {
 2017                 uint64_t pool;
 2018 
 2019                 vd = oldvdevs[i];
 2020                 if (vd != NULL) {
 2021                         ASSERT(vd->vdev_isl2cache);
 2022 
 2023                         if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 2024                             pool != 0ULL && l2arc_vdev_present(vd))
 2025                                 l2arc_remove_vdev(vd);
 2026                         vdev_clear_stats(vd);
 2027                         vdev_free(vd);
 2028                 }
 2029         }
 2030 
 2031         if (oldvdevs)
 2032                 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
 2033 
 2034         for (i = 0; i < sav->sav_count; i++)
 2035                 nvlist_free(l2cache[i]);
 2036         if (sav->sav_count)
 2037                 kmem_free(l2cache, sav->sav_count * sizeof (void *));
 2038 }
 2039 
 2040 static int
 2041 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 2042 {
 2043         dmu_buf_t *db;
 2044         char *packed = NULL;
 2045         size_t nvsize = 0;
 2046         int error;
 2047         *value = NULL;
 2048 
 2049         error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
 2050         if (error)
 2051                 return (error);
 2052 
 2053         nvsize = *(uint64_t *)db->db_data;
 2054         dmu_buf_rele(db, FTAG);
 2055 
 2056         packed = vmem_alloc(nvsize, KM_SLEEP);
 2057         error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
 2058             DMU_READ_PREFETCH);
 2059         if (error == 0)
 2060                 error = nvlist_unpack(packed, nvsize, value, 0);
 2061         vmem_free(packed, nvsize);
 2062 
 2063         return (error);
 2064 }
 2065 
 2066 /*
 2067  * Concrete top-level vdevs that are not missing and are not logs. At every
 2068  * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
 2069  */
 2070 static uint64_t
 2071 spa_healthy_core_tvds(spa_t *spa)
 2072 {
 2073         vdev_t *rvd = spa->spa_root_vdev;
 2074         uint64_t tvds = 0;
 2075 
 2076         for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 2077                 vdev_t *vd = rvd->vdev_child[i];
 2078                 if (vd->vdev_islog)
 2079                         continue;
 2080                 if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
 2081                         tvds++;
 2082         }
 2083 
 2084         return (tvds);
 2085 }
 2086 
 2087 /*
 2088  * Checks to see if the given vdev could not be opened, in which case we post a
 2089  * sysevent to notify the autoreplace code that the device has been removed.
 2090  */
 2091 static void
 2092 spa_check_removed(vdev_t *vd)
 2093 {
 2094         for (uint64_t c = 0; c < vd->vdev_children; c++)
 2095                 spa_check_removed(vd->vdev_child[c]);
 2096 
 2097         if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
 2098             vdev_is_concrete(vd)) {
 2099                 zfs_post_autoreplace(vd->vdev_spa, vd);
 2100                 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
 2101         }
 2102 }
 2103 
 2104 static int
 2105 spa_check_for_missing_logs(spa_t *spa)
 2106 {
 2107         vdev_t *rvd = spa->spa_root_vdev;
 2108 
 2109         /*
 2110          * If we're doing a normal import, then build up any additional
 2111          * diagnostic information about missing log devices.
 2112          * We'll pass this up to the user for further processing.
 2113          */
 2114         if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
 2115                 nvlist_t **child, *nv;
 2116                 uint64_t idx = 0;
 2117 
 2118                 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *),
 2119                     KM_SLEEP);
 2120                 nv = fnvlist_alloc();
 2121 
 2122                 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 2123                         vdev_t *tvd = rvd->vdev_child[c];
 2124 
 2125                         /*
 2126                          * We consider a device as missing only if it failed
 2127                          * to open (i.e. offline or faulted is not considered
 2128                          * as missing).
 2129                          */
 2130                         if (tvd->vdev_islog &&
 2131                             tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
 2132                                 child[idx++] = vdev_config_generate(spa, tvd,
 2133                                     B_FALSE, VDEV_CONFIG_MISSING);
 2134                         }
 2135                 }
 2136 
 2137                 if (idx > 0) {
 2138                         fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 2139                             (const nvlist_t * const *)child, idx);
 2140                         fnvlist_add_nvlist(spa->spa_load_info,
 2141                             ZPOOL_CONFIG_MISSING_DEVICES, nv);
 2142 
 2143                         for (uint64_t i = 0; i < idx; i++)
 2144                                 nvlist_free(child[i]);
 2145                 }
 2146                 nvlist_free(nv);
 2147                 kmem_free(child, rvd->vdev_children * sizeof (char **));
 2148 
 2149                 if (idx > 0) {
 2150                         spa_load_failed(spa, "some log devices are missing");
 2151                         vdev_dbgmsg_print_tree(rvd, 2);
 2152                         return (SET_ERROR(ENXIO));
 2153                 }
 2154         } else {
 2155                 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 2156                         vdev_t *tvd = rvd->vdev_child[c];
 2157 
 2158                         if (tvd->vdev_islog &&
 2159                             tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
 2160                                 spa_set_log_state(spa, SPA_LOG_CLEAR);
 2161                                 spa_load_note(spa, "some log devices are "
 2162                                     "missing, ZIL is dropped.");
 2163                                 vdev_dbgmsg_print_tree(rvd, 2);
 2164                                 break;
 2165                         }
 2166                 }
 2167         }
 2168 
 2169         return (0);
 2170 }
 2171 
 2172 /*
 2173  * Check for missing log devices
 2174  */
 2175 static boolean_t
 2176 spa_check_logs(spa_t *spa)
 2177 {
 2178         boolean_t rv = B_FALSE;
 2179         dsl_pool_t *dp = spa_get_dsl(spa);
 2180 
 2181         switch (spa->spa_log_state) {
 2182         default:
 2183                 break;
 2184         case SPA_LOG_MISSING:
 2185                 /* need to recheck in case slog has been restored */
 2186         case SPA_LOG_UNKNOWN:
 2187                 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 2188                     zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
 2189                 if (rv)
 2190                         spa_set_log_state(spa, SPA_LOG_MISSING);
 2191                 break;
 2192         }
 2193         return (rv);
 2194 }
 2195 
 2196 /*
 2197  * Passivate any log vdevs (note, does not apply to embedded log metaslabs).
 2198  */
 2199 static boolean_t
 2200 spa_passivate_log(spa_t *spa)
 2201 {
 2202         vdev_t *rvd = spa->spa_root_vdev;
 2203         boolean_t slog_found = B_FALSE;
 2204 
 2205         ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 2206 
 2207         for (int c = 0; c < rvd->vdev_children; c++) {
 2208                 vdev_t *tvd = rvd->vdev_child[c];
 2209 
 2210                 if (tvd->vdev_islog) {
 2211                         ASSERT3P(tvd->vdev_log_mg, ==, NULL);
 2212                         metaslab_group_passivate(tvd->vdev_mg);
 2213                         slog_found = B_TRUE;
 2214                 }
 2215         }
 2216 
 2217         return (slog_found);
 2218 }
 2219 
 2220 /*
 2221  * Activate any log vdevs (note, does not apply to embedded log metaslabs).
 2222  */
 2223 static void
 2224 spa_activate_log(spa_t *spa)
 2225 {
 2226         vdev_t *rvd = spa->spa_root_vdev;
 2227 
 2228         ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 2229 
 2230         for (int c = 0; c < rvd->vdev_children; c++) {
 2231                 vdev_t *tvd = rvd->vdev_child[c];
 2232 
 2233                 if (tvd->vdev_islog) {
 2234                         ASSERT3P(tvd->vdev_log_mg, ==, NULL);
 2235                         metaslab_group_activate(tvd->vdev_mg);
 2236                 }
 2237         }
 2238 }
 2239 
 2240 int
 2241 spa_reset_logs(spa_t *spa)
 2242 {
 2243         int error;
 2244 
 2245         error = dmu_objset_find(spa_name(spa), zil_reset,
 2246             NULL, DS_FIND_CHILDREN);
 2247         if (error == 0) {
 2248                 /*
 2249                  * We successfully offlined the log device, sync out the
 2250                  * current txg so that the "stubby" block can be removed
 2251                  * by zil_sync().
 2252                  */
 2253                 txg_wait_synced(spa->spa_dsl_pool, 0);
 2254         }
 2255         return (error);
 2256 }
 2257 
 2258 static void
 2259 spa_aux_check_removed(spa_aux_vdev_t *sav)
 2260 {
 2261         for (int i = 0; i < sav->sav_count; i++)
 2262                 spa_check_removed(sav->sav_vdevs[i]);
 2263 }
 2264 
 2265 void
 2266 spa_claim_notify(zio_t *zio)
 2267 {
 2268         spa_t *spa = zio->io_spa;
 2269 
 2270         if (zio->io_error)
 2271                 return;
 2272 
 2273         mutex_enter(&spa->spa_props_lock);      /* any mutex will do */
 2274         if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
 2275                 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
 2276         mutex_exit(&spa->spa_props_lock);
 2277 }
 2278 
 2279 typedef struct spa_load_error {
 2280         boolean_t       sle_verify_data;
 2281         uint64_t        sle_meta_count;
 2282         uint64_t        sle_data_count;
 2283 } spa_load_error_t;
 2284 
 2285 static void
 2286 spa_load_verify_done(zio_t *zio)
 2287 {
 2288         blkptr_t *bp = zio->io_bp;
 2289         spa_load_error_t *sle = zio->io_private;
 2290         dmu_object_type_t type = BP_GET_TYPE(bp);
 2291         int error = zio->io_error;
 2292         spa_t *spa = zio->io_spa;
 2293 
 2294         abd_free(zio->io_abd);
 2295         if (error) {
 2296                 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
 2297                     type != DMU_OT_INTENT_LOG)
 2298                         atomic_inc_64(&sle->sle_meta_count);
 2299                 else
 2300                         atomic_inc_64(&sle->sle_data_count);
 2301         }
 2302 
 2303         mutex_enter(&spa->spa_scrub_lock);
 2304         spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
 2305         cv_broadcast(&spa->spa_scrub_io_cv);
 2306         mutex_exit(&spa->spa_scrub_lock);
 2307 }
 2308 
 2309 /*
 2310  * Maximum number of inflight bytes is the log2 fraction of the arc size.
 2311  * By default, we set it to 1/16th of the arc.
 2312  */
 2313 static uint_t spa_load_verify_shift = 4;
 2314 static int spa_load_verify_metadata = B_TRUE;
 2315 static int spa_load_verify_data = B_TRUE;
 2316 
 2317 static int
 2318 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 2319     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 2320 {
 2321         zio_t *rio = arg;
 2322         spa_load_error_t *sle = rio->io_private;
 2323 
 2324         (void) zilog, (void) dnp;
 2325 
 2326         /*
 2327          * Note: normally this routine will not be called if
 2328          * spa_load_verify_metadata is not set.  However, it may be useful
 2329          * to manually set the flag after the traversal has begun.
 2330          */
 2331         if (!spa_load_verify_metadata)
 2332                 return (0);
 2333 
 2334         /*
 2335          * Sanity check the block pointer in order to detect obvious damage
 2336          * before using the contents in subsequent checks or in zio_read().
 2337          * When damaged consider it to be a metadata error since we cannot
 2338          * trust the BP_GET_TYPE and BP_GET_LEVEL values.
 2339          */
 2340         if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) {
 2341                 atomic_inc_64(&sle->sle_meta_count);
 2342                 return (0);
 2343         }
 2344 
 2345         if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
 2346             BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
 2347                 return (0);
 2348 
 2349         if (!BP_IS_METADATA(bp) &&
 2350             (!spa_load_verify_data || !sle->sle_verify_data))
 2351                 return (0);
 2352 
 2353         uint64_t maxinflight_bytes =
 2354             arc_target_bytes() >> spa_load_verify_shift;
 2355         size_t size = BP_GET_PSIZE(bp);
 2356 
 2357         mutex_enter(&spa->spa_scrub_lock);
 2358         while (spa->spa_load_verify_bytes >= maxinflight_bytes)
 2359                 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 2360         spa->spa_load_verify_bytes += size;
 2361         mutex_exit(&spa->spa_scrub_lock);
 2362 
 2363         zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
 2364             spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
 2365             ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
 2366             ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
 2367         return (0);
 2368 }
 2369 
 2370 static int
 2371 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 2372 {
 2373         (void) dp, (void) arg;
 2374 
 2375         if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
 2376                 return (SET_ERROR(ENAMETOOLONG));
 2377 
 2378         return (0);
 2379 }
 2380 
 2381 static int
 2382 spa_load_verify(spa_t *spa)
 2383 {
 2384         zio_t *rio;
 2385         spa_load_error_t sle = { 0 };
 2386         zpool_load_policy_t policy;
 2387         boolean_t verify_ok = B_FALSE;
 2388         int error = 0;
 2389 
 2390         zpool_get_load_policy(spa->spa_config, &policy);
 2391 
 2392         if (policy.zlp_rewind & ZPOOL_NEVER_REWIND ||
 2393             policy.zlp_maxmeta == UINT64_MAX)
 2394                 return (0);
 2395 
 2396         dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 2397         error = dmu_objset_find_dp(spa->spa_dsl_pool,
 2398             spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
 2399             DS_FIND_CHILDREN);
 2400         dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 2401         if (error != 0)
 2402                 return (error);
 2403 
 2404         /*
 2405          * Verify data only if we are rewinding or error limit was set.
 2406          * Otherwise nothing except dbgmsg care about it to waste time.
 2407          */
 2408         sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) ||
 2409             (policy.zlp_maxdata < UINT64_MAX);
 2410 
 2411         rio = zio_root(spa, NULL, &sle,
 2412             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 2413 
 2414         if (spa_load_verify_metadata) {
 2415                 if (spa->spa_extreme_rewind) {
 2416                         spa_load_note(spa, "performing a complete scan of the "
 2417                             "pool since extreme rewind is on. This may take "
 2418                             "a very long time.\n  (spa_load_verify_data=%u, "
 2419                             "spa_load_verify_metadata=%u)",
 2420                             spa_load_verify_data, spa_load_verify_metadata);
 2421                 }
 2422 
 2423                 error = traverse_pool(spa, spa->spa_verify_min_txg,
 2424                     TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
 2425                     TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
 2426         }
 2427 
 2428         (void) zio_wait(rio);
 2429         ASSERT0(spa->spa_load_verify_bytes);
 2430 
 2431         spa->spa_load_meta_errors = sle.sle_meta_count;
 2432         spa->spa_load_data_errors = sle.sle_data_count;
 2433 
 2434         if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
 2435                 spa_load_note(spa, "spa_load_verify found %llu metadata errors "
 2436                     "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
 2437                     (u_longlong_t)sle.sle_data_count);
 2438         }
 2439 
 2440         if (spa_load_verify_dryrun ||
 2441             (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
 2442             sle.sle_data_count <= policy.zlp_maxdata)) {
 2443                 int64_t loss = 0;
 2444 
 2445                 verify_ok = B_TRUE;
 2446                 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
 2447                 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
 2448 
 2449                 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
 2450                 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME,
 2451                     spa->spa_load_txg_ts);
 2452                 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME,
 2453                     loss);
 2454                 fnvlist_add_uint64(spa->spa_load_info,
 2455                     ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count);
 2456                 fnvlist_add_uint64(spa->spa_load_info,
 2457                     ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count);
 2458         } else {
 2459                 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
 2460         }
 2461 
 2462         if (spa_load_verify_dryrun)
 2463                 return (0);
 2464 
 2465         if (error) {
 2466                 if (error != ENXIO && error != EIO)
 2467                         error = SET_ERROR(EIO);
 2468                 return (error);
 2469         }
 2470 
 2471         return (verify_ok ? 0 : EIO);
 2472 }
 2473 
 2474 /*
 2475  * Find a value in the pool props object.
 2476  */
 2477 static void
 2478 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
 2479 {
 2480         (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
 2481             zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
 2482 }
 2483 
 2484 /*
 2485  * Find a value in the pool directory object.
 2486  */
 2487 static int
 2488 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
 2489 {
 2490         int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 2491             name, sizeof (uint64_t), 1, val);
 2492 
 2493         if (error != 0 && (error != ENOENT || log_enoent)) {
 2494                 spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
 2495                     "[error=%d]", name, error);
 2496         }
 2497 
 2498         return (error);
 2499 }
 2500 
 2501 static int
 2502 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
 2503 {
 2504         vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
 2505         return (SET_ERROR(err));
 2506 }
 2507 
 2508 boolean_t
 2509 spa_livelist_delete_check(spa_t *spa)
 2510 {
 2511         return (spa->spa_livelists_to_delete != 0);
 2512 }
 2513 
 2514 static boolean_t
 2515 spa_livelist_delete_cb_check(void *arg, zthr_t *z)
 2516 {
 2517         (void) z;
 2518         spa_t *spa = arg;
 2519         return (spa_livelist_delete_check(spa));
 2520 }
 2521 
 2522 static int
 2523 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 2524 {
 2525         spa_t *spa = arg;
 2526         zio_free(spa, tx->tx_txg, bp);
 2527         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
 2528             -bp_get_dsize_sync(spa, bp),
 2529             -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
 2530         return (0);
 2531 }
 2532 
 2533 static int
 2534 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
 2535 {
 2536         int err;
 2537         zap_cursor_t zc;
 2538         zap_attribute_t za;
 2539         zap_cursor_init(&zc, os, zap_obj);
 2540         err = zap_cursor_retrieve(&zc, &za);
 2541         zap_cursor_fini(&zc);
 2542         if (err == 0)
 2543                 *llp = za.za_first_integer;
 2544         return (err);
 2545 }
 2546 
 2547 /*
 2548  * Components of livelist deletion that must be performed in syncing
 2549  * context: freeing block pointers and updating the pool-wide data
 2550  * structures to indicate how much work is left to do
 2551  */
 2552 typedef struct sublist_delete_arg {
 2553         spa_t *spa;
 2554         dsl_deadlist_t *ll;
 2555         uint64_t key;
 2556         bplist_t *to_free;
 2557 } sublist_delete_arg_t;
 2558 
 2559 static void
 2560 sublist_delete_sync(void *arg, dmu_tx_t *tx)
 2561 {
 2562         sublist_delete_arg_t *sda = arg;
 2563         spa_t *spa = sda->spa;
 2564         dsl_deadlist_t *ll = sda->ll;
 2565         uint64_t key = sda->key;
 2566         bplist_t *to_free = sda->to_free;
 2567 
 2568         bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
 2569         dsl_deadlist_remove_entry(ll, key, tx);
 2570 }
 2571 
 2572 typedef struct livelist_delete_arg {
 2573         spa_t *spa;
 2574         uint64_t ll_obj;
 2575         uint64_t zap_obj;
 2576 } livelist_delete_arg_t;
 2577 
 2578 static void
 2579 livelist_delete_sync(void *arg, dmu_tx_t *tx)
 2580 {
 2581         livelist_delete_arg_t *lda = arg;
 2582         spa_t *spa = lda->spa;
 2583         uint64_t ll_obj = lda->ll_obj;
 2584         uint64_t zap_obj = lda->zap_obj;
 2585         objset_t *mos = spa->spa_meta_objset;
 2586         uint64_t count;
 2587 
 2588         /* free the livelist and decrement the feature count */
 2589         VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
 2590         dsl_deadlist_free(mos, ll_obj, tx);
 2591         spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
 2592         VERIFY0(zap_count(mos, zap_obj, &count));
 2593         if (count == 0) {
 2594                 /* no more livelists to delete */
 2595                 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
 2596                     DMU_POOL_DELETED_CLONES, tx));
 2597                 VERIFY0(zap_destroy(mos, zap_obj, tx));
 2598                 spa->spa_livelists_to_delete = 0;
 2599                 spa_notify_waiters(spa);
 2600         }
 2601 }
 2602 
 2603 /*
 2604  * Load in the value for the livelist to be removed and open it. Then,
 2605  * load its first sublist and determine which block pointers should actually
 2606  * be freed. Then, call a synctask which performs the actual frees and updates
 2607  * the pool-wide livelist data.
 2608  */
 2609 static void
 2610 spa_livelist_delete_cb(void *arg, zthr_t *z)
 2611 {
 2612         spa_t *spa = arg;
 2613         uint64_t ll_obj = 0, count;
 2614         objset_t *mos = spa->spa_meta_objset;
 2615         uint64_t zap_obj = spa->spa_livelists_to_delete;
 2616         /*
 2617          * Determine the next livelist to delete. This function should only
 2618          * be called if there is at least one deleted clone.
 2619          */
 2620         VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
 2621         VERIFY0(zap_count(mos, ll_obj, &count));
 2622         if (count > 0) {
 2623                 dsl_deadlist_t *ll;
 2624                 dsl_deadlist_entry_t *dle;
 2625                 bplist_t to_free;
 2626                 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP);
 2627                 dsl_deadlist_open(ll, mos, ll_obj);
 2628                 dle = dsl_deadlist_first(ll);
 2629                 ASSERT3P(dle, !=, NULL);
 2630                 bplist_create(&to_free);
 2631                 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
 2632                     z, NULL);
 2633                 if (err == 0) {
 2634                         sublist_delete_arg_t sync_arg = {
 2635                             .spa = spa,
 2636                             .ll = ll,
 2637                             .key = dle->dle_mintxg,
 2638                             .to_free = &to_free
 2639                         };
 2640                         zfs_dbgmsg("deleting sublist (id %llu) from"
 2641                             " livelist %llu, %lld remaining",
 2642                             (u_longlong_t)dle->dle_bpobj.bpo_object,
 2643                             (u_longlong_t)ll_obj, (longlong_t)count - 1);
 2644                         VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 2645                             sublist_delete_sync, &sync_arg, 0,
 2646                             ZFS_SPACE_CHECK_DESTROY));
 2647                 } else {
 2648                         VERIFY3U(err, ==, EINTR);
 2649                 }
 2650                 bplist_clear(&to_free);
 2651                 bplist_destroy(&to_free);
 2652                 dsl_deadlist_close(ll);
 2653                 kmem_free(ll, sizeof (dsl_deadlist_t));
 2654         } else {
 2655                 livelist_delete_arg_t sync_arg = {
 2656                     .spa = spa,
 2657                     .ll_obj = ll_obj,
 2658                     .zap_obj = zap_obj
 2659                 };
 2660                 zfs_dbgmsg("deletion of livelist %llu completed",
 2661                     (u_longlong_t)ll_obj);
 2662                 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
 2663                     &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
 2664         }
 2665 }
 2666 
 2667 static void
 2668 spa_start_livelist_destroy_thread(spa_t *spa)
 2669 {
 2670         ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL);
 2671         spa->spa_livelist_delete_zthr =
 2672             zthr_create("z_livelist_destroy",
 2673             spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa,
 2674             minclsyspri);
 2675 }
 2676 
 2677 typedef struct livelist_new_arg {
 2678         bplist_t *allocs;
 2679         bplist_t *frees;
 2680 } livelist_new_arg_t;
 2681 
 2682 static int
 2683 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
 2684     dmu_tx_t *tx)
 2685 {
 2686         ASSERT(tx == NULL);
 2687         livelist_new_arg_t *lna = arg;
 2688         if (bp_freed) {
 2689                 bplist_append(lna->frees, bp);
 2690         } else {
 2691                 bplist_append(lna->allocs, bp);
 2692                 zfs_livelist_condense_new_alloc++;
 2693         }
 2694         return (0);
 2695 }
 2696 
 2697 typedef struct livelist_condense_arg {
 2698         spa_t *spa;
 2699         bplist_t to_keep;
 2700         uint64_t first_size;
 2701         uint64_t next_size;
 2702 } livelist_condense_arg_t;
 2703 
 2704 static void
 2705 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
 2706 {
 2707         livelist_condense_arg_t *lca = arg;
 2708         spa_t *spa = lca->spa;
 2709         bplist_t new_frees;
 2710         dsl_dataset_t *ds = spa->spa_to_condense.ds;
 2711 
 2712         /* Have we been cancelled? */
 2713         if (spa->spa_to_condense.cancelled) {
 2714                 zfs_livelist_condense_sync_cancel++;
 2715                 goto out;
 2716         }
 2717 
 2718         dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
 2719         dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
 2720         dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
 2721 
 2722         /*
 2723          * It's possible that the livelist was changed while the zthr was
 2724          * running. Therefore, we need to check for new blkptrs in the two
 2725          * entries being condensed and continue to track them in the livelist.
 2726          * Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
 2727          * it's possible that the newly added blkptrs are FREEs or ALLOCs so
 2728          * we need to sort them into two different bplists.
 2729          */
 2730         uint64_t first_obj = first->dle_bpobj.bpo_object;
 2731         uint64_t next_obj = next->dle_bpobj.bpo_object;
 2732         uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
 2733         uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
 2734 
 2735         bplist_create(&new_frees);
 2736         livelist_new_arg_t new_bps = {
 2737             .allocs = &lca->to_keep,
 2738             .frees = &new_frees,
 2739         };
 2740 
 2741         if (cur_first_size > lca->first_size) {
 2742                 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
 2743                     livelist_track_new_cb, &new_bps, lca->first_size));
 2744         }
 2745         if (cur_next_size > lca->next_size) {
 2746                 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
 2747                     livelist_track_new_cb, &new_bps, lca->next_size));
 2748         }
 2749 
 2750         dsl_deadlist_clear_entry(first, ll, tx);
 2751         ASSERT(bpobj_is_empty(&first->dle_bpobj));
 2752         dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
 2753 
 2754         bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
 2755         bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
 2756         bplist_destroy(&new_frees);
 2757 
 2758         char dsname[ZFS_MAX_DATASET_NAME_LEN];
 2759         dsl_dataset_name(ds, dsname);
 2760         zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
 2761             "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
 2762             "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname,
 2763             (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj,
 2764             (u_longlong_t)cur_first_size, (u_longlong_t)next_obj,
 2765             (u_longlong_t)cur_next_size,
 2766             (u_longlong_t)first->dle_bpobj.bpo_object,
 2767             (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
 2768 out:
 2769         dmu_buf_rele(ds->ds_dbuf, spa);
 2770         spa->spa_to_condense.ds = NULL;
 2771         bplist_clear(&lca->to_keep);
 2772         bplist_destroy(&lca->to_keep);
 2773         kmem_free(lca, sizeof (livelist_condense_arg_t));
 2774         spa->spa_to_condense.syncing = B_FALSE;
 2775 }
 2776 
 2777 static void
 2778 spa_livelist_condense_cb(void *arg, zthr_t *t)
 2779 {
 2780         while (zfs_livelist_condense_zthr_pause &&
 2781             !(zthr_has_waiters(t) || zthr_iscancelled(t)))
 2782                 delay(1);
 2783 
 2784         spa_t *spa = arg;
 2785         dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
 2786         dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
 2787         uint64_t first_size, next_size;
 2788 
 2789         livelist_condense_arg_t *lca =
 2790             kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
 2791         bplist_create(&lca->to_keep);
 2792 
 2793         /*
 2794          * Process the livelists (matching FREEs and ALLOCs) in open context
 2795          * so we have minimal work in syncing context to condense.
 2796          *
 2797          * We save bpobj sizes (first_size and next_size) to use later in
 2798          * syncing context to determine if entries were added to these sublists
 2799          * while in open context. This is possible because the clone is still
 2800          * active and open for normal writes and we want to make sure the new,
 2801          * unprocessed blockpointers are inserted into the livelist normally.
 2802          *
 2803          * Note that dsl_process_sub_livelist() both stores the size number of
 2804          * blockpointers and iterates over them while the bpobj's lock held, so
 2805          * the sizes returned to us are consistent which what was actually
 2806          * processed.
 2807          */
 2808         int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
 2809             &first_size);
 2810         if (err == 0)
 2811                 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
 2812                     t, &next_size);
 2813 
 2814         if (err == 0) {
 2815                 while (zfs_livelist_condense_sync_pause &&
 2816                     !(zthr_has_waiters(t) || zthr_iscancelled(t)))
 2817                         delay(1);
 2818 
 2819                 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 2820                 dmu_tx_mark_netfree(tx);
 2821                 dmu_tx_hold_space(tx, 1);
 2822                 err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE);
 2823                 if (err == 0) {
 2824                         /*
 2825                          * Prevent the condense zthr restarting before
 2826                          * the synctask completes.
 2827                          */
 2828                         spa->spa_to_condense.syncing = B_TRUE;
 2829                         lca->spa = spa;
 2830                         lca->first_size = first_size;
 2831                         lca->next_size = next_size;
 2832                         dsl_sync_task_nowait(spa_get_dsl(spa),
 2833                             spa_livelist_condense_sync, lca, tx);
 2834                         dmu_tx_commit(tx);
 2835                         return;
 2836                 }
 2837         }
 2838         /*
 2839          * Condensing can not continue: either it was externally stopped or
 2840          * we were unable to assign to a tx because the pool has run out of
 2841          * space. In the second case, we'll just end up trying to condense
 2842          * again in a later txg.
 2843          */
 2844         ASSERT(err != 0);
 2845         bplist_clear(&lca->to_keep);
 2846         bplist_destroy(&lca->to_keep);
 2847         kmem_free(lca, sizeof (livelist_condense_arg_t));
 2848         dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
 2849         spa->spa_to_condense.ds = NULL;
 2850         if (err == EINTR)
 2851                 zfs_livelist_condense_zthr_cancel++;
 2852 }
 2853 
 2854 /*
 2855  * Check that there is something to condense but that a condense is not
 2856  * already in progress and that condensing has not been cancelled.
 2857  */
 2858 static boolean_t
 2859 spa_livelist_condense_cb_check(void *arg, zthr_t *z)
 2860 {
 2861         (void) z;
 2862         spa_t *spa = arg;
 2863         if ((spa->spa_to_condense.ds != NULL) &&
 2864             (spa->spa_to_condense.syncing == B_FALSE) &&
 2865             (spa->spa_to_condense.cancelled == B_FALSE)) {
 2866                 return (B_TRUE);
 2867         }
 2868         return (B_FALSE);
 2869 }
 2870 
 2871 static void
 2872 spa_start_livelist_condensing_thread(spa_t *spa)
 2873 {
 2874         spa->spa_to_condense.ds = NULL;
 2875         spa->spa_to_condense.first = NULL;
 2876         spa->spa_to_condense.next = NULL;
 2877         spa->spa_to_condense.syncing = B_FALSE;
 2878         spa->spa_to_condense.cancelled = B_FALSE;
 2879 
 2880         ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL);
 2881         spa->spa_livelist_condense_zthr =
 2882             zthr_create("z_livelist_condense",
 2883             spa_livelist_condense_cb_check,
 2884             spa_livelist_condense_cb, spa, minclsyspri);
 2885 }
 2886 
 2887 static void
 2888 spa_spawn_aux_threads(spa_t *spa)
 2889 {
 2890         ASSERT(spa_writeable(spa));
 2891 
 2892         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 2893 
 2894         spa_start_indirect_condensing_thread(spa);
 2895         spa_start_livelist_destroy_thread(spa);
 2896         spa_start_livelist_condensing_thread(spa);
 2897 
 2898         ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
 2899         spa->spa_checkpoint_discard_zthr =
 2900             zthr_create("z_checkpoint_discard",
 2901             spa_checkpoint_discard_thread_check,
 2902             spa_checkpoint_discard_thread, spa, minclsyspri);
 2903 }
 2904 
 2905 /*
 2906  * Fix up config after a partly-completed split.  This is done with the
 2907  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
 2908  * pool have that entry in their config, but only the splitting one contains
 2909  * a list of all the guids of the vdevs that are being split off.
 2910  *
 2911  * This function determines what to do with that list: either rejoin
 2912  * all the disks to the pool, or complete the splitting process.  To attempt
 2913  * the rejoin, each disk that is offlined is marked online again, and
 2914  * we do a reopen() call.  If the vdev label for every disk that was
 2915  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
 2916  * then we call vdev_split() on each disk, and complete the split.
 2917  *
 2918  * Otherwise we leave the config alone, with all the vdevs in place in
 2919  * the original pool.
 2920  */
 2921 static void
 2922 spa_try_repair(spa_t *spa, nvlist_t *config)
 2923 {
 2924         uint_t extracted;
 2925         uint64_t *glist;
 2926         uint_t i, gcount;
 2927         nvlist_t *nvl;
 2928         vdev_t **vd;
 2929         boolean_t attempt_reopen;
 2930 
 2931         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
 2932                 return;
 2933 
 2934         /* check that the config is complete */
 2935         if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
 2936             &glist, &gcount) != 0)
 2937                 return;
 2938 
 2939         vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
 2940 
 2941         /* attempt to online all the vdevs & validate */
 2942         attempt_reopen = B_TRUE;
 2943         for (i = 0; i < gcount; i++) {
 2944                 if (glist[i] == 0)      /* vdev is hole */
 2945                         continue;
 2946 
 2947                 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
 2948                 if (vd[i] == NULL) {
 2949                         /*
 2950                          * Don't bother attempting to reopen the disks;
 2951                          * just do the split.
 2952                          */
 2953                         attempt_reopen = B_FALSE;
 2954                 } else {
 2955                         /* attempt to re-online it */
 2956                         vd[i]->vdev_offline = B_FALSE;
 2957                 }
 2958         }
 2959 
 2960         if (attempt_reopen) {
 2961                 vdev_reopen(spa->spa_root_vdev);
 2962 
 2963                 /* check each device to see what state it's in */
 2964                 for (extracted = 0, i = 0; i < gcount; i++) {
 2965                         if (vd[i] != NULL &&
 2966                             vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
 2967                                 break;
 2968                         ++extracted;
 2969                 }
 2970         }
 2971 
 2972         /*
 2973          * If every disk has been moved to the new pool, or if we never
 2974          * even attempted to look at them, then we split them off for
 2975          * good.
 2976          */
 2977         if (!attempt_reopen || gcount == extracted) {
 2978                 for (i = 0; i < gcount; i++)
 2979                         if (vd[i] != NULL)
 2980                                 vdev_split(vd[i]);
 2981                 vdev_reopen(spa->spa_root_vdev);
 2982         }
 2983 
 2984         kmem_free(vd, gcount * sizeof (vdev_t *));
 2985 }
 2986 
 2987 static int
 2988 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
 2989 {
 2990         const char *ereport = FM_EREPORT_ZFS_POOL;
 2991         int error;
 2992 
 2993         spa->spa_load_state = state;
 2994         (void) spa_import_progress_set_state(spa_guid(spa),
 2995             spa_load_state(spa));
 2996 
 2997         gethrestime(&spa->spa_loaded_ts);
 2998         error = spa_load_impl(spa, type, &ereport);
 2999 
 3000         /*
 3001          * Don't count references from objsets that are already closed
 3002          * and are making their way through the eviction process.
 3003          */
 3004         spa_evicting_os_wait(spa);
 3005         spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
 3006         if (error) {
 3007                 if (error != EEXIST) {
 3008                         spa->spa_loaded_ts.tv_sec = 0;
 3009                         spa->spa_loaded_ts.tv_nsec = 0;
 3010                 }
 3011                 if (error != EBADF) {
 3012                         (void) zfs_ereport_post(ereport, spa,
 3013                             NULL, NULL, NULL, 0);
 3014                 }
 3015         }
 3016         spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
 3017         spa->spa_ena = 0;
 3018 
 3019         (void) spa_import_progress_set_state(spa_guid(spa),
 3020             spa_load_state(spa));
 3021 
 3022         return (error);
 3023 }
 3024 
 3025 #ifdef ZFS_DEBUG
 3026 /*
 3027  * Count the number of per-vdev ZAPs associated with all of the vdevs in the
 3028  * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
 3029  * spa's per-vdev ZAP list.
 3030  */
 3031 static uint64_t
 3032 vdev_count_verify_zaps(vdev_t *vd)
 3033 {
 3034         spa_t *spa = vd->vdev_spa;
 3035         uint64_t total = 0;
 3036 
 3037         if (vd->vdev_top_zap != 0) {
 3038                 total++;
 3039                 ASSERT0(zap_lookup_int(spa->spa_meta_objset,
 3040                     spa->spa_all_vdev_zaps, vd->vdev_top_zap));
 3041         }
 3042         if (vd->vdev_leaf_zap != 0) {
 3043                 total++;
 3044                 ASSERT0(zap_lookup_int(spa->spa_meta_objset,
 3045                     spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
 3046         }
 3047 
 3048         for (uint64_t i = 0; i < vd->vdev_children; i++) {
 3049                 total += vdev_count_verify_zaps(vd->vdev_child[i]);
 3050         }
 3051 
 3052         return (total);
 3053 }
 3054 #else
 3055 #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0)
 3056 #endif
 3057 
 3058 /*
 3059  * Determine whether the activity check is required.
 3060  */
 3061 static boolean_t
 3062 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
 3063     nvlist_t *config)
 3064 {
 3065         uint64_t state = 0;
 3066         uint64_t hostid = 0;
 3067         uint64_t tryconfig_txg = 0;
 3068         uint64_t tryconfig_timestamp = 0;
 3069         uint16_t tryconfig_mmp_seq = 0;
 3070         nvlist_t *nvinfo;
 3071 
 3072         if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
 3073                 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
 3074                 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
 3075                     &tryconfig_txg);
 3076                 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 3077                     &tryconfig_timestamp);
 3078                 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
 3079                     &tryconfig_mmp_seq);
 3080         }
 3081 
 3082         (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
 3083 
 3084         /*
 3085          * Disable the MMP activity check - This is used by zdb which
 3086          * is intended to be used on potentially active pools.
 3087          */
 3088         if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
 3089                 return (B_FALSE);
 3090 
 3091         /*
 3092          * Skip the activity check when the MMP feature is disabled.
 3093          */
 3094         if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
 3095                 return (B_FALSE);
 3096 
 3097         /*
 3098          * If the tryconfig_ values are nonzero, they are the results of an
 3099          * earlier tryimport.  If they all match the uberblock we just found,
 3100          * then the pool has not changed and we return false so we do not test
 3101          * a second time.
 3102          */
 3103         if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
 3104             tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
 3105             tryconfig_mmp_seq && tryconfig_mmp_seq ==
 3106             (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
 3107                 return (B_FALSE);
 3108 
 3109         /*
 3110          * Allow the activity check to be skipped when importing the pool
 3111          * on the same host which last imported it.  Since the hostid from
 3112          * configuration may be stale use the one read from the label.
 3113          */
 3114         if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
 3115                 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
 3116 
 3117         if (hostid == spa_get_hostid(spa))
 3118                 return (B_FALSE);
 3119 
 3120         /*
 3121          * Skip the activity test when the pool was cleanly exported.
 3122          */
 3123         if (state != POOL_STATE_ACTIVE)
 3124                 return (B_FALSE);
 3125 
 3126         return (B_TRUE);
 3127 }
 3128 
 3129 /*
 3130  * Nanoseconds the activity check must watch for changes on-disk.
 3131  */
 3132 static uint64_t
 3133 spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
 3134 {
 3135         uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
 3136         uint64_t multihost_interval = MSEC2NSEC(
 3137             MMP_INTERVAL_OK(zfs_multihost_interval));
 3138         uint64_t import_delay = MAX(NANOSEC, import_intervals *
 3139             multihost_interval);
 3140 
 3141         /*
 3142          * Local tunables determine a minimum duration except for the case
 3143          * where we know when the remote host will suspend the pool if MMP
 3144          * writes do not land.
 3145          *
 3146          * See Big Theory comment at the top of mmp.c for the reasoning behind
 3147          * these cases and times.
 3148          */
 3149 
 3150         ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
 3151 
 3152         if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
 3153             MMP_FAIL_INT(ub) > 0) {
 3154 
 3155                 /* MMP on remote host will suspend pool after failed writes */
 3156                 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
 3157                     MMP_IMPORT_SAFETY_FACTOR / 100;
 3158 
 3159                 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
 3160                     "mmp_fails=%llu ub_mmp mmp_interval=%llu "
 3161                     "import_intervals=%llu", (u_longlong_t)import_delay,
 3162                     (u_longlong_t)MMP_FAIL_INT(ub),
 3163                     (u_longlong_t)MMP_INTERVAL(ub),
 3164                     (u_longlong_t)import_intervals);
 3165 
 3166         } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
 3167             MMP_FAIL_INT(ub) == 0) {
 3168 
 3169                 /* MMP on remote host will never suspend pool */
 3170                 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
 3171                     ub->ub_mmp_delay) * import_intervals);
 3172 
 3173                 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
 3174                     "mmp_interval=%llu ub_mmp_delay=%llu "
 3175                     "import_intervals=%llu", (u_longlong_t)import_delay,
 3176                     (u_longlong_t)MMP_INTERVAL(ub),
 3177                     (u_longlong_t)ub->ub_mmp_delay,
 3178                     (u_longlong_t)import_intervals);
 3179 
 3180         } else if (MMP_VALID(ub)) {
 3181                 /*
 3182                  * zfs-0.7 compatibility case
 3183                  */
 3184 
 3185                 import_delay = MAX(import_delay, (multihost_interval +
 3186                     ub->ub_mmp_delay) * import_intervals);
 3187 
 3188                 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
 3189                     "import_intervals=%llu leaves=%u",
 3190                     (u_longlong_t)import_delay,
 3191                     (u_longlong_t)ub->ub_mmp_delay,
 3192                     (u_longlong_t)import_intervals,
 3193                     vdev_count_leaves(spa));
 3194         } else {
 3195                 /* Using local tunings is the only reasonable option */
 3196                 zfs_dbgmsg("pool last imported on non-MMP aware "
 3197                     "host using import_delay=%llu multihost_interval=%llu "
 3198                     "import_intervals=%llu", (u_longlong_t)import_delay,
 3199                     (u_longlong_t)multihost_interval,
 3200                     (u_longlong_t)import_intervals);
 3201         }
 3202 
 3203         return (import_delay);
 3204 }
 3205 
 3206 /*
 3207  * Perform the import activity check.  If the user canceled the import or
 3208  * we detected activity then fail.
 3209  */
 3210 static int
 3211 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
 3212 {
 3213         uint64_t txg = ub->ub_txg;
 3214         uint64_t timestamp = ub->ub_timestamp;
 3215         uint64_t mmp_config = ub->ub_mmp_config;
 3216         uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
 3217         uint64_t import_delay;
 3218         hrtime_t import_expire;
 3219         nvlist_t *mmp_label = NULL;
 3220         vdev_t *rvd = spa->spa_root_vdev;
 3221         kcondvar_t cv;
 3222         kmutex_t mtx;
 3223         int error = 0;
 3224 
 3225         cv_init(&cv, NULL, CV_DEFAULT, NULL);
 3226         mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
 3227         mutex_enter(&mtx);
 3228 
 3229         /*
 3230          * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
 3231          * during the earlier tryimport.  If the txg recorded there is 0 then
 3232          * the pool is known to be active on another host.
 3233          *
 3234          * Otherwise, the pool might be in use on another host.  Check for
 3235          * changes in the uberblocks on disk if necessary.
 3236          */
 3237         if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
 3238                 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
 3239                     ZPOOL_CONFIG_LOAD_INFO);
 3240 
 3241                 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
 3242                     fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
 3243                         vdev_uberblock_load(rvd, ub, &mmp_label);
 3244                         error = SET_ERROR(EREMOTEIO);
 3245                         goto out;
 3246                 }
 3247         }
 3248 
 3249         import_delay = spa_activity_check_duration(spa, ub);
 3250 
 3251         /* Add a small random factor in case of simultaneous imports (0-25%) */
 3252         import_delay += import_delay * random_in_range(250) / 1000;
 3253 
 3254         import_expire = gethrtime() + import_delay;
 3255 
 3256         while (gethrtime() < import_expire) {
 3257                 (void) spa_import_progress_set_mmp_check(spa_guid(spa),
 3258                     NSEC2SEC(import_expire - gethrtime()));
 3259 
 3260                 vdev_uberblock_load(rvd, ub, &mmp_label);
 3261 
 3262                 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
 3263                     mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
 3264                         zfs_dbgmsg("multihost activity detected "
 3265                             "txg %llu ub_txg  %llu "
 3266                             "timestamp %llu ub_timestamp  %llu "
 3267                             "mmp_config %#llx ub_mmp_config %#llx",
 3268                             (u_longlong_t)txg, (u_longlong_t)ub->ub_txg,
 3269                             (u_longlong_t)timestamp,
 3270                             (u_longlong_t)ub->ub_timestamp,
 3271                             (u_longlong_t)mmp_config,
 3272                             (u_longlong_t)ub->ub_mmp_config);
 3273 
 3274                         error = SET_ERROR(EREMOTEIO);
 3275                         break;
 3276                 }
 3277 
 3278                 if (mmp_label) {
 3279                         nvlist_free(mmp_label);
 3280                         mmp_label = NULL;
 3281                 }
 3282 
 3283                 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz);
 3284                 if (error != -1) {
 3285                         error = SET_ERROR(EINTR);
 3286                         break;
 3287                 }
 3288                 error = 0;
 3289         }
 3290 
 3291 out:
 3292         mutex_exit(&mtx);
 3293         mutex_destroy(&mtx);
 3294         cv_destroy(&cv);
 3295 
 3296         /*
 3297          * If the pool is determined to be active store the status in the
 3298          * spa->spa_load_info nvlist.  If the remote hostname or hostid are
 3299          * available from configuration read from disk store them as well.
 3300          * This allows 'zpool import' to generate a more useful message.
 3301          *
 3302          * ZPOOL_CONFIG_MMP_STATE    - observed pool status (mandatory)
 3303          * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
 3304          * ZPOOL_CONFIG_MMP_HOSTID   - hostid from the active pool
 3305          */
 3306         if (error == EREMOTEIO) {
 3307                 const char *hostname = "<unknown>";
 3308                 uint64_t hostid = 0;
 3309 
 3310                 if (mmp_label) {
 3311                         if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
 3312                                 hostname = fnvlist_lookup_string(mmp_label,
 3313                                     ZPOOL_CONFIG_HOSTNAME);
 3314                                 fnvlist_add_string(spa->spa_load_info,
 3315                                     ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
 3316                         }
 3317 
 3318                         if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
 3319                                 hostid = fnvlist_lookup_uint64(mmp_label,
 3320                                     ZPOOL_CONFIG_HOSTID);
 3321                                 fnvlist_add_uint64(spa->spa_load_info,
 3322                                     ZPOOL_CONFIG_MMP_HOSTID, hostid);
 3323                         }
 3324                 }
 3325 
 3326                 fnvlist_add_uint64(spa->spa_load_info,
 3327                     ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
 3328                 fnvlist_add_uint64(spa->spa_load_info,
 3329                     ZPOOL_CONFIG_MMP_TXG, 0);
 3330 
 3331                 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
 3332         }
 3333 
 3334         if (mmp_label)
 3335                 nvlist_free(mmp_label);
 3336 
 3337         return (error);
 3338 }
 3339 
 3340 static int
 3341 spa_verify_host(spa_t *spa, nvlist_t *mos_config)
 3342 {
 3343         uint64_t hostid;
 3344         char *hostname;
 3345         uint64_t myhostid = 0;
 3346 
 3347         if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
 3348             ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
 3349                 hostname = fnvlist_lookup_string(mos_config,
 3350                     ZPOOL_CONFIG_HOSTNAME);
 3351 
 3352                 myhostid = zone_get_hostid(NULL);
 3353 
 3354                 if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
 3355                         cmn_err(CE_WARN, "pool '%s' could not be "
 3356                             "loaded as it was last accessed by "
 3357                             "another system (host: %s hostid: 0x%llx). "
 3358                             "See: https://openzfs.github.io/openzfs-docs/msg/"
 3359                             "ZFS-8000-EY",
 3360                             spa_name(spa), hostname, (u_longlong_t)hostid);
 3361                         spa_load_failed(spa, "hostid verification failed: pool "
 3362                             "last accessed by host: %s (hostid: 0x%llx)",
 3363                             hostname, (u_longlong_t)hostid);
 3364                         return (SET_ERROR(EBADF));
 3365                 }
 3366         }
 3367 
 3368         return (0);
 3369 }
 3370 
 3371 static int
 3372 spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
 3373 {
 3374         int error = 0;
 3375         nvlist_t *nvtree, *nvl, *config = spa->spa_config;
 3376         int parse;
 3377         vdev_t *rvd;
 3378         uint64_t pool_guid;
 3379         char *comment;
 3380         char *compatibility;
 3381 
 3382         /*
 3383          * Versioning wasn't explicitly added to the label until later, so if
 3384          * it's not present treat it as the initial version.
 3385          */
 3386         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 3387             &spa->spa_ubsync.ub_version) != 0)
 3388                 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 3389 
 3390         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
 3391                 spa_load_failed(spa, "invalid config provided: '%s' missing",
 3392                     ZPOOL_CONFIG_POOL_GUID);
 3393                 return (SET_ERROR(EINVAL));
 3394         }
 3395 
 3396         /*
 3397          * If we are doing an import, ensure that the pool is not already
 3398          * imported by checking if its pool guid already exists in the
 3399          * spa namespace.
 3400          *
 3401          * The only case that we allow an already imported pool to be
 3402          * imported again, is when the pool is checkpointed and we want to
 3403          * look at its checkpointed state from userland tools like zdb.
 3404          */
 3405 #ifdef _KERNEL
 3406         if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
 3407             spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
 3408             spa_guid_exists(pool_guid, 0)) {
 3409 #else
 3410         if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
 3411             spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
 3412             spa_guid_exists(pool_guid, 0) &&
 3413             !spa_importing_readonly_checkpoint(spa)) {
 3414 #endif
 3415                 spa_load_failed(spa, "a pool with guid %llu is already open",
 3416                     (u_longlong_t)pool_guid);
 3417                 return (SET_ERROR(EEXIST));
 3418         }
 3419 
 3420         spa->spa_config_guid = pool_guid;
 3421 
 3422         nvlist_free(spa->spa_load_info);
 3423         spa->spa_load_info = fnvlist_alloc();
 3424 
 3425         ASSERT(spa->spa_comment == NULL);
 3426         if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
 3427                 spa->spa_comment = spa_strdup(comment);
 3428 
 3429         ASSERT(spa->spa_compatibility == NULL);
 3430         if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY,
 3431             &compatibility) == 0)
 3432                 spa->spa_compatibility = spa_strdup(compatibility);
 3433 
 3434         (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 3435             &spa->spa_config_txg);
 3436 
 3437         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
 3438                 spa->spa_config_splitting = fnvlist_dup(nvl);
 3439 
 3440         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
 3441                 spa_load_failed(spa, "invalid config provided: '%s' missing",
 3442                     ZPOOL_CONFIG_VDEV_TREE);
 3443                 return (SET_ERROR(EINVAL));
 3444         }
 3445 
 3446         /*
 3447          * Create "The Godfather" zio to hold all async IOs
 3448          */
 3449         spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
 3450             KM_SLEEP);
 3451         for (int i = 0; i < max_ncpus; i++) {
 3452                 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 3453                     ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 3454                     ZIO_FLAG_GODFATHER);
 3455         }
 3456 
 3457         /*
 3458          * Parse the configuration into a vdev tree.  We explicitly set the
 3459          * value that will be returned by spa_version() since parsing the
 3460          * configuration requires knowing the version number.
 3461          */
 3462         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 3463         parse = (type == SPA_IMPORT_EXISTING ?
 3464             VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
 3465         error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
 3466         spa_config_exit(spa, SCL_ALL, FTAG);
 3467 
 3468         if (error != 0) {
 3469                 spa_load_failed(spa, "unable to parse config [error=%d]",
 3470                     error);
 3471                 return (error);
 3472         }
 3473 
 3474         ASSERT(spa->spa_root_vdev == rvd);
 3475         ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
 3476         ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
 3477 
 3478         if (type != SPA_IMPORT_ASSEMBLE) {
 3479                 ASSERT(spa_guid(spa) == pool_guid);
 3480         }
 3481 
 3482         return (0);
 3483 }
 3484 
 3485 /*
 3486  * Recursively open all vdevs in the vdev tree. This function is called twice:
 3487  * first with the untrusted config, then with the trusted config.
 3488  */
 3489 static int
 3490 spa_ld_open_vdevs(spa_t *spa)
 3491 {
 3492         int error = 0;
 3493 
 3494         /*
 3495          * spa_missing_tvds_allowed defines how many top-level vdevs can be
 3496          * missing/unopenable for the root vdev to be still considered openable.
 3497          */
 3498         if (spa->spa_trust_config) {
 3499                 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
 3500         } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
 3501                 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
 3502         } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
 3503                 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
 3504         } else {
 3505                 spa->spa_missing_tvds_allowed = 0;
 3506         }
 3507 
 3508         spa->spa_missing_tvds_allowed =
 3509             MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
 3510 
 3511         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 3512         error = vdev_open(spa->spa_root_vdev);
 3513         spa_config_exit(spa, SCL_ALL, FTAG);
 3514 
 3515         if (spa->spa_missing_tvds != 0) {
 3516                 spa_load_note(spa, "vdev tree has %lld missing top-level "
 3517                     "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
 3518                 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) {
 3519                         /*
 3520                          * Although theoretically we could allow users to open
 3521                          * incomplete pools in RW mode, we'd need to add a lot
 3522                          * of extra logic (e.g. adjust pool space to account
 3523                          * for missing vdevs).
 3524                          * This limitation also prevents users from accidentally
 3525                          * opening the pool in RW mode during data recovery and
 3526                          * damaging it further.
 3527                          */
 3528                         spa_load_note(spa, "pools with missing top-level "
 3529                             "vdevs can only be opened in read-only mode.");
 3530                         error = SET_ERROR(ENXIO);
 3531                 } else {
 3532                         spa_load_note(spa, "current settings allow for maximum "
 3533                             "%lld missing top-level vdevs at this stage.",
 3534                             (u_longlong_t)spa->spa_missing_tvds_allowed);
 3535                 }
 3536         }
 3537         if (error != 0) {
 3538                 spa_load_failed(spa, "unable to open vdev tree [error=%d]",
 3539                     error);
 3540         }
 3541         if (spa->spa_missing_tvds != 0 || error != 0)
 3542                 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
 3543 
 3544         return (error);
 3545 }
 3546 
 3547 /*
 3548  * We need to validate the vdev labels against the configuration that
 3549  * we have in hand. This function is called twice: first with an untrusted
 3550  * config, then with a trusted config. The validation is more strict when the
 3551  * config is trusted.
 3552  */
 3553 static int
 3554 spa_ld_validate_vdevs(spa_t *spa)
 3555 {
 3556         int error = 0;
 3557         vdev_t *rvd = spa->spa_root_vdev;
 3558 
 3559         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 3560         error = vdev_validate(rvd);
 3561         spa_config_exit(spa, SCL_ALL, FTAG);
 3562 
 3563         if (error != 0) {
 3564                 spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
 3565                 return (error);
 3566         }
 3567 
 3568         if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
 3569                 spa_load_failed(spa, "cannot open vdev tree after invalidating "
 3570                     "some vdevs");
 3571                 vdev_dbgmsg_print_tree(rvd, 2);
 3572                 return (SET_ERROR(ENXIO));
 3573         }
 3574 
 3575         return (0);
 3576 }
 3577 
 3578 static void
 3579 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
 3580 {
 3581         spa->spa_state = POOL_STATE_ACTIVE;
 3582         spa->spa_ubsync = spa->spa_uberblock;
 3583         spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
 3584             TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
 3585         spa->spa_first_txg = spa->spa_last_ubsync_txg ?
 3586             spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
 3587         spa->spa_claim_max_txg = spa->spa_first_txg;
 3588         spa->spa_prev_software_version = ub->ub_software_version;
 3589 }
 3590 
 3591 static int
 3592 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 3593 {
 3594         vdev_t *rvd = spa->spa_root_vdev;
 3595         nvlist_t *label;
 3596         uberblock_t *ub = &spa->spa_uberblock;
 3597         boolean_t activity_check = B_FALSE;
 3598 
 3599         /*
 3600          * If we are opening the checkpointed state of the pool by
 3601          * rewinding to it, at this point we will have written the
 3602          * checkpointed uberblock to the vdev labels, so searching
 3603          * the labels will find the right uberblock.  However, if
 3604          * we are opening the checkpointed state read-only, we have
 3605          * not modified the labels. Therefore, we must ignore the
 3606          * labels and continue using the spa_uberblock that was set
 3607          * by spa_ld_checkpoint_rewind.
 3608          *
 3609          * Note that it would be fine to ignore the labels when
 3610          * rewinding (opening writeable) as well. However, if we
 3611          * crash just after writing the labels, we will end up
 3612          * searching the labels. Doing so in the common case means
 3613          * that this code path gets exercised normally, rather than
 3614          * just in the edge case.
 3615          */
 3616         if (ub->ub_checkpoint_txg != 0 &&
 3617             spa_importing_readonly_checkpoint(spa)) {
 3618                 spa_ld_select_uberblock_done(spa, ub);
 3619                 return (0);
 3620         }
 3621 
 3622         /*
 3623          * Find the best uberblock.
 3624          */
 3625         vdev_uberblock_load(rvd, ub, &label);
 3626 
 3627         /*
 3628          * If we weren't able to find a single valid uberblock, return failure.
 3629          */
 3630         if (ub->ub_txg == 0) {
 3631                 nvlist_free(label);
 3632                 spa_load_failed(spa, "no valid uberblock found");
 3633                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
 3634         }
 3635 
 3636         if (spa->spa_load_max_txg != UINT64_MAX) {
 3637                 (void) spa_import_progress_set_max_txg(spa_guid(spa),
 3638                     (u_longlong_t)spa->spa_load_max_txg);
 3639         }
 3640         spa_load_note(spa, "using uberblock with txg=%llu",
 3641             (u_longlong_t)ub->ub_txg);
 3642 
 3643 
 3644         /*
 3645          * For pools which have the multihost property on determine if the
 3646          * pool is truly inactive and can be safely imported.  Prevent
 3647          * hosts which don't have a hostid set from importing the pool.
 3648          */
 3649         activity_check = spa_activity_check_required(spa, ub, label,
 3650             spa->spa_config);
 3651         if (activity_check) {
 3652                 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
 3653                     spa_get_hostid(spa) == 0) {
 3654                         nvlist_free(label);
 3655                         fnvlist_add_uint64(spa->spa_load_info,
 3656                             ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
 3657                         return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
 3658                 }
 3659 
 3660                 int error = spa_activity_check(spa, ub, spa->spa_config);
 3661                 if (error) {
 3662                         nvlist_free(label);
 3663                         return (error);
 3664                 }
 3665 
 3666                 fnvlist_add_uint64(spa->spa_load_info,
 3667                     ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
 3668                 fnvlist_add_uint64(spa->spa_load_info,
 3669                     ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
 3670                 fnvlist_add_uint16(spa->spa_load_info,
 3671                     ZPOOL_CONFIG_MMP_SEQ,
 3672                     (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
 3673         }
 3674 
 3675         /*
 3676          * If the pool has an unsupported version we can't open it.
 3677          */
 3678         if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
 3679                 nvlist_free(label);
 3680                 spa_load_failed(spa, "version %llu is not supported",
 3681                     (u_longlong_t)ub->ub_version);
 3682                 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
 3683         }
 3684 
 3685         if (ub->ub_version >= SPA_VERSION_FEATURES) {
 3686                 nvlist_t *features;
 3687 
 3688                 /*
 3689                  * If we weren't able to find what's necessary for reading the
 3690                  * MOS in the label, return failure.
 3691                  */
 3692                 if (label == NULL) {
 3693                         spa_load_failed(spa, "label config unavailable");
 3694                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 3695                             ENXIO));
 3696                 }
 3697 
 3698                 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
 3699                     &features) != 0) {
 3700                         nvlist_free(label);
 3701                         spa_load_failed(spa, "invalid label: '%s' missing",
 3702                             ZPOOL_CONFIG_FEATURES_FOR_READ);
 3703                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 3704                             ENXIO));
 3705                 }
 3706 
 3707                 /*
 3708                  * Update our in-core representation with the definitive values
 3709                  * from the label.
 3710                  */
 3711                 nvlist_free(spa->spa_label_features);
 3712                 spa->spa_label_features = fnvlist_dup(features);
 3713         }
 3714 
 3715         nvlist_free(label);
 3716 
 3717         /*
 3718          * Look through entries in the label nvlist's features_for_read. If
 3719          * there is a feature listed there which we don't understand then we
 3720          * cannot open a pool.
 3721          */
 3722         if (ub->ub_version >= SPA_VERSION_FEATURES) {
 3723                 nvlist_t *unsup_feat;
 3724 
 3725                 unsup_feat = fnvlist_alloc();
 3726 
 3727                 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
 3728                     NULL); nvp != NULL;
 3729                     nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
 3730                         if (!zfeature_is_supported(nvpair_name(nvp))) {
 3731                                 fnvlist_add_string(unsup_feat,
 3732                                     nvpair_name(nvp), "");
 3733                         }
 3734                 }
 3735 
 3736                 if (!nvlist_empty(unsup_feat)) {
 3737                         fnvlist_add_nvlist(spa->spa_load_info,
 3738                             ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
 3739                         nvlist_free(unsup_feat);
 3740                         spa_load_failed(spa, "some features are unsupported");
 3741                         return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 3742                             ENOTSUP));
 3743                 }
 3744 
 3745                 nvlist_free(unsup_feat);
 3746         }
 3747 
 3748         if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
 3749                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 3750                 spa_try_repair(spa, spa->spa_config);
 3751                 spa_config_exit(spa, SCL_ALL, FTAG);
 3752                 nvlist_free(spa->spa_config_splitting);
 3753                 spa->spa_config_splitting = NULL;
 3754         }
 3755 
 3756         /*
 3757          * Initialize internal SPA structures.
 3758          */
 3759         spa_ld_select_uberblock_done(spa, ub);
 3760 
 3761         return (0);
 3762 }
 3763 
 3764 static int
 3765 spa_ld_open_rootbp(spa_t *spa)
 3766 {
 3767         int error = 0;
 3768         vdev_t *rvd = spa->spa_root_vdev;
 3769 
 3770         error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 3771         if (error != 0) {
 3772                 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
 3773                     "[error=%d]", error);
 3774                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 3775         }
 3776         spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
 3777 
 3778         return (0);
 3779 }
 3780 
 3781 static int
 3782 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
 3783     boolean_t reloading)
 3784 {
 3785         vdev_t *mrvd, *rvd = spa->spa_root_vdev;
 3786         nvlist_t *nv, *mos_config, *policy;
 3787         int error = 0, copy_error;
 3788         uint64_t healthy_tvds, healthy_tvds_mos;
 3789         uint64_t mos_config_txg;
 3790 
 3791         if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
 3792             != 0)
 3793                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 3794 
 3795         /*
 3796          * If we're assembling a pool from a split, the config provided is
 3797          * already trusted so there is nothing to do.
 3798          */
 3799         if (type == SPA_IMPORT_ASSEMBLE)
 3800                 return (0);
 3801 
 3802         healthy_tvds = spa_healthy_core_tvds(spa);
 3803 
 3804         if (load_nvlist(spa, spa->spa_config_object, &mos_config)
 3805             != 0) {
 3806                 spa_load_failed(spa, "unable to retrieve MOS config");
 3807                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 3808         }
 3809 
 3810         /*
 3811          * If we are doing an open, pool owner wasn't verified yet, thus do
 3812          * the verification here.
 3813          */
 3814         if (spa->spa_load_state == SPA_LOAD_OPEN) {
 3815                 error = spa_verify_host(spa, mos_config);
 3816                 if (error != 0) {
 3817                         nvlist_free(mos_config);
 3818                         return (error);
 3819                 }
 3820         }
 3821 
 3822         nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
 3823 
 3824         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 3825 
 3826         /*
 3827          * Build a new vdev tree from the trusted config
 3828          */
 3829         error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
 3830         if (error != 0) {
 3831                 nvlist_free(mos_config);
 3832                 spa_config_exit(spa, SCL_ALL, FTAG);
 3833                 spa_load_failed(spa, "spa_config_parse failed [error=%d]",
 3834                     error);
 3835                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 3836         }
 3837 
 3838         /*
 3839          * Vdev paths in the MOS may be obsolete. If the untrusted config was
 3840          * obtained by scanning /dev/dsk, then it will have the right vdev
 3841          * paths. We update the trusted MOS config with this information.
 3842          * We first try to copy the paths with vdev_copy_path_strict, which
 3843          * succeeds only when both configs have exactly the same vdev tree.
 3844          * If that fails, we fall back to a more flexible method that has a
 3845          * best effort policy.
 3846          */
 3847         copy_error = vdev_copy_path_strict(rvd, mrvd);
 3848         if (copy_error != 0 || spa_load_print_vdev_tree) {
 3849                 spa_load_note(spa, "provided vdev tree:");
 3850                 vdev_dbgmsg_print_tree(rvd, 2);
 3851                 spa_load_note(spa, "MOS vdev tree:");
 3852                 vdev_dbgmsg_print_tree(mrvd, 2);
 3853         }
 3854         if (copy_error != 0) {
 3855                 spa_load_note(spa, "vdev_copy_path_strict failed, falling "
 3856                     "back to vdev_copy_path_relaxed");
 3857                 vdev_copy_path_relaxed(rvd, mrvd);
 3858         }
 3859 
 3860         vdev_close(rvd);
 3861         vdev_free(rvd);
 3862         spa->spa_root_vdev = mrvd;
 3863         rvd = mrvd;
 3864         spa_config_exit(spa, SCL_ALL, FTAG);
 3865 
 3866         /*
 3867          * We will use spa_config if we decide to reload the spa or if spa_load
 3868          * fails and we rewind. We must thus regenerate the config using the
 3869          * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
 3870          * pass settings on how to load the pool and is not stored in the MOS.
 3871          * We copy it over to our new, trusted config.
 3872          */
 3873         mos_config_txg = fnvlist_lookup_uint64(mos_config,
 3874             ZPOOL_CONFIG_POOL_TXG);
 3875         nvlist_free(mos_config);
 3876         mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
 3877         if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
 3878             &policy) == 0)
 3879                 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
 3880         spa_config_set(spa, mos_config);
 3881         spa->spa_config_source = SPA_CONFIG_SRC_MOS;
 3882 
 3883         /*
 3884          * Now that we got the config from the MOS, we should be more strict
 3885          * in checking blkptrs and can make assumptions about the consistency
 3886          * of the vdev tree. spa_trust_config must be set to true before opening
 3887          * vdevs in order for them to be writeable.
 3888          */
 3889         spa->spa_trust_config = B_TRUE;
 3890 
 3891         /*
 3892          * Open and validate the new vdev tree
 3893          */
 3894         error = spa_ld_open_vdevs(spa);
 3895         if (error != 0)
 3896                 return (error);
 3897 
 3898         error = spa_ld_validate_vdevs(spa);
 3899         if (error != 0)
 3900                 return (error);
 3901 
 3902         if (copy_error != 0 || spa_load_print_vdev_tree) {
 3903                 spa_load_note(spa, "final vdev tree:");
 3904                 vdev_dbgmsg_print_tree(rvd, 2);
 3905         }
 3906 
 3907         if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
 3908             !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
 3909                 /*
 3910                  * Sanity check to make sure that we are indeed loading the
 3911                  * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
 3912                  * in the config provided and they happened to be the only ones
 3913                  * to have the latest uberblock, we could involuntarily perform
 3914                  * an extreme rewind.
 3915                  */
 3916                 healthy_tvds_mos = spa_healthy_core_tvds(spa);
 3917                 if (healthy_tvds_mos - healthy_tvds >=
 3918                     SPA_SYNC_MIN_VDEVS) {
 3919                         spa_load_note(spa, "config provided misses too many "
 3920                             "top-level vdevs compared to MOS (%lld vs %lld). ",
 3921                             (u_longlong_t)healthy_tvds,
 3922                             (u_longlong_t)healthy_tvds_mos);
 3923                         spa_load_note(spa, "vdev tree:");
 3924                         vdev_dbgmsg_print_tree(rvd, 2);
 3925                         if (reloading) {
 3926                                 spa_load_failed(spa, "config was already "
 3927                                     "provided from MOS. Aborting.");
 3928                                 return (spa_vdev_err(rvd,
 3929                                     VDEV_AUX_CORRUPT_DATA, EIO));
 3930                         }
 3931                         spa_load_note(spa, "spa must be reloaded using MOS "
 3932                             "config");
 3933                         return (SET_ERROR(EAGAIN));
 3934                 }
 3935         }
 3936 
 3937         error = spa_check_for_missing_logs(spa);
 3938         if (error != 0)
 3939                 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
 3940 
 3941         if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
 3942                 spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
 3943                     "guid sum (%llu != %llu)",
 3944                     (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
 3945                     (u_longlong_t)rvd->vdev_guid_sum);
 3946                 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
 3947                     ENXIO));
 3948         }
 3949 
 3950         return (0);
 3951 }
 3952 
 3953 static int
 3954 spa_ld_open_indirect_vdev_metadata(spa_t *spa)
 3955 {
 3956         int error = 0;
 3957         vdev_t *rvd = spa->spa_root_vdev;
 3958 
 3959         /*
 3960          * Everything that we read before spa_remove_init() must be stored
 3961          * on concreted vdevs.  Therefore we do this as early as possible.
 3962          */
 3963         error = spa_remove_init(spa);
 3964         if (error != 0) {
 3965                 spa_load_failed(spa, "spa_remove_init failed [error=%d]",
 3966                     error);
 3967                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 3968         }
 3969 
 3970         /*
 3971          * Retrieve information needed to condense indirect vdev mappings.
 3972          */
 3973         error = spa_condense_init(spa);
 3974         if (error != 0) {
 3975                 spa_load_failed(spa, "spa_condense_init failed [error=%d]",
 3976                     error);
 3977                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 3978         }
 3979 
 3980         return (0);
 3981 }
 3982 
 3983 static int
 3984 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
 3985 {
 3986         int error = 0;
 3987         vdev_t *rvd = spa->spa_root_vdev;
 3988 
 3989         if (spa_version(spa) >= SPA_VERSION_FEATURES) {
 3990                 boolean_t missing_feat_read = B_FALSE;
 3991                 nvlist_t *unsup_feat, *enabled_feat;
 3992 
 3993                 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
 3994                     &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
 3995                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 3996                 }
 3997 
 3998                 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
 3999                     &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
 4000                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4001                 }
 4002 
 4003                 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
 4004                     &spa->spa_feat_desc_obj, B_TRUE) != 0) {
 4005                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4006                 }
 4007 
 4008                 enabled_feat = fnvlist_alloc();
 4009                 unsup_feat = fnvlist_alloc();
 4010 
 4011                 if (!spa_features_check(spa, B_FALSE,
 4012                     unsup_feat, enabled_feat))
 4013                         missing_feat_read = B_TRUE;
 4014 
 4015                 if (spa_writeable(spa) ||
 4016                     spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
 4017                         if (!spa_features_check(spa, B_TRUE,
 4018                             unsup_feat, enabled_feat)) {
 4019                                 *missing_feat_writep = B_TRUE;
 4020                         }
 4021                 }
 4022 
 4023                 fnvlist_add_nvlist(spa->spa_load_info,
 4024                     ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
 4025 
 4026                 if (!nvlist_empty(unsup_feat)) {
 4027                         fnvlist_add_nvlist(spa->spa_load_info,
 4028                             ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
 4029                 }
 4030 
 4031                 fnvlist_free(enabled_feat);
 4032                 fnvlist_free(unsup_feat);
 4033 
 4034                 if (!missing_feat_read) {
 4035                         fnvlist_add_boolean(spa->spa_load_info,
 4036                             ZPOOL_CONFIG_CAN_RDONLY);
 4037                 }
 4038 
 4039                 /*
 4040                  * If the state is SPA_LOAD_TRYIMPORT, our objective is
 4041                  * twofold: to determine whether the pool is available for
 4042                  * import in read-write mode and (if it is not) whether the
 4043                  * pool is available for import in read-only mode. If the pool
 4044                  * is available for import in read-write mode, it is displayed
 4045                  * as available in userland; if it is not available for import
 4046                  * in read-only mode, it is displayed as unavailable in
 4047                  * userland. If the pool is available for import in read-only
 4048                  * mode but not read-write mode, it is displayed as unavailable
 4049                  * in userland with a special note that the pool is actually
 4050                  * available for open in read-only mode.
 4051                  *
 4052                  * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
 4053                  * missing a feature for write, we must first determine whether
 4054                  * the pool can be opened read-only before returning to
 4055                  * userland in order to know whether to display the
 4056                  * abovementioned note.
 4057                  */
 4058                 if (missing_feat_read || (*missing_feat_writep &&
 4059                     spa_writeable(spa))) {
 4060                         spa_load_failed(spa, "pool uses unsupported features");
 4061                         return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 4062                             ENOTSUP));
 4063                 }
 4064 
 4065                 /*
 4066                  * Load refcounts for ZFS features from disk into an in-memory
 4067                  * cache during SPA initialization.
 4068                  */
 4069                 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
 4070                         uint64_t refcount;
 4071 
 4072                         error = feature_get_refcount_from_disk(spa,
 4073                             &spa_feature_table[i], &refcount);
 4074                         if (error == 0) {
 4075                                 spa->spa_feat_refcount_cache[i] = refcount;
 4076                         } else if (error == ENOTSUP) {
 4077                                 spa->spa_feat_refcount_cache[i] =
 4078                                     SPA_FEATURE_DISABLED;
 4079                         } else {
 4080                                 spa_load_failed(spa, "error getting refcount "
 4081                                     "for feature %s [error=%d]",
 4082                                     spa_feature_table[i].fi_guid, error);
 4083                                 return (spa_vdev_err(rvd,
 4084                                     VDEV_AUX_CORRUPT_DATA, EIO));
 4085                         }
 4086                 }
 4087         }
 4088 
 4089         if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
 4090                 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
 4091                     &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
 4092                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4093         }
 4094 
 4095         /*
 4096          * Encryption was added before bookmark_v2, even though bookmark_v2
 4097          * is now a dependency. If this pool has encryption enabled without
 4098          * bookmark_v2, trigger an errata message.
 4099          */
 4100         if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
 4101             !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
 4102                 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
 4103         }
 4104 
 4105         return (0);
 4106 }
 4107 
 4108 static int
 4109 spa_ld_load_special_directories(spa_t *spa)
 4110 {
 4111         int error = 0;
 4112         vdev_t *rvd = spa->spa_root_vdev;
 4113 
 4114         spa->spa_is_initializing = B_TRUE;
 4115         error = dsl_pool_open(spa->spa_dsl_pool);
 4116         spa->spa_is_initializing = B_FALSE;
 4117         if (error != 0) {
 4118                 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
 4119                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4120         }
 4121 
 4122         return (0);
 4123 }
 4124 
 4125 static int
 4126 spa_ld_get_props(spa_t *spa)
 4127 {
 4128         int error = 0;
 4129         uint64_t obj;
 4130         vdev_t *rvd = spa->spa_root_vdev;
 4131 
 4132         /* Grab the checksum salt from the MOS. */
 4133         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 4134             DMU_POOL_CHECKSUM_SALT, 1,
 4135             sizeof (spa->spa_cksum_salt.zcs_bytes),
 4136             spa->spa_cksum_salt.zcs_bytes);
 4137         if (error == ENOENT) {
 4138                 /* Generate a new salt for subsequent use */
 4139                 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
 4140                     sizeof (spa->spa_cksum_salt.zcs_bytes));
 4141         } else if (error != 0) {
 4142                 spa_load_failed(spa, "unable to retrieve checksum salt from "
 4143                     "MOS [error=%d]", error);
 4144                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4145         }
 4146 
 4147         if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
 4148                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4149         error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
 4150         if (error != 0) {
 4151                 spa_load_failed(spa, "error opening deferred-frees bpobj "
 4152                     "[error=%d]", error);
 4153                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4154         }
 4155 
 4156         /*
 4157          * Load the bit that tells us to use the new accounting function
 4158          * (raid-z deflation).  If we have an older pool, this will not
 4159          * be present.
 4160          */
 4161         error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
 4162         if (error != 0 && error != ENOENT)
 4163                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4164 
 4165         error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
 4166             &spa->spa_creation_version, B_FALSE);
 4167         if (error != 0 && error != ENOENT)
 4168                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4169 
 4170         /*
 4171          * Load the persistent error log.  If we have an older pool, this will
 4172          * not be present.
 4173          */
 4174         error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
 4175             B_FALSE);
 4176         if (error != 0 && error != ENOENT)
 4177                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4178 
 4179         error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
 4180             &spa->spa_errlog_scrub, B_FALSE);
 4181         if (error != 0 && error != ENOENT)
 4182                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4183 
 4184         /*
 4185          * Load the livelist deletion field. If a livelist is queued for
 4186          * deletion, indicate that in the spa
 4187          */
 4188         error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
 4189             &spa->spa_livelists_to_delete, B_FALSE);
 4190         if (error != 0 && error != ENOENT)
 4191                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4192 
 4193         /*
 4194          * Load the history object.  If we have an older pool, this
 4195          * will not be present.
 4196          */
 4197         error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
 4198         if (error != 0 && error != ENOENT)
 4199                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4200 
 4201         /*
 4202          * Load the per-vdev ZAP map. If we have an older pool, this will not
 4203          * be present; in this case, defer its creation to a later time to
 4204          * avoid dirtying the MOS this early / out of sync context. See
 4205          * spa_sync_config_object.
 4206          */
 4207 
 4208         /* The sentinel is only available in the MOS config. */
 4209         nvlist_t *mos_config;
 4210         if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
 4211                 spa_load_failed(spa, "unable to retrieve MOS config");
 4212                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4213         }
 4214 
 4215         error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
 4216             &spa->spa_all_vdev_zaps, B_FALSE);
 4217 
 4218         if (error == ENOENT) {
 4219                 VERIFY(!nvlist_exists(mos_config,
 4220                     ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
 4221                 spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
 4222                 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
 4223         } else if (error != 0) {
 4224                 nvlist_free(mos_config);
 4225                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4226         } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
 4227                 /*
 4228                  * An older version of ZFS overwrote the sentinel value, so
 4229                  * we have orphaned per-vdev ZAPs in the MOS. Defer their
 4230                  * destruction to later; see spa_sync_config_object.
 4231                  */
 4232                 spa->spa_avz_action = AVZ_ACTION_DESTROY;
 4233                 /*
 4234                  * We're assuming that no vdevs have had their ZAPs created
 4235                  * before this. Better be sure of it.
 4236                  */
 4237                 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
 4238         }
 4239         nvlist_free(mos_config);
 4240 
 4241         spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 4242 
 4243         error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
 4244             B_FALSE);
 4245         if (error && error != ENOENT)
 4246                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4247 
 4248         if (error == 0) {
 4249                 uint64_t autoreplace = 0;
 4250 
 4251                 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
 4252                 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
 4253                 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
 4254                 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
 4255                 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
 4256                 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
 4257                 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
 4258                 spa->spa_autoreplace = (autoreplace != 0);
 4259         }
 4260 
 4261         /*
 4262          * If we are importing a pool with missing top-level vdevs,
 4263          * we enforce that the pool doesn't panic or get suspended on
 4264          * error since the likelihood of missing data is extremely high.
 4265          */
 4266         if (spa->spa_missing_tvds > 0 &&
 4267             spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
 4268             spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 4269                 spa_load_note(spa, "forcing failmode to 'continue' "
 4270                     "as some top level vdevs are missing");
 4271                 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
 4272         }
 4273 
 4274         return (0);
 4275 }
 4276 
 4277 static int
 4278 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
 4279 {
 4280         int error = 0;
 4281         vdev_t *rvd = spa->spa_root_vdev;
 4282 
 4283         /*
 4284          * If we're assembling the pool from the split-off vdevs of
 4285          * an existing pool, we don't want to attach the spares & cache
 4286          * devices.
 4287          */
 4288 
 4289         /*
 4290          * Load any hot spares for this pool.
 4291          */
 4292         error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
 4293             B_FALSE);
 4294         if (error != 0 && error != ENOENT)
 4295                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4296         if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 4297                 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
 4298                 if (load_nvlist(spa, spa->spa_spares.sav_object,
 4299                     &spa->spa_spares.sav_config) != 0) {
 4300                         spa_load_failed(spa, "error loading spares nvlist");
 4301                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4302                 }
 4303 
 4304                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 4305                 spa_load_spares(spa);
 4306                 spa_config_exit(spa, SCL_ALL, FTAG);
 4307         } else if (error == 0) {
 4308                 spa->spa_spares.sav_sync = B_TRUE;
 4309         }
 4310 
 4311         /*
 4312          * Load any level 2 ARC devices for this pool.
 4313          */
 4314         error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
 4315             &spa->spa_l2cache.sav_object, B_FALSE);
 4316         if (error != 0 && error != ENOENT)
 4317                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4318         if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 4319                 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
 4320                 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
 4321                     &spa->spa_l2cache.sav_config) != 0) {
 4322                         spa_load_failed(spa, "error loading l2cache nvlist");
 4323                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4324                 }
 4325 
 4326                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 4327                 spa_load_l2cache(spa);
 4328                 spa_config_exit(spa, SCL_ALL, FTAG);
 4329         } else if (error == 0) {
 4330                 spa->spa_l2cache.sav_sync = B_TRUE;
 4331         }
 4332 
 4333         return (0);
 4334 }
 4335 
 4336 static int
 4337 spa_ld_load_vdev_metadata(spa_t *spa)
 4338 {
 4339         int error = 0;
 4340         vdev_t *rvd = spa->spa_root_vdev;
 4341 
 4342         /*
 4343          * If the 'multihost' property is set, then never allow a pool to
 4344          * be imported when the system hostid is zero.  The exception to
 4345          * this rule is zdb which is always allowed to access pools.
 4346          */
 4347         if (spa_multihost(spa) && spa_get_hostid(spa) == 0 &&
 4348             (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
 4349                 fnvlist_add_uint64(spa->spa_load_info,
 4350                     ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
 4351                 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
 4352         }
 4353 
 4354         /*
 4355          * If the 'autoreplace' property is set, then post a resource notifying
 4356          * the ZFS DE that it should not issue any faults for unopenable
 4357          * devices.  We also iterate over the vdevs, and post a sysevent for any
 4358          * unopenable vdevs so that the normal autoreplace handler can take
 4359          * over.
 4360          */
 4361         if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 4362                 spa_check_removed(spa->spa_root_vdev);
 4363                 /*
 4364                  * For the import case, this is done in spa_import(), because
 4365                  * at this point we're using the spare definitions from
 4366                  * the MOS config, not necessarily from the userland config.
 4367                  */
 4368                 if (spa->spa_load_state != SPA_LOAD_IMPORT) {
 4369                         spa_aux_check_removed(&spa->spa_spares);
 4370                         spa_aux_check_removed(&spa->spa_l2cache);
 4371                 }
 4372         }
 4373 
 4374         /*
 4375          * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
 4376          */
 4377         error = vdev_load(rvd);
 4378         if (error != 0) {
 4379                 spa_load_failed(spa, "vdev_load failed [error=%d]", error);
 4380                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 4381         }
 4382 
 4383         error = spa_ld_log_spacemaps(spa);
 4384         if (error != 0) {
 4385                 spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]",
 4386                     error);
 4387                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 4388         }
 4389 
 4390         /*
 4391          * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
 4392          */
 4393         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 4394         vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
 4395         spa_config_exit(spa, SCL_ALL, FTAG);
 4396 
 4397         return (0);
 4398 }
 4399 
 4400 static int
 4401 spa_ld_load_dedup_tables(spa_t *spa)
 4402 {
 4403         int error = 0;
 4404         vdev_t *rvd = spa->spa_root_vdev;
 4405 
 4406         error = ddt_load(spa);
 4407         if (error != 0) {
 4408                 spa_load_failed(spa, "ddt_load failed [error=%d]", error);
 4409                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 4410         }
 4411 
 4412         return (0);
 4413 }
 4414 
 4415 static int
 4416 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport)
 4417 {
 4418         vdev_t *rvd = spa->spa_root_vdev;
 4419 
 4420         if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
 4421                 boolean_t missing = spa_check_logs(spa);
 4422                 if (missing) {
 4423                         if (spa->spa_missing_tvds != 0) {
 4424                                 spa_load_note(spa, "spa_check_logs failed "
 4425                                     "so dropping the logs");
 4426                         } else {
 4427                                 *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
 4428                                 spa_load_failed(spa, "spa_check_logs failed");
 4429                                 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
 4430                                     ENXIO));
 4431                         }
 4432                 }
 4433         }
 4434 
 4435         return (0);
 4436 }
 4437 
 4438 static int
 4439 spa_ld_verify_pool_data(spa_t *spa)
 4440 {
 4441         int error = 0;
 4442         vdev_t *rvd = spa->spa_root_vdev;
 4443 
 4444         /*
 4445          * We've successfully opened the pool, verify that we're ready
 4446          * to start pushing transactions.
 4447          */
 4448         if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 4449                 error = spa_load_verify(spa);
 4450                 if (error != 0) {
 4451                         spa_load_failed(spa, "spa_load_verify failed "
 4452                             "[error=%d]", error);
 4453                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 4454                             error));
 4455                 }
 4456         }
 4457 
 4458         return (0);
 4459 }
 4460 
 4461 static void
 4462 spa_ld_claim_log_blocks(spa_t *spa)
 4463 {
 4464         dmu_tx_t *tx;
 4465         dsl_pool_t *dp = spa_get_dsl(spa);
 4466 
 4467         /*
 4468          * Claim log blocks that haven't been committed yet.
 4469          * This must all happen in a single txg.
 4470          * Note: spa_claim_max_txg is updated by spa_claim_notify(),
 4471          * invoked from zil_claim_log_block()'s i/o done callback.
 4472          * Price of rollback is that we abandon the log.
 4473          */
 4474         spa->spa_claiming = B_TRUE;
 4475 
 4476         tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
 4477         (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 4478             zil_claim, tx, DS_FIND_CHILDREN);
 4479         dmu_tx_commit(tx);
 4480 
 4481         spa->spa_claiming = B_FALSE;
 4482 
 4483         spa_set_log_state(spa, SPA_LOG_GOOD);
 4484 }
 4485 
 4486 static void
 4487 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
 4488     boolean_t update_config_cache)
 4489 {
 4490         vdev_t *rvd = spa->spa_root_vdev;
 4491         int need_update = B_FALSE;
 4492 
 4493         /*
 4494          * If the config cache is stale, or we have uninitialized
 4495          * metaslabs (see spa_vdev_add()), then update the config.
 4496          *
 4497          * If this is a verbatim import, trust the current
 4498          * in-core spa_config and update the disk labels.
 4499          */
 4500         if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
 4501             spa->spa_load_state == SPA_LOAD_IMPORT ||
 4502             spa->spa_load_state == SPA_LOAD_RECOVER ||
 4503             (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
 4504                 need_update = B_TRUE;
 4505 
 4506         for (int c = 0; c < rvd->vdev_children; c++)
 4507                 if (rvd->vdev_child[c]->vdev_ms_array == 0)
 4508                         need_update = B_TRUE;
 4509 
 4510         /*
 4511          * Update the config cache asynchronously in case we're the
 4512          * root pool, in which case the config cache isn't writable yet.
 4513          */
 4514         if (need_update)
 4515                 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 4516 }
 4517 
 4518 static void
 4519 spa_ld_prepare_for_reload(spa_t *spa)
 4520 {
 4521         spa_mode_t mode = spa->spa_mode;
 4522         int async_suspended = spa->spa_async_suspended;
 4523 
 4524         spa_unload(spa);
 4525         spa_deactivate(spa);
 4526         spa_activate(spa, mode);
 4527 
 4528         /*
 4529          * We save the value of spa_async_suspended as it gets reset to 0 by
 4530          * spa_unload(). We want to restore it back to the original value before
 4531          * returning as we might be calling spa_async_resume() later.
 4532          */
 4533         spa->spa_async_suspended = async_suspended;
 4534 }
 4535 
 4536 static int
 4537 spa_ld_read_checkpoint_txg(spa_t *spa)
 4538 {
 4539         uberblock_t checkpoint;
 4540         int error = 0;
 4541 
 4542         ASSERT0(spa->spa_checkpoint_txg);
 4543         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 4544 
 4545         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 4546             DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 4547             sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 4548 
 4549         if (error == ENOENT)
 4550                 return (0);
 4551 
 4552         if (error != 0)
 4553                 return (error);
 4554 
 4555         ASSERT3U(checkpoint.ub_txg, !=, 0);
 4556         ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
 4557         ASSERT3U(checkpoint.ub_timestamp, !=, 0);
 4558         spa->spa_checkpoint_txg = checkpoint.ub_txg;
 4559         spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
 4560 
 4561         return (0);
 4562 }
 4563 
 4564 static int
 4565 spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
 4566 {
 4567         int error = 0;
 4568 
 4569         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 4570         ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
 4571 
 4572         /*
 4573          * Never trust the config that is provided unless we are assembling
 4574          * a pool following a split.
 4575          * This means don't trust blkptrs and the vdev tree in general. This
 4576          * also effectively puts the spa in read-only mode since
 4577          * spa_writeable() checks for spa_trust_config to be true.
 4578          * We will later load a trusted config from the MOS.
 4579          */
 4580         if (type != SPA_IMPORT_ASSEMBLE)
 4581                 spa->spa_trust_config = B_FALSE;
 4582 
 4583         /*
 4584          * Parse the config provided to create a vdev tree.
 4585          */
 4586         error = spa_ld_parse_config(spa, type);
 4587         if (error != 0)
 4588                 return (error);
 4589 
 4590         spa_import_progress_add(spa);
 4591 
 4592         /*
 4593          * Now that we have the vdev tree, try to open each vdev. This involves
 4594          * opening the underlying physical device, retrieving its geometry and
 4595          * probing the vdev with a dummy I/O. The state of each vdev will be set
 4596          * based on the success of those operations. After this we'll be ready
 4597          * to read from the vdevs.
 4598          */
 4599         error = spa_ld_open_vdevs(spa);
 4600         if (error != 0)
 4601                 return (error);
 4602 
 4603         /*
 4604          * Read the label of each vdev and make sure that the GUIDs stored
 4605          * there match the GUIDs in the config provided.
 4606          * If we're assembling a new pool that's been split off from an
 4607          * existing pool, the labels haven't yet been updated so we skip
 4608          * validation for now.
 4609          */
 4610         if (type != SPA_IMPORT_ASSEMBLE) {
 4611                 error = spa_ld_validate_vdevs(spa);
 4612                 if (error != 0)
 4613                         return (error);
 4614         }
 4615 
 4616         /*
 4617          * Read all vdev labels to find the best uberblock (i.e. latest,
 4618          * unless spa_load_max_txg is set) and store it in spa_uberblock. We
 4619          * get the list of features required to read blkptrs in the MOS from
 4620          * the vdev label with the best uberblock and verify that our version
 4621          * of zfs supports them all.
 4622          */
 4623         error = spa_ld_select_uberblock(spa, type);
 4624         if (error != 0)
 4625                 return (error);
 4626 
 4627         /*
 4628          * Pass that uberblock to the dsl_pool layer which will open the root
 4629          * blkptr. This blkptr points to the latest version of the MOS and will
 4630          * allow us to read its contents.
 4631          */
 4632         error = spa_ld_open_rootbp(spa);
 4633         if (error != 0)
 4634                 return (error);
 4635 
 4636         return (0);
 4637 }
 4638 
 4639 static int
 4640 spa_ld_checkpoint_rewind(spa_t *spa)
 4641 {
 4642         uberblock_t checkpoint;
 4643         int error = 0;
 4644 
 4645         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 4646         ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 4647 
 4648         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 4649             DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 4650             sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 4651 
 4652         if (error != 0) {
 4653                 spa_load_failed(spa, "unable to retrieve checkpointed "
 4654                     "uberblock from the MOS config [error=%d]", error);
 4655 
 4656                 if (error == ENOENT)
 4657                         error = ZFS_ERR_NO_CHECKPOINT;
 4658 
 4659                 return (error);
 4660         }
 4661 
 4662         ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
 4663         ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
 4664 
 4665         /*
 4666          * We need to update the txg and timestamp of the checkpointed
 4667          * uberblock to be higher than the latest one. This ensures that
 4668          * the checkpointed uberblock is selected if we were to close and
 4669          * reopen the pool right after we've written it in the vdev labels.
 4670          * (also see block comment in vdev_uberblock_compare)
 4671          */
 4672         checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
 4673         checkpoint.ub_timestamp = gethrestime_sec();
 4674 
 4675         /*
 4676          * Set current uberblock to be the checkpointed uberblock.
 4677          */
 4678         spa->spa_uberblock = checkpoint;
 4679 
 4680         /*
 4681          * If we are doing a normal rewind, then the pool is open for
 4682          * writing and we sync the "updated" checkpointed uberblock to
 4683          * disk. Once this is done, we've basically rewound the whole
 4684          * pool and there is no way back.
 4685          *
 4686          * There are cases when we don't want to attempt and sync the
 4687          * checkpointed uberblock to disk because we are opening a
 4688          * pool as read-only. Specifically, verifying the checkpointed
 4689          * state with zdb, and importing the checkpointed state to get
 4690          * a "preview" of its content.
 4691          */
 4692         if (spa_writeable(spa)) {
 4693                 vdev_t *rvd = spa->spa_root_vdev;
 4694 
 4695                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 4696                 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
 4697                 int svdcount = 0;
 4698                 int children = rvd->vdev_children;
 4699                 int c0 = random_in_range(children);
 4700 
 4701                 for (int c = 0; c < children; c++) {
 4702                         vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
 4703 
 4704                         /* Stop when revisiting the first vdev */
 4705                         if (c > 0 && svd[0] == vd)
 4706                                 break;
 4707 
 4708                         if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
 4709                             !vdev_is_concrete(vd))
 4710                                 continue;
 4711 
 4712                         svd[svdcount++] = vd;
 4713                         if (svdcount == SPA_SYNC_MIN_VDEVS)
 4714                                 break;
 4715                 }
 4716                 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
 4717                 if (error == 0)
 4718                         spa->spa_last_synced_guid = rvd->vdev_guid;
 4719                 spa_config_exit(spa, SCL_ALL, FTAG);
 4720 
 4721                 if (error != 0) {
 4722                         spa_load_failed(spa, "failed to write checkpointed "
 4723                             "uberblock to the vdev labels [error=%d]", error);
 4724                         return (error);
 4725                 }
 4726         }
 4727 
 4728         return (0);
 4729 }
 4730 
 4731 static int
 4732 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
 4733     boolean_t *update_config_cache)
 4734 {
 4735         int error;
 4736 
 4737         /*
 4738          * Parse the config for pool, open and validate vdevs,
 4739          * select an uberblock, and use that uberblock to open
 4740          * the MOS.
 4741          */
 4742         error = spa_ld_mos_init(spa, type);
 4743         if (error != 0)
 4744                 return (error);
 4745 
 4746         /*
 4747          * Retrieve the trusted config stored in the MOS and use it to create
 4748          * a new, exact version of the vdev tree, then reopen all vdevs.
 4749          */
 4750         error = spa_ld_trusted_config(spa, type, B_FALSE);
 4751         if (error == EAGAIN) {
 4752                 if (update_config_cache != NULL)
 4753                         *update_config_cache = B_TRUE;
 4754 
 4755                 /*
 4756                  * Redo the loading process with the trusted config if it is
 4757                  * too different from the untrusted config.
 4758                  */
 4759                 spa_ld_prepare_for_reload(spa);
 4760                 spa_load_note(spa, "RELOADING");
 4761                 error = spa_ld_mos_init(spa, type);
 4762                 if (error != 0)
 4763                         return (error);
 4764 
 4765                 error = spa_ld_trusted_config(spa, type, B_TRUE);
 4766                 if (error != 0)
 4767                         return (error);
 4768 
 4769         } else if (error != 0) {
 4770                 return (error);
 4771         }
 4772 
 4773         return (0);
 4774 }
 4775 
 4776 /*
 4777  * Load an existing storage pool, using the config provided. This config
 4778  * describes which vdevs are part of the pool and is later validated against
 4779  * partial configs present in each vdev's label and an entire copy of the
 4780  * config stored in the MOS.
 4781  */
 4782 static int
 4783 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 4784 {
 4785         int error = 0;
 4786         boolean_t missing_feat_write = B_FALSE;
 4787         boolean_t checkpoint_rewind =
 4788             (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 4789         boolean_t update_config_cache = B_FALSE;
 4790 
 4791         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 4792         ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
 4793 
 4794         spa_load_note(spa, "LOADING");
 4795 
 4796         error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
 4797         if (error != 0)
 4798                 return (error);
 4799 
 4800         /*
 4801          * If we are rewinding to the checkpoint then we need to repeat
 4802          * everything we've done so far in this function but this time
 4803          * selecting the checkpointed uberblock and using that to open
 4804          * the MOS.
 4805          */
 4806         if (checkpoint_rewind) {
 4807                 /*
 4808                  * If we are rewinding to the checkpoint update config cache
 4809                  * anyway.
 4810                  */
 4811                 update_config_cache = B_TRUE;
 4812 
 4813                 /*
 4814                  * Extract the checkpointed uberblock from the current MOS
 4815                  * and use this as the pool's uberblock from now on. If the
 4816                  * pool is imported as writeable we also write the checkpoint
 4817                  * uberblock to the labels, making the rewind permanent.
 4818                  */
 4819                 error = spa_ld_checkpoint_rewind(spa);
 4820                 if (error != 0)
 4821                         return (error);
 4822 
 4823                 /*
 4824                  * Redo the loading process again with the
 4825                  * checkpointed uberblock.
 4826                  */
 4827                 spa_ld_prepare_for_reload(spa);
 4828                 spa_load_note(spa, "LOADING checkpointed uberblock");
 4829                 error = spa_ld_mos_with_trusted_config(spa, type, NULL);
 4830                 if (error != 0)
 4831                         return (error);
 4832         }
 4833 
 4834         /*
 4835          * Retrieve the checkpoint txg if the pool has a checkpoint.
 4836          */
 4837         error = spa_ld_read_checkpoint_txg(spa);
 4838         if (error != 0)
 4839                 return (error);
 4840 
 4841         /*
 4842          * Retrieve the mapping of indirect vdevs. Those vdevs were removed
 4843          * from the pool and their contents were re-mapped to other vdevs. Note
 4844          * that everything that we read before this step must have been
 4845          * rewritten on concrete vdevs after the last device removal was
 4846          * initiated. Otherwise we could be reading from indirect vdevs before
 4847          * we have loaded their mappings.
 4848          */
 4849         error = spa_ld_open_indirect_vdev_metadata(spa);
 4850         if (error != 0)
 4851                 return (error);
 4852 
 4853         /*
 4854          * Retrieve the full list of active features from the MOS and check if
 4855          * they are all supported.
 4856          */
 4857         error = spa_ld_check_features(spa, &missing_feat_write);
 4858         if (error != 0)
 4859                 return (error);
 4860 
 4861         /*
 4862          * Load several special directories from the MOS needed by the dsl_pool
 4863          * layer.
 4864          */
 4865         error = spa_ld_load_special_directories(spa);
 4866         if (error != 0)
 4867                 return (error);
 4868 
 4869         /*
 4870          * Retrieve pool properties from the MOS.
 4871          */
 4872         error = spa_ld_get_props(spa);
 4873         if (error != 0)
 4874                 return (error);
 4875 
 4876         /*
 4877          * Retrieve the list of auxiliary devices - cache devices and spares -
 4878          * and open them.
 4879          */
 4880         error = spa_ld_open_aux_vdevs(spa, type);
 4881         if (error != 0)
 4882                 return (error);
 4883 
 4884         /*
 4885          * Load the metadata for all vdevs. Also check if unopenable devices
 4886          * should be autoreplaced.
 4887          */
 4888         error = spa_ld_load_vdev_metadata(spa);
 4889         if (error != 0)
 4890                 return (error);
 4891 
 4892         error = spa_ld_load_dedup_tables(spa);
 4893         if (error != 0)
 4894                 return (error);
 4895 
 4896         /*
 4897          * Verify the logs now to make sure we don't have any unexpected errors
 4898          * when we claim log blocks later.
 4899          */
 4900         error = spa_ld_verify_logs(spa, type, ereport);
 4901         if (error != 0)
 4902                 return (error);
 4903 
 4904         if (missing_feat_write) {
 4905                 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
 4906 
 4907                 /*
 4908                  * At this point, we know that we can open the pool in
 4909                  * read-only mode but not read-write mode. We now have enough
 4910                  * information and can return to userland.
 4911                  */
 4912                 return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
 4913                     ENOTSUP));
 4914         }
 4915 
 4916         /*
 4917          * Traverse the last txgs to make sure the pool was left off in a safe
 4918          * state. When performing an extreme rewind, we verify the whole pool,
 4919          * which can take a very long time.
 4920          */
 4921         error = spa_ld_verify_pool_data(spa);
 4922         if (error != 0)
 4923                 return (error);
 4924 
 4925         /*
 4926          * Calculate the deflated space for the pool. This must be done before
 4927          * we write anything to the pool because we'd need to update the space
 4928          * accounting using the deflated sizes.
 4929          */
 4930         spa_update_dspace(spa);
 4931 
 4932         /*
 4933          * We have now retrieved all the information we needed to open the
 4934          * pool. If we are importing the pool in read-write mode, a few
 4935          * additional steps must be performed to finish the import.
 4936          */
 4937         if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
 4938             spa->spa_load_max_txg == UINT64_MAX)) {
 4939                 uint64_t config_cache_txg = spa->spa_config_txg;
 4940 
 4941                 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
 4942 
 4943                 /*
 4944                  * In case of a checkpoint rewind, log the original txg
 4945                  * of the checkpointed uberblock.
 4946                  */
 4947                 if (checkpoint_rewind) {
 4948                         spa_history_log_internal(spa, "checkpoint rewind",
 4949                             NULL, "rewound state to txg=%llu",
 4950                             (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
 4951                 }
 4952 
 4953                 /*
 4954                  * Traverse the ZIL and claim all blocks.
 4955                  */
 4956                 spa_ld_claim_log_blocks(spa);
 4957 
 4958                 /*
 4959                  * Kick-off the syncing thread.
 4960                  */
 4961                 spa->spa_sync_on = B_TRUE;
 4962                 txg_sync_start(spa->spa_dsl_pool);
 4963                 mmp_thread_start(spa);
 4964 
 4965                 /*
 4966                  * Wait for all claims to sync.  We sync up to the highest
 4967                  * claimed log block birth time so that claimed log blocks
 4968                  * don't appear to be from the future.  spa_claim_max_txg
 4969                  * will have been set for us by ZIL traversal operations
 4970                  * performed above.
 4971                  */
 4972                 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 4973 
 4974                 /*
 4975                  * Check if we need to request an update of the config. On the
 4976                  * next sync, we would update the config stored in vdev labels
 4977                  * and the cachefile (by default /etc/zfs/zpool.cache).
 4978                  */
 4979                 spa_ld_check_for_config_update(spa, config_cache_txg,
 4980                     update_config_cache);
 4981 
 4982                 /*
 4983                  * Check if a rebuild was in progress and if so resume it.
 4984                  * Then check all DTLs to see if anything needs resilvering.
 4985                  * The resilver will be deferred if a rebuild was started.
 4986                  */
 4987                 if (vdev_rebuild_active(spa->spa_root_vdev)) {
 4988                         vdev_rebuild_restart(spa);
 4989                 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
 4990                     vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 4991                         spa_async_request(spa, SPA_ASYNC_RESILVER);
 4992                 }
 4993 
 4994                 /*
 4995                  * Log the fact that we booted up (so that we can detect if
 4996                  * we rebooted in the middle of an operation).
 4997                  */
 4998                 spa_history_log_version(spa, "open", NULL);
 4999 
 5000                 spa_restart_removal(spa);
 5001                 spa_spawn_aux_threads(spa);
 5002 
 5003                 /*
 5004                  * Delete any inconsistent datasets.
 5005                  *
 5006                  * Note:
 5007                  * Since we may be issuing deletes for clones here,
 5008                  * we make sure to do so after we've spawned all the
 5009                  * auxiliary threads above (from which the livelist
 5010                  * deletion zthr is part of).
 5011                  */
 5012                 (void) dmu_objset_find(spa_name(spa),
 5013                     dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
 5014 
 5015                 /*
 5016                  * Clean up any stale temporary dataset userrefs.
 5017                  */
 5018                 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 5019 
 5020                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 5021                 vdev_initialize_restart(spa->spa_root_vdev);
 5022                 vdev_trim_restart(spa->spa_root_vdev);
 5023                 vdev_autotrim_restart(spa);
 5024                 spa_config_exit(spa, SCL_CONFIG, FTAG);
 5025         }
 5026 
 5027         spa_import_progress_remove(spa_guid(spa));
 5028         spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
 5029 
 5030         spa_load_note(spa, "LOADED");
 5031 
 5032         return (0);
 5033 }
 5034 
 5035 static int
 5036 spa_load_retry(spa_t *spa, spa_load_state_t state)
 5037 {
 5038         spa_mode_t mode = spa->spa_mode;
 5039 
 5040         spa_unload(spa);
 5041         spa_deactivate(spa);
 5042 
 5043         spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
 5044 
 5045         spa_activate(spa, mode);
 5046         spa_async_suspend(spa);
 5047 
 5048         spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
 5049             (u_longlong_t)spa->spa_load_max_txg);
 5050 
 5051         return (spa_load(spa, state, SPA_IMPORT_EXISTING));
 5052 }
 5053 
 5054 /*
 5055  * If spa_load() fails this function will try loading prior txg's. If
 5056  * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
 5057  * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
 5058  * function will not rewind the pool and will return the same error as
 5059  * spa_load().
 5060  */
 5061 static int
 5062 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
 5063     int rewind_flags)
 5064 {
 5065         nvlist_t *loadinfo = NULL;
 5066         nvlist_t *config = NULL;
 5067         int load_error, rewind_error;
 5068         uint64_t safe_rewind_txg;
 5069         uint64_t min_txg;
 5070 
 5071         if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
 5072                 spa->spa_load_max_txg = spa->spa_load_txg;
 5073                 spa_set_log_state(spa, SPA_LOG_CLEAR);
 5074         } else {
 5075                 spa->spa_load_max_txg = max_request;
 5076                 if (max_request != UINT64_MAX)
 5077                         spa->spa_extreme_rewind = B_TRUE;
 5078         }
 5079 
 5080         load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
 5081         if (load_error == 0)
 5082                 return (0);
 5083         if (load_error == ZFS_ERR_NO_CHECKPOINT) {
 5084                 /*
 5085                  * When attempting checkpoint-rewind on a pool with no
 5086                  * checkpoint, we should not attempt to load uberblocks
 5087                  * from previous txgs when spa_load fails.
 5088                  */
 5089                 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 5090                 spa_import_progress_remove(spa_guid(spa));
 5091                 return (load_error);
 5092         }
 5093 
 5094         if (spa->spa_root_vdev != NULL)
 5095                 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 5096 
 5097         spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
 5098         spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
 5099 
 5100         if (rewind_flags & ZPOOL_NEVER_REWIND) {
 5101                 nvlist_free(config);
 5102                 spa_import_progress_remove(spa_guid(spa));
 5103                 return (load_error);
 5104         }
 5105 
 5106         if (state == SPA_LOAD_RECOVER) {
 5107                 /* Price of rolling back is discarding txgs, including log */
 5108                 spa_set_log_state(spa, SPA_LOG_CLEAR);
 5109         } else {
 5110                 /*
 5111                  * If we aren't rolling back save the load info from our first
 5112                  * import attempt so that we can restore it after attempting
 5113                  * to rewind.
 5114                  */
 5115                 loadinfo = spa->spa_load_info;
 5116                 spa->spa_load_info = fnvlist_alloc();
 5117         }
 5118 
 5119         spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
 5120         safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
 5121         min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
 5122             TXG_INITIAL : safe_rewind_txg;
 5123 
 5124         /*
 5125          * Continue as long as we're finding errors, we're still within
 5126          * the acceptable rewind range, and we're still finding uberblocks
 5127          */
 5128         while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
 5129             spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
 5130                 if (spa->spa_load_max_txg < safe_rewind_txg)
 5131                         spa->spa_extreme_rewind = B_TRUE;
 5132                 rewind_error = spa_load_retry(spa, state);
 5133         }
 5134 
 5135         spa->spa_extreme_rewind = B_FALSE;
 5136         spa->spa_load_max_txg = UINT64_MAX;
 5137 
 5138         if (config && (rewind_error || state != SPA_LOAD_RECOVER))
 5139                 spa_config_set(spa, config);
 5140         else
 5141                 nvlist_free(config);
 5142 
 5143         if (state == SPA_LOAD_RECOVER) {
 5144                 ASSERT3P(loadinfo, ==, NULL);
 5145                 spa_import_progress_remove(spa_guid(spa));
 5146                 return (rewind_error);
 5147         } else {
 5148                 /* Store the rewind info as part of the initial load info */
 5149                 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
 5150                     spa->spa_load_info);
 5151 
 5152                 /* Restore the initial load info */
 5153                 fnvlist_free(spa->spa_load_info);
 5154                 spa->spa_load_info = loadinfo;
 5155 
 5156                 spa_import_progress_remove(spa_guid(spa));
 5157                 return (load_error);
 5158         }
 5159 }
 5160 
 5161 /*
 5162  * Pool Open/Import
 5163  *
 5164  * The import case is identical to an open except that the configuration is sent
 5165  * down from userland, instead of grabbed from the configuration cache.  For the
 5166  * case of an open, the pool configuration will exist in the
 5167  * POOL_STATE_UNINITIALIZED state.
 5168  *
 5169  * The stats information (gen/count/ustats) is used to gather vdev statistics at
 5170  * the same time open the pool, without having to keep around the spa_t in some
 5171  * ambiguous state.
 5172  */
 5173 static int
 5174 spa_open_common(const char *pool, spa_t **spapp, const void *tag,
 5175     nvlist_t *nvpolicy, nvlist_t **config)
 5176 {
 5177         spa_t *spa;
 5178         spa_load_state_t state = SPA_LOAD_OPEN;
 5179         int error;
 5180         int locked = B_FALSE;
 5181         int firstopen = B_FALSE;
 5182 
 5183         *spapp = NULL;
 5184 
 5185         /*
 5186          * As disgusting as this is, we need to support recursive calls to this
 5187          * function because dsl_dir_open() is called during spa_load(), and ends
 5188          * up calling spa_open() again.  The real fix is to figure out how to
 5189          * avoid dsl_dir_open() calling this in the first place.
 5190          */
 5191         if (MUTEX_NOT_HELD(&spa_namespace_lock)) {
 5192                 mutex_enter(&spa_namespace_lock);
 5193                 locked = B_TRUE;
 5194         }
 5195 
 5196         if ((spa = spa_lookup(pool)) == NULL) {
 5197                 if (locked)
 5198                         mutex_exit(&spa_namespace_lock);
 5199                 return (SET_ERROR(ENOENT));
 5200         }
 5201 
 5202         if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 5203                 zpool_load_policy_t policy;
 5204 
 5205                 firstopen = B_TRUE;
 5206 
 5207                 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
 5208                     &policy);
 5209                 if (policy.zlp_rewind & ZPOOL_DO_REWIND)
 5210                         state = SPA_LOAD_RECOVER;
 5211 
 5212                 spa_activate(spa, spa_mode_global);
 5213 
 5214                 if (state != SPA_LOAD_RECOVER)
 5215                         spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 5216                 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
 5217 
 5218                 zfs_dbgmsg("spa_open_common: opening %s", pool);
 5219                 error = spa_load_best(spa, state, policy.zlp_txg,
 5220                     policy.zlp_rewind);
 5221 
 5222                 if (error == EBADF) {
 5223                         /*
 5224                          * If vdev_validate() returns failure (indicated by
 5225                          * EBADF), it indicates that one of the vdevs indicates
 5226                          * that the pool has been exported or destroyed.  If
 5227                          * this is the case, the config cache is out of sync and
 5228                          * we should remove the pool from the namespace.
 5229                          */
 5230                         spa_unload(spa);
 5231                         spa_deactivate(spa);
 5232                         spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
 5233                         spa_remove(spa);
 5234                         if (locked)
 5235                                 mutex_exit(&spa_namespace_lock);
 5236                         return (SET_ERROR(ENOENT));
 5237                 }
 5238 
 5239                 if (error) {
 5240                         /*
 5241                          * We can't open the pool, but we still have useful
 5242                          * information: the state of each vdev after the
 5243                          * attempted vdev_open().  Return this to the user.
 5244                          */
 5245                         if (config != NULL && spa->spa_config) {
 5246                                 *config = fnvlist_dup(spa->spa_config);
 5247                                 fnvlist_add_nvlist(*config,
 5248                                     ZPOOL_CONFIG_LOAD_INFO,
 5249                                     spa->spa_load_info);
 5250                         }
 5251                         spa_unload(spa);
 5252                         spa_deactivate(spa);
 5253                         spa->spa_last_open_failed = error;
 5254                         if (locked)
 5255                                 mutex_exit(&spa_namespace_lock);
 5256                         *spapp = NULL;
 5257                         return (error);
 5258                 }
 5259         }
 5260 
 5261         spa_open_ref(spa, tag);
 5262 
 5263         if (config != NULL)
 5264                 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 5265 
 5266         /*
 5267          * If we've recovered the pool, pass back any information we
 5268          * gathered while doing the load.
 5269          */
 5270         if (state == SPA_LOAD_RECOVER && config != NULL) {
 5271                 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
 5272                     spa->spa_load_info);
 5273         }
 5274 
 5275         if (locked) {
 5276                 spa->spa_last_open_failed = 0;
 5277                 spa->spa_last_ubsync_txg = 0;
 5278                 spa->spa_load_txg = 0;
 5279                 mutex_exit(&spa_namespace_lock);
 5280         }
 5281 
 5282         if (firstopen)
 5283                 zvol_create_minors_recursive(spa_name(spa));
 5284 
 5285         *spapp = spa;
 5286 
 5287         return (0);
 5288 }
 5289 
 5290 int
 5291 spa_open_rewind(const char *name, spa_t **spapp, const void *tag,
 5292     nvlist_t *policy, nvlist_t **config)
 5293 {
 5294         return (spa_open_common(name, spapp, tag, policy, config));
 5295 }
 5296 
 5297 int
 5298 spa_open(const char *name, spa_t **spapp, const void *tag)
 5299 {
 5300         return (spa_open_common(name, spapp, tag, NULL, NULL));
 5301 }
 5302 
 5303 /*
 5304  * Lookup the given spa_t, incrementing the inject count in the process,
 5305  * preventing it from being exported or destroyed.
 5306  */
 5307 spa_t *
 5308 spa_inject_addref(char *name)
 5309 {
 5310         spa_t *spa;
 5311 
 5312         mutex_enter(&spa_namespace_lock);
 5313         if ((spa = spa_lookup(name)) == NULL) {
 5314                 mutex_exit(&spa_namespace_lock);
 5315                 return (NULL);
 5316         }
 5317         spa->spa_inject_ref++;
 5318         mutex_exit(&spa_namespace_lock);
 5319 
 5320         return (spa);
 5321 }
 5322 
 5323 void
 5324 spa_inject_delref(spa_t *spa)
 5325 {
 5326         mutex_enter(&spa_namespace_lock);
 5327         spa->spa_inject_ref--;
 5328         mutex_exit(&spa_namespace_lock);
 5329 }
 5330 
 5331 /*
 5332  * Add spares device information to the nvlist.
 5333  */
 5334 static void
 5335 spa_add_spares(spa_t *spa, nvlist_t *config)
 5336 {
 5337         nvlist_t **spares;
 5338         uint_t i, nspares;
 5339         nvlist_t *nvroot;
 5340         uint64_t guid;
 5341         vdev_stat_t *vs;
 5342         uint_t vsc;
 5343         uint64_t pool;
 5344 
 5345         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 5346 
 5347         if (spa->spa_spares.sav_count == 0)
 5348                 return;
 5349 
 5350         nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 5351         VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 5352             ZPOOL_CONFIG_SPARES, &spares, &nspares));
 5353         if (nspares != 0) {
 5354                 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 5355                     (const nvlist_t * const *)spares, nspares);
 5356                 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 5357                     &spares, &nspares));
 5358 
 5359                 /*
 5360                  * Go through and find any spares which have since been
 5361                  * repurposed as an active spare.  If this is the case, update
 5362                  * their status appropriately.
 5363                  */
 5364                 for (i = 0; i < nspares; i++) {
 5365                         guid = fnvlist_lookup_uint64(spares[i],
 5366                             ZPOOL_CONFIG_GUID);
 5367                         if (spa_spare_exists(guid, &pool, NULL) &&
 5368                             pool != 0ULL) {
 5369                                 VERIFY0(nvlist_lookup_uint64_array(spares[i],
 5370                                     ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs,
 5371                                     &vsc));
 5372                                 vs->vs_state = VDEV_STATE_CANT_OPEN;
 5373                                 vs->vs_aux = VDEV_AUX_SPARED;
 5374                         }
 5375                 }
 5376         }
 5377 }
 5378 
 5379 /*
 5380  * Add l2cache device information to the nvlist, including vdev stats.
 5381  */
 5382 static void
 5383 spa_add_l2cache(spa_t *spa, nvlist_t *config)
 5384 {
 5385         nvlist_t **l2cache;
 5386         uint_t i, j, nl2cache;
 5387         nvlist_t *nvroot;
 5388         uint64_t guid;
 5389         vdev_t *vd;
 5390         vdev_stat_t *vs;
 5391         uint_t vsc;
 5392 
 5393         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 5394 
 5395         if (spa->spa_l2cache.sav_count == 0)
 5396                 return;
 5397 
 5398         nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 5399         VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 5400             ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
 5401         if (nl2cache != 0) {
 5402                 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 5403                     (const nvlist_t * const *)l2cache, nl2cache);
 5404                 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 5405                     &l2cache, &nl2cache));
 5406 
 5407                 /*
 5408                  * Update level 2 cache device stats.
 5409                  */
 5410 
 5411                 for (i = 0; i < nl2cache; i++) {
 5412                         guid = fnvlist_lookup_uint64(l2cache[i],
 5413                             ZPOOL_CONFIG_GUID);
 5414 
 5415                         vd = NULL;
 5416                         for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
 5417                                 if (guid ==
 5418                                     spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
 5419                                         vd = spa->spa_l2cache.sav_vdevs[j];
 5420                                         break;
 5421                                 }
 5422                         }
 5423                         ASSERT(vd != NULL);
 5424 
 5425                         VERIFY0(nvlist_lookup_uint64_array(l2cache[i],
 5426                             ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
 5427                         vdev_get_stats(vd, vs);
 5428                         vdev_config_generate_stats(vd, l2cache[i]);
 5429 
 5430                 }
 5431         }
 5432 }
 5433 
 5434 static void
 5435 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
 5436 {
 5437         zap_cursor_t zc;
 5438         zap_attribute_t za;
 5439 
 5440         if (spa->spa_feat_for_read_obj != 0) {
 5441                 for (zap_cursor_init(&zc, spa->spa_meta_objset,
 5442                     spa->spa_feat_for_read_obj);
 5443                     zap_cursor_retrieve(&zc, &za) == 0;
 5444                     zap_cursor_advance(&zc)) {
 5445                         ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 5446                             za.za_num_integers == 1);
 5447                         VERIFY0(nvlist_add_uint64(features, za.za_name,
 5448                             za.za_first_integer));
 5449                 }
 5450                 zap_cursor_fini(&zc);
 5451         }
 5452 
 5453         if (spa->spa_feat_for_write_obj != 0) {
 5454                 for (zap_cursor_init(&zc, spa->spa_meta_objset,
 5455                     spa->spa_feat_for_write_obj);
 5456                     zap_cursor_retrieve(&zc, &za) == 0;
 5457                     zap_cursor_advance(&zc)) {
 5458                         ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 5459                             za.za_num_integers == 1);
 5460                         VERIFY0(nvlist_add_uint64(features, za.za_name,
 5461                             za.za_first_integer));
 5462                 }
 5463                 zap_cursor_fini(&zc);
 5464         }
 5465 }
 5466 
 5467 static void
 5468 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
 5469 {
 5470         int i;
 5471 
 5472         for (i = 0; i < SPA_FEATURES; i++) {
 5473                 zfeature_info_t feature = spa_feature_table[i];
 5474                 uint64_t refcount;
 5475 
 5476                 if (feature_get_refcount(spa, &feature, &refcount) != 0)
 5477                         continue;
 5478 
 5479                 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
 5480         }
 5481 }
 5482 
 5483 /*
 5484  * Store a list of pool features and their reference counts in the
 5485  * config.
 5486  *
 5487  * The first time this is called on a spa, allocate a new nvlist, fetch
 5488  * the pool features and reference counts from disk, then save the list
 5489  * in the spa. In subsequent calls on the same spa use the saved nvlist
 5490  * and refresh its values from the cached reference counts.  This
 5491  * ensures we don't block here on I/O on a suspended pool so 'zpool
 5492  * clear' can resume the pool.
 5493  */
 5494 static void
 5495 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
 5496 {
 5497         nvlist_t *features;
 5498 
 5499         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 5500 
 5501         mutex_enter(&spa->spa_feat_stats_lock);
 5502         features = spa->spa_feat_stats;
 5503 
 5504         if (features != NULL) {
 5505                 spa_feature_stats_from_cache(spa, features);
 5506         } else {
 5507                 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
 5508                 spa->spa_feat_stats = features;
 5509                 spa_feature_stats_from_disk(spa, features);
 5510         }
 5511 
 5512         VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
 5513             features));
 5514 
 5515         mutex_exit(&spa->spa_feat_stats_lock);
 5516 }
 5517 
 5518 int
 5519 spa_get_stats(const char *name, nvlist_t **config,
 5520     char *altroot, size_t buflen)
 5521 {
 5522         int error;
 5523         spa_t *spa;
 5524 
 5525         *config = NULL;
 5526         error = spa_open_common(name, &spa, FTAG, NULL, config);
 5527 
 5528         if (spa != NULL) {
 5529                 /*
 5530                  * This still leaves a window of inconsistency where the spares
 5531                  * or l2cache devices could change and the config would be
 5532                  * self-inconsistent.
 5533                  */
 5534                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 5535 
 5536                 if (*config != NULL) {
 5537                         uint64_t loadtimes[2];
 5538 
 5539                         loadtimes[0] = spa->spa_loaded_ts.tv_sec;
 5540                         loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
 5541                         fnvlist_add_uint64_array(*config,
 5542                             ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2);
 5543 
 5544                         fnvlist_add_uint64(*config,
 5545                             ZPOOL_CONFIG_ERRCOUNT,
 5546                             spa_approx_errlog_size(spa));
 5547 
 5548                         if (spa_suspended(spa)) {
 5549                                 fnvlist_add_uint64(*config,
 5550                                     ZPOOL_CONFIG_SUSPENDED,
 5551                                     spa->spa_failmode);
 5552                                 fnvlist_add_uint64(*config,
 5553                                     ZPOOL_CONFIG_SUSPENDED_REASON,
 5554                                     spa->spa_suspended);
 5555                         }
 5556 
 5557                         spa_add_spares(spa, *config);
 5558                         spa_add_l2cache(spa, *config);
 5559                         spa_add_feature_stats(spa, *config);
 5560                 }
 5561         }
 5562 
 5563         /*
 5564          * We want to get the alternate root even for faulted pools, so we cheat
 5565          * and call spa_lookup() directly.
 5566          */
 5567         if (altroot) {
 5568                 if (spa == NULL) {
 5569                         mutex_enter(&spa_namespace_lock);
 5570                         spa = spa_lookup(name);
 5571                         if (spa)
 5572                                 spa_altroot(spa, altroot, buflen);
 5573                         else
 5574                                 altroot[0] = '\0';
 5575                         spa = NULL;
 5576                         mutex_exit(&spa_namespace_lock);
 5577                 } else {
 5578                         spa_altroot(spa, altroot, buflen);
 5579                 }
 5580         }
 5581 
 5582         if (spa != NULL) {
 5583                 spa_config_exit(spa, SCL_CONFIG, FTAG);
 5584                 spa_close(spa, FTAG);
 5585         }
 5586 
 5587         return (error);
 5588 }
 5589 
 5590 /*
 5591  * Validate that the auxiliary device array is well formed.  We must have an
 5592  * array of nvlists, each which describes a valid leaf vdev.  If this is an
 5593  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
 5594  * specified, as long as they are well-formed.
 5595  */
 5596 static int
 5597 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
 5598     spa_aux_vdev_t *sav, const char *config, uint64_t version,
 5599     vdev_labeltype_t label)
 5600 {
 5601         nvlist_t **dev;
 5602         uint_t i, ndev;
 5603         vdev_t *vd;
 5604         int error;
 5605 
 5606         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 5607 
 5608         /*
 5609          * It's acceptable to have no devs specified.
 5610          */
 5611         if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
 5612                 return (0);
 5613 
 5614         if (ndev == 0)
 5615                 return (SET_ERROR(EINVAL));
 5616 
 5617         /*
 5618          * Make sure the pool is formatted with a version that supports this
 5619          * device type.
 5620          */
 5621         if (spa_version(spa) < version)
 5622                 return (SET_ERROR(ENOTSUP));
 5623 
 5624         /*
 5625          * Set the pending device list so we correctly handle device in-use
 5626          * checking.
 5627          */
 5628         sav->sav_pending = dev;
 5629         sav->sav_npending = ndev;
 5630 
 5631         for (i = 0; i < ndev; i++) {
 5632                 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
 5633                     mode)) != 0)
 5634                         goto out;
 5635 
 5636                 if (!vd->vdev_ops->vdev_op_leaf) {
 5637                         vdev_free(vd);
 5638                         error = SET_ERROR(EINVAL);
 5639                         goto out;
 5640                 }
 5641 
 5642                 vd->vdev_top = vd;
 5643 
 5644                 if ((error = vdev_open(vd)) == 0 &&
 5645                     (error = vdev_label_init(vd, crtxg, label)) == 0) {
 5646                         fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
 5647                             vd->vdev_guid);
 5648                 }
 5649 
 5650                 vdev_free(vd);
 5651 
 5652                 if (error &&
 5653                     (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
 5654                         goto out;
 5655                 else
 5656                         error = 0;
 5657         }
 5658 
 5659 out:
 5660         sav->sav_pending = NULL;
 5661         sav->sav_npending = 0;
 5662         return (error);
 5663 }
 5664 
 5665 static int
 5666 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
 5667 {
 5668         int error;
 5669 
 5670         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 5671 
 5672         if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 5673             &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
 5674             VDEV_LABEL_SPARE)) != 0) {
 5675                 return (error);
 5676         }
 5677 
 5678         return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 5679             &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
 5680             VDEV_LABEL_L2CACHE));
 5681 }
 5682 
 5683 static void
 5684 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
 5685     const char *config)
 5686 {
 5687         int i;
 5688 
 5689         if (sav->sav_config != NULL) {
 5690                 nvlist_t **olddevs;
 5691                 uint_t oldndevs;
 5692                 nvlist_t **newdevs;
 5693 
 5694                 /*
 5695                  * Generate new dev list by concatenating with the
 5696                  * current dev list.
 5697                  */
 5698                 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config,
 5699                     &olddevs, &oldndevs));
 5700 
 5701                 newdevs = kmem_alloc(sizeof (void *) *
 5702                     (ndevs + oldndevs), KM_SLEEP);
 5703                 for (i = 0; i < oldndevs; i++)
 5704                         newdevs[i] = fnvlist_dup(olddevs[i]);
 5705                 for (i = 0; i < ndevs; i++)
 5706                         newdevs[i + oldndevs] = fnvlist_dup(devs[i]);
 5707 
 5708                 fnvlist_remove(sav->sav_config, config);
 5709 
 5710                 fnvlist_add_nvlist_array(sav->sav_config, config,
 5711                     (const nvlist_t * const *)newdevs, ndevs + oldndevs);
 5712                 for (i = 0; i < oldndevs + ndevs; i++)
 5713                         nvlist_free(newdevs[i]);
 5714                 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
 5715         } else {
 5716                 /*
 5717                  * Generate a new dev list.
 5718                  */
 5719                 sav->sav_config = fnvlist_alloc();
 5720                 fnvlist_add_nvlist_array(sav->sav_config, config,
 5721                     (const nvlist_t * const *)devs, ndevs);
 5722         }
 5723 }
 5724 
 5725 /*
 5726  * Stop and drop level 2 ARC devices
 5727  */
 5728 void
 5729 spa_l2cache_drop(spa_t *spa)
 5730 {
 5731         vdev_t *vd;
 5732         int i;
 5733         spa_aux_vdev_t *sav = &spa->spa_l2cache;
 5734 
 5735         for (i = 0; i < sav->sav_count; i++) {
 5736                 uint64_t pool;
 5737 
 5738                 vd = sav->sav_vdevs[i];
 5739                 ASSERT(vd != NULL);
 5740 
 5741                 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 5742                     pool != 0ULL && l2arc_vdev_present(vd))
 5743                         l2arc_remove_vdev(vd);
 5744         }
 5745 }
 5746 
 5747 /*
 5748  * Verify encryption parameters for spa creation. If we are encrypting, we must
 5749  * have the encryption feature flag enabled.
 5750  */
 5751 static int
 5752 spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
 5753     boolean_t has_encryption)
 5754 {
 5755         if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
 5756             dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
 5757             !has_encryption)
 5758                 return (SET_ERROR(ENOTSUP));
 5759 
 5760         return (dmu_objset_create_crypt_check(NULL, dcp, NULL));
 5761 }
 5762 
 5763 /*
 5764  * Pool Creation
 5765  */
 5766 int
 5767 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 5768     nvlist_t *zplprops, dsl_crypto_params_t *dcp)
 5769 {
 5770         spa_t *spa;
 5771         char *altroot = NULL;
 5772         vdev_t *rvd;
 5773         dsl_pool_t *dp;
 5774         dmu_tx_t *tx;
 5775         int error = 0;
 5776         uint64_t txg = TXG_INITIAL;
 5777         nvlist_t **spares, **l2cache;
 5778         uint_t nspares, nl2cache;
 5779         uint64_t version, obj, ndraid = 0;
 5780         boolean_t has_features;
 5781         boolean_t has_encryption;
 5782         boolean_t has_allocclass;
 5783         spa_feature_t feat;
 5784         char *feat_name;
 5785         char *poolname;
 5786         nvlist_t *nvl;
 5787 
 5788         if (props == NULL ||
 5789             nvlist_lookup_string(props, "tname", &poolname) != 0)
 5790                 poolname = (char *)pool;
 5791 
 5792         /*
 5793          * If this pool already exists, return failure.
 5794          */
 5795         mutex_enter(&spa_namespace_lock);
 5796         if (spa_lookup(poolname) != NULL) {
 5797                 mutex_exit(&spa_namespace_lock);
 5798                 return (SET_ERROR(EEXIST));
 5799         }
 5800 
 5801         /*
 5802          * Allocate a new spa_t structure.
 5803          */
 5804         nvl = fnvlist_alloc();
 5805         fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
 5806         (void) nvlist_lookup_string(props,
 5807             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 5808         spa = spa_add(poolname, nvl, altroot);
 5809         fnvlist_free(nvl);
 5810         spa_activate(spa, spa_mode_global);
 5811 
 5812         if (props && (error = spa_prop_validate(spa, props))) {
 5813                 spa_deactivate(spa);
 5814                 spa_remove(spa);
 5815                 mutex_exit(&spa_namespace_lock);
 5816                 return (error);
 5817         }
 5818 
 5819         /*
 5820          * Temporary pool names should never be written to disk.
 5821          */
 5822         if (poolname != pool)
 5823                 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
 5824 
 5825         has_features = B_FALSE;
 5826         has_encryption = B_FALSE;
 5827         has_allocclass = B_FALSE;
 5828         for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
 5829             elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
 5830                 if (zpool_prop_feature(nvpair_name(elem))) {
 5831                         has_features = B_TRUE;
 5832 
 5833                         feat_name = strchr(nvpair_name(elem), '@') + 1;
 5834                         VERIFY0(zfeature_lookup_name(feat_name, &feat));
 5835                         if (feat == SPA_FEATURE_ENCRYPTION)
 5836                                 has_encryption = B_TRUE;
 5837                         if (feat == SPA_FEATURE_ALLOCATION_CLASSES)
 5838                                 has_allocclass = B_TRUE;
 5839                 }
 5840         }
 5841 
 5842         /* verify encryption params, if they were provided */
 5843         if (dcp != NULL) {
 5844                 error = spa_create_check_encryption_params(dcp, has_encryption);
 5845                 if (error != 0) {
 5846                         spa_deactivate(spa);
 5847                         spa_remove(spa);
 5848                         mutex_exit(&spa_namespace_lock);
 5849                         return (error);
 5850                 }
 5851         }
 5852         if (!has_allocclass && zfs_special_devs(nvroot, NULL)) {
 5853                 spa_deactivate(spa);
 5854                 spa_remove(spa);
 5855                 mutex_exit(&spa_namespace_lock);
 5856                 return (ENOTSUP);
 5857         }
 5858 
 5859         if (has_features || nvlist_lookup_uint64(props,
 5860             zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
 5861                 version = SPA_VERSION;
 5862         }
 5863         ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 5864 
 5865         spa->spa_first_txg = txg;
 5866         spa->spa_uberblock.ub_txg = txg - 1;
 5867         spa->spa_uberblock.ub_version = version;
 5868         spa->spa_ubsync = spa->spa_uberblock;
 5869         spa->spa_load_state = SPA_LOAD_CREATE;
 5870         spa->spa_removing_phys.sr_state = DSS_NONE;
 5871         spa->spa_removing_phys.sr_removing_vdev = -1;
 5872         spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
 5873         spa->spa_indirect_vdevs_loaded = B_TRUE;
 5874 
 5875         /*
 5876          * Create "The Godfather" zio to hold all async IOs
 5877          */
 5878         spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
 5879             KM_SLEEP);
 5880         for (int i = 0; i < max_ncpus; i++) {
 5881                 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 5882                     ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 5883                     ZIO_FLAG_GODFATHER);
 5884         }
 5885 
 5886         /*
 5887          * Create the root vdev.
 5888          */
 5889         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 5890 
 5891         error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
 5892 
 5893         ASSERT(error != 0 || rvd != NULL);
 5894         ASSERT(error != 0 || spa->spa_root_vdev == rvd);
 5895 
 5896         if (error == 0 && !zfs_allocatable_devs(nvroot))
 5897                 error = SET_ERROR(EINVAL);
 5898 
 5899         if (error == 0 &&
 5900             (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
 5901             (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
 5902             (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
 5903                 /*
 5904                  * instantiate the metaslab groups (this will dirty the vdevs)
 5905                  * we can no longer error exit past this point
 5906                  */
 5907                 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
 5908                         vdev_t *vd = rvd->vdev_child[c];
 5909 
 5910                         vdev_metaslab_set_size(vd);
 5911                         vdev_expand(vd, txg);
 5912                 }
 5913         }
 5914 
 5915         spa_config_exit(spa, SCL_ALL, FTAG);
 5916 
 5917         if (error != 0) {
 5918                 spa_unload(spa);
 5919                 spa_deactivate(spa);
 5920                 spa_remove(spa);
 5921                 mutex_exit(&spa_namespace_lock);
 5922                 return (error);
 5923         }
 5924 
 5925         /*
 5926          * Get the list of spares, if specified.
 5927          */
 5928         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 5929             &spares, &nspares) == 0) {
 5930                 spa->spa_spares.sav_config = fnvlist_alloc();
 5931                 fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
 5932                     ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
 5933                     nspares);
 5934                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 5935                 spa_load_spares(spa);
 5936                 spa_config_exit(spa, SCL_ALL, FTAG);
 5937                 spa->spa_spares.sav_sync = B_TRUE;
 5938         }
 5939 
 5940         /*
 5941          * Get the list of level 2 cache devices, if specified.
 5942          */
 5943         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 5944             &l2cache, &nl2cache) == 0) {
 5945                 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config,
 5946                     NV_UNIQUE_NAME, KM_SLEEP));
 5947                 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 5948                     ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
 5949                     nl2cache);
 5950                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 5951                 spa_load_l2cache(spa);
 5952                 spa_config_exit(spa, SCL_ALL, FTAG);
 5953                 spa->spa_l2cache.sav_sync = B_TRUE;
 5954         }
 5955 
 5956         spa->spa_is_initializing = B_TRUE;
 5957         spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
 5958         spa->spa_is_initializing = B_FALSE;
 5959 
 5960         /*
 5961          * Create DDTs (dedup tables).
 5962          */
 5963         ddt_create(spa);
 5964 
 5965         spa_update_dspace(spa);
 5966 
 5967         tx = dmu_tx_create_assigned(dp, txg);
 5968 
 5969         /*
 5970          * Create the pool's history object.
 5971          */
 5972         if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history)
 5973                 spa_history_create_obj(spa, tx);
 5974 
 5975         spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
 5976         spa_history_log_version(spa, "create", tx);
 5977 
 5978         /*
 5979          * Create the pool config object.
 5980          */
 5981         spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
 5982             DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
 5983             DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
 5984 
 5985         if (zap_add(spa->spa_meta_objset,
 5986             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
 5987             sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
 5988                 cmn_err(CE_PANIC, "failed to add pool config");
 5989         }
 5990 
 5991         if (zap_add(spa->spa_meta_objset,
 5992             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
 5993             sizeof (uint64_t), 1, &version, tx) != 0) {
 5994                 cmn_err(CE_PANIC, "failed to add pool version");
 5995         }
 5996 
 5997         /* Newly created pools with the right version are always deflated. */
 5998         if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
 5999                 spa->spa_deflate = TRUE;
 6000                 if (zap_add(spa->spa_meta_objset,
 6001                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 6002                     sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
 6003                         cmn_err(CE_PANIC, "failed to add deflate");
 6004                 }
 6005         }
 6006 
 6007         /*
 6008          * Create the deferred-free bpobj.  Turn off compression
 6009          * because sync-to-convergence takes longer if the blocksize
 6010          * keeps changing.
 6011          */
 6012         obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
 6013         dmu_object_set_compress(spa->spa_meta_objset, obj,
 6014             ZIO_COMPRESS_OFF, tx);
 6015         if (zap_add(spa->spa_meta_objset,
 6016             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
 6017             sizeof (uint64_t), 1, &obj, tx) != 0) {
 6018                 cmn_err(CE_PANIC, "failed to add bpobj");
 6019         }
 6020         VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
 6021             spa->spa_meta_objset, obj));
 6022 
 6023         /*
 6024          * Generate some random noise for salted checksums to operate on.
 6025          */
 6026         (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
 6027             sizeof (spa->spa_cksum_salt.zcs_bytes));
 6028 
 6029         /*
 6030          * Set pool properties.
 6031          */
 6032         spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
 6033         spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 6034         spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
 6035         spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
 6036         spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
 6037         spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
 6038 
 6039         if (props != NULL) {
 6040                 spa_configfile_set(spa, props, B_FALSE);
 6041                 spa_sync_props(props, tx);
 6042         }
 6043 
 6044         for (int i = 0; i < ndraid; i++)
 6045                 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
 6046 
 6047         dmu_tx_commit(tx);
 6048 
 6049         spa->spa_sync_on = B_TRUE;
 6050         txg_sync_start(dp);
 6051         mmp_thread_start(spa);
 6052         txg_wait_synced(dp, txg);
 6053 
 6054         spa_spawn_aux_threads(spa);
 6055 
 6056         spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
 6057 
 6058         /*
 6059          * Don't count references from objsets that are already closed
 6060          * and are making their way through the eviction process.
 6061          */
 6062         spa_evicting_os_wait(spa);
 6063         spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
 6064         spa->spa_load_state = SPA_LOAD_NONE;
 6065 
 6066         spa_import_os(spa);
 6067 
 6068         mutex_exit(&spa_namespace_lock);
 6069 
 6070         return (0);
 6071 }
 6072 
 6073 /*
 6074  * Import a non-root pool into the system.
 6075  */
 6076 int
 6077 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 6078 {
 6079         spa_t *spa;
 6080         char *altroot = NULL;
 6081         spa_load_state_t state = SPA_LOAD_IMPORT;
 6082         zpool_load_policy_t policy;
 6083         spa_mode_t mode = spa_mode_global;
 6084         uint64_t readonly = B_FALSE;
 6085         int error;
 6086         nvlist_t *nvroot;
 6087         nvlist_t **spares, **l2cache;
 6088         uint_t nspares, nl2cache;
 6089 
 6090         /*
 6091          * If a pool with this name exists, return failure.
 6092          */
 6093         mutex_enter(&spa_namespace_lock);
 6094         if (spa_lookup(pool) != NULL) {
 6095                 mutex_exit(&spa_namespace_lock);
 6096                 return (SET_ERROR(EEXIST));
 6097         }
 6098 
 6099         /*
 6100          * Create and initialize the spa structure.
 6101          */
 6102         (void) nvlist_lookup_string(props,
 6103             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 6104         (void) nvlist_lookup_uint64(props,
 6105             zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
 6106         if (readonly)
 6107                 mode = SPA_MODE_READ;
 6108         spa = spa_add(pool, config, altroot);
 6109         spa->spa_import_flags = flags;
 6110 
 6111         /*
 6112          * Verbatim import - Take a pool and insert it into the namespace
 6113          * as if it had been loaded at boot.
 6114          */
 6115         if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
 6116                 if (props != NULL)
 6117                         spa_configfile_set(spa, props, B_FALSE);
 6118 
 6119                 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
 6120                 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 6121                 zfs_dbgmsg("spa_import: verbatim import of %s", pool);
 6122                 mutex_exit(&spa_namespace_lock);
 6123                 return (0);
 6124         }
 6125 
 6126         spa_activate(spa, mode);
 6127 
 6128         /*
 6129          * Don't start async tasks until we know everything is healthy.
 6130          */
 6131         spa_async_suspend(spa);
 6132 
 6133         zpool_get_load_policy(config, &policy);
 6134         if (policy.zlp_rewind & ZPOOL_DO_REWIND)
 6135                 state = SPA_LOAD_RECOVER;
 6136 
 6137         spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
 6138 
 6139         if (state != SPA_LOAD_RECOVER) {
 6140                 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 6141                 zfs_dbgmsg("spa_import: importing %s", pool);
 6142         } else {
 6143                 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
 6144                     "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
 6145         }
 6146         error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
 6147 
 6148         /*
 6149          * Propagate anything learned while loading the pool and pass it
 6150          * back to caller (i.e. rewind info, missing devices, etc).
 6151          */
 6152         fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info);
 6153 
 6154         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 6155         /*
 6156          * Toss any existing sparelist, as it doesn't have any validity
 6157          * anymore, and conflicts with spa_has_spare().
 6158          */
 6159         if (spa->spa_spares.sav_config) {
 6160                 nvlist_free(spa->spa_spares.sav_config);
 6161                 spa->spa_spares.sav_config = NULL;
 6162                 spa_load_spares(spa);
 6163         }
 6164         if (spa->spa_l2cache.sav_config) {
 6165                 nvlist_free(spa->spa_l2cache.sav_config);
 6166                 spa->spa_l2cache.sav_config = NULL;
 6167                 spa_load_l2cache(spa);
 6168         }
 6169 
 6170         nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 6171         spa_config_exit(spa, SCL_ALL, FTAG);
 6172 
 6173         if (props != NULL)
 6174                 spa_configfile_set(spa, props, B_FALSE);
 6175 
 6176         if (error != 0 || (props && spa_writeable(spa) &&
 6177             (error = spa_prop_set(spa, props)))) {
 6178                 spa_unload(spa);
 6179                 spa_deactivate(spa);
 6180                 spa_remove(spa);
 6181                 mutex_exit(&spa_namespace_lock);
 6182                 return (error);
 6183         }
 6184 
 6185         spa_async_resume(spa);
 6186 
 6187         /*
 6188          * Override any spares and level 2 cache devices as specified by
 6189          * the user, as these may have correct device names/devids, etc.
 6190          */
 6191         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 6192             &spares, &nspares) == 0) {
 6193                 if (spa->spa_spares.sav_config)
 6194                         fnvlist_remove(spa->spa_spares.sav_config,
 6195                             ZPOOL_CONFIG_SPARES);
 6196                 else
 6197                         spa->spa_spares.sav_config = fnvlist_alloc();
 6198                 fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
 6199                     ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
 6200                     nspares);
 6201                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 6202                 spa_load_spares(spa);
 6203                 spa_config_exit(spa, SCL_ALL, FTAG);
 6204                 spa->spa_spares.sav_sync = B_TRUE;
 6205         }
 6206         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 6207             &l2cache, &nl2cache) == 0) {
 6208                 if (spa->spa_l2cache.sav_config)
 6209                         fnvlist_remove(spa->spa_l2cache.sav_config,
 6210                             ZPOOL_CONFIG_L2CACHE);
 6211                 else
 6212                         spa->spa_l2cache.sav_config = fnvlist_alloc();
 6213                 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 6214                     ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
 6215                     nl2cache);
 6216                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 6217                 spa_load_l2cache(spa);
 6218                 spa_config_exit(spa, SCL_ALL, FTAG);
 6219                 spa->spa_l2cache.sav_sync = B_TRUE;
 6220         }
 6221 
 6222         /*
 6223          * Check for any removed devices.
 6224          */
 6225         if (spa->spa_autoreplace) {
 6226                 spa_aux_check_removed(&spa->spa_spares);
 6227                 spa_aux_check_removed(&spa->spa_l2cache);
 6228         }
 6229 
 6230         if (spa_writeable(spa)) {
 6231                 /*
 6232                  * Update the config cache to include the newly-imported pool.
 6233                  */
 6234                 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 6235         }
 6236 
 6237         /*
 6238          * It's possible that the pool was expanded while it was exported.
 6239          * We kick off an async task to handle this for us.
 6240          */
 6241         spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
 6242 
 6243         spa_history_log_version(spa, "import", NULL);
 6244 
 6245         spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 6246 
 6247         mutex_exit(&spa_namespace_lock);
 6248 
 6249         zvol_create_minors_recursive(pool);
 6250 
 6251         spa_import_os(spa);
 6252 
 6253         return (0);
 6254 }
 6255 
 6256 nvlist_t *
 6257 spa_tryimport(nvlist_t *tryconfig)
 6258 {
 6259         nvlist_t *config = NULL;
 6260         char *poolname, *cachefile;
 6261         spa_t *spa;
 6262         uint64_t state;
 6263         int error;
 6264         zpool_load_policy_t policy;
 6265 
 6266         if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
 6267                 return (NULL);
 6268 
 6269         if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
 6270                 return (NULL);
 6271 
 6272         /*
 6273          * Create and initialize the spa structure.
 6274          */
 6275         mutex_enter(&spa_namespace_lock);
 6276         spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
 6277         spa_activate(spa, SPA_MODE_READ);
 6278 
 6279         /*
 6280          * Rewind pool if a max txg was provided.
 6281          */
 6282         zpool_get_load_policy(spa->spa_config, &policy);
 6283         if (policy.zlp_txg != UINT64_MAX) {
 6284                 spa->spa_load_max_txg = policy.zlp_txg;
 6285                 spa->spa_extreme_rewind = B_TRUE;
 6286                 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
 6287                     poolname, (longlong_t)policy.zlp_txg);
 6288         } else {
 6289                 zfs_dbgmsg("spa_tryimport: importing %s", poolname);
 6290         }
 6291 
 6292         if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
 6293             == 0) {
 6294                 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
 6295                 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
 6296         } else {
 6297                 spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
 6298         }
 6299 
 6300         error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
 6301 
 6302         /*
 6303          * If 'tryconfig' was at least parsable, return the current config.
 6304          */
 6305         if (spa->spa_root_vdev != NULL) {
 6306                 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 6307                 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname);
 6308                 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state);
 6309                 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 6310                     spa->spa_uberblock.ub_timestamp);
 6311                 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
 6312                     spa->spa_load_info);
 6313                 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA,
 6314                     spa->spa_errata);
 6315 
 6316                 /*
 6317                  * If the bootfs property exists on this pool then we
 6318                  * copy it out so that external consumers can tell which
 6319                  * pools are bootable.
 6320                  */
 6321                 if ((!error || error == EEXIST) && spa->spa_bootfs) {
 6322                         char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 6323 
 6324                         /*
 6325                          * We have to play games with the name since the
 6326                          * pool was opened as TRYIMPORT_NAME.
 6327                          */
 6328                         if (dsl_dsobj_to_dsname(spa_name(spa),
 6329                             spa->spa_bootfs, tmpname) == 0) {
 6330                                 char *cp;
 6331                                 char *dsname;
 6332 
 6333                                 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 6334 
 6335                                 cp = strchr(tmpname, '/');
 6336                                 if (cp == NULL) {
 6337                                         (void) strlcpy(dsname, tmpname,
 6338                                             MAXPATHLEN);
 6339                                 } else {
 6340                                         (void) snprintf(dsname, MAXPATHLEN,
 6341                                             "%s/%s", poolname, ++cp);
 6342                                 }
 6343                                 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS,
 6344                                     dsname);
 6345                                 kmem_free(dsname, MAXPATHLEN);
 6346                         }
 6347                         kmem_free(tmpname, MAXPATHLEN);
 6348                 }
 6349 
 6350                 /*
 6351                  * Add the list of hot spares and level 2 cache devices.
 6352                  */
 6353                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 6354                 spa_add_spares(spa, config);
 6355                 spa_add_l2cache(spa, config);
 6356                 spa_config_exit(spa, SCL_CONFIG, FTAG);
 6357         }
 6358 
 6359         spa_unload(spa);
 6360         spa_deactivate(spa);
 6361         spa_remove(spa);
 6362         mutex_exit(&spa_namespace_lock);
 6363 
 6364         return (config);
 6365 }
 6366 
 6367 /*
 6368  * Pool export/destroy
 6369  *
 6370  * The act of destroying or exporting a pool is very simple.  We make sure there
 6371  * is no more pending I/O and any references to the pool are gone.  Then, we
 6372  * update the pool state and sync all the labels to disk, removing the
 6373  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
 6374  * we don't sync the labels or remove the configuration cache.
 6375  */
 6376 static int
 6377 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
 6378     boolean_t force, boolean_t hardforce)
 6379 {
 6380         int error;
 6381         spa_t *spa;
 6382 
 6383         if (oldconfig)
 6384                 *oldconfig = NULL;
 6385 
 6386         if (!(spa_mode_global & SPA_MODE_WRITE))
 6387                 return (SET_ERROR(EROFS));
 6388 
 6389         mutex_enter(&spa_namespace_lock);
 6390         if ((spa = spa_lookup(pool)) == NULL) {
 6391                 mutex_exit(&spa_namespace_lock);
 6392                 return (SET_ERROR(ENOENT));
 6393         }
 6394 
 6395         if (spa->spa_is_exporting) {
 6396                 /* the pool is being exported by another thread */
 6397                 mutex_exit(&spa_namespace_lock);
 6398                 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS));
 6399         }
 6400         spa->spa_is_exporting = B_TRUE;
 6401 
 6402         /*
 6403          * Put a hold on the pool, drop the namespace lock, stop async tasks,
 6404          * reacquire the namespace lock, and see if we can export.
 6405          */
 6406         spa_open_ref(spa, FTAG);
 6407         mutex_exit(&spa_namespace_lock);
 6408         spa_async_suspend(spa);
 6409         if (spa->spa_zvol_taskq) {
 6410                 zvol_remove_minors(spa, spa_name(spa), B_TRUE);
 6411                 taskq_wait(spa->spa_zvol_taskq);
 6412         }
 6413         mutex_enter(&spa_namespace_lock);
 6414         spa_close(spa, FTAG);
 6415 
 6416         if (spa->spa_state == POOL_STATE_UNINITIALIZED)
 6417                 goto export_spa;
 6418         /*
 6419          * The pool will be in core if it's openable, in which case we can
 6420          * modify its state.  Objsets may be open only because they're dirty,
 6421          * so we have to force it to sync before checking spa_refcnt.
 6422          */
 6423         if (spa->spa_sync_on) {
 6424                 txg_wait_synced(spa->spa_dsl_pool, 0);
 6425                 spa_evicting_os_wait(spa);
 6426         }
 6427 
 6428         /*
 6429          * A pool cannot be exported or destroyed if there are active
 6430          * references.  If we are resetting a pool, allow references by
 6431          * fault injection handlers.
 6432          */
 6433         if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) {
 6434                 error = SET_ERROR(EBUSY);
 6435                 goto fail;
 6436         }
 6437 
 6438         if (spa->spa_sync_on) {
 6439                 vdev_t *rvd = spa->spa_root_vdev;
 6440                 /*
 6441                  * A pool cannot be exported if it has an active shared spare.
 6442                  * This is to prevent other pools stealing the active spare
 6443                  * from an exported pool. At user's own will, such pool can
 6444                  * be forcedly exported.
 6445                  */
 6446                 if (!force && new_state == POOL_STATE_EXPORTED &&
 6447                     spa_has_active_shared_spare(spa)) {
 6448                         error = SET_ERROR(EXDEV);
 6449                         goto fail;
 6450                 }
 6451 
 6452                 /*
 6453                  * We're about to export or destroy this pool. Make sure
 6454                  * we stop all initialization and trim activity here before
 6455                  * we set the spa_final_txg. This will ensure that all
 6456                  * dirty data resulting from the initialization is
 6457                  * committed to disk before we unload the pool.
 6458                  */
 6459                 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
 6460                 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
 6461                 vdev_autotrim_stop_all(spa);
 6462                 vdev_rebuild_stop_all(spa);
 6463 
 6464                 /*
 6465                  * We want this to be reflected on every label,
 6466                  * so mark them all dirty.  spa_unload() will do the
 6467                  * final sync that pushes these changes out.
 6468                  */
 6469                 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 6470                         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 6471                         spa->spa_state = new_state;
 6472                         vdev_config_dirty(rvd);
 6473                         spa_config_exit(spa, SCL_ALL, FTAG);
 6474                 }
 6475 
 6476                 /*
 6477                  * If the log space map feature is enabled and the pool is
 6478                  * getting exported (but not destroyed), we want to spend some
 6479                  * time flushing as many metaslabs as we can in an attempt to
 6480                  * destroy log space maps and save import time. This has to be
 6481                  * done before we set the spa_final_txg, otherwise
 6482                  * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs.
 6483                  * spa_should_flush_logs_on_unload() should be called after
 6484                  * spa_state has been set to the new_state.
 6485                  */
 6486                 if (spa_should_flush_logs_on_unload(spa))
 6487                         spa_unload_log_sm_flush_all(spa);
 6488 
 6489                 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 6490                         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 6491                         spa->spa_final_txg = spa_last_synced_txg(spa) +
 6492                             TXG_DEFER_SIZE + 1;
 6493                         spa_config_exit(spa, SCL_ALL, FTAG);
 6494                 }
 6495         }
 6496 
 6497 export_spa:
 6498         spa_export_os(spa);
 6499 
 6500         if (new_state == POOL_STATE_DESTROYED)
 6501                 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
 6502         else if (new_state == POOL_STATE_EXPORTED)
 6503                 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT);
 6504 
 6505         if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 6506                 spa_unload(spa);
 6507                 spa_deactivate(spa);
 6508         }
 6509 
 6510         if (oldconfig && spa->spa_config)
 6511                 *oldconfig = fnvlist_dup(spa->spa_config);
 6512 
 6513         if (new_state != POOL_STATE_UNINITIALIZED) {
 6514                 if (!hardforce)
 6515                         spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
 6516                 spa_remove(spa);
 6517         } else {
 6518                 /*
 6519                  * If spa_remove() is not called for this spa_t and
 6520                  * there is any possibility that it can be reused,
 6521                  * we make sure to reset the exporting flag.
 6522                  */
 6523                 spa->spa_is_exporting = B_FALSE;
 6524         }
 6525 
 6526         mutex_exit(&spa_namespace_lock);
 6527         return (0);
 6528 
 6529 fail:
 6530         spa->spa_is_exporting = B_FALSE;
 6531         spa_async_resume(spa);
 6532         mutex_exit(&spa_namespace_lock);
 6533         return (error);
 6534 }
 6535 
 6536 /*
 6537  * Destroy a storage pool.
 6538  */
 6539 int
 6540 spa_destroy(const char *pool)
 6541 {
 6542         return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
 6543             B_FALSE, B_FALSE));
 6544 }
 6545 
 6546 /*
 6547  * Export a storage pool.
 6548  */
 6549 int
 6550 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force,
 6551     boolean_t hardforce)
 6552 {
 6553         return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
 6554             force, hardforce));
 6555 }
 6556 
 6557 /*
 6558  * Similar to spa_export(), this unloads the spa_t without actually removing it
 6559  * from the namespace in any way.
 6560  */
 6561 int
 6562 spa_reset(const char *pool)
 6563 {
 6564         return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
 6565             B_FALSE, B_FALSE));
 6566 }
 6567 
 6568 /*
 6569  * ==========================================================================
 6570  * Device manipulation
 6571  * ==========================================================================
 6572  */
 6573 
 6574 /*
 6575  * This is called as a synctask to increment the draid feature flag
 6576  */
 6577 static void
 6578 spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
 6579 {
 6580         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 6581         int draid = (int)(uintptr_t)arg;
 6582 
 6583         for (int c = 0; c < draid; c++)
 6584                 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
 6585 }
 6586 
 6587 /*
 6588  * Add a device to a storage pool.
 6589  */
 6590 int
 6591 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 6592 {
 6593         uint64_t txg, ndraid = 0;
 6594         int error;
 6595         vdev_t *rvd = spa->spa_root_vdev;
 6596         vdev_t *vd, *tvd;
 6597         nvlist_t **spares, **l2cache;
 6598         uint_t nspares, nl2cache;
 6599 
 6600         ASSERT(spa_writeable(spa));
 6601 
 6602         txg = spa_vdev_enter(spa);
 6603 
 6604         if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
 6605             VDEV_ALLOC_ADD)) != 0)
 6606                 return (spa_vdev_exit(spa, NULL, txg, error));
 6607 
 6608         spa->spa_pending_vdev = vd;     /* spa_vdev_exit() will clear this */
 6609 
 6610         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
 6611             &nspares) != 0)
 6612                 nspares = 0;
 6613 
 6614         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
 6615             &nl2cache) != 0)
 6616                 nl2cache = 0;
 6617 
 6618         if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
 6619                 return (spa_vdev_exit(spa, vd, txg, EINVAL));
 6620 
 6621         if (vd->vdev_children != 0 &&
 6622             (error = vdev_create(vd, txg, B_FALSE)) != 0) {
 6623                 return (spa_vdev_exit(spa, vd, txg, error));
 6624         }
 6625 
 6626         /*
 6627          * The virtual dRAID spares must be added after vdev tree is created
 6628          * and the vdev guids are generated.  The guid of their associated
 6629          * dRAID is stored in the config and used when opening the spare.
 6630          */
 6631         if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
 6632             rvd->vdev_children)) == 0) {
 6633                 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
 6634                     ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
 6635                         nspares = 0;
 6636         } else {
 6637                 return (spa_vdev_exit(spa, vd, txg, error));
 6638         }
 6639 
 6640         /*
 6641          * We must validate the spares and l2cache devices after checking the
 6642          * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
 6643          */
 6644         if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
 6645                 return (spa_vdev_exit(spa, vd, txg, error));
 6646 
 6647         /*
 6648          * If we are in the middle of a device removal, we can only add
 6649          * devices which match the existing devices in the pool.
 6650          * If we are in the middle of a removal, or have some indirect
 6651          * vdevs, we can not add raidz or dRAID top levels.
 6652          */
 6653         if (spa->spa_vdev_removal != NULL ||
 6654             spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
 6655                 for (int c = 0; c < vd->vdev_children; c++) {
 6656                         tvd = vd->vdev_child[c];
 6657                         if (spa->spa_vdev_removal != NULL &&
 6658                             tvd->vdev_ashift != spa->spa_max_ashift) {
 6659                                 return (spa_vdev_exit(spa, vd, txg, EINVAL));
 6660                         }
 6661                         /* Fail if top level vdev is raidz or a dRAID */
 6662                         if (vdev_get_nparity(tvd) != 0)
 6663                                 return (spa_vdev_exit(spa, vd, txg, EINVAL));
 6664 
 6665                         /*
 6666                          * Need the top level mirror to be
 6667                          * a mirror of leaf vdevs only
 6668                          */
 6669                         if (tvd->vdev_ops == &vdev_mirror_ops) {
 6670                                 for (uint64_t cid = 0;
 6671                                     cid < tvd->vdev_children; cid++) {
 6672                                         vdev_t *cvd = tvd->vdev_child[cid];
 6673                                         if (!cvd->vdev_ops->vdev_op_leaf) {
 6674                                                 return (spa_vdev_exit(spa, vd,
 6675                                                     txg, EINVAL));
 6676                                         }
 6677                                 }
 6678                         }
 6679                 }
 6680         }
 6681 
 6682         for (int c = 0; c < vd->vdev_children; c++) {
 6683                 tvd = vd->vdev_child[c];
 6684                 vdev_remove_child(vd, tvd);
 6685                 tvd->vdev_id = rvd->vdev_children;
 6686                 vdev_add_child(rvd, tvd);
 6687                 vdev_config_dirty(tvd);
 6688         }
 6689 
 6690         if (nspares != 0) {
 6691                 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
 6692                     ZPOOL_CONFIG_SPARES);
 6693                 spa_load_spares(spa);
 6694                 spa->spa_spares.sav_sync = B_TRUE;
 6695         }
 6696 
 6697         if (nl2cache != 0) {
 6698                 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
 6699                     ZPOOL_CONFIG_L2CACHE);
 6700                 spa_load_l2cache(spa);
 6701                 spa->spa_l2cache.sav_sync = B_TRUE;
 6702         }
 6703 
 6704         /*
 6705          * We can't increment a feature while holding spa_vdev so we
 6706          * have to do it in a synctask.
 6707          */
 6708         if (ndraid != 0) {
 6709                 dmu_tx_t *tx;
 6710 
 6711                 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 6712                 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
 6713                     (void *)(uintptr_t)ndraid, tx);
 6714                 dmu_tx_commit(tx);
 6715         }
 6716 
 6717         /*
 6718          * We have to be careful when adding new vdevs to an existing pool.
 6719          * If other threads start allocating from these vdevs before we
 6720          * sync the config cache, and we lose power, then upon reboot we may
 6721          * fail to open the pool because there are DVAs that the config cache
 6722          * can't translate.  Therefore, we first add the vdevs without
 6723          * initializing metaslabs; sync the config cache (via spa_vdev_exit());
 6724          * and then let spa_config_update() initialize the new metaslabs.
 6725          *
 6726          * spa_load() checks for added-but-not-initialized vdevs, so that
 6727          * if we lose power at any point in this sequence, the remaining
 6728          * steps will be completed the next time we load the pool.
 6729          */
 6730         (void) spa_vdev_exit(spa, vd, txg, 0);
 6731 
 6732         mutex_enter(&spa_namespace_lock);
 6733         spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 6734         spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
 6735         mutex_exit(&spa_namespace_lock);
 6736 
 6737         return (0);
 6738 }
 6739 
 6740 /*
 6741  * Attach a device to a mirror.  The arguments are the path to any device
 6742  * in the mirror, and the nvroot for the new device.  If the path specifies
 6743  * a device that is not mirrored, we automatically insert the mirror vdev.
 6744  *
 6745  * If 'replacing' is specified, the new device is intended to replace the
 6746  * existing device; in this case the two devices are made into their own
 6747  * mirror using the 'replacing' vdev, which is functionally identical to
 6748  * the mirror vdev (it actually reuses all the same ops) but has a few
 6749  * extra rules: you can't attach to it after it's been created, and upon
 6750  * completion of resilvering, the first disk (the one being replaced)
 6751  * is automatically detached.
 6752  *
 6753  * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
 6754  * should be performed instead of traditional healing reconstruction.  From
 6755  * an administrators perspective these are both resilver operations.
 6756  */
 6757 int
 6758 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 6759     int rebuild)
 6760 {
 6761         uint64_t txg, dtl_max_txg;
 6762         vdev_t *rvd = spa->spa_root_vdev;
 6763         vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 6764         vdev_ops_t *pvops;
 6765         char *oldvdpath, *newvdpath;
 6766         int newvd_isspare;
 6767         int error;
 6768 
 6769         ASSERT(spa_writeable(spa));
 6770 
 6771         txg = spa_vdev_enter(spa);
 6772 
 6773         oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 6774 
 6775         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 6776         if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 6777                 error = (spa_has_checkpoint(spa)) ?
 6778                     ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 6779                 return (spa_vdev_exit(spa, NULL, txg, error));
 6780         }
 6781 
 6782         if (rebuild) {
 6783                 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
 6784                         return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 6785 
 6786                 if (dsl_scan_resilvering(spa_get_dsl(spa)))
 6787                         return (spa_vdev_exit(spa, NULL, txg,
 6788                             ZFS_ERR_RESILVER_IN_PROGRESS));
 6789         } else {
 6790                 if (vdev_rebuild_active(rvd))
 6791                         return (spa_vdev_exit(spa, NULL, txg,
 6792                             ZFS_ERR_REBUILD_IN_PROGRESS));
 6793         }
 6794 
 6795         if (spa->spa_vdev_removal != NULL)
 6796                 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 6797 
 6798         if (oldvd == NULL)
 6799                 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 6800 
 6801         if (!oldvd->vdev_ops->vdev_op_leaf)
 6802                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 6803 
 6804         pvd = oldvd->vdev_parent;
 6805 
 6806         if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
 6807             VDEV_ALLOC_ATTACH) != 0)
 6808                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 6809 
 6810         if (newrootvd->vdev_children != 1)
 6811                 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 6812 
 6813         newvd = newrootvd->vdev_child[0];
 6814 
 6815         if (!newvd->vdev_ops->vdev_op_leaf)
 6816                 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 6817 
 6818         if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
 6819                 return (spa_vdev_exit(spa, newrootvd, txg, error));
 6820 
 6821         /*
 6822          * log, dedup and special vdevs should not be replaced by spares.
 6823          */
 6824         if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE ||
 6825             oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) {
 6826                 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 6827         }
 6828 
 6829         /*
 6830          * A dRAID spare can only replace a child of its parent dRAID vdev.
 6831          */
 6832         if (newvd->vdev_ops == &vdev_draid_spare_ops &&
 6833             oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
 6834                 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 6835         }
 6836 
 6837         if (rebuild) {
 6838                 /*
 6839                  * For rebuilds, the top vdev must support reconstruction
 6840                  * using only space maps.  This means the only allowable
 6841                  * vdevs types are the root vdev, a mirror, or dRAID.
 6842                  */
 6843                 tvd = pvd;
 6844                 if (pvd->vdev_top != NULL)
 6845                         tvd = pvd->vdev_top;
 6846 
 6847                 if (tvd->vdev_ops != &vdev_mirror_ops &&
 6848                     tvd->vdev_ops != &vdev_root_ops &&
 6849                     tvd->vdev_ops != &vdev_draid_ops) {
 6850                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 6851                 }
 6852         }
 6853 
 6854         if (!replacing) {
 6855                 /*
 6856                  * For attach, the only allowable parent is a mirror or the root
 6857                  * vdev.
 6858                  */
 6859                 if (pvd->vdev_ops != &vdev_mirror_ops &&
 6860                     pvd->vdev_ops != &vdev_root_ops)
 6861                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 6862 
 6863                 pvops = &vdev_mirror_ops;
 6864         } else {
 6865                 /*
 6866                  * Active hot spares can only be replaced by inactive hot
 6867                  * spares.
 6868                  */
 6869                 if (pvd->vdev_ops == &vdev_spare_ops &&
 6870                     oldvd->vdev_isspare &&
 6871                     !spa_has_spare(spa, newvd->vdev_guid))
 6872                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 6873 
 6874                 /*
 6875                  * If the source is a hot spare, and the parent isn't already a
 6876                  * spare, then we want to create a new hot spare.  Otherwise, we
 6877                  * want to create a replacing vdev.  The user is not allowed to
 6878                  * attach to a spared vdev child unless the 'isspare' state is
 6879                  * the same (spare replaces spare, non-spare replaces
 6880                  * non-spare).
 6881                  */
 6882                 if (pvd->vdev_ops == &vdev_replacing_ops &&
 6883                     spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
 6884                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 6885                 } else if (pvd->vdev_ops == &vdev_spare_ops &&
 6886                     newvd->vdev_isspare != oldvd->vdev_isspare) {
 6887                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 6888                 }
 6889 
 6890                 if (newvd->vdev_isspare)
 6891                         pvops = &vdev_spare_ops;
 6892                 else
 6893                         pvops = &vdev_replacing_ops;
 6894         }
 6895 
 6896         /*
 6897          * Make sure the new device is big enough.
 6898          */
 6899         if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
 6900                 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
 6901 
 6902         /*
 6903          * The new device cannot have a higher alignment requirement
 6904          * than the top-level vdev.
 6905          */
 6906         if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
 6907                 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 6908 
 6909         /*
 6910          * If this is an in-place replacement, update oldvd's path and devid
 6911          * to make it distinguishable from newvd, and unopenable from now on.
 6912          */
 6913         if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
 6914                 spa_strfree(oldvd->vdev_path);
 6915                 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
 6916                     KM_SLEEP);
 6917                 (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5,
 6918                     "%s/%s", newvd->vdev_path, "old");
 6919                 if (oldvd->vdev_devid != NULL) {
 6920                         spa_strfree(oldvd->vdev_devid);
 6921                         oldvd->vdev_devid = NULL;
 6922                 }
 6923         }
 6924 
 6925         /*
 6926          * If the parent is not a mirror, or if we're replacing, insert the new
 6927          * mirror/replacing/spare vdev above oldvd.
 6928          */
 6929         if (pvd->vdev_ops != pvops)
 6930                 pvd = vdev_add_parent(oldvd, pvops);
 6931 
 6932         ASSERT(pvd->vdev_top->vdev_parent == rvd);
 6933         ASSERT(pvd->vdev_ops == pvops);
 6934         ASSERT(oldvd->vdev_parent == pvd);
 6935 
 6936         /*
 6937          * Extract the new device from its root and add it to pvd.
 6938          */
 6939         vdev_remove_child(newrootvd, newvd);
 6940         newvd->vdev_id = pvd->vdev_children;
 6941         newvd->vdev_crtxg = oldvd->vdev_crtxg;
 6942         vdev_add_child(pvd, newvd);
 6943 
 6944         /*
 6945          * Reevaluate the parent vdev state.
 6946          */
 6947         vdev_propagate_state(pvd);
 6948 
 6949         tvd = newvd->vdev_top;
 6950         ASSERT(pvd->vdev_top == tvd);
 6951         ASSERT(tvd->vdev_parent == rvd);
 6952 
 6953         vdev_config_dirty(tvd);
 6954 
 6955         /*
 6956          * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
 6957          * for any dmu_sync-ed blocks.  It will propagate upward when
 6958          * spa_vdev_exit() calls vdev_dtl_reassess().
 6959          */
 6960         dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 6961 
 6962         vdev_dtl_dirty(newvd, DTL_MISSING,
 6963             TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
 6964 
 6965         if (newvd->vdev_isspare) {
 6966                 spa_spare_activate(newvd);
 6967                 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
 6968         }
 6969 
 6970         oldvdpath = spa_strdup(oldvd->vdev_path);
 6971         newvdpath = spa_strdup(newvd->vdev_path);
 6972         newvd_isspare = newvd->vdev_isspare;
 6973 
 6974         /*
 6975          * Mark newvd's DTL dirty in this txg.
 6976          */
 6977         vdev_dirty(tvd, VDD_DTL, newvd, txg);
 6978 
 6979         /*
 6980          * Schedule the resilver or rebuild to restart in the future. We do
 6981          * this to ensure that dmu_sync-ed blocks have been stitched into the
 6982          * respective datasets.
 6983          */
 6984         if (rebuild) {
 6985                 newvd->vdev_rebuild_txg = txg;
 6986 
 6987                 vdev_rebuild(tvd);
 6988         } else {
 6989                 newvd->vdev_resilver_txg = txg;
 6990 
 6991                 if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
 6992                     spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
 6993                         vdev_defer_resilver(newvd);
 6994                 } else {
 6995                         dsl_scan_restart_resilver(spa->spa_dsl_pool,
 6996                             dtl_max_txg);
 6997                 }
 6998         }
 6999 
 7000         if (spa->spa_bootfs)
 7001                 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
 7002 
 7003         spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
 7004 
 7005         /*
 7006          * Commit the config
 7007          */
 7008         (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
 7009 
 7010         spa_history_log_internal(spa, "vdev attach", NULL,
 7011             "%s vdev=%s %s vdev=%s",
 7012             replacing && newvd_isspare ? "spare in" :
 7013             replacing ? "replace" : "attach", newvdpath,
 7014             replacing ? "for" : "to", oldvdpath);
 7015 
 7016         spa_strfree(oldvdpath);
 7017         spa_strfree(newvdpath);
 7018 
 7019         return (0);
 7020 }
 7021 
 7022 /*
 7023  * Detach a device from a mirror or replacing vdev.
 7024  *
 7025  * If 'replace_done' is specified, only detach if the parent
 7026  * is a replacing vdev.
 7027  */
 7028 int
 7029 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 7030 {
 7031         uint64_t txg;
 7032         int error;
 7033         vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
 7034         vdev_t *vd, *pvd, *cvd, *tvd;
 7035         boolean_t unspare = B_FALSE;
 7036         uint64_t unspare_guid = 0;
 7037         char *vdpath;
 7038 
 7039         ASSERT(spa_writeable(spa));
 7040 
 7041         txg = spa_vdev_detach_enter(spa, guid);
 7042 
 7043         vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 7044 
 7045         /*
 7046          * Besides being called directly from the userland through the
 7047          * ioctl interface, spa_vdev_detach() can be potentially called
 7048          * at the end of spa_vdev_resilver_done().
 7049          *
 7050          * In the regular case, when we have a checkpoint this shouldn't
 7051          * happen as we never empty the DTLs of a vdev during the scrub
 7052          * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
 7053          * should never get here when we have a checkpoint.
 7054          *
 7055          * That said, even in a case when we checkpoint the pool exactly
 7056          * as spa_vdev_resilver_done() calls this function everything
 7057          * should be fine as the resilver will return right away.
 7058          */
 7059         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 7060         if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 7061                 error = (spa_has_checkpoint(spa)) ?
 7062                     ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 7063                 return (spa_vdev_exit(spa, NULL, txg, error));
 7064         }
 7065 
 7066         if (vd == NULL)
 7067                 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 7068 
 7069         if (!vd->vdev_ops->vdev_op_leaf)
 7070                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 7071 
 7072         pvd = vd->vdev_parent;
 7073 
 7074         /*
 7075          * If the parent/child relationship is not as expected, don't do it.
 7076          * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
 7077          * vdev that's replacing B with C.  The user's intent in replacing
 7078          * is to go from M(A,B) to M(A,C).  If the user decides to cancel
 7079          * the replace by detaching C, the expected behavior is to end up
 7080          * M(A,B).  But suppose that right after deciding to detach C,
 7081          * the replacement of B completes.  We would have M(A,C), and then
 7082          * ask to detach C, which would leave us with just A -- not what
 7083          * the user wanted.  To prevent this, we make sure that the
 7084          * parent/child relationship hasn't changed -- in this example,
 7085          * that C's parent is still the replacing vdev R.
 7086          */
 7087         if (pvd->vdev_guid != pguid && pguid != 0)
 7088                 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 7089 
 7090         /*
 7091          * Only 'replacing' or 'spare' vdevs can be replaced.
 7092          */
 7093         if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
 7094             pvd->vdev_ops != &vdev_spare_ops)
 7095                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 7096 
 7097         ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
 7098             spa_version(spa) >= SPA_VERSION_SPARES);
 7099 
 7100         /*
 7101          * Only mirror, replacing, and spare vdevs support detach.
 7102          */
 7103         if (pvd->vdev_ops != &vdev_replacing_ops &&
 7104             pvd->vdev_ops != &vdev_mirror_ops &&
 7105             pvd->vdev_ops != &vdev_spare_ops)
 7106                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 7107 
 7108         /*
 7109          * If this device has the only valid copy of some data,
 7110          * we cannot safely detach it.
 7111          */
 7112         if (vdev_dtl_required(vd))
 7113                 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 7114 
 7115         ASSERT(pvd->vdev_children >= 2);
 7116 
 7117         /*
 7118          * If we are detaching the second disk from a replacing vdev, then
 7119          * check to see if we changed the original vdev's path to have "/old"
 7120          * at the end in spa_vdev_attach().  If so, undo that change now.
 7121          */
 7122         if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
 7123             vd->vdev_path != NULL) {
 7124                 size_t len = strlen(vd->vdev_path);
 7125 
 7126                 for (int c = 0; c < pvd->vdev_children; c++) {
 7127                         cvd = pvd->vdev_child[c];
 7128 
 7129                         if (cvd == vd || cvd->vdev_path == NULL)
 7130                                 continue;
 7131 
 7132                         if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
 7133                             strcmp(cvd->vdev_path + len, "/old") == 0) {
 7134                                 spa_strfree(cvd->vdev_path);
 7135                                 cvd->vdev_path = spa_strdup(vd->vdev_path);
 7136                                 break;
 7137                         }
 7138                 }
 7139         }
 7140 
 7141         /*
 7142          * If we are detaching the original disk from a normal spare, then it
 7143          * implies that the spare should become a real disk, and be removed
 7144          * from the active spare list for the pool.  dRAID spares on the
 7145          * other hand are coupled to the pool and thus should never be removed
 7146          * from the spares list.
 7147          */
 7148         if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
 7149                 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
 7150 
 7151                 if (last_cvd->vdev_isspare &&
 7152                     last_cvd->vdev_ops != &vdev_draid_spare_ops) {
 7153                         unspare = B_TRUE;
 7154                 }
 7155         }
 7156 
 7157         /*
 7158          * Erase the disk labels so the disk can be used for other things.
 7159          * This must be done after all other error cases are handled,
 7160          * but before we disembowel vd (so we can still do I/O to it).
 7161          * But if we can't do it, don't treat the error as fatal --
 7162          * it may be that the unwritability of the disk is the reason
 7163          * it's being detached!
 7164          */
 7165         (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 7166 
 7167         /*
 7168          * Remove vd from its parent and compact the parent's children.
 7169          */
 7170         vdev_remove_child(pvd, vd);
 7171         vdev_compact_children(pvd);
 7172 
 7173         /*
 7174          * Remember one of the remaining children so we can get tvd below.
 7175          */
 7176         cvd = pvd->vdev_child[pvd->vdev_children - 1];
 7177 
 7178         /*
 7179          * If we need to remove the remaining child from the list of hot spares,
 7180          * do it now, marking the vdev as no longer a spare in the process.
 7181          * We must do this before vdev_remove_parent(), because that can
 7182          * change the GUID if it creates a new toplevel GUID.  For a similar
 7183          * reason, we must remove the spare now, in the same txg as the detach;
 7184          * otherwise someone could attach a new sibling, change the GUID, and
 7185          * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
 7186          */
 7187         if (unspare) {
 7188                 ASSERT(cvd->vdev_isspare);
 7189                 spa_spare_remove(cvd);
 7190                 unspare_guid = cvd->vdev_guid;
 7191                 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
 7192                 cvd->vdev_unspare = B_TRUE;
 7193         }
 7194 
 7195         /*
 7196          * If the parent mirror/replacing vdev only has one child,
 7197          * the parent is no longer needed.  Remove it from the tree.
 7198          */
 7199         if (pvd->vdev_children == 1) {
 7200                 if (pvd->vdev_ops == &vdev_spare_ops)
 7201                         cvd->vdev_unspare = B_FALSE;
 7202                 vdev_remove_parent(cvd);
 7203         }
 7204 
 7205         /*
 7206          * We don't set tvd until now because the parent we just removed
 7207          * may have been the previous top-level vdev.
 7208          */
 7209         tvd = cvd->vdev_top;
 7210         ASSERT(tvd->vdev_parent == rvd);
 7211 
 7212         /*
 7213          * Reevaluate the parent vdev state.
 7214          */
 7215         vdev_propagate_state(cvd);
 7216 
 7217         /*
 7218          * If the 'autoexpand' property is set on the pool then automatically
 7219          * try to expand the size of the pool. For example if the device we
 7220          * just detached was smaller than the others, it may be possible to
 7221          * add metaslabs (i.e. grow the pool). We need to reopen the vdev
 7222          * first so that we can obtain the updated sizes of the leaf vdevs.
 7223          */
 7224         if (spa->spa_autoexpand) {
 7225                 vdev_reopen(tvd);
 7226                 vdev_expand(tvd, txg);
 7227         }
 7228 
 7229         vdev_config_dirty(tvd);
 7230 
 7231         /*
 7232          * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
 7233          * vd->vdev_detached is set and free vd's DTL object in syncing context.
 7234          * But first make sure we're not on any *other* txg's DTL list, to
 7235          * prevent vd from being accessed after it's freed.
 7236          */
 7237         vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none");
 7238         for (int t = 0; t < TXG_SIZE; t++)
 7239                 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 7240         vd->vdev_detached = B_TRUE;
 7241         vdev_dirty(tvd, VDD_DTL, vd, txg);
 7242 
 7243         spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
 7244         spa_notify_waiters(spa);
 7245 
 7246         /* hang on to the spa before we release the lock */
 7247         spa_open_ref(spa, FTAG);
 7248 
 7249         error = spa_vdev_exit(spa, vd, txg, 0);
 7250 
 7251         spa_history_log_internal(spa, "detach", NULL,
 7252             "vdev=%s", vdpath);
 7253         spa_strfree(vdpath);
 7254 
 7255         /*
 7256          * If this was the removal of the original device in a hot spare vdev,
 7257          * then we want to go through and remove the device from the hot spare
 7258          * list of every other pool.
 7259          */
 7260         if (unspare) {
 7261                 spa_t *altspa = NULL;
 7262 
 7263                 mutex_enter(&spa_namespace_lock);
 7264                 while ((altspa = spa_next(altspa)) != NULL) {
 7265                         if (altspa->spa_state != POOL_STATE_ACTIVE ||
 7266                             altspa == spa)
 7267                                 continue;
 7268 
 7269                         spa_open_ref(altspa, FTAG);
 7270                         mutex_exit(&spa_namespace_lock);
 7271                         (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
 7272                         mutex_enter(&spa_namespace_lock);
 7273                         spa_close(altspa, FTAG);
 7274                 }
 7275                 mutex_exit(&spa_namespace_lock);
 7276 
 7277                 /* search the rest of the vdevs for spares to remove */
 7278                 spa_vdev_resilver_done(spa);
 7279         }
 7280 
 7281         /* all done with the spa; OK to release */
 7282         mutex_enter(&spa_namespace_lock);
 7283         spa_close(spa, FTAG);
 7284         mutex_exit(&spa_namespace_lock);
 7285 
 7286         return (error);
 7287 }
 7288 
 7289 static int
 7290 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
 7291     list_t *vd_list)
 7292 {
 7293         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 7294 
 7295         spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 7296 
 7297         /* Look up vdev and ensure it's a leaf. */
 7298         vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 7299         if (vd == NULL || vd->vdev_detached) {
 7300                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 7301                 return (SET_ERROR(ENODEV));
 7302         } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
 7303                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 7304                 return (SET_ERROR(EINVAL));
 7305         } else if (!vdev_writeable(vd)) {
 7306                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 7307                 return (SET_ERROR(EROFS));
 7308         }
 7309         mutex_enter(&vd->vdev_initialize_lock);
 7310         spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 7311 
 7312         /*
 7313          * When we activate an initialize action we check to see
 7314          * if the vdev_initialize_thread is NULL. We do this instead
 7315          * of using the vdev_initialize_state since there might be
 7316          * a previous initialization process which has completed but
 7317          * the thread is not exited.
 7318          */
 7319         if (cmd_type == POOL_INITIALIZE_START &&
 7320             (vd->vdev_initialize_thread != NULL ||
 7321             vd->vdev_top->vdev_removing)) {
 7322                 mutex_exit(&vd->vdev_initialize_lock);
 7323                 return (SET_ERROR(EBUSY));
 7324         } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
 7325             (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
 7326             vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
 7327                 mutex_exit(&vd->vdev_initialize_lock);
 7328                 return (SET_ERROR(ESRCH));
 7329         } else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
 7330             vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
 7331                 mutex_exit(&vd->vdev_initialize_lock);
 7332                 return (SET_ERROR(ESRCH));
 7333         }
 7334 
 7335         switch (cmd_type) {
 7336         case POOL_INITIALIZE_START:
 7337                 vdev_initialize(vd);
 7338                 break;
 7339         case POOL_INITIALIZE_CANCEL:
 7340                 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list);
 7341                 break;
 7342         case POOL_INITIALIZE_SUSPEND:
 7343                 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
 7344                 break;
 7345         default:
 7346                 panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
 7347         }
 7348         mutex_exit(&vd->vdev_initialize_lock);
 7349 
 7350         return (0);
 7351 }
 7352 
 7353 int
 7354 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
 7355     nvlist_t *vdev_errlist)
 7356 {
 7357         int total_errors = 0;
 7358         list_t vd_list;
 7359 
 7360         list_create(&vd_list, sizeof (vdev_t),
 7361             offsetof(vdev_t, vdev_initialize_node));
 7362 
 7363         /*
 7364          * We hold the namespace lock through the whole function
 7365          * to prevent any changes to the pool while we're starting or
 7366          * stopping initialization. The config and state locks are held so that
 7367          * we can properly assess the vdev state before we commit to
 7368          * the initializing operation.
 7369          */
 7370         mutex_enter(&spa_namespace_lock);
 7371 
 7372         for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
 7373             pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
 7374                 uint64_t vdev_guid = fnvpair_value_uint64(pair);
 7375 
 7376                 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type,
 7377                     &vd_list);
 7378                 if (error != 0) {
 7379                         char guid_as_str[MAXNAMELEN];
 7380 
 7381                         (void) snprintf(guid_as_str, sizeof (guid_as_str),
 7382                             "%llu", (unsigned long long)vdev_guid);
 7383                         fnvlist_add_int64(vdev_errlist, guid_as_str, error);
 7384                         total_errors++;
 7385                 }
 7386         }
 7387 
 7388         /* Wait for all initialize threads to stop. */
 7389         vdev_initialize_stop_wait(spa, &vd_list);
 7390 
 7391         /* Sync out the initializing state */
 7392         txg_wait_synced(spa->spa_dsl_pool, 0);
 7393         mutex_exit(&spa_namespace_lock);
 7394 
 7395         list_destroy(&vd_list);
 7396 
 7397         return (total_errors);
 7398 }
 7399 
 7400 static int
 7401 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
 7402     uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
 7403 {
 7404         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 7405 
 7406         spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 7407 
 7408         /* Look up vdev and ensure it's a leaf. */
 7409         vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 7410         if (vd == NULL || vd->vdev_detached) {
 7411                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 7412                 return (SET_ERROR(ENODEV));
 7413         } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
 7414                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 7415                 return (SET_ERROR(EINVAL));
 7416         } else if (!vdev_writeable(vd)) {
 7417                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 7418                 return (SET_ERROR(EROFS));
 7419         } else if (!vd->vdev_has_trim) {
 7420                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 7421                 return (SET_ERROR(EOPNOTSUPP));
 7422         } else if (secure && !vd->vdev_has_securetrim) {
 7423                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 7424                 return (SET_ERROR(EOPNOTSUPP));
 7425         }
 7426         mutex_enter(&vd->vdev_trim_lock);
 7427         spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 7428 
 7429         /*
 7430          * When we activate a TRIM action we check to see if the
 7431          * vdev_trim_thread is NULL. We do this instead of using the
 7432          * vdev_trim_state since there might be a previous TRIM process
 7433          * which has completed but the thread is not exited.
 7434          */
 7435         if (cmd_type == POOL_TRIM_START &&
 7436             (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
 7437                 mutex_exit(&vd->vdev_trim_lock);
 7438                 return (SET_ERROR(EBUSY));
 7439         } else if (cmd_type == POOL_TRIM_CANCEL &&
 7440             (vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
 7441             vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
 7442                 mutex_exit(&vd->vdev_trim_lock);
 7443                 return (SET_ERROR(ESRCH));
 7444         } else if (cmd_type == POOL_TRIM_SUSPEND &&
 7445             vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
 7446                 mutex_exit(&vd->vdev_trim_lock);
 7447                 return (SET_ERROR(ESRCH));
 7448         }
 7449 
 7450         switch (cmd_type) {
 7451         case POOL_TRIM_START:
 7452                 vdev_trim(vd, rate, partial, secure);
 7453                 break;
 7454         case POOL_TRIM_CANCEL:
 7455                 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
 7456                 break;
 7457         case POOL_TRIM_SUSPEND:
 7458                 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
 7459                 break;
 7460         default:
 7461                 panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
 7462         }
 7463         mutex_exit(&vd->vdev_trim_lock);
 7464 
 7465         return (0);
 7466 }
 7467 
 7468 /*
 7469  * Initiates a manual TRIM for the requested vdevs. This kicks off individual
 7470  * TRIM threads for each child vdev.  These threads pass over all of the free
 7471  * space in the vdev's metaslabs and issues TRIM commands for that space.
 7472  */
 7473 int
 7474 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
 7475     boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
 7476 {
 7477         int total_errors = 0;
 7478         list_t vd_list;
 7479 
 7480         list_create(&vd_list, sizeof (vdev_t),
 7481             offsetof(vdev_t, vdev_trim_node));
 7482 
 7483         /*
 7484          * We hold the namespace lock through the whole function
 7485          * to prevent any changes to the pool while we're starting or
 7486          * stopping TRIM. The config and state locks are held so that
 7487          * we can properly assess the vdev state before we commit to
 7488          * the TRIM operation.
 7489          */
 7490         mutex_enter(&spa_namespace_lock);
 7491 
 7492         for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
 7493             pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
 7494                 uint64_t vdev_guid = fnvpair_value_uint64(pair);
 7495 
 7496                 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
 7497                     rate, partial, secure, &vd_list);
 7498                 if (error != 0) {
 7499                         char guid_as_str[MAXNAMELEN];
 7500 
 7501                         (void) snprintf(guid_as_str, sizeof (guid_as_str),
 7502                             "%llu", (unsigned long long)vdev_guid);
 7503                         fnvlist_add_int64(vdev_errlist, guid_as_str, error);
 7504                         total_errors++;
 7505                 }
 7506         }
 7507 
 7508         /* Wait for all TRIM threads to stop. */
 7509         vdev_trim_stop_wait(spa, &vd_list);
 7510 
 7511         /* Sync out the TRIM state */
 7512         txg_wait_synced(spa->spa_dsl_pool, 0);
 7513         mutex_exit(&spa_namespace_lock);
 7514 
 7515         list_destroy(&vd_list);
 7516 
 7517         return (total_errors);
 7518 }
 7519 
 7520 /*
 7521  * Split a set of devices from their mirrors, and create a new pool from them.
 7522  */
 7523 int
 7524 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config,
 7525     nvlist_t *props, boolean_t exp)
 7526 {
 7527         int error = 0;
 7528         uint64_t txg, *glist;
 7529         spa_t *newspa;
 7530         uint_t c, children, lastlog;
 7531         nvlist_t **child, *nvl, *tmp;
 7532         dmu_tx_t *tx;
 7533         char *altroot = NULL;
 7534         vdev_t *rvd, **vml = NULL;                      /* vdev modify list */
 7535         boolean_t activate_slog;
 7536 
 7537         ASSERT(spa_writeable(spa));
 7538 
 7539         txg = spa_vdev_enter(spa);
 7540 
 7541         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 7542         if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 7543                 error = (spa_has_checkpoint(spa)) ?
 7544                     ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 7545                 return (spa_vdev_exit(spa, NULL, txg, error));
 7546         }
 7547 
 7548         /* clear the log and flush everything up to now */
 7549         activate_slog = spa_passivate_log(spa);
 7550         (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 7551         error = spa_reset_logs(spa);
 7552         txg = spa_vdev_config_enter(spa);
 7553 
 7554         if (activate_slog)
 7555                 spa_activate_log(spa);
 7556 
 7557         if (error != 0)
 7558                 return (spa_vdev_exit(spa, NULL, txg, error));
 7559 
 7560         /* check new spa name before going any further */
 7561         if (spa_lookup(newname) != NULL)
 7562                 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
 7563 
 7564         /*
 7565          * scan through all the children to ensure they're all mirrors
 7566          */
 7567         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
 7568             nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
 7569             &children) != 0)
 7570                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 7571 
 7572         /* first, check to ensure we've got the right child count */
 7573         rvd = spa->spa_root_vdev;
 7574         lastlog = 0;
 7575         for (c = 0; c < rvd->vdev_children; c++) {
 7576                 vdev_t *vd = rvd->vdev_child[c];
 7577 
 7578                 /* don't count the holes & logs as children */
 7579                 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops &&
 7580                     !vdev_is_concrete(vd))) {
 7581                         if (lastlog == 0)
 7582                                 lastlog = c;
 7583                         continue;
 7584                 }
 7585 
 7586                 lastlog = 0;
 7587         }
 7588         if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
 7589                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 7590 
 7591         /* next, ensure no spare or cache devices are part of the split */
 7592         if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
 7593             nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
 7594                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 7595 
 7596         vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
 7597         glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
 7598 
 7599         /* then, loop over each vdev and validate it */
 7600         for (c = 0; c < children; c++) {
 7601                 uint64_t is_hole = 0;
 7602 
 7603                 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
 7604                     &is_hole);
 7605 
 7606                 if (is_hole != 0) {
 7607                         if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
 7608                             spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
 7609                                 continue;
 7610                         } else {
 7611                                 error = SET_ERROR(EINVAL);
 7612                                 break;
 7613                         }
 7614                 }
 7615 
 7616                 /* deal with indirect vdevs */
 7617                 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops ==
 7618                     &vdev_indirect_ops)
 7619                         continue;
 7620 
 7621                 /* which disk is going to be split? */
 7622                 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
 7623                     &glist[c]) != 0) {
 7624                         error = SET_ERROR(EINVAL);
 7625                         break;
 7626                 }
 7627 
 7628                 /* look it up in the spa */
 7629                 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
 7630                 if (vml[c] == NULL) {
 7631                         error = SET_ERROR(ENODEV);
 7632                         break;
 7633                 }
 7634 
 7635                 /* make sure there's nothing stopping the split */
 7636                 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
 7637                     vml[c]->vdev_islog ||
 7638                     !vdev_is_concrete(vml[c]) ||
 7639                     vml[c]->vdev_isspare ||
 7640                     vml[c]->vdev_isl2cache ||
 7641                     !vdev_writeable(vml[c]) ||
 7642                     vml[c]->vdev_children != 0 ||
 7643                     vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
 7644                     c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
 7645                         error = SET_ERROR(EINVAL);
 7646                         break;
 7647                 }
 7648 
 7649                 if (vdev_dtl_required(vml[c]) ||
 7650                     vdev_resilver_needed(vml[c], NULL, NULL)) {
 7651                         error = SET_ERROR(EBUSY);
 7652                         break;
 7653                 }
 7654 
 7655                 /* we need certain info from the top level */
 7656                 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
 7657                     vml[c]->vdev_top->vdev_ms_array);
 7658                 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
 7659                     vml[c]->vdev_top->vdev_ms_shift);
 7660                 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
 7661                     vml[c]->vdev_top->vdev_asize);
 7662                 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
 7663                     vml[c]->vdev_top->vdev_ashift);
 7664 
 7665                 /* transfer per-vdev ZAPs */
 7666                 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
 7667                 VERIFY0(nvlist_add_uint64(child[c],
 7668                     ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
 7669 
 7670                 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
 7671                 VERIFY0(nvlist_add_uint64(child[c],
 7672                     ZPOOL_CONFIG_VDEV_TOP_ZAP,
 7673                     vml[c]->vdev_parent->vdev_top_zap));
 7674         }
 7675 
 7676         if (error != 0) {
 7677                 kmem_free(vml, children * sizeof (vdev_t *));
 7678                 kmem_free(glist, children * sizeof (uint64_t));
 7679                 return (spa_vdev_exit(spa, NULL, txg, error));
 7680         }
 7681 
 7682         /* stop writers from using the disks */
 7683         for (c = 0; c < children; c++) {
 7684                 if (vml[c] != NULL)
 7685                         vml[c]->vdev_offline = B_TRUE;
 7686         }
 7687         vdev_reopen(spa->spa_root_vdev);
 7688 
 7689         /*
 7690          * Temporarily record the splitting vdevs in the spa config.  This
 7691          * will disappear once the config is regenerated.
 7692          */
 7693         nvl = fnvlist_alloc();
 7694         fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children);
 7695         kmem_free(glist, children * sizeof (uint64_t));
 7696 
 7697         mutex_enter(&spa->spa_props_lock);
 7698         fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl);
 7699         mutex_exit(&spa->spa_props_lock);
 7700         spa->spa_config_splitting = nvl;
 7701         vdev_config_dirty(spa->spa_root_vdev);
 7702 
 7703         /* configure and create the new pool */
 7704         fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname);
 7705         fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 7706             exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE);
 7707         fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
 7708         fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
 7709         fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 7710             spa_generate_guid(NULL));
 7711         VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
 7712         (void) nvlist_lookup_string(props,
 7713             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 7714 
 7715         /* add the new pool to the namespace */
 7716         newspa = spa_add(newname, config, altroot);
 7717         newspa->spa_avz_action = AVZ_ACTION_REBUILD;
 7718         newspa->spa_config_txg = spa->spa_config_txg;
 7719         spa_set_log_state(newspa, SPA_LOG_CLEAR);
 7720 
 7721         /* release the spa config lock, retaining the namespace lock */
 7722         spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 7723 
 7724         if (zio_injection_enabled)
 7725                 zio_handle_panic_injection(spa, FTAG, 1);
 7726 
 7727         spa_activate(newspa, spa_mode_global);
 7728         spa_async_suspend(newspa);
 7729 
 7730         /*
 7731          * Temporarily stop the initializing and TRIM activity.  We set the
 7732          * state to ACTIVE so that we know to resume initializing or TRIM
 7733          * once the split has completed.
 7734          */
 7735         list_t vd_initialize_list;
 7736         list_create(&vd_initialize_list, sizeof (vdev_t),
 7737             offsetof(vdev_t, vdev_initialize_node));
 7738 
 7739         list_t vd_trim_list;
 7740         list_create(&vd_trim_list, sizeof (vdev_t),
 7741             offsetof(vdev_t, vdev_trim_node));
 7742 
 7743         for (c = 0; c < children; c++) {
 7744                 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
 7745                         mutex_enter(&vml[c]->vdev_initialize_lock);
 7746                         vdev_initialize_stop(vml[c],
 7747                             VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
 7748                         mutex_exit(&vml[c]->vdev_initialize_lock);
 7749 
 7750                         mutex_enter(&vml[c]->vdev_trim_lock);
 7751                         vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
 7752                         mutex_exit(&vml[c]->vdev_trim_lock);
 7753                 }
 7754         }
 7755 
 7756         vdev_initialize_stop_wait(spa, &vd_initialize_list);
 7757         vdev_trim_stop_wait(spa, &vd_trim_list);
 7758 
 7759         list_destroy(&vd_initialize_list);
 7760         list_destroy(&vd_trim_list);
 7761 
 7762         newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
 7763         newspa->spa_is_splitting = B_TRUE;
 7764 
 7765         /* create the new pool from the disks of the original pool */
 7766         error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
 7767         if (error)
 7768                 goto out;
 7769 
 7770         /* if that worked, generate a real config for the new pool */
 7771         if (newspa->spa_root_vdev != NULL) {
 7772                 newspa->spa_config_splitting = fnvlist_alloc();
 7773                 fnvlist_add_uint64(newspa->spa_config_splitting,
 7774                     ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa));
 7775                 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
 7776                     B_TRUE));
 7777         }
 7778 
 7779         /* set the props */
 7780         if (props != NULL) {
 7781                 spa_configfile_set(newspa, props, B_FALSE);
 7782                 error = spa_prop_set(newspa, props);
 7783                 if (error)
 7784                         goto out;
 7785         }
 7786 
 7787         /* flush everything */
 7788         txg = spa_vdev_config_enter(newspa);
 7789         vdev_config_dirty(newspa->spa_root_vdev);
 7790         (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
 7791 
 7792         if (zio_injection_enabled)
 7793                 zio_handle_panic_injection(spa, FTAG, 2);
 7794 
 7795         spa_async_resume(newspa);
 7796 
 7797         /* finally, update the original pool's config */
 7798         txg = spa_vdev_config_enter(spa);
 7799         tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 7800         error = dmu_tx_assign(tx, TXG_WAIT);
 7801         if (error != 0)
 7802                 dmu_tx_abort(tx);
 7803         for (c = 0; c < children; c++) {
 7804                 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
 7805                         vdev_t *tvd = vml[c]->vdev_top;
 7806 
 7807                         /*
 7808                          * Need to be sure the detachable VDEV is not
 7809                          * on any *other* txg's DTL list to prevent it
 7810                          * from being accessed after it's freed.
 7811                          */
 7812                         for (int t = 0; t < TXG_SIZE; t++) {
 7813                                 (void) txg_list_remove_this(
 7814                                     &tvd->vdev_dtl_list, vml[c], t);
 7815                         }
 7816 
 7817                         vdev_split(vml[c]);
 7818                         if (error == 0)
 7819                                 spa_history_log_internal(spa, "detach", tx,
 7820                                     "vdev=%s", vml[c]->vdev_path);
 7821 
 7822                         vdev_free(vml[c]);
 7823                 }
 7824         }
 7825         spa->spa_avz_action = AVZ_ACTION_REBUILD;
 7826         vdev_config_dirty(spa->spa_root_vdev);
 7827         spa->spa_config_splitting = NULL;
 7828         nvlist_free(nvl);
 7829         if (error == 0)
 7830                 dmu_tx_commit(tx);
 7831         (void) spa_vdev_exit(spa, NULL, txg, 0);
 7832 
 7833         if (zio_injection_enabled)
 7834                 zio_handle_panic_injection(spa, FTAG, 3);
 7835 
 7836         /* split is complete; log a history record */
 7837         spa_history_log_internal(newspa, "split", NULL,
 7838             "from pool %s", spa_name(spa));
 7839 
 7840         newspa->spa_is_splitting = B_FALSE;
 7841         kmem_free(vml, children * sizeof (vdev_t *));
 7842 
 7843         /* if we're not going to mount the filesystems in userland, export */
 7844         if (exp)
 7845                 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
 7846                     B_FALSE, B_FALSE);
 7847 
 7848         return (error);
 7849 
 7850 out:
 7851         spa_unload(newspa);
 7852         spa_deactivate(newspa);
 7853         spa_remove(newspa);
 7854 
 7855         txg = spa_vdev_config_enter(spa);
 7856 
 7857         /* re-online all offlined disks */
 7858         for (c = 0; c < children; c++) {
 7859                 if (vml[c] != NULL)
 7860                         vml[c]->vdev_offline = B_FALSE;
 7861         }
 7862 
 7863         /* restart initializing or trimming disks as necessary */
 7864         spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 7865         spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
 7866         spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 7867 
 7868         vdev_reopen(spa->spa_root_vdev);
 7869 
 7870         nvlist_free(spa->spa_config_splitting);
 7871         spa->spa_config_splitting = NULL;
 7872         (void) spa_vdev_exit(spa, NULL, txg, error);
 7873 
 7874         kmem_free(vml, children * sizeof (vdev_t *));
 7875         return (error);
 7876 }
 7877 
 7878 /*
 7879  * Find any device that's done replacing, or a vdev marked 'unspare' that's
 7880  * currently spared, so we can detach it.
 7881  */
 7882 static vdev_t *
 7883 spa_vdev_resilver_done_hunt(vdev_t *vd)
 7884 {
 7885         vdev_t *newvd, *oldvd;
 7886 
 7887         for (int c = 0; c < vd->vdev_children; c++) {
 7888                 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
 7889                 if (oldvd != NULL)
 7890                         return (oldvd);
 7891         }
 7892 
 7893         /*
 7894          * Check for a completed replacement.  We always consider the first
 7895          * vdev in the list to be the oldest vdev, and the last one to be
 7896          * the newest (see spa_vdev_attach() for how that works).  In
 7897          * the case where the newest vdev is faulted, we will not automatically
 7898          * remove it after a resilver completes.  This is OK as it will require
 7899          * user intervention to determine which disk the admin wishes to keep.
 7900          */
 7901         if (vd->vdev_ops == &vdev_replacing_ops) {
 7902                 ASSERT(vd->vdev_children > 1);
 7903 
 7904                 newvd = vd->vdev_child[vd->vdev_children - 1];
 7905                 oldvd = vd->vdev_child[0];
 7906 
 7907                 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
 7908                     vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 7909                     !vdev_dtl_required(oldvd))
 7910                         return (oldvd);
 7911         }
 7912 
 7913         /*
 7914          * Check for a completed resilver with the 'unspare' flag set.
 7915          * Also potentially update faulted state.
 7916          */
 7917         if (vd->vdev_ops == &vdev_spare_ops) {
 7918                 vdev_t *first = vd->vdev_child[0];
 7919                 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
 7920 
 7921                 if (last->vdev_unspare) {
 7922                         oldvd = first;
 7923                         newvd = last;
 7924                 } else if (first->vdev_unspare) {
 7925                         oldvd = last;
 7926                         newvd = first;
 7927                 } else {
 7928                         oldvd = NULL;
 7929                 }
 7930 
 7931                 if (oldvd != NULL &&
 7932                     vdev_dtl_empty(newvd, DTL_MISSING) &&
 7933                     vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 7934                     !vdev_dtl_required(oldvd))
 7935                         return (oldvd);
 7936 
 7937                 vdev_propagate_state(vd);
 7938 
 7939                 /*
 7940                  * If there are more than two spares attached to a disk,
 7941                  * and those spares are not required, then we want to
 7942                  * attempt to free them up now so that they can be used
 7943                  * by other pools.  Once we're back down to a single
 7944                  * disk+spare, we stop removing them.
 7945                  */
 7946                 if (vd->vdev_children > 2) {
 7947                         newvd = vd->vdev_child[1];
 7948 
 7949                         if (newvd->vdev_isspare && last->vdev_isspare &&
 7950                             vdev_dtl_empty(last, DTL_MISSING) &&
 7951                             vdev_dtl_empty(last, DTL_OUTAGE) &&
 7952                             !vdev_dtl_required(newvd))
 7953                                 return (newvd);
 7954                 }
 7955         }
 7956 
 7957         return (NULL);
 7958 }
 7959 
 7960 static void
 7961 spa_vdev_resilver_done(spa_t *spa)
 7962 {
 7963         vdev_t *vd, *pvd, *ppvd;
 7964         uint64_t guid, sguid, pguid, ppguid;
 7965 
 7966         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 7967 
 7968         while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
 7969                 pvd = vd->vdev_parent;
 7970                 ppvd = pvd->vdev_parent;
 7971                 guid = vd->vdev_guid;
 7972                 pguid = pvd->vdev_guid;
 7973                 ppguid = ppvd->vdev_guid;
 7974                 sguid = 0;
 7975                 /*
 7976                  * If we have just finished replacing a hot spared device, then
 7977                  * we need to detach the parent's first child (the original hot
 7978                  * spare) as well.
 7979                  */
 7980                 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
 7981                     ppvd->vdev_children == 2) {
 7982                         ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
 7983                         sguid = ppvd->vdev_child[1]->vdev_guid;
 7984                 }
 7985                 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
 7986 
 7987                 spa_config_exit(spa, SCL_ALL, FTAG);
 7988                 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
 7989                         return;
 7990                 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
 7991                         return;
 7992                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 7993         }
 7994 
 7995         spa_config_exit(spa, SCL_ALL, FTAG);
 7996 
 7997         /*
 7998          * If a detach was not performed above replace waiters will not have
 7999          * been notified.  In which case we must do so now.
 8000          */
 8001         spa_notify_waiters(spa);
 8002 }
 8003 
 8004 /*
 8005  * Update the stored path or FRU for this vdev.
 8006  */
 8007 static int
 8008 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
 8009     boolean_t ispath)
 8010 {
 8011         vdev_t *vd;
 8012         boolean_t sync = B_FALSE;
 8013 
 8014         ASSERT(spa_writeable(spa));
 8015 
 8016         spa_vdev_state_enter(spa, SCL_ALL);
 8017 
 8018         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 8019                 return (spa_vdev_state_exit(spa, NULL, ENOENT));
 8020 
 8021         if (!vd->vdev_ops->vdev_op_leaf)
 8022                 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 8023 
 8024         if (ispath) {
 8025                 if (strcmp(value, vd->vdev_path) != 0) {
 8026                         spa_strfree(vd->vdev_path);
 8027                         vd->vdev_path = spa_strdup(value);
 8028                         sync = B_TRUE;
 8029                 }
 8030         } else {
 8031                 if (vd->vdev_fru == NULL) {
 8032                         vd->vdev_fru = spa_strdup(value);
 8033                         sync = B_TRUE;
 8034                 } else if (strcmp(value, vd->vdev_fru) != 0) {
 8035                         spa_strfree(vd->vdev_fru);
 8036                         vd->vdev_fru = spa_strdup(value);
 8037                         sync = B_TRUE;
 8038                 }
 8039         }
 8040 
 8041         return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
 8042 }
 8043 
 8044 int
 8045 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
 8046 {
 8047         return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
 8048 }
 8049 
 8050 int
 8051 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
 8052 {
 8053         return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
 8054 }
 8055 
 8056 /*
 8057  * ==========================================================================
 8058  * SPA Scanning
 8059  * ==========================================================================
 8060  */
 8061 int
 8062 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
 8063 {
 8064         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 8065 
 8066         if (dsl_scan_resilvering(spa->spa_dsl_pool))
 8067                 return (SET_ERROR(EBUSY));
 8068 
 8069         return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
 8070 }
 8071 
 8072 int
 8073 spa_scan_stop(spa_t *spa)
 8074 {
 8075         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 8076         if (dsl_scan_resilvering(spa->spa_dsl_pool))
 8077                 return (SET_ERROR(EBUSY));
 8078         return (dsl_scan_cancel(spa->spa_dsl_pool));
 8079 }
 8080 
 8081 int
 8082 spa_scan(spa_t *spa, pool_scan_func_t func)
 8083 {
 8084         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 8085 
 8086         if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
 8087                 return (SET_ERROR(ENOTSUP));
 8088 
 8089         if (func == POOL_SCAN_RESILVER &&
 8090             !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
 8091                 return (SET_ERROR(ENOTSUP));
 8092 
 8093         /*
 8094          * If a resilver was requested, but there is no DTL on a
 8095          * writeable leaf device, we have nothing to do.
 8096          */
 8097         if (func == POOL_SCAN_RESILVER &&
 8098             !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 8099                 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 8100                 return (0);
 8101         }
 8102 
 8103         return (dsl_scan(spa->spa_dsl_pool, func));
 8104 }
 8105 
 8106 /*
 8107  * ==========================================================================
 8108  * SPA async task processing
 8109  * ==========================================================================
 8110  */
 8111 
 8112 static void
 8113 spa_async_remove(spa_t *spa, vdev_t *vd)
 8114 {
 8115         if (vd->vdev_remove_wanted) {
 8116                 vd->vdev_remove_wanted = B_FALSE;
 8117                 vd->vdev_delayed_close = B_FALSE;
 8118                 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
 8119 
 8120                 /*
 8121                  * We want to clear the stats, but we don't want to do a full
 8122                  * vdev_clear() as that will cause us to throw away
 8123                  * degraded/faulted state as well as attempt to reopen the
 8124                  * device, all of which is a waste.
 8125                  */
 8126                 vd->vdev_stat.vs_read_errors = 0;
 8127                 vd->vdev_stat.vs_write_errors = 0;
 8128                 vd->vdev_stat.vs_checksum_errors = 0;
 8129 
 8130                 vdev_state_dirty(vd->vdev_top);
 8131 
 8132                 /* Tell userspace that the vdev is gone. */
 8133                 zfs_post_remove(spa, vd);
 8134         }
 8135 
 8136         for (int c = 0; c < vd->vdev_children; c++)
 8137                 spa_async_remove(spa, vd->vdev_child[c]);
 8138 }
 8139 
 8140 static void
 8141 spa_async_probe(spa_t *spa, vdev_t *vd)
 8142 {
 8143         if (vd->vdev_probe_wanted) {
 8144                 vd->vdev_probe_wanted = B_FALSE;
 8145                 vdev_reopen(vd);        /* vdev_open() does the actual probe */
 8146         }
 8147 
 8148         for (int c = 0; c < vd->vdev_children; c++)
 8149                 spa_async_probe(spa, vd->vdev_child[c]);
 8150 }
 8151 
 8152 static void
 8153 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
 8154 {
 8155         if (!spa->spa_autoexpand)
 8156                 return;
 8157 
 8158         for (int c = 0; c < vd->vdev_children; c++) {
 8159                 vdev_t *cvd = vd->vdev_child[c];
 8160                 spa_async_autoexpand(spa, cvd);
 8161         }
 8162 
 8163         if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
 8164                 return;
 8165 
 8166         spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND);
 8167 }
 8168 
 8169 static __attribute__((noreturn)) void
 8170 spa_async_thread(void *arg)
 8171 {
 8172         spa_t *spa = (spa_t *)arg;
 8173         dsl_pool_t *dp = spa->spa_dsl_pool;
 8174         int tasks;
 8175 
 8176         ASSERT(spa->spa_sync_on);
 8177 
 8178         mutex_enter(&spa->spa_async_lock);
 8179         tasks = spa->spa_async_tasks;
 8180         spa->spa_async_tasks = 0;
 8181         mutex_exit(&spa->spa_async_lock);
 8182 
 8183         /*
 8184          * See if the config needs to be updated.
 8185          */
 8186         if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
 8187                 uint64_t old_space, new_space;
 8188 
 8189                 mutex_enter(&spa_namespace_lock);
 8190                 old_space = metaslab_class_get_space(spa_normal_class(spa));
 8191                 old_space += metaslab_class_get_space(spa_special_class(spa));
 8192                 old_space += metaslab_class_get_space(spa_dedup_class(spa));
 8193                 old_space += metaslab_class_get_space(
 8194                     spa_embedded_log_class(spa));
 8195 
 8196                 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 8197 
 8198                 new_space = metaslab_class_get_space(spa_normal_class(spa));
 8199                 new_space += metaslab_class_get_space(spa_special_class(spa));
 8200                 new_space += metaslab_class_get_space(spa_dedup_class(spa));
 8201                 new_space += metaslab_class_get_space(
 8202                     spa_embedded_log_class(spa));
 8203                 mutex_exit(&spa_namespace_lock);
 8204 
 8205                 /*
 8206                  * If the pool grew as a result of the config update,
 8207                  * then log an internal history event.
 8208                  */
 8209                 if (new_space != old_space) {
 8210                         spa_history_log_internal(spa, "vdev online", NULL,
 8211                             "pool '%s' size: %llu(+%llu)",
 8212                             spa_name(spa), (u_longlong_t)new_space,
 8213                             (u_longlong_t)(new_space - old_space));
 8214                 }
 8215         }
 8216 
 8217         /*
 8218          * See if any devices need to be marked REMOVED.
 8219          */
 8220         if (tasks & SPA_ASYNC_REMOVE) {
 8221                 spa_vdev_state_enter(spa, SCL_NONE);
 8222                 spa_async_remove(spa, spa->spa_root_vdev);
 8223                 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 8224                         spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
 8225                 for (int i = 0; i < spa->spa_spares.sav_count; i++)
 8226                         spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
 8227                 (void) spa_vdev_state_exit(spa, NULL, 0);
 8228         }
 8229 
 8230         if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
 8231                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 8232                 spa_async_autoexpand(spa, spa->spa_root_vdev);
 8233                 spa_config_exit(spa, SCL_CONFIG, FTAG);
 8234         }
 8235 
 8236         /*
 8237          * See if any devices need to be probed.
 8238          */
 8239         if (tasks & SPA_ASYNC_PROBE) {
 8240                 spa_vdev_state_enter(spa, SCL_NONE);
 8241                 spa_async_probe(spa, spa->spa_root_vdev);
 8242                 (void) spa_vdev_state_exit(spa, NULL, 0);
 8243         }
 8244 
 8245         /*
 8246          * If any devices are done replacing, detach them.
 8247          */
 8248         if (tasks & SPA_ASYNC_RESILVER_DONE ||
 8249             tasks & SPA_ASYNC_REBUILD_DONE) {
 8250                 spa_vdev_resilver_done(spa);
 8251         }
 8252 
 8253         /*
 8254          * Kick off a resilver.
 8255          */
 8256         if (tasks & SPA_ASYNC_RESILVER &&
 8257             !vdev_rebuild_active(spa->spa_root_vdev) &&
 8258             (!dsl_scan_resilvering(dp) ||
 8259             !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
 8260                 dsl_scan_restart_resilver(dp, 0);
 8261 
 8262         if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
 8263                 mutex_enter(&spa_namespace_lock);
 8264                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 8265                 vdev_initialize_restart(spa->spa_root_vdev);
 8266                 spa_config_exit(spa, SCL_CONFIG, FTAG);
 8267                 mutex_exit(&spa_namespace_lock);
 8268         }
 8269 
 8270         if (tasks & SPA_ASYNC_TRIM_RESTART) {
 8271                 mutex_enter(&spa_namespace_lock);
 8272                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 8273                 vdev_trim_restart(spa->spa_root_vdev);
 8274                 spa_config_exit(spa, SCL_CONFIG, FTAG);
 8275                 mutex_exit(&spa_namespace_lock);
 8276         }
 8277 
 8278         if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
 8279                 mutex_enter(&spa_namespace_lock);
 8280                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 8281                 vdev_autotrim_restart(spa);
 8282                 spa_config_exit(spa, SCL_CONFIG, FTAG);
 8283                 mutex_exit(&spa_namespace_lock);
 8284         }
 8285 
 8286         /*
 8287          * Kick off L2 cache whole device TRIM.
 8288          */
 8289         if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
 8290                 mutex_enter(&spa_namespace_lock);
 8291                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 8292                 vdev_trim_l2arc(spa);
 8293                 spa_config_exit(spa, SCL_CONFIG, FTAG);
 8294                 mutex_exit(&spa_namespace_lock);
 8295         }
 8296 
 8297         /*
 8298          * Kick off L2 cache rebuilding.
 8299          */
 8300         if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
 8301                 mutex_enter(&spa_namespace_lock);
 8302                 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
 8303                 l2arc_spa_rebuild_start(spa);
 8304                 spa_config_exit(spa, SCL_L2ARC, FTAG);
 8305                 mutex_exit(&spa_namespace_lock);
 8306         }
 8307 
 8308         /*
 8309          * Let the world know that we're done.
 8310          */
 8311         mutex_enter(&spa->spa_async_lock);
 8312         spa->spa_async_thread = NULL;
 8313         cv_broadcast(&spa->spa_async_cv);
 8314         mutex_exit(&spa->spa_async_lock);
 8315         thread_exit();
 8316 }
 8317 
 8318 void
 8319 spa_async_suspend(spa_t *spa)
 8320 {
 8321         mutex_enter(&spa->spa_async_lock);
 8322         spa->spa_async_suspended++;
 8323         while (spa->spa_async_thread != NULL)
 8324                 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
 8325         mutex_exit(&spa->spa_async_lock);
 8326 
 8327         spa_vdev_remove_suspend(spa);
 8328 
 8329         zthr_t *condense_thread = spa->spa_condense_zthr;
 8330         if (condense_thread != NULL)
 8331                 zthr_cancel(condense_thread);
 8332 
 8333         zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
 8334         if (discard_thread != NULL)
 8335                 zthr_cancel(discard_thread);
 8336 
 8337         zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
 8338         if (ll_delete_thread != NULL)
 8339                 zthr_cancel(ll_delete_thread);
 8340 
 8341         zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
 8342         if (ll_condense_thread != NULL)
 8343                 zthr_cancel(ll_condense_thread);
 8344 }
 8345 
 8346 void
 8347 spa_async_resume(spa_t *spa)
 8348 {
 8349         mutex_enter(&spa->spa_async_lock);
 8350         ASSERT(spa->spa_async_suspended != 0);
 8351         spa->spa_async_suspended--;
 8352         mutex_exit(&spa->spa_async_lock);
 8353         spa_restart_removal(spa);
 8354 
 8355         zthr_t *condense_thread = spa->spa_condense_zthr;
 8356         if (condense_thread != NULL)
 8357                 zthr_resume(condense_thread);
 8358 
 8359         zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
 8360         if (discard_thread != NULL)
 8361                 zthr_resume(discard_thread);
 8362 
 8363         zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
 8364         if (ll_delete_thread != NULL)
 8365                 zthr_resume(ll_delete_thread);
 8366 
 8367         zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
 8368         if (ll_condense_thread != NULL)
 8369                 zthr_resume(ll_condense_thread);
 8370 }
 8371 
 8372 static boolean_t
 8373 spa_async_tasks_pending(spa_t *spa)
 8374 {
 8375         uint_t non_config_tasks;
 8376         uint_t config_task;
 8377         boolean_t config_task_suspended;
 8378 
 8379         non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
 8380         config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
 8381         if (spa->spa_ccw_fail_time == 0) {
 8382                 config_task_suspended = B_FALSE;
 8383         } else {
 8384                 config_task_suspended =
 8385                     (gethrtime() - spa->spa_ccw_fail_time) <
 8386                     ((hrtime_t)zfs_ccw_retry_interval * NANOSEC);
 8387         }
 8388 
 8389         return (non_config_tasks || (config_task && !config_task_suspended));
 8390 }
 8391 
 8392 static void
 8393 spa_async_dispatch(spa_t *spa)
 8394 {
 8395         mutex_enter(&spa->spa_async_lock);
 8396         if (spa_async_tasks_pending(spa) &&
 8397             !spa->spa_async_suspended &&
 8398             spa->spa_async_thread == NULL)
 8399                 spa->spa_async_thread = thread_create(NULL, 0,
 8400                     spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
 8401         mutex_exit(&spa->spa_async_lock);
 8402 }
 8403 
 8404 void
 8405 spa_async_request(spa_t *spa, int task)
 8406 {
 8407         zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
 8408         mutex_enter(&spa->spa_async_lock);
 8409         spa->spa_async_tasks |= task;
 8410         mutex_exit(&spa->spa_async_lock);
 8411 }
 8412 
 8413 int
 8414 spa_async_tasks(spa_t *spa)
 8415 {
 8416         return (spa->spa_async_tasks);
 8417 }
 8418 
 8419 /*
 8420  * ==========================================================================
 8421  * SPA syncing routines
 8422  * ==========================================================================
 8423  */
 8424 
 8425 
 8426 static int
 8427 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
 8428     dmu_tx_t *tx)
 8429 {
 8430         bpobj_t *bpo = arg;
 8431         bpobj_enqueue(bpo, bp, bp_freed, tx);
 8432         return (0);
 8433 }
 8434 
 8435 int
 8436 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 8437 {
 8438         return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
 8439 }
 8440 
 8441 int
 8442 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 8443 {
 8444         return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
 8445 }
 8446 
 8447 static int
 8448 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 8449 {
 8450         zio_t *pio = arg;
 8451 
 8452         zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp,
 8453             pio->io_flags));
 8454         return (0);
 8455 }
 8456 
 8457 static int
 8458 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
 8459     dmu_tx_t *tx)
 8460 {
 8461         ASSERT(!bp_freed);
 8462         return (spa_free_sync_cb(arg, bp, tx));
 8463 }
 8464 
 8465 /*
 8466  * Note: this simple function is not inlined to make it easier to dtrace the
 8467  * amount of time spent syncing frees.
 8468  */
 8469 static void
 8470 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
 8471 {
 8472         zio_t *zio = zio_root(spa, NULL, NULL, 0);
 8473         bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
 8474         VERIFY(zio_wait(zio) == 0);
 8475 }
 8476 
 8477 /*
 8478  * Note: this simple function is not inlined to make it easier to dtrace the
 8479  * amount of time spent syncing deferred frees.
 8480  */
 8481 static void
 8482 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
 8483 {
 8484         if (spa_sync_pass(spa) != 1)
 8485                 return;
 8486 
 8487         /*
 8488          * Note:
 8489          * If the log space map feature is active, we stop deferring
 8490          * frees to the next TXG and therefore running this function
 8491          * would be considered a no-op as spa_deferred_bpobj should
 8492          * not have any entries.
 8493          *
 8494          * That said we run this function anyway (instead of returning
 8495          * immediately) for the edge-case scenario where we just
 8496          * activated the log space map feature in this TXG but we have
 8497          * deferred frees from the previous TXG.
 8498          */
 8499         zio_t *zio = zio_root(spa, NULL, NULL, 0);
 8500         VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
 8501             bpobj_spa_free_sync_cb, zio, tx), ==, 0);
 8502         VERIFY0(zio_wait(zio));
 8503 }
 8504 
 8505 static void
 8506 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 8507 {
 8508         char *packed = NULL;
 8509         size_t bufsize;
 8510         size_t nvsize = 0;
 8511         dmu_buf_t *db;
 8512 
 8513         VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
 8514 
 8515         /*
 8516          * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
 8517          * information.  This avoids the dmu_buf_will_dirty() path and
 8518          * saves us a pre-read to get data we don't actually care about.
 8519          */
 8520         bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
 8521         packed = vmem_alloc(bufsize, KM_SLEEP);
 8522 
 8523         VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
 8524             KM_SLEEP) == 0);
 8525         memset(packed + nvsize, 0, bufsize - nvsize);
 8526 
 8527         dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
 8528 
 8529         vmem_free(packed, bufsize);
 8530 
 8531         VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
 8532         dmu_buf_will_dirty(db, tx);
 8533         *(uint64_t *)db->db_data = nvsize;
 8534         dmu_buf_rele(db, FTAG);
 8535 }
 8536 
 8537 static void
 8538 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
 8539     const char *config, const char *entry)
 8540 {
 8541         nvlist_t *nvroot;
 8542         nvlist_t **list;
 8543         int i;
 8544 
 8545         if (!sav->sav_sync)
 8546                 return;
 8547 
 8548         /*
 8549          * Update the MOS nvlist describing the list of available devices.
 8550          * spa_validate_aux() will have already made sure this nvlist is
 8551          * valid and the vdevs are labeled appropriately.
 8552          */
 8553         if (sav->sav_object == 0) {
 8554                 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
 8555                     DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
 8556                     sizeof (uint64_t), tx);
 8557                 VERIFY(zap_update(spa->spa_meta_objset,
 8558                     DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
 8559                     &sav->sav_object, tx) == 0);
 8560         }
 8561 
 8562         nvroot = fnvlist_alloc();
 8563         if (sav->sav_count == 0) {
 8564                 fnvlist_add_nvlist_array(nvroot, config,
 8565                     (const nvlist_t * const *)NULL, 0);
 8566         } else {
 8567                 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
 8568                 for (i = 0; i < sav->sav_count; i++)
 8569                         list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
 8570                             B_FALSE, VDEV_CONFIG_L2CACHE);
 8571                 fnvlist_add_nvlist_array(nvroot, config,
 8572                     (const nvlist_t * const *)list, sav->sav_count);
 8573                 for (i = 0; i < sav->sav_count; i++)
 8574                         nvlist_free(list[i]);
 8575                 kmem_free(list, sav->sav_count * sizeof (void *));
 8576         }
 8577 
 8578         spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
 8579         nvlist_free(nvroot);
 8580 
 8581         sav->sav_sync = B_FALSE;
 8582 }
 8583 
 8584 /*
 8585  * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
 8586  * The all-vdev ZAP must be empty.
 8587  */
 8588 static void
 8589 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
 8590 {
 8591         spa_t *spa = vd->vdev_spa;
 8592 
 8593         if (vd->vdev_top_zap != 0) {
 8594                 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
 8595                     vd->vdev_top_zap, tx));
 8596         }
 8597         if (vd->vdev_leaf_zap != 0) {
 8598                 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
 8599                     vd->vdev_leaf_zap, tx));
 8600         }
 8601         for (uint64_t i = 0; i < vd->vdev_children; i++) {
 8602                 spa_avz_build(vd->vdev_child[i], avz, tx);
 8603         }
 8604 }
 8605 
 8606 static void
 8607 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 8608 {
 8609         nvlist_t *config;
 8610 
 8611         /*
 8612          * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
 8613          * its config may not be dirty but we still need to build per-vdev ZAPs.
 8614          * Similarly, if the pool is being assembled (e.g. after a split), we
 8615          * need to rebuild the AVZ although the config may not be dirty.
 8616          */
 8617         if (list_is_empty(&spa->spa_config_dirty_list) &&
 8618             spa->spa_avz_action == AVZ_ACTION_NONE)
 8619                 return;
 8620 
 8621         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 8622 
 8623         ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
 8624             spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
 8625             spa->spa_all_vdev_zaps != 0);
 8626 
 8627         if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
 8628                 /* Make and build the new AVZ */
 8629                 uint64_t new_avz = zap_create(spa->spa_meta_objset,
 8630                     DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
 8631                 spa_avz_build(spa->spa_root_vdev, new_avz, tx);
 8632 
 8633                 /* Diff old AVZ with new one */
 8634                 zap_cursor_t zc;
 8635                 zap_attribute_t za;
 8636 
 8637                 for (zap_cursor_init(&zc, spa->spa_meta_objset,
 8638                     spa->spa_all_vdev_zaps);
 8639                     zap_cursor_retrieve(&zc, &za) == 0;
 8640                     zap_cursor_advance(&zc)) {
 8641                         uint64_t vdzap = za.za_first_integer;
 8642                         if (zap_lookup_int(spa->spa_meta_objset, new_avz,
 8643                             vdzap) == ENOENT) {
 8644                                 /*
 8645                                  * ZAP is listed in old AVZ but not in new one;
 8646                                  * destroy it
 8647                                  */
 8648                                 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
 8649                                     tx));
 8650                         }
 8651                 }
 8652 
 8653                 zap_cursor_fini(&zc);
 8654 
 8655                 /* Destroy the old AVZ */
 8656                 VERIFY0(zap_destroy(spa->spa_meta_objset,
 8657                     spa->spa_all_vdev_zaps, tx));
 8658 
 8659                 /* Replace the old AVZ in the dir obj with the new one */
 8660                 VERIFY0(zap_update(spa->spa_meta_objset,
 8661                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
 8662                     sizeof (new_avz), 1, &new_avz, tx));
 8663 
 8664                 spa->spa_all_vdev_zaps = new_avz;
 8665         } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
 8666                 zap_cursor_t zc;
 8667                 zap_attribute_t za;
 8668 
 8669                 /* Walk through the AVZ and destroy all listed ZAPs */
 8670                 for (zap_cursor_init(&zc, spa->spa_meta_objset,
 8671                     spa->spa_all_vdev_zaps);
 8672                     zap_cursor_retrieve(&zc, &za) == 0;
 8673                     zap_cursor_advance(&zc)) {
 8674                         uint64_t zap = za.za_first_integer;
 8675                         VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
 8676                 }
 8677 
 8678                 zap_cursor_fini(&zc);
 8679 
 8680                 /* Destroy and unlink the AVZ itself */
 8681                 VERIFY0(zap_destroy(spa->spa_meta_objset,
 8682                     spa->spa_all_vdev_zaps, tx));
 8683                 VERIFY0(zap_remove(spa->spa_meta_objset,
 8684                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
 8685                 spa->spa_all_vdev_zaps = 0;
 8686         }
 8687 
 8688         if (spa->spa_all_vdev_zaps == 0) {
 8689                 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
 8690                     DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
 8691                     DMU_POOL_VDEV_ZAP_MAP, tx);
 8692         }
 8693         spa->spa_avz_action = AVZ_ACTION_NONE;
 8694 
 8695         /* Create ZAPs for vdevs that don't have them. */
 8696         vdev_construct_zaps(spa->spa_root_vdev, tx);
 8697 
 8698         config = spa_config_generate(spa, spa->spa_root_vdev,
 8699             dmu_tx_get_txg(tx), B_FALSE);
 8700 
 8701         /*
 8702          * If we're upgrading the spa version then make sure that
 8703          * the config object gets updated with the correct version.
 8704          */
 8705         if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
 8706                 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 8707                     spa->spa_uberblock.ub_version);
 8708 
 8709         spa_config_exit(spa, SCL_STATE, FTAG);
 8710 
 8711         nvlist_free(spa->spa_config_syncing);
 8712         spa->spa_config_syncing = config;
 8713 
 8714         spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
 8715 }
 8716 
 8717 static void
 8718 spa_sync_version(void *arg, dmu_tx_t *tx)
 8719 {
 8720         uint64_t *versionp = arg;
 8721         uint64_t version = *versionp;
 8722         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 8723 
 8724         /*
 8725          * Setting the version is special cased when first creating the pool.
 8726          */
 8727         ASSERT(tx->tx_txg != TXG_INITIAL);
 8728 
 8729         ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 8730         ASSERT(version >= spa_version(spa));
 8731 
 8732         spa->spa_uberblock.ub_version = version;
 8733         vdev_config_dirty(spa->spa_root_vdev);
 8734         spa_history_log_internal(spa, "set", tx, "version=%lld",
 8735             (longlong_t)version);
 8736 }
 8737 
 8738 /*
 8739  * Set zpool properties.
 8740  */
 8741 static void
 8742 spa_sync_props(void *arg, dmu_tx_t *tx)
 8743 {
 8744         nvlist_t *nvp = arg;
 8745         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 8746         objset_t *mos = spa->spa_meta_objset;
 8747         nvpair_t *elem = NULL;
 8748 
 8749         mutex_enter(&spa->spa_props_lock);
 8750 
 8751         while ((elem = nvlist_next_nvpair(nvp, elem))) {
 8752                 uint64_t intval;
 8753                 char *strval, *fname;
 8754                 zpool_prop_t prop;
 8755                 const char *propname;
 8756                 zprop_type_t proptype;
 8757                 spa_feature_t fid;
 8758 
 8759                 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
 8760                 case ZPOOL_PROP_INVAL:
 8761                         /*
 8762                          * We checked this earlier in spa_prop_validate().
 8763                          */
 8764                         ASSERT(zpool_prop_feature(nvpair_name(elem)));
 8765 
 8766                         fname = strchr(nvpair_name(elem), '@') + 1;
 8767                         VERIFY0(zfeature_lookup_name(fname, &fid));
 8768 
 8769                         spa_feature_enable(spa, fid, tx);
 8770                         spa_history_log_internal(spa, "set", tx,
 8771                             "%s=enabled", nvpair_name(elem));
 8772                         break;
 8773 
 8774                 case ZPOOL_PROP_VERSION:
 8775                         intval = fnvpair_value_uint64(elem);
 8776                         /*
 8777                          * The version is synced separately before other
 8778                          * properties and should be correct by now.
 8779                          */
 8780                         ASSERT3U(spa_version(spa), >=, intval);
 8781                         break;
 8782 
 8783                 case ZPOOL_PROP_ALTROOT:
 8784                         /*
 8785                          * 'altroot' is a non-persistent property. It should
 8786                          * have been set temporarily at creation or import time.
 8787                          */
 8788                         ASSERT(spa->spa_root != NULL);
 8789                         break;
 8790 
 8791                 case ZPOOL_PROP_READONLY:
 8792                 case ZPOOL_PROP_CACHEFILE:
 8793                         /*
 8794                          * 'readonly' and 'cachefile' are also non-persistent
 8795                          * properties.
 8796                          */
 8797                         break;
 8798                 case ZPOOL_PROP_COMMENT:
 8799                         strval = fnvpair_value_string(elem);
 8800                         if (spa->spa_comment != NULL)
 8801                                 spa_strfree(spa->spa_comment);
 8802                         spa->spa_comment = spa_strdup(strval);
 8803                         /*
 8804                          * We need to dirty the configuration on all the vdevs
 8805                          * so that their labels get updated.  We also need to
 8806                          * update the cache file to keep it in sync with the
 8807                          * MOS version. It's unnecessary to do this for pool
 8808                          * creation since the vdev's configuration has already
 8809                          * been dirtied.
 8810                          */
 8811                         if (tx->tx_txg != TXG_INITIAL) {
 8812                                 vdev_config_dirty(spa->spa_root_vdev);
 8813                                 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 8814                         }
 8815                         spa_history_log_internal(spa, "set", tx,
 8816                             "%s=%s", nvpair_name(elem), strval);
 8817                         break;
 8818                 case ZPOOL_PROP_COMPATIBILITY:
 8819                         strval = fnvpair_value_string(elem);
 8820                         if (spa->spa_compatibility != NULL)
 8821                                 spa_strfree(spa->spa_compatibility);
 8822                         spa->spa_compatibility = spa_strdup(strval);
 8823                         /*
 8824                          * Dirty the configuration on vdevs as above.
 8825                          */
 8826                         if (tx->tx_txg != TXG_INITIAL) {
 8827                                 vdev_config_dirty(spa->spa_root_vdev);
 8828                                 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 8829                         }
 8830 
 8831                         spa_history_log_internal(spa, "set", tx,
 8832                             "%s=%s", nvpair_name(elem), strval);
 8833                         break;
 8834 
 8835                 default:
 8836                         /*
 8837                          * Set pool property values in the poolprops mos object.
 8838                          */
 8839                         if (spa->spa_pool_props_object == 0) {
 8840                                 spa->spa_pool_props_object =
 8841                                     zap_create_link(mos, DMU_OT_POOL_PROPS,
 8842                                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
 8843                                     tx);
 8844                         }
 8845 
 8846                         /* normalize the property name */
 8847                         propname = zpool_prop_to_name(prop);
 8848                         proptype = zpool_prop_get_type(prop);
 8849 
 8850                         if (nvpair_type(elem) == DATA_TYPE_STRING) {
 8851                                 ASSERT(proptype == PROP_TYPE_STRING);
 8852                                 strval = fnvpair_value_string(elem);
 8853                                 VERIFY0(zap_update(mos,
 8854                                     spa->spa_pool_props_object, propname,
 8855                                     1, strlen(strval) + 1, strval, tx));
 8856                                 spa_history_log_internal(spa, "set", tx,
 8857                                     "%s=%s", nvpair_name(elem), strval);
 8858                         } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 8859                                 intval = fnvpair_value_uint64(elem);
 8860 
 8861                                 if (proptype == PROP_TYPE_INDEX) {
 8862                                         const char *unused;
 8863                                         VERIFY0(zpool_prop_index_to_string(
 8864                                             prop, intval, &unused));
 8865                                 }
 8866                                 VERIFY0(zap_update(mos,
 8867                                     spa->spa_pool_props_object, propname,
 8868                                     8, 1, &intval, tx));
 8869                                 spa_history_log_internal(spa, "set", tx,
 8870                                     "%s=%lld", nvpair_name(elem),
 8871                                     (longlong_t)intval);
 8872 
 8873                                 switch (prop) {
 8874                                 case ZPOOL_PROP_DELEGATION:
 8875                                         spa->spa_delegation = intval;
 8876                                         break;
 8877                                 case ZPOOL_PROP_BOOTFS:
 8878                                         spa->spa_bootfs = intval;
 8879                                         break;
 8880                                 case ZPOOL_PROP_FAILUREMODE:
 8881                                         spa->spa_failmode = intval;
 8882                                         break;
 8883                                 case ZPOOL_PROP_AUTOTRIM:
 8884                                         spa->spa_autotrim = intval;
 8885                                         spa_async_request(spa,
 8886                                             SPA_ASYNC_AUTOTRIM_RESTART);
 8887                                         break;
 8888                                 case ZPOOL_PROP_AUTOEXPAND:
 8889                                         spa->spa_autoexpand = intval;
 8890                                         if (tx->tx_txg != TXG_INITIAL)
 8891                                                 spa_async_request(spa,
 8892                                                     SPA_ASYNC_AUTOEXPAND);
 8893                                         break;
 8894                                 case ZPOOL_PROP_MULTIHOST:
 8895                                         spa->spa_multihost = intval;
 8896                                         break;
 8897                                 default:
 8898                                         break;
 8899                                 }
 8900                         } else {
 8901                                 ASSERT(0); /* not allowed */
 8902                         }
 8903                 }
 8904 
 8905         }
 8906 
 8907         mutex_exit(&spa->spa_props_lock);
 8908 }
 8909 
 8910 /*
 8911  * Perform one-time upgrade on-disk changes.  spa_version() does not
 8912  * reflect the new version this txg, so there must be no changes this
 8913  * txg to anything that the upgrade code depends on after it executes.
 8914  * Therefore this must be called after dsl_pool_sync() does the sync
 8915  * tasks.
 8916  */
 8917 static void
 8918 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 8919 {
 8920         if (spa_sync_pass(spa) != 1)
 8921                 return;
 8922 
 8923         dsl_pool_t *dp = spa->spa_dsl_pool;
 8924         rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 8925 
 8926         if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
 8927             spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
 8928                 dsl_pool_create_origin(dp, tx);
 8929 
 8930                 /* Keeping the origin open increases spa_minref */
 8931                 spa->spa_minref += 3;
 8932         }
 8933 
 8934         if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
 8935             spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
 8936                 dsl_pool_upgrade_clones(dp, tx);
 8937         }
 8938 
 8939         if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
 8940             spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
 8941                 dsl_pool_upgrade_dir_clones(dp, tx);
 8942 
 8943                 /* Keeping the freedir open increases spa_minref */
 8944                 spa->spa_minref += 3;
 8945         }
 8946 
 8947         if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
 8948             spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 8949                 spa_feature_create_zap_objects(spa, tx);
 8950         }
 8951 
 8952         /*
 8953          * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
 8954          * when possibility to use lz4 compression for metadata was added
 8955          * Old pools that have this feature enabled must be upgraded to have
 8956          * this feature active
 8957          */
 8958         if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 8959                 boolean_t lz4_en = spa_feature_is_enabled(spa,
 8960                     SPA_FEATURE_LZ4_COMPRESS);
 8961                 boolean_t lz4_ac = spa_feature_is_active(spa,
 8962                     SPA_FEATURE_LZ4_COMPRESS);
 8963 
 8964                 if (lz4_en && !lz4_ac)
 8965                         spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
 8966         }
 8967 
 8968         /*
 8969          * If we haven't written the salt, do so now.  Note that the
 8970          * feature may not be activated yet, but that's fine since
 8971          * the presence of this ZAP entry is backwards compatible.
 8972          */
 8973         if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 8974             DMU_POOL_CHECKSUM_SALT) == ENOENT) {
 8975                 VERIFY0(zap_add(spa->spa_meta_objset,
 8976                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
 8977                     sizeof (spa->spa_cksum_salt.zcs_bytes),
 8978                     spa->spa_cksum_salt.zcs_bytes, tx));
 8979         }
 8980 
 8981         rrw_exit(&dp->dp_config_rwlock, FTAG);
 8982 }
 8983 
 8984 static void
 8985 vdev_indirect_state_sync_verify(vdev_t *vd)
 8986 {
 8987         vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping;
 8988         vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births;
 8989 
 8990         if (vd->vdev_ops == &vdev_indirect_ops) {
 8991                 ASSERT(vim != NULL);
 8992                 ASSERT(vib != NULL);
 8993         }
 8994 
 8995         uint64_t obsolete_sm_object = 0;
 8996         ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 8997         if (obsolete_sm_object != 0) {
 8998                 ASSERT(vd->vdev_obsolete_sm != NULL);
 8999                 ASSERT(vd->vdev_removing ||
 9000                     vd->vdev_ops == &vdev_indirect_ops);
 9001                 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
 9002                 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
 9003                 ASSERT3U(obsolete_sm_object, ==,
 9004                     space_map_object(vd->vdev_obsolete_sm));
 9005                 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
 9006                     space_map_allocated(vd->vdev_obsolete_sm));
 9007         }
 9008         ASSERT(vd->vdev_obsolete_segments != NULL);
 9009 
 9010         /*
 9011          * Since frees / remaps to an indirect vdev can only
 9012          * happen in syncing context, the obsolete segments
 9013          * tree must be empty when we start syncing.
 9014          */
 9015         ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
 9016 }
 9017 
 9018 /*
 9019  * Set the top-level vdev's max queue depth. Evaluate each top-level's
 9020  * async write queue depth in case it changed. The max queue depth will
 9021  * not change in the middle of syncing out this txg.
 9022  */
 9023 static void
 9024 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
 9025 {
 9026         ASSERT(spa_writeable(spa));
 9027 
 9028         vdev_t *rvd = spa->spa_root_vdev;
 9029         uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
 9030             zfs_vdev_queue_depth_pct / 100;
 9031         metaslab_class_t *normal = spa_normal_class(spa);
 9032         metaslab_class_t *special = spa_special_class(spa);
 9033         metaslab_class_t *dedup = spa_dedup_class(spa);
 9034 
 9035         uint64_t slots_per_allocator = 0;
 9036         for (int c = 0; c < rvd->vdev_children; c++) {
 9037                 vdev_t *tvd = rvd->vdev_child[c];
 9038 
 9039                 metaslab_group_t *mg = tvd->vdev_mg;
 9040                 if (mg == NULL || !metaslab_group_initialized(mg))
 9041                         continue;
 9042 
 9043                 metaslab_class_t *mc = mg->mg_class;
 9044                 if (mc != normal && mc != special && mc != dedup)
 9045                         continue;
 9046 
 9047                 /*
 9048                  * It is safe to do a lock-free check here because only async
 9049                  * allocations look at mg_max_alloc_queue_depth, and async
 9050                  * allocations all happen from spa_sync().
 9051                  */
 9052                 for (int i = 0; i < mg->mg_allocators; i++) {
 9053                         ASSERT0(zfs_refcount_count(
 9054                             &(mg->mg_allocator[i].mga_alloc_queue_depth)));
 9055                 }
 9056                 mg->mg_max_alloc_queue_depth = max_queue_depth;
 9057 
 9058                 for (int i = 0; i < mg->mg_allocators; i++) {
 9059                         mg->mg_allocator[i].mga_cur_max_alloc_queue_depth =
 9060                             zfs_vdev_def_queue_depth;
 9061                 }
 9062                 slots_per_allocator += zfs_vdev_def_queue_depth;
 9063         }
 9064 
 9065         for (int i = 0; i < spa->spa_alloc_count; i++) {
 9066                 ASSERT0(zfs_refcount_count(&normal->mc_allocator[i].
 9067                     mca_alloc_slots));
 9068                 ASSERT0(zfs_refcount_count(&special->mc_allocator[i].
 9069                     mca_alloc_slots));
 9070                 ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i].
 9071                     mca_alloc_slots));
 9072                 normal->mc_allocator[i].mca_alloc_max_slots =
 9073                     slots_per_allocator;
 9074                 special->mc_allocator[i].mca_alloc_max_slots =
 9075                     slots_per_allocator;
 9076                 dedup->mc_allocator[i].mca_alloc_max_slots =
 9077                     slots_per_allocator;
 9078         }
 9079         normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 9080         special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 9081         dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 9082 }
 9083 
 9084 static void
 9085 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
 9086 {
 9087         ASSERT(spa_writeable(spa));
 9088 
 9089         vdev_t *rvd = spa->spa_root_vdev;
 9090         for (int c = 0; c < rvd->vdev_children; c++) {
 9091                 vdev_t *vd = rvd->vdev_child[c];
 9092                 vdev_indirect_state_sync_verify(vd);
 9093 
 9094                 if (vdev_indirect_should_condense(vd)) {
 9095                         spa_condense_indirect_start_sync(vd, tx);
 9096                         break;
 9097                 }
 9098         }
 9099 }
 9100 
 9101 static void
 9102 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 9103 {
 9104         objset_t *mos = spa->spa_meta_objset;
 9105         dsl_pool_t *dp = spa->spa_dsl_pool;
 9106         uint64_t txg = tx->tx_txg;
 9107         bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 9108 
 9109         do {
 9110                 int pass = ++spa->spa_sync_pass;
 9111 
 9112                 spa_sync_config_object(spa, tx);
 9113                 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
 9114                     ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
 9115                 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
 9116                     ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
 9117                 spa_errlog_sync(spa, txg);
 9118                 dsl_pool_sync(dp, txg);
 9119 
 9120                 if (pass < zfs_sync_pass_deferred_free ||
 9121                     spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
 9122                         /*
 9123                          * If the log space map feature is active we don't
 9124                          * care about deferred frees and the deferred bpobj
 9125                          * as the log space map should effectively have the
 9126                          * same results (i.e. appending only to one object).
 9127                          */
 9128                         spa_sync_frees(spa, free_bpl, tx);
 9129                 } else {
 9130                         /*
 9131                          * We can not defer frees in pass 1, because
 9132                          * we sync the deferred frees later in pass 1.
 9133                          */
 9134                         ASSERT3U(pass, >, 1);
 9135                         bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
 9136                             &spa->spa_deferred_bpobj, tx);
 9137                 }
 9138 
 9139                 ddt_sync(spa, txg);
 9140                 dsl_scan_sync(dp, tx);
 9141                 svr_sync(spa, tx);
 9142                 spa_sync_upgrades(spa, tx);
 9143 
 9144                 spa_flush_metaslabs(spa, tx);
 9145 
 9146                 vdev_t *vd = NULL;
 9147                 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 9148                     != NULL)
 9149                         vdev_sync(vd, txg);
 9150 
 9151                 /*
 9152                  * Note: We need to check if the MOS is dirty because we could
 9153                  * have marked the MOS dirty without updating the uberblock
 9154                  * (e.g. if we have sync tasks but no dirty user data). We need
 9155                  * to check the uberblock's rootbp because it is updated if we
 9156                  * have synced out dirty data (though in this case the MOS will
 9157                  * most likely also be dirty due to second order effects, we
 9158                  * don't want to rely on that here).
 9159                  */
 9160                 if (pass == 1 &&
 9161                     spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
 9162                     !dmu_objset_is_dirty(mos, txg)) {
 9163                         /*
 9164                          * Nothing changed on the first pass, therefore this
 9165                          * TXG is a no-op. Avoid syncing deferred frees, so
 9166                          * that we can keep this TXG as a no-op.
 9167                          */
 9168                         ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 9169                         ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 9170                         ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
 9171                         ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
 9172                         break;
 9173                 }
 9174 
 9175                 spa_sync_deferred_frees(spa, tx);
 9176         } while (dmu_objset_is_dirty(mos, txg));
 9177 }
 9178 
 9179 /*
 9180  * Rewrite the vdev configuration (which includes the uberblock) to
 9181  * commit the transaction group.
 9182  *
 9183  * If there are no dirty vdevs, we sync the uberblock to a few random
 9184  * top-level vdevs that are known to be visible in the config cache
 9185  * (see spa_vdev_add() for a complete description). If there *are* dirty
 9186  * vdevs, sync the uberblock to all vdevs.
 9187  */
 9188 static void
 9189 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
 9190 {
 9191         vdev_t *rvd = spa->spa_root_vdev;
 9192         uint64_t txg = tx->tx_txg;
 9193 
 9194         for (;;) {
 9195                 int error = 0;
 9196 
 9197                 /*
 9198                  * We hold SCL_STATE to prevent vdev open/close/etc.
 9199                  * while we're attempting to write the vdev labels.
 9200                  */
 9201                 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 9202 
 9203                 if (list_is_empty(&spa->spa_config_dirty_list)) {
 9204                         vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
 9205                         int svdcount = 0;
 9206                         int children = rvd->vdev_children;
 9207                         int c0 = random_in_range(children);
 9208 
 9209                         for (int c = 0; c < children; c++) {
 9210                                 vdev_t *vd =
 9211                                     rvd->vdev_child[(c0 + c) % children];
 9212 
 9213                                 /* Stop when revisiting the first vdev */
 9214                                 if (c > 0 && svd[0] == vd)
 9215                                         break;
 9216 
 9217                                 if (vd->vdev_ms_array == 0 ||
 9218                                     vd->vdev_islog ||
 9219                                     !vdev_is_concrete(vd))
 9220                                         continue;
 9221 
 9222                                 svd[svdcount++] = vd;
 9223                                 if (svdcount == SPA_SYNC_MIN_VDEVS)
 9224                                         break;
 9225                         }
 9226                         error = vdev_config_sync(svd, svdcount, txg);
 9227                 } else {
 9228                         error = vdev_config_sync(rvd->vdev_child,
 9229                             rvd->vdev_children, txg);
 9230                 }
 9231 
 9232                 if (error == 0)
 9233                         spa->spa_last_synced_guid = rvd->vdev_guid;
 9234 
 9235                 spa_config_exit(spa, SCL_STATE, FTAG);
 9236 
 9237                 if (error == 0)
 9238                         break;
 9239                 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
 9240                 zio_resume_wait(spa);
 9241         }
 9242 }
 9243 
 9244 /*
 9245  * Sync the specified transaction group.  New blocks may be dirtied as
 9246  * part of the process, so we iterate until it converges.
 9247  */
 9248 void
 9249 spa_sync(spa_t *spa, uint64_t txg)
 9250 {
 9251         vdev_t *vd = NULL;
 9252 
 9253         VERIFY(spa_writeable(spa));
 9254 
 9255         /*
 9256          * Wait for i/os issued in open context that need to complete
 9257          * before this txg syncs.
 9258          */
 9259         (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
 9260         spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
 9261             ZIO_FLAG_CANFAIL);
 9262 
 9263         /*
 9264          * Lock out configuration changes.
 9265          */
 9266         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 9267 
 9268         spa->spa_syncing_txg = txg;
 9269         spa->spa_sync_pass = 0;
 9270 
 9271         for (int i = 0; i < spa->spa_alloc_count; i++) {
 9272                 mutex_enter(&spa->spa_allocs[i].spaa_lock);
 9273                 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
 9274                 mutex_exit(&spa->spa_allocs[i].spaa_lock);
 9275         }
 9276 
 9277         /*
 9278          * If there are any pending vdev state changes, convert them
 9279          * into config changes that go out with this transaction group.
 9280          */
 9281         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 9282         while (list_head(&spa->spa_state_dirty_list) != NULL) {
 9283                 /*
 9284                  * We need the write lock here because, for aux vdevs,
 9285                  * calling vdev_config_dirty() modifies sav_config.
 9286                  * This is ugly and will become unnecessary when we
 9287                  * eliminate the aux vdev wart by integrating all vdevs
 9288                  * into the root vdev tree.
 9289                  */
 9290                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 9291                 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
 9292                 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
 9293                         vdev_state_clean(vd);
 9294                         vdev_config_dirty(vd);
 9295                 }
 9296                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 9297                 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 9298         }
 9299         spa_config_exit(spa, SCL_STATE, FTAG);
 9300 
 9301         dsl_pool_t *dp = spa->spa_dsl_pool;
 9302         dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 9303 
 9304         spa->spa_sync_starttime = gethrtime();
 9305         taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
 9306         spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
 9307             spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
 9308             NSEC_TO_TICK(spa->spa_deadman_synctime));
 9309 
 9310         /*
 9311          * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
 9312          * set spa_deflate if we have no raid-z vdevs.
 9313          */
 9314         if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
 9315             spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
 9316                 vdev_t *rvd = spa->spa_root_vdev;
 9317 
 9318                 int i;
 9319                 for (i = 0; i < rvd->vdev_children; i++) {
 9320                         vd = rvd->vdev_child[i];
 9321                         if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
 9322                                 break;
 9323                 }
 9324                 if (i == rvd->vdev_children) {
 9325                         spa->spa_deflate = TRUE;
 9326                         VERIFY0(zap_add(spa->spa_meta_objset,
 9327                             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 9328                             sizeof (uint64_t), 1, &spa->spa_deflate, tx));
 9329                 }
 9330         }
 9331 
 9332         spa_sync_adjust_vdev_max_queue_depth(spa);
 9333 
 9334         spa_sync_condense_indirect(spa, tx);
 9335 
 9336         spa_sync_iterate_to_convergence(spa, tx);
 9337 
 9338 #ifdef ZFS_DEBUG
 9339         if (!list_is_empty(&spa->spa_config_dirty_list)) {
 9340         /*
 9341          * Make sure that the number of ZAPs for all the vdevs matches
 9342          * the number of ZAPs in the per-vdev ZAP list. This only gets
 9343          * called if the config is dirty; otherwise there may be
 9344          * outstanding AVZ operations that weren't completed in
 9345          * spa_sync_config_object.
 9346          */
 9347                 uint64_t all_vdev_zap_entry_count;
 9348                 ASSERT0(zap_count(spa->spa_meta_objset,
 9349                     spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
 9350                 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
 9351                     all_vdev_zap_entry_count);
 9352         }
 9353 #endif
 9354 
 9355         if (spa->spa_vdev_removal != NULL) {
 9356                 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
 9357         }
 9358 
 9359         spa_sync_rewrite_vdev_config(spa, tx);
 9360         dmu_tx_commit(tx);
 9361 
 9362         taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
 9363         spa->spa_deadman_tqid = 0;
 9364 
 9365         /*
 9366          * Clear the dirty config list.
 9367          */
 9368         while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
 9369                 vdev_config_clean(vd);
 9370 
 9371         /*
 9372          * Now that the new config has synced transactionally,
 9373          * let it become visible to the config cache.
 9374          */
 9375         if (spa->spa_config_syncing != NULL) {
 9376                 spa_config_set(spa, spa->spa_config_syncing);
 9377                 spa->spa_config_txg = txg;
 9378                 spa->spa_config_syncing = NULL;
 9379         }
 9380 
 9381         dsl_pool_sync_done(dp, txg);
 9382 
 9383         for (int i = 0; i < spa->spa_alloc_count; i++) {
 9384                 mutex_enter(&spa->spa_allocs[i].spaa_lock);
 9385                 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
 9386                 mutex_exit(&spa->spa_allocs[i].spaa_lock);
 9387         }
 9388 
 9389         /*
 9390          * Update usable space statistics.
 9391          */
 9392         while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
 9393             != NULL)
 9394                 vdev_sync_done(vd, txg);
 9395 
 9396         metaslab_class_evict_old(spa->spa_normal_class, txg);
 9397         metaslab_class_evict_old(spa->spa_log_class, txg);
 9398 
 9399         spa_sync_close_syncing_log_sm(spa);
 9400 
 9401         spa_update_dspace(spa);
 9402 
 9403         /*
 9404          * It had better be the case that we didn't dirty anything
 9405          * since vdev_config_sync().
 9406          */
 9407         ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 9408         ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 9409         ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 9410 
 9411         while (zfs_pause_spa_sync)
 9412                 delay(1);
 9413 
 9414         spa->spa_sync_pass = 0;
 9415 
 9416         /*
 9417          * Update the last synced uberblock here. We want to do this at
 9418          * the end of spa_sync() so that consumers of spa_last_synced_txg()
 9419          * will be guaranteed that all the processing associated with
 9420          * that txg has been completed.
 9421          */
 9422         spa->spa_ubsync = spa->spa_uberblock;
 9423         spa_config_exit(spa, SCL_CONFIG, FTAG);
 9424 
 9425         spa_handle_ignored_writes(spa);
 9426 
 9427         /*
 9428          * If any async tasks have been requested, kick them off.
 9429          */
 9430         spa_async_dispatch(spa);
 9431 }
 9432 
 9433 /*
 9434  * Sync all pools.  We don't want to hold the namespace lock across these
 9435  * operations, so we take a reference on the spa_t and drop the lock during the
 9436  * sync.
 9437  */
 9438 void
 9439 spa_sync_allpools(void)
 9440 {
 9441         spa_t *spa = NULL;
 9442         mutex_enter(&spa_namespace_lock);
 9443         while ((spa = spa_next(spa)) != NULL) {
 9444                 if (spa_state(spa) != POOL_STATE_ACTIVE ||
 9445                     !spa_writeable(spa) || spa_suspended(spa))
 9446                         continue;
 9447                 spa_open_ref(spa, FTAG);
 9448                 mutex_exit(&spa_namespace_lock);
 9449                 txg_wait_synced(spa_get_dsl(spa), 0);
 9450                 mutex_enter(&spa_namespace_lock);
 9451                 spa_close(spa, FTAG);
 9452         }
 9453         mutex_exit(&spa_namespace_lock);
 9454 }
 9455 
 9456 /*
 9457  * ==========================================================================
 9458  * Miscellaneous routines
 9459  * ==========================================================================
 9460  */
 9461 
 9462 /*
 9463  * Remove all pools in the system.
 9464  */
 9465 void
 9466 spa_evict_all(void)
 9467 {
 9468         spa_t *spa;
 9469 
 9470         /*
 9471          * Remove all cached state.  All pools should be closed now,
 9472          * so every spa in the AVL tree should be unreferenced.
 9473          */
 9474         mutex_enter(&spa_namespace_lock);
 9475         while ((spa = spa_next(NULL)) != NULL) {
 9476                 /*
 9477                  * Stop async tasks.  The async thread may need to detach
 9478                  * a device that's been replaced, which requires grabbing
 9479                  * spa_namespace_lock, so we must drop it here.
 9480                  */
 9481                 spa_open_ref(spa, FTAG);
 9482                 mutex_exit(&spa_namespace_lock);
 9483                 spa_async_suspend(spa);
 9484                 mutex_enter(&spa_namespace_lock);
 9485                 spa_close(spa, FTAG);
 9486 
 9487                 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 9488                         spa_unload(spa);
 9489                         spa_deactivate(spa);
 9490                 }
 9491                 spa_remove(spa);
 9492         }
 9493         mutex_exit(&spa_namespace_lock);
 9494 }
 9495 
 9496 vdev_t *
 9497 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
 9498 {
 9499         vdev_t *vd;
 9500         int i;
 9501 
 9502         if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
 9503                 return (vd);
 9504 
 9505         if (aux) {
 9506                 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 9507                         vd = spa->spa_l2cache.sav_vdevs[i];
 9508                         if (vd->vdev_guid == guid)
 9509                                 return (vd);
 9510                 }
 9511 
 9512                 for (i = 0; i < spa->spa_spares.sav_count; i++) {
 9513                         vd = spa->spa_spares.sav_vdevs[i];
 9514                         if (vd->vdev_guid == guid)
 9515                                 return (vd);
 9516                 }
 9517         }
 9518 
 9519         return (NULL);
 9520 }
 9521 
 9522 void
 9523 spa_upgrade(spa_t *spa, uint64_t version)
 9524 {
 9525         ASSERT(spa_writeable(spa));
 9526 
 9527         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 9528 
 9529         /*
 9530          * This should only be called for a non-faulted pool, and since a
 9531          * future version would result in an unopenable pool, this shouldn't be
 9532          * possible.
 9533          */
 9534         ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
 9535         ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
 9536 
 9537         spa->spa_uberblock.ub_version = version;
 9538         vdev_config_dirty(spa->spa_root_vdev);
 9539 
 9540         spa_config_exit(spa, SCL_ALL, FTAG);
 9541 
 9542         txg_wait_synced(spa_get_dsl(spa), 0);
 9543 }
 9544 
 9545 static boolean_t
 9546 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav)
 9547 {
 9548         (void) spa;
 9549         int i;
 9550         uint64_t vdev_guid;
 9551 
 9552         for (i = 0; i < sav->sav_count; i++)
 9553                 if (sav->sav_vdevs[i]->vdev_guid == guid)
 9554                         return (B_TRUE);
 9555 
 9556         for (i = 0; i < sav->sav_npending; i++) {
 9557                 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
 9558                     &vdev_guid) == 0 && vdev_guid == guid)
 9559                         return (B_TRUE);
 9560         }
 9561 
 9562         return (B_FALSE);
 9563 }
 9564 
 9565 boolean_t
 9566 spa_has_l2cache(spa_t *spa, uint64_t guid)
 9567 {
 9568         return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache));
 9569 }
 9570 
 9571 boolean_t
 9572 spa_has_spare(spa_t *spa, uint64_t guid)
 9573 {
 9574         return (spa_has_aux_vdev(spa, guid, &spa->spa_spares));
 9575 }
 9576 
 9577 /*
 9578  * Check if a pool has an active shared spare device.
 9579  * Note: reference count of an active spare is 2, as a spare and as a replace
 9580  */
 9581 static boolean_t
 9582 spa_has_active_shared_spare(spa_t *spa)
 9583 {
 9584         int i, refcnt;
 9585         uint64_t pool;
 9586         spa_aux_vdev_t *sav = &spa->spa_spares;
 9587 
 9588         for (i = 0; i < sav->sav_count; i++) {
 9589                 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
 9590                     &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
 9591                     refcnt > 2)
 9592                         return (B_TRUE);
 9593         }
 9594 
 9595         return (B_FALSE);
 9596 }
 9597 
 9598 uint64_t
 9599 spa_total_metaslabs(spa_t *spa)
 9600 {
 9601         vdev_t *rvd = spa->spa_root_vdev;
 9602 
 9603         uint64_t m = 0;
 9604         for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 9605                 vdev_t *vd = rvd->vdev_child[c];
 9606                 if (!vdev_is_concrete(vd))
 9607                         continue;
 9608                 m += vd->vdev_ms_count;
 9609         }
 9610         return (m);
 9611 }
 9612 
 9613 /*
 9614  * Notify any waiting threads that some activity has switched from being in-
 9615  * progress to not-in-progress so that the thread can wake up and determine
 9616  * whether it is finished waiting.
 9617  */
 9618 void
 9619 spa_notify_waiters(spa_t *spa)
 9620 {
 9621         /*
 9622          * Acquiring spa_activities_lock here prevents the cv_broadcast from
 9623          * happening between the waiting thread's check and cv_wait.
 9624          */
 9625         mutex_enter(&spa->spa_activities_lock);
 9626         cv_broadcast(&spa->spa_activities_cv);
 9627         mutex_exit(&spa->spa_activities_lock);
 9628 }
 9629 
 9630 /*
 9631  * Notify any waiting threads that the pool is exporting, and then block until
 9632  * they are finished using the spa_t.
 9633  */
 9634 void
 9635 spa_wake_waiters(spa_t *spa)
 9636 {
 9637         mutex_enter(&spa->spa_activities_lock);
 9638         spa->spa_waiters_cancel = B_TRUE;
 9639         cv_broadcast(&spa->spa_activities_cv);
 9640         while (spa->spa_waiters != 0)
 9641                 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock);
 9642         spa->spa_waiters_cancel = B_FALSE;
 9643         mutex_exit(&spa->spa_activities_lock);
 9644 }
 9645 
 9646 /* Whether the vdev or any of its descendants are being initialized/trimmed. */
 9647 static boolean_t
 9648 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity)
 9649 {
 9650         spa_t *spa = vd->vdev_spa;
 9651 
 9652         ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER));
 9653         ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
 9654         ASSERT(activity == ZPOOL_WAIT_INITIALIZE ||
 9655             activity == ZPOOL_WAIT_TRIM);
 9656 
 9657         kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ?
 9658             &vd->vdev_initialize_lock : &vd->vdev_trim_lock;
 9659 
 9660         mutex_exit(&spa->spa_activities_lock);
 9661         mutex_enter(lock);
 9662         mutex_enter(&spa->spa_activities_lock);
 9663 
 9664         boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ?
 9665             (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) :
 9666             (vd->vdev_trim_state == VDEV_TRIM_ACTIVE);
 9667         mutex_exit(lock);
 9668 
 9669         if (in_progress)
 9670                 return (B_TRUE);
 9671 
 9672         for (int i = 0; i < vd->vdev_children; i++) {
 9673                 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i],
 9674                     activity))
 9675                         return (B_TRUE);
 9676         }
 9677 
 9678         return (B_FALSE);
 9679 }
 9680 
 9681 /*
 9682  * If use_guid is true, this checks whether the vdev specified by guid is
 9683  * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool
 9684  * is being initialized/trimmed. The caller must hold the config lock and
 9685  * spa_activities_lock.
 9686  */
 9687 static int
 9688 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid,
 9689     zpool_wait_activity_t activity, boolean_t *in_progress)
 9690 {
 9691         mutex_exit(&spa->spa_activities_lock);
 9692         spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 9693         mutex_enter(&spa->spa_activities_lock);
 9694 
 9695         vdev_t *vd;
 9696         if (use_guid) {
 9697                 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 9698                 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) {
 9699                         spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 9700                         return (EINVAL);
 9701                 }
 9702         } else {
 9703                 vd = spa->spa_root_vdev;
 9704         }
 9705 
 9706         *in_progress = spa_vdev_activity_in_progress_impl(vd, activity);
 9707 
 9708         spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 9709         return (0);
 9710 }
 9711 
 9712 /*
 9713  * Locking for waiting threads
 9714  * ---------------------------
 9715  *
 9716  * Waiting threads need a way to check whether a given activity is in progress,
 9717  * and then, if it is, wait for it to complete. Each activity will have some
 9718  * in-memory representation of the relevant on-disk state which can be used to
 9719  * determine whether or not the activity is in progress. The in-memory state and
 9720  * the locking used to protect it will be different for each activity, and may
 9721  * not be suitable for use with a cvar (e.g., some state is protected by the
 9722  * config lock). To allow waiting threads to wait without any races, another
 9723  * lock, spa_activities_lock, is used.
 9724  *
 9725  * When the state is checked, both the activity-specific lock (if there is one)
 9726  * and spa_activities_lock are held. In some cases, the activity-specific lock
 9727  * is acquired explicitly (e.g. the config lock). In others, the locking is
 9728  * internal to some check (e.g. bpobj_is_empty). After checking, the waiting
 9729  * thread releases the activity-specific lock and, if the activity is in
 9730  * progress, then cv_waits using spa_activities_lock.
 9731  *
 9732  * The waiting thread is woken when another thread, one completing some
 9733  * activity, updates the state of the activity and then calls
 9734  * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only
 9735  * needs to hold its activity-specific lock when updating the state, and this
 9736  * lock can (but doesn't have to) be dropped before calling spa_notify_waiters.
 9737  *
 9738  * Because spa_notify_waiters acquires spa_activities_lock before broadcasting,
 9739  * and because it is held when the waiting thread checks the state of the
 9740  * activity, it can never be the case that the completing thread both updates
 9741  * the activity state and cv_broadcasts in between the waiting thread's check
 9742  * and cv_wait. Thus, a waiting thread can never miss a wakeup.
 9743  *
 9744  * In order to prevent deadlock, when the waiting thread does its check, in some
 9745  * cases it will temporarily drop spa_activities_lock in order to acquire the
 9746  * activity-specific lock. The order in which spa_activities_lock and the
 9747  * activity specific lock are acquired in the waiting thread is determined by
 9748  * the order in which they are acquired in the completing thread; if the
 9749  * completing thread calls spa_notify_waiters with the activity-specific lock
 9750  * held, then the waiting thread must also acquire the activity-specific lock
 9751  * first.
 9752  */
 9753 
 9754 static int
 9755 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
 9756     boolean_t use_tag, uint64_t tag, boolean_t *in_progress)
 9757 {
 9758         int error = 0;
 9759 
 9760         ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
 9761 
 9762         switch (activity) {
 9763         case ZPOOL_WAIT_CKPT_DISCARD:
 9764                 *in_progress =
 9765                     (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) &&
 9766                     zap_contains(spa_meta_objset(spa),
 9767                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) ==
 9768                     ENOENT);
 9769                 break;
 9770         case ZPOOL_WAIT_FREE:
 9771                 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS &&
 9772                     !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) ||
 9773                     spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) ||
 9774                     spa_livelist_delete_check(spa));
 9775                 break;
 9776         case ZPOOL_WAIT_INITIALIZE:
 9777         case ZPOOL_WAIT_TRIM:
 9778                 error = spa_vdev_activity_in_progress(spa, use_tag, tag,
 9779                     activity, in_progress);
 9780                 break;
 9781         case ZPOOL_WAIT_REPLACE:
 9782                 mutex_exit(&spa->spa_activities_lock);
 9783                 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 9784                 mutex_enter(&spa->spa_activities_lock);
 9785 
 9786                 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev);
 9787                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 9788                 break;
 9789         case ZPOOL_WAIT_REMOVE:
 9790                 *in_progress = (spa->spa_removing_phys.sr_state ==
 9791                     DSS_SCANNING);
 9792                 break;
 9793         case ZPOOL_WAIT_RESILVER:
 9794                 if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
 9795                         break;
 9796                 zfs_fallthrough;
 9797         case ZPOOL_WAIT_SCRUB:
 9798         {
 9799                 boolean_t scanning, paused, is_scrub;
 9800                 dsl_scan_t *scn =  spa->spa_dsl_pool->dp_scan;
 9801 
 9802                 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB);
 9803                 scanning = (scn->scn_phys.scn_state == DSS_SCANNING);
 9804                 paused = dsl_scan_is_paused_scrub(scn);
 9805                 *in_progress = (scanning && !paused &&
 9806                     is_scrub == (activity == ZPOOL_WAIT_SCRUB));
 9807                 break;
 9808         }
 9809         default:
 9810                 panic("unrecognized value for activity %d", activity);
 9811         }
 9812 
 9813         return (error);
 9814 }
 9815 
 9816 static int
 9817 spa_wait_common(const char *pool, zpool_wait_activity_t activity,
 9818     boolean_t use_tag, uint64_t tag, boolean_t *waited)
 9819 {
 9820         /*
 9821          * The tag is used to distinguish between instances of an activity.
 9822          * 'initialize' and 'trim' are the only activities that we use this for.
 9823          * The other activities can only have a single instance in progress in a
 9824          * pool at one time, making the tag unnecessary.
 9825          *
 9826          * There can be multiple devices being replaced at once, but since they
 9827          * all finish once resilvering finishes, we don't bother keeping track
 9828          * of them individually, we just wait for them all to finish.
 9829          */
 9830         if (use_tag && activity != ZPOOL_WAIT_INITIALIZE &&
 9831             activity != ZPOOL_WAIT_TRIM)
 9832                 return (EINVAL);
 9833 
 9834         if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES)
 9835                 return (EINVAL);
 9836 
 9837         spa_t *spa;
 9838         int error = spa_open(pool, &spa, FTAG);
 9839         if (error != 0)
 9840                 return (error);
 9841 
 9842         /*
 9843          * Increment the spa's waiter count so that we can call spa_close and
 9844          * still ensure that the spa_t doesn't get freed before this thread is
 9845          * finished with it when the pool is exported. We want to call spa_close
 9846          * before we start waiting because otherwise the additional ref would
 9847          * prevent the pool from being exported or destroyed throughout the
 9848          * potentially long wait.
 9849          */
 9850         mutex_enter(&spa->spa_activities_lock);
 9851         spa->spa_waiters++;
 9852         spa_close(spa, FTAG);
 9853 
 9854         *waited = B_FALSE;
 9855         for (;;) {
 9856                 boolean_t in_progress;
 9857                 error = spa_activity_in_progress(spa, activity, use_tag, tag,
 9858                     &in_progress);
 9859 
 9860                 if (error || !in_progress || spa->spa_waiters_cancel)
 9861                         break;
 9862 
 9863                 *waited = B_TRUE;
 9864 
 9865                 if (cv_wait_sig(&spa->spa_activities_cv,
 9866                     &spa->spa_activities_lock) == 0) {
 9867                         error = EINTR;
 9868                         break;
 9869                 }
 9870         }
 9871 
 9872         spa->spa_waiters--;
 9873         cv_signal(&spa->spa_waiters_cv);
 9874         mutex_exit(&spa->spa_activities_lock);
 9875 
 9876         return (error);
 9877 }
 9878 
 9879 /*
 9880  * Wait for a particular instance of the specified activity to complete, where
 9881  * the instance is identified by 'tag'
 9882  */
 9883 int
 9884 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag,
 9885     boolean_t *waited)
 9886 {
 9887         return (spa_wait_common(pool, activity, B_TRUE, tag, waited));
 9888 }
 9889 
 9890 /*
 9891  * Wait for all instances of the specified activity complete
 9892  */
 9893 int
 9894 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited)
 9895 {
 9896 
 9897         return (spa_wait_common(pool, activity, B_FALSE, 0, waited));
 9898 }
 9899 
 9900 sysevent_t *
 9901 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 9902 {
 9903         sysevent_t *ev = NULL;
 9904 #ifdef _KERNEL
 9905         nvlist_t *resource;
 9906 
 9907         resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl);
 9908         if (resource) {
 9909                 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP);
 9910                 ev->resource = resource;
 9911         }
 9912 #else
 9913         (void) spa, (void) vd, (void) hist_nvl, (void) name;
 9914 #endif
 9915         return (ev);
 9916 }
 9917 
 9918 void
 9919 spa_event_post(sysevent_t *ev)
 9920 {
 9921 #ifdef _KERNEL
 9922         if (ev) {
 9923                 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb);
 9924                 kmem_free(ev, sizeof (*ev));
 9925         }
 9926 #else
 9927         (void) ev;
 9928 #endif
 9929 }
 9930 
 9931 /*
 9932  * Post a zevent corresponding to the given sysevent.   The 'name' must be one
 9933  * of the event definitions in sys/sysevent/eventdefs.h.  The payload will be
 9934  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
 9935  * in the userland libzpool, as we don't want consumers to misinterpret ztest
 9936  * or zdb as real changes.
 9937  */
 9938 void
 9939 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 9940 {
 9941         spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
 9942 }
 9943 
 9944 /* state manipulation functions */
 9945 EXPORT_SYMBOL(spa_open);
 9946 EXPORT_SYMBOL(spa_open_rewind);
 9947 EXPORT_SYMBOL(spa_get_stats);
 9948 EXPORT_SYMBOL(spa_create);
 9949 EXPORT_SYMBOL(spa_import);
 9950 EXPORT_SYMBOL(spa_tryimport);
 9951 EXPORT_SYMBOL(spa_destroy);
 9952 EXPORT_SYMBOL(spa_export);
 9953 EXPORT_SYMBOL(spa_reset);
 9954 EXPORT_SYMBOL(spa_async_request);
 9955 EXPORT_SYMBOL(spa_async_suspend);
 9956 EXPORT_SYMBOL(spa_async_resume);
 9957 EXPORT_SYMBOL(spa_inject_addref);
 9958 EXPORT_SYMBOL(spa_inject_delref);
 9959 EXPORT_SYMBOL(spa_scan_stat_init);
 9960 EXPORT_SYMBOL(spa_scan_get_stats);
 9961 
 9962 /* device manipulation */
 9963 EXPORT_SYMBOL(spa_vdev_add);
 9964 EXPORT_SYMBOL(spa_vdev_attach);
 9965 EXPORT_SYMBOL(spa_vdev_detach);
 9966 EXPORT_SYMBOL(spa_vdev_setpath);
 9967 EXPORT_SYMBOL(spa_vdev_setfru);
 9968 EXPORT_SYMBOL(spa_vdev_split_mirror);
 9969 
 9970 /* spare statech is global across all pools) */
 9971 EXPORT_SYMBOL(spa_spare_add);
 9972 EXPORT_SYMBOL(spa_spare_remove);
 9973 EXPORT_SYMBOL(spa_spare_exists);
 9974 EXPORT_SYMBOL(spa_spare_activate);
 9975 
 9976 /* L2ARC statech is global across all pools) */
 9977 EXPORT_SYMBOL(spa_l2cache_add);
 9978 EXPORT_SYMBOL(spa_l2cache_remove);
 9979 EXPORT_SYMBOL(spa_l2cache_exists);
 9980 EXPORT_SYMBOL(spa_l2cache_activate);
 9981 EXPORT_SYMBOL(spa_l2cache_drop);
 9982 
 9983 /* scanning */
 9984 EXPORT_SYMBOL(spa_scan);
 9985 EXPORT_SYMBOL(spa_scan_stop);
 9986 
 9987 /* spa syncing */
 9988 EXPORT_SYMBOL(spa_sync); /* only for DMU use */
 9989 EXPORT_SYMBOL(spa_sync_allpools);
 9990 
 9991 /* properties */
 9992 EXPORT_SYMBOL(spa_prop_set);
 9993 EXPORT_SYMBOL(spa_prop_get);
 9994 EXPORT_SYMBOL(spa_prop_clear_bootfs);
 9995 
 9996 /* asynchronous event notification */
 9997 EXPORT_SYMBOL(spa_event_notify);
 9998 
 9999 /* BEGIN CSTYLED */
10000 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
10001         "log2 fraction of arc that can be used by inflight I/Os when "
10002         "verifying pool during import");
10003 /* END CSTYLED */
10004 
10005 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW,
10006         "Set to traverse metadata on pool import");
10007 
10008 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
10009         "Set to traverse data on pool import");
10010 
10011 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
10012         "Print vdev tree to zfs_dbgmsg during pool import");
10013 
10014 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
10015         "Percentage of CPUs to run an IO worker thread");
10016 
10017 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
10018         "Number of threads per IO worker taskqueue");
10019 
10020 /* BEGIN CSTYLED */
10021 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW,
10022         "Allow importing pool with up to this number of missing top-level "
10023         "vdevs (in read-only mode)");
10024 /* END CSTYLED */
10025 
10026 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT,
10027         ZMOD_RW, "Set the livelist condense zthr to pause");
10028 
10029 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT,
10030         ZMOD_RW, "Set the livelist condense synctask to pause");
10031 
10032 /* BEGIN CSTYLED */
10033 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel,
10034         INT, ZMOD_RW,
10035         "Whether livelist condensing was canceled in the synctask");
10036 
10037 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel,
10038         INT, ZMOD_RW,
10039         "Whether livelist condensing was canceled in the zthr function");
10040 
10041 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
10042         ZMOD_RW,
10043         "Whether extra ALLOC blkptrs were added to a livelist entry while it "
10044         "was being condensed");
10045 /* END CSTYLED */
Cache object: 5a8f7720ed483000e99cd389bf0aa085
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/contrib/openzfs/module/zfs/spa.c

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/spa.c