The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/vdev_initialize.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 
   22 /*
   23  * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
   24  */
   25 
   26 #include <sys/spa.h>
   27 #include <sys/spa_impl.h>
   28 #include <sys/txg.h>
   29 #include <sys/vdev_impl.h>
   30 #include <sys/metaslab_impl.h>
   31 #include <sys/dsl_synctask.h>
   32 #include <sys/zap.h>
   33 #include <sys/dmu_tx.h>
   34 #include <sys/vdev_initialize.h>
   35 
   36 /*
   37  * Value that is written to disk during initialization.
   38  */
   39 static uint64_t zfs_initialize_value = 0xdeadbeefdeadbeeeULL;
   40 
   41 /* maximum number of I/Os outstanding per leaf vdev */
   42 static const int zfs_initialize_limit = 1;
   43 
   44 /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
   45 static uint64_t zfs_initialize_chunk_size = 1024 * 1024;
   46 
   47 static boolean_t
   48 vdev_initialize_should_stop(vdev_t *vd)
   49 {
   50         return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
   51             vd->vdev_detached || vd->vdev_top->vdev_removing);
   52 }
   53 
   54 static void
   55 vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
   56 {
   57         /*
   58          * We pass in the guid instead of the vdev_t since the vdev may
   59          * have been freed prior to the sync task being processed. This
   60          * happens when a vdev is detached as we call spa_config_vdev_exit(),
   61          * stop the initializing thread, schedule the sync task, and free
   62          * the vdev. Later when the scheduled sync task is invoked, it would
   63          * find that the vdev has been freed.
   64          */
   65         uint64_t guid = *(uint64_t *)arg;
   66         uint64_t txg = dmu_tx_get_txg(tx);
   67         kmem_free(arg, sizeof (uint64_t));
   68 
   69         vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
   70         if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
   71                 return;
   72 
   73         uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
   74         vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
   75 
   76         VERIFY(vd->vdev_leaf_zap != 0);
   77 
   78         objset_t *mos = vd->vdev_spa->spa_meta_objset;
   79 
   80         if (last_offset > 0) {
   81                 vd->vdev_initialize_last_offset = last_offset;
   82                 VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
   83                     VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
   84                     sizeof (last_offset), 1, &last_offset, tx));
   85         }
   86         if (vd->vdev_initialize_action_time > 0) {
   87                 uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
   88                 VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
   89                     VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
   90                     1, &val, tx));
   91         }
   92 
   93         uint64_t initialize_state = vd->vdev_initialize_state;
   94         VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
   95             VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
   96             &initialize_state, tx));
   97 }
   98 
   99 static void
  100 vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
  101 {
  102         ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
  103         spa_t *spa = vd->vdev_spa;
  104 
  105         if (new_state == vd->vdev_initialize_state)
  106                 return;
  107 
  108         /*
  109          * Copy the vd's guid, this will be freed by the sync task.
  110          */
  111         uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
  112         *guid = vd->vdev_guid;
  113 
  114         /*
  115          * If we're suspending, then preserving the original start time.
  116          */
  117         if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
  118                 vd->vdev_initialize_action_time = gethrestime_sec();
  119         }
  120 
  121         vdev_initializing_state_t old_state = vd->vdev_initialize_state;
  122         vd->vdev_initialize_state = new_state;
  123 
  124         dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
  125         VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
  126         dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
  127             guid, tx);
  128 
  129         switch (new_state) {
  130         case VDEV_INITIALIZE_ACTIVE:
  131                 spa_history_log_internal(spa, "initialize", tx,
  132                     "vdev=%s activated", vd->vdev_path);
  133                 break;
  134         case VDEV_INITIALIZE_SUSPENDED:
  135                 spa_history_log_internal(spa, "initialize", tx,
  136                     "vdev=%s suspended", vd->vdev_path);
  137                 break;
  138         case VDEV_INITIALIZE_CANCELED:
  139                 if (old_state == VDEV_INITIALIZE_ACTIVE ||
  140                     old_state == VDEV_INITIALIZE_SUSPENDED)
  141                         spa_history_log_internal(spa, "initialize", tx,
  142                             "vdev=%s canceled", vd->vdev_path);
  143                 break;
  144         case VDEV_INITIALIZE_COMPLETE:
  145                 spa_history_log_internal(spa, "initialize", tx,
  146                     "vdev=%s complete", vd->vdev_path);
  147                 break;
  148         default:
  149                 panic("invalid state %llu", (unsigned long long)new_state);
  150         }
  151 
  152         dmu_tx_commit(tx);
  153 
  154         if (new_state != VDEV_INITIALIZE_ACTIVE)
  155                 spa_notify_waiters(spa);
  156 }
  157 
  158 static void
  159 vdev_initialize_cb(zio_t *zio)
  160 {
  161         vdev_t *vd = zio->io_vd;
  162         mutex_enter(&vd->vdev_initialize_io_lock);
  163         if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
  164                 /*
  165                  * The I/O failed because the vdev was unavailable; roll the
  166                  * last offset back. (This works because spa_sync waits on
  167                  * spa_txg_zio before it runs sync tasks.)
  168                  */
  169                 uint64_t *off =
  170                     &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
  171                 *off = MIN(*off, zio->io_offset);
  172         } else {
  173                 /*
  174                  * Since initializing is best-effort, we ignore I/O errors and
  175                  * rely on vdev_probe to determine if the errors are more
  176                  * critical.
  177                  */
  178                 if (zio->io_error != 0)
  179                         vd->vdev_stat.vs_initialize_errors++;
  180 
  181                 vd->vdev_initialize_bytes_done += zio->io_orig_size;
  182         }
  183         ASSERT3U(vd->vdev_initialize_inflight, >, 0);
  184         vd->vdev_initialize_inflight--;
  185         cv_broadcast(&vd->vdev_initialize_io_cv);
  186         mutex_exit(&vd->vdev_initialize_io_lock);
  187 
  188         spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
  189 }
  190 
  191 /* Takes care of physical writing and limiting # of concurrent ZIOs. */
  192 static int
  193 vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
  194 {
  195         spa_t *spa = vd->vdev_spa;
  196 
  197         /* Limit inflight initializing I/Os */
  198         mutex_enter(&vd->vdev_initialize_io_lock);
  199         while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
  200                 cv_wait(&vd->vdev_initialize_io_cv,
  201                     &vd->vdev_initialize_io_lock);
  202         }
  203         vd->vdev_initialize_inflight++;
  204         mutex_exit(&vd->vdev_initialize_io_lock);
  205 
  206         dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
  207         VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
  208         uint64_t txg = dmu_tx_get_txg(tx);
  209 
  210         spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
  211         mutex_enter(&vd->vdev_initialize_lock);
  212 
  213         if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
  214                 uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
  215                 *guid = vd->vdev_guid;
  216 
  217                 /* This is the first write of this txg. */
  218                 dsl_sync_task_nowait(spa_get_dsl(spa),
  219                     vdev_initialize_zap_update_sync, guid, tx);
  220         }
  221 
  222         /*
  223          * We know the vdev struct will still be around since all
  224          * consumers of vdev_free must stop the initialization first.
  225          */
  226         if (vdev_initialize_should_stop(vd)) {
  227                 mutex_enter(&vd->vdev_initialize_io_lock);
  228                 ASSERT3U(vd->vdev_initialize_inflight, >, 0);
  229                 vd->vdev_initialize_inflight--;
  230                 mutex_exit(&vd->vdev_initialize_io_lock);
  231                 spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
  232                 mutex_exit(&vd->vdev_initialize_lock);
  233                 dmu_tx_commit(tx);
  234                 return (SET_ERROR(EINTR));
  235         }
  236         mutex_exit(&vd->vdev_initialize_lock);
  237 
  238         vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
  239         zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
  240             size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
  241             ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
  242         /* vdev_initialize_cb releases SCL_STATE_ALL */
  243 
  244         dmu_tx_commit(tx);
  245 
  246         return (0);
  247 }
  248 
  249 /*
  250  * Callback to fill each ABD chunk with zfs_initialize_value. len must be
  251  * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
  252  * allocation will guarantee these for us.
  253  */
  254 static int
  255 vdev_initialize_block_fill(void *buf, size_t len, void *unused)
  256 {
  257         (void) unused;
  258 
  259         ASSERT0(len % sizeof (uint64_t));
  260         for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
  261                 *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
  262         }
  263         return (0);
  264 }
  265 
  266 static abd_t *
  267 vdev_initialize_block_alloc(void)
  268 {
  269         /* Allocate ABD for filler data */
  270         abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
  271 
  272         ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
  273         (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
  274             vdev_initialize_block_fill, NULL);
  275 
  276         return (data);
  277 }
  278 
  279 static void
  280 vdev_initialize_block_free(abd_t *data)
  281 {
  282         abd_free(data);
  283 }
  284 
  285 static int
  286 vdev_initialize_ranges(vdev_t *vd, abd_t *data)
  287 {
  288         range_tree_t *rt = vd->vdev_initialize_tree;
  289         zfs_btree_t *bt = &rt->rt_root;
  290         zfs_btree_index_t where;
  291 
  292         for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL;
  293             rs = zfs_btree_next(bt, &where, &where)) {
  294                 uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
  295 
  296                 /* Split range into legally-sized physical chunks */
  297                 uint64_t writes_required =
  298                     ((size - 1) / zfs_initialize_chunk_size) + 1;
  299 
  300                 for (uint64_t w = 0; w < writes_required; w++) {
  301                         int error;
  302 
  303                         error = vdev_initialize_write(vd,
  304                             VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) +
  305                             (w * zfs_initialize_chunk_size),
  306                             MIN(size - (w * zfs_initialize_chunk_size),
  307                             zfs_initialize_chunk_size), data);
  308                         if (error != 0)
  309                                 return (error);
  310                 }
  311         }
  312         return (0);
  313 }
  314 
  315 static void
  316 vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
  317 {
  318         uint64_t *last_rs_end = (uint64_t *)arg;
  319 
  320         if (physical_rs->rs_end > *last_rs_end)
  321                 *last_rs_end = physical_rs->rs_end;
  322 }
  323 
  324 static void
  325 vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs)
  326 {
  327         vdev_t *vd = (vdev_t *)arg;
  328 
  329         uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
  330         vd->vdev_initialize_bytes_est += size;
  331 
  332         if (vd->vdev_initialize_last_offset > physical_rs->rs_end) {
  333                 vd->vdev_initialize_bytes_done += size;
  334         } else if (vd->vdev_initialize_last_offset > physical_rs->rs_start &&
  335             vd->vdev_initialize_last_offset < physical_rs->rs_end) {
  336                 vd->vdev_initialize_bytes_done +=
  337                     vd->vdev_initialize_last_offset - physical_rs->rs_start;
  338         }
  339 }
  340 
  341 static void
  342 vdev_initialize_calculate_progress(vdev_t *vd)
  343 {
  344         ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
  345             spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
  346         ASSERT(vd->vdev_leaf_zap != 0);
  347 
  348         vd->vdev_initialize_bytes_est = 0;
  349         vd->vdev_initialize_bytes_done = 0;
  350 
  351         for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
  352                 metaslab_t *msp = vd->vdev_top->vdev_ms[i];
  353                 mutex_enter(&msp->ms_lock);
  354 
  355                 uint64_t ms_free = (msp->ms_size -
  356                     metaslab_allocated_space(msp)) /
  357                     vdev_get_ndisks(vd->vdev_top);
  358 
  359                 /*
  360                  * Convert the metaslab range to a physical range
  361                  * on our vdev. We use this to determine if we are
  362                  * in the middle of this metaslab range.
  363                  */
  364                 range_seg64_t logical_rs, physical_rs, remain_rs;
  365                 logical_rs.rs_start = msp->ms_start;
  366                 logical_rs.rs_end = msp->ms_start + msp->ms_size;
  367 
  368                 /* Metaslab space after this offset has not been initialized */
  369                 vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
  370                 if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
  371                         vd->vdev_initialize_bytes_est += ms_free;
  372                         mutex_exit(&msp->ms_lock);
  373                         continue;
  374                 }
  375 
  376                 /* Metaslab space before this offset has been initialized */
  377                 uint64_t last_rs_end = physical_rs.rs_end;
  378                 if (!vdev_xlate_is_empty(&remain_rs)) {
  379                         vdev_xlate_walk(vd, &remain_rs,
  380                             vdev_initialize_xlate_last_rs_end, &last_rs_end);
  381                 }
  382 
  383                 if (vd->vdev_initialize_last_offset > last_rs_end) {
  384                         vd->vdev_initialize_bytes_done += ms_free;
  385                         vd->vdev_initialize_bytes_est += ms_free;
  386                         mutex_exit(&msp->ms_lock);
  387                         continue;
  388                 }
  389 
  390                 /*
  391                  * If we get here, we're in the middle of initializing this
  392                  * metaslab. Load it and walk the free tree for more accurate
  393                  * progress estimation.
  394                  */
  395                 VERIFY0(metaslab_load(msp));
  396 
  397                 zfs_btree_index_t where;
  398                 range_tree_t *rt = msp->ms_allocatable;
  399                 for (range_seg_t *rs =
  400                     zfs_btree_first(&rt->rt_root, &where); rs;
  401                     rs = zfs_btree_next(&rt->rt_root, &where,
  402                     &where)) {
  403                         logical_rs.rs_start = rs_get_start(rs, rt);
  404                         logical_rs.rs_end = rs_get_end(rs, rt);
  405 
  406                         vdev_xlate_walk(vd, &logical_rs,
  407                             vdev_initialize_xlate_progress, vd);
  408                 }
  409                 mutex_exit(&msp->ms_lock);
  410         }
  411 }
  412 
  413 static int
  414 vdev_initialize_load(vdev_t *vd)
  415 {
  416         int err = 0;
  417         ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
  418             spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
  419         ASSERT(vd->vdev_leaf_zap != 0);
  420 
  421         if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
  422             vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
  423                 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
  424                     vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
  425                     sizeof (vd->vdev_initialize_last_offset), 1,
  426                     &vd->vdev_initialize_last_offset);
  427                 if (err == ENOENT) {
  428                         vd->vdev_initialize_last_offset = 0;
  429                         err = 0;
  430                 }
  431         }
  432 
  433         vdev_initialize_calculate_progress(vd);
  434         return (err);
  435 }
  436 
  437 static void
  438 vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs)
  439 {
  440         vdev_t *vd = arg;
  441 
  442         /* Only add segments that we have not visited yet */
  443         if (physical_rs->rs_end <= vd->vdev_initialize_last_offset)
  444                 return;
  445 
  446         /* Pick up where we left off mid-range. */
  447         if (vd->vdev_initialize_last_offset > physical_rs->rs_start) {
  448                 zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
  449                     "(%llu, %llu)", vd->vdev_path,
  450                     (u_longlong_t)physical_rs->rs_start,
  451                     (u_longlong_t)physical_rs->rs_end,
  452                     (u_longlong_t)vd->vdev_initialize_last_offset,
  453                     (u_longlong_t)physical_rs->rs_end);
  454                 ASSERT3U(physical_rs->rs_end, >,
  455                     vd->vdev_initialize_last_offset);
  456                 physical_rs->rs_start = vd->vdev_initialize_last_offset;
  457         }
  458 
  459         ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
  460 
  461         range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start,
  462             physical_rs->rs_end - physical_rs->rs_start);
  463 }
  464 
  465 /*
  466  * Convert the logical range into a physical range and add it to our
  467  * avl tree.
  468  */
  469 static void
  470 vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
  471 {
  472         vdev_t *vd = arg;
  473         range_seg64_t logical_rs;
  474         logical_rs.rs_start = start;
  475         logical_rs.rs_end = start + size;
  476 
  477         ASSERT(vd->vdev_ops->vdev_op_leaf);
  478         vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg);
  479 }
  480 
  481 static __attribute__((noreturn)) void
  482 vdev_initialize_thread(void *arg)
  483 {
  484         vdev_t *vd = arg;
  485         spa_t *spa = vd->vdev_spa;
  486         int error = 0;
  487         uint64_t ms_count = 0;
  488 
  489         ASSERT(vdev_is_concrete(vd));
  490         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
  491 
  492         vd->vdev_initialize_last_offset = 0;
  493         VERIFY0(vdev_initialize_load(vd));
  494 
  495         abd_t *deadbeef = vdev_initialize_block_alloc();
  496 
  497         vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL,
  498             0, 0);
  499 
  500         for (uint64_t i = 0; !vd->vdev_detached &&
  501             i < vd->vdev_top->vdev_ms_count; i++) {
  502                 metaslab_t *msp = vd->vdev_top->vdev_ms[i];
  503                 boolean_t unload_when_done = B_FALSE;
  504 
  505                 /*
  506                  * If we've expanded the top-level vdev or it's our
  507                  * first pass, calculate our progress.
  508                  */
  509                 if (vd->vdev_top->vdev_ms_count != ms_count) {
  510                         vdev_initialize_calculate_progress(vd);
  511                         ms_count = vd->vdev_top->vdev_ms_count;
  512                 }
  513 
  514                 spa_config_exit(spa, SCL_CONFIG, FTAG);
  515                 metaslab_disable(msp);
  516                 mutex_enter(&msp->ms_lock);
  517                 if (!msp->ms_loaded && !msp->ms_loading)
  518                         unload_when_done = B_TRUE;
  519                 VERIFY0(metaslab_load(msp));
  520 
  521                 range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
  522                     vd);
  523                 mutex_exit(&msp->ms_lock);
  524 
  525                 error = vdev_initialize_ranges(vd, deadbeef);
  526                 metaslab_enable(msp, B_TRUE, unload_when_done);
  527                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
  528 
  529                 range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
  530                 if (error != 0)
  531                         break;
  532         }
  533 
  534         spa_config_exit(spa, SCL_CONFIG, FTAG);
  535         mutex_enter(&vd->vdev_initialize_io_lock);
  536         while (vd->vdev_initialize_inflight > 0) {
  537                 cv_wait(&vd->vdev_initialize_io_cv,
  538                     &vd->vdev_initialize_io_lock);
  539         }
  540         mutex_exit(&vd->vdev_initialize_io_lock);
  541 
  542         range_tree_destroy(vd->vdev_initialize_tree);
  543         vdev_initialize_block_free(deadbeef);
  544         vd->vdev_initialize_tree = NULL;
  545 
  546         mutex_enter(&vd->vdev_initialize_lock);
  547         if (!vd->vdev_initialize_exit_wanted) {
  548                 if (vdev_writeable(vd)) {
  549                         vdev_initialize_change_state(vd,
  550                             VDEV_INITIALIZE_COMPLETE);
  551                 } else if (vd->vdev_faulted) {
  552                         vdev_initialize_change_state(vd,
  553                             VDEV_INITIALIZE_CANCELED);
  554                 }
  555         }
  556         ASSERT(vd->vdev_initialize_thread != NULL ||
  557             vd->vdev_initialize_inflight == 0);
  558 
  559         /*
  560          * Drop the vdev_initialize_lock while we sync out the
  561          * txg since it's possible that a device might be trying to
  562          * come online and must check to see if it needs to restart an
  563          * initialization. That thread will be holding the spa_config_lock
  564          * which would prevent the txg_wait_synced from completing.
  565          */
  566         mutex_exit(&vd->vdev_initialize_lock);
  567         txg_wait_synced(spa_get_dsl(spa), 0);
  568         mutex_enter(&vd->vdev_initialize_lock);
  569 
  570         vd->vdev_initialize_thread = NULL;
  571         cv_broadcast(&vd->vdev_initialize_cv);
  572         mutex_exit(&vd->vdev_initialize_lock);
  573 
  574         thread_exit();
  575 }
  576 
  577 /*
  578  * Initiates a device. Caller must hold vdev_initialize_lock.
  579  * Device must be a leaf and not already be initializing.
  580  */
  581 void
  582 vdev_initialize(vdev_t *vd)
  583 {
  584         ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
  585         ASSERT(vd->vdev_ops->vdev_op_leaf);
  586         ASSERT(vdev_is_concrete(vd));
  587         ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
  588         ASSERT(!vd->vdev_detached);
  589         ASSERT(!vd->vdev_initialize_exit_wanted);
  590         ASSERT(!vd->vdev_top->vdev_removing);
  591 
  592         vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
  593         vd->vdev_initialize_thread = thread_create(NULL, 0,
  594             vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
  595 }
  596 
  597 /*
  598  * Wait for the initialize thread to be terminated (cancelled or stopped).
  599  */
  600 static void
  601 vdev_initialize_stop_wait_impl(vdev_t *vd)
  602 {
  603         ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
  604 
  605         while (vd->vdev_initialize_thread != NULL)
  606                 cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
  607 
  608         ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
  609         vd->vdev_initialize_exit_wanted = B_FALSE;
  610 }
  611 
  612 /*
  613  * Wait for vdev initialize threads which were either to cleanly exit.
  614  */
  615 void
  616 vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
  617 {
  618         (void) spa;
  619         vdev_t *vd;
  620 
  621         ASSERT(MUTEX_HELD(&spa_namespace_lock));
  622 
  623         while ((vd = list_remove_head(vd_list)) != NULL) {
  624                 mutex_enter(&vd->vdev_initialize_lock);
  625                 vdev_initialize_stop_wait_impl(vd);
  626                 mutex_exit(&vd->vdev_initialize_lock);
  627         }
  628 }
  629 
  630 /*
  631  * Stop initializing a device, with the resultant initializing state being
  632  * tgt_state.  For blocking behavior pass NULL for vd_list.  Otherwise, when
  633  * a list_t is provided the stopping vdev is inserted in to the list.  Callers
  634  * are then required to call vdev_initialize_stop_wait() to block for all the
  635  * initialization threads to exit.  The caller must hold vdev_initialize_lock
  636  * and must not be writing to the spa config, as the initializing thread may
  637  * try to enter the config as a reader before exiting.
  638  */
  639 void
  640 vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
  641     list_t *vd_list)
  642 {
  643         ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER));
  644         ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
  645         ASSERT(vd->vdev_ops->vdev_op_leaf);
  646         ASSERT(vdev_is_concrete(vd));
  647 
  648         /*
  649          * Allow cancel requests to proceed even if the initialize thread
  650          * has stopped.
  651          */
  652         if (vd->vdev_initialize_thread == NULL &&
  653             tgt_state != VDEV_INITIALIZE_CANCELED) {
  654                 return;
  655         }
  656 
  657         vdev_initialize_change_state(vd, tgt_state);
  658         vd->vdev_initialize_exit_wanted = B_TRUE;
  659 
  660         if (vd_list == NULL) {
  661                 vdev_initialize_stop_wait_impl(vd);
  662         } else {
  663                 ASSERT(MUTEX_HELD(&spa_namespace_lock));
  664                 list_insert_tail(vd_list, vd);
  665         }
  666 }
  667 
  668 static void
  669 vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state,
  670     list_t *vd_list)
  671 {
  672         if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
  673                 mutex_enter(&vd->vdev_initialize_lock);
  674                 vdev_initialize_stop(vd, tgt_state, vd_list);
  675                 mutex_exit(&vd->vdev_initialize_lock);
  676                 return;
  677         }
  678 
  679         for (uint64_t i = 0; i < vd->vdev_children; i++) {
  680                 vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state,
  681                     vd_list);
  682         }
  683 }
  684 
  685 /*
  686  * Convenience function to stop initializing of a vdev tree and set all
  687  * initialize thread pointers to NULL.
  688  */
  689 void
  690 vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
  691 {
  692         spa_t *spa = vd->vdev_spa;
  693         list_t vd_list;
  694 
  695         ASSERT(MUTEX_HELD(&spa_namespace_lock));
  696 
  697         list_create(&vd_list, sizeof (vdev_t),
  698             offsetof(vdev_t, vdev_initialize_node));
  699 
  700         vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list);
  701         vdev_initialize_stop_wait(spa, &vd_list);
  702 
  703         if (vd->vdev_spa->spa_sync_on) {
  704                 /* Make sure that our state has been synced to disk */
  705                 txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
  706         }
  707 
  708         list_destroy(&vd_list);
  709 }
  710 
  711 void
  712 vdev_initialize_restart(vdev_t *vd)
  713 {
  714         ASSERT(MUTEX_HELD(&spa_namespace_lock));
  715         ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
  716 
  717         if (vd->vdev_leaf_zap != 0) {
  718                 mutex_enter(&vd->vdev_initialize_lock);
  719                 uint64_t initialize_state = VDEV_INITIALIZE_NONE;
  720                 int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
  721                     vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
  722                     sizeof (initialize_state), 1, &initialize_state);
  723                 ASSERT(err == 0 || err == ENOENT);
  724                 vd->vdev_initialize_state = initialize_state;
  725 
  726                 uint64_t timestamp = 0;
  727                 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
  728                     vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
  729                     sizeof (timestamp), 1, &timestamp);
  730                 ASSERT(err == 0 || err == ENOENT);
  731                 vd->vdev_initialize_action_time = timestamp;
  732 
  733                 if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
  734                     vd->vdev_offline) {
  735                         /* load progress for reporting, but don't resume */
  736                         VERIFY0(vdev_initialize_load(vd));
  737                 } else if (vd->vdev_initialize_state ==
  738                     VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) &&
  739                     !vd->vdev_top->vdev_removing &&
  740                     vd->vdev_initialize_thread == NULL) {
  741                         vdev_initialize(vd);
  742                 }
  743 
  744                 mutex_exit(&vd->vdev_initialize_lock);
  745         }
  746 
  747         for (uint64_t i = 0; i < vd->vdev_children; i++) {
  748                 vdev_initialize_restart(vd->vdev_child[i]);
  749         }
  750 }
  751 
  752 EXPORT_SYMBOL(vdev_initialize);
  753 EXPORT_SYMBOL(vdev_initialize_stop);
  754 EXPORT_SYMBOL(vdev_initialize_stop_all);
  755 EXPORT_SYMBOL(vdev_initialize_stop_wait);
  756 EXPORT_SYMBOL(vdev_initialize_restart);
  757 
  758 ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, U64, ZMOD_RW,
  759         "Value written during zpool initialize");
  760 
  761 ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, U64, ZMOD_RW,
  762         "Size in bytes of writes by zpool initialize");

Cache object: 1278296358d61c6852457b63b0f86ff2


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.