The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/vdev_mirror.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
   23  * Use is subject to license terms.
   24  */
   25 
   26 /*
   27  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
   28  */
   29 
   30 #include <sys/zfs_context.h>
   31 #include <sys/spa.h>
   32 #include <sys/spa_impl.h>
   33 #include <sys/dsl_pool.h>
   34 #include <sys/dsl_scan.h>
   35 #include <sys/vdev_impl.h>
   36 #include <sys/vdev_draid.h>
   37 #include <sys/zio.h>
   38 #include <sys/zio_checksum.h>
   39 #include <sys/abd.h>
   40 #include <sys/fs/zfs.h>
   41 
   42 /*
   43  * Vdev mirror kstats
   44  */
   45 static kstat_t *mirror_ksp = NULL;
   46 
   47 typedef struct mirror_stats {
   48         kstat_named_t vdev_mirror_stat_rotating_linear;
   49         kstat_named_t vdev_mirror_stat_rotating_offset;
   50         kstat_named_t vdev_mirror_stat_rotating_seek;
   51         kstat_named_t vdev_mirror_stat_non_rotating_linear;
   52         kstat_named_t vdev_mirror_stat_non_rotating_seek;
   53 
   54         kstat_named_t vdev_mirror_stat_preferred_found;
   55         kstat_named_t vdev_mirror_stat_preferred_not_found;
   56 } mirror_stats_t;
   57 
   58 static mirror_stats_t mirror_stats = {
   59         /* New I/O follows directly the last I/O */
   60         { "rotating_linear",                    KSTAT_DATA_UINT64 },
   61         /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
   62         { "rotating_offset",                    KSTAT_DATA_UINT64 },
   63         /* New I/O requires random seek */
   64         { "rotating_seek",                      KSTAT_DATA_UINT64 },
   65         /* New I/O follows directly the last I/O  (nonrot) */
   66         { "non_rotating_linear",                KSTAT_DATA_UINT64 },
   67         /* New I/O requires random seek (nonrot) */
   68         { "non_rotating_seek",                  KSTAT_DATA_UINT64 },
   69         /* Preferred child vdev found */
   70         { "preferred_found",                    KSTAT_DATA_UINT64 },
   71         /* Preferred child vdev not found or equal load  */
   72         { "preferred_not_found",                KSTAT_DATA_UINT64 },
   73 
   74 };
   75 
   76 #define MIRROR_STAT(stat)               (mirror_stats.stat.value.ui64)
   77 #define MIRROR_INCR(stat, val)          atomic_add_64(&MIRROR_STAT(stat), val)
   78 #define MIRROR_BUMP(stat)               MIRROR_INCR(stat, 1)
   79 
   80 void
   81 vdev_mirror_stat_init(void)
   82 {
   83         mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats",
   84             "misc", KSTAT_TYPE_NAMED,
   85             sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
   86         if (mirror_ksp != NULL) {
   87                 mirror_ksp->ks_data = &mirror_stats;
   88                 kstat_install(mirror_ksp);
   89         }
   90 }
   91 
   92 void
   93 vdev_mirror_stat_fini(void)
   94 {
   95         if (mirror_ksp != NULL) {
   96                 kstat_delete(mirror_ksp);
   97                 mirror_ksp = NULL;
   98         }
   99 }
  100 
  101 /*
  102  * Virtual device vector for mirroring.
  103  */
  104 typedef struct mirror_child {
  105         vdev_t          *mc_vd;
  106         abd_t           *mc_abd;
  107         uint64_t        mc_offset;
  108         int             mc_error;
  109         int             mc_load;
  110         uint8_t         mc_tried;
  111         uint8_t         mc_skipped;
  112         uint8_t         mc_speculative;
  113         uint8_t         mc_rebuilding;
  114 } mirror_child_t;
  115 
  116 typedef struct mirror_map {
  117         int             *mm_preferred;
  118         int             mm_preferred_cnt;
  119         int             mm_children;
  120         boolean_t       mm_resilvering;
  121         boolean_t       mm_rebuilding;
  122         boolean_t       mm_root;
  123         mirror_child_t  mm_child[];
  124 } mirror_map_t;
  125 
  126 static const int vdev_mirror_shift = 21;
  127 
  128 /*
  129  * The load configuration settings below are tuned by default for
  130  * the case where all devices are of the same rotational type.
  131  *
  132  * If there is a mixture of rotating and non-rotating media, setting
  133  * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
  134  * as it will direct more reads to the non-rotating vdevs which are more likely
  135  * to have a higher performance.
  136  */
  137 
  138 /* Rotating media load calculation configuration. */
  139 static int zfs_vdev_mirror_rotating_inc = 0;
  140 static int zfs_vdev_mirror_rotating_seek_inc = 5;
  141 static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024;
  142 
  143 /* Non-rotating media load calculation configuration. */
  144 static int zfs_vdev_mirror_non_rotating_inc = 0;
  145 static int zfs_vdev_mirror_non_rotating_seek_inc = 1;
  146 
  147 static inline size_t
  148 vdev_mirror_map_size(int children)
  149 {
  150         return (offsetof(mirror_map_t, mm_child[children]) +
  151             sizeof (int) * children);
  152 }
  153 
  154 static inline mirror_map_t *
  155 vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
  156 {
  157         mirror_map_t *mm;
  158 
  159         mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
  160         mm->mm_children = children;
  161         mm->mm_resilvering = resilvering;
  162         mm->mm_root = root;
  163         mm->mm_preferred = (int *)((uintptr_t)mm +
  164             offsetof(mirror_map_t, mm_child[children]));
  165 
  166         return (mm);
  167 }
  168 
  169 static void
  170 vdev_mirror_map_free(zio_t *zio)
  171 {
  172         mirror_map_t *mm = zio->io_vsd;
  173 
  174         kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
  175 }
  176 
  177 static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
  178         .vsd_free = vdev_mirror_map_free,
  179 };
  180 
  181 static int
  182 vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
  183 {
  184         uint64_t last_offset;
  185         int64_t offset_diff;
  186         int load;
  187 
  188         /* All DVAs have equal weight at the root. */
  189         if (mm->mm_root)
  190                 return (INT_MAX);
  191 
  192         /*
  193          * We don't return INT_MAX if the device is resilvering i.e.
  194          * vdev_resilver_txg != 0 as when tested performance was slightly
  195          * worse overall when resilvering with compared to without.
  196          */
  197 
  198         /* Fix zio_offset for leaf vdevs */
  199         if (vd->vdev_ops->vdev_op_leaf)
  200                 zio_offset += VDEV_LABEL_START_SIZE;
  201 
  202         /* Standard load based on pending queue length. */
  203         load = vdev_queue_length(vd);
  204         last_offset = vdev_queue_last_offset(vd);
  205 
  206         if (vd->vdev_nonrot) {
  207                 /* Non-rotating media. */
  208                 if (last_offset == zio_offset) {
  209                         MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear);
  210                         return (load + zfs_vdev_mirror_non_rotating_inc);
  211                 }
  212 
  213                 /*
  214                  * Apply a seek penalty even for non-rotating devices as
  215                  * sequential I/O's can be aggregated into fewer operations on
  216                  * the device, thus avoiding unnecessary per-command overhead
  217                  * and boosting performance.
  218                  */
  219                 MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek);
  220                 return (load + zfs_vdev_mirror_non_rotating_seek_inc);
  221         }
  222 
  223         /* Rotating media I/O's which directly follow the last I/O. */
  224         if (last_offset == zio_offset) {
  225                 MIRROR_BUMP(vdev_mirror_stat_rotating_linear);
  226                 return (load + zfs_vdev_mirror_rotating_inc);
  227         }
  228 
  229         /*
  230          * Apply half the seek increment to I/O's within seek offset
  231          * of the last I/O issued to this vdev as they should incur less
  232          * of a seek increment.
  233          */
  234         offset_diff = (int64_t)(last_offset - zio_offset);
  235         if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) {
  236                 MIRROR_BUMP(vdev_mirror_stat_rotating_offset);
  237                 return (load + (zfs_vdev_mirror_rotating_seek_inc / 2));
  238         }
  239 
  240         /* Apply the full seek increment to all other I/O's. */
  241         MIRROR_BUMP(vdev_mirror_stat_rotating_seek);
  242         return (load + zfs_vdev_mirror_rotating_seek_inc);
  243 }
  244 
  245 static boolean_t
  246 vdev_mirror_rebuilding(vdev_t *vd)
  247 {
  248         if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
  249                 return (B_TRUE);
  250 
  251         for (int i = 0; i < vd->vdev_children; i++) {
  252                 if (vdev_mirror_rebuilding(vd->vdev_child[i])) {
  253                         return (B_TRUE);
  254                 }
  255         }
  256 
  257         return (B_FALSE);
  258 }
  259 
  260 /*
  261  * Avoid inlining the function to keep vdev_mirror_io_start(), which
  262  * is this functions only caller, as small as possible on the stack.
  263  */
  264 noinline static mirror_map_t *
  265 vdev_mirror_map_init(zio_t *zio)
  266 {
  267         mirror_map_t *mm = NULL;
  268         mirror_child_t *mc;
  269         vdev_t *vd = zio->io_vd;
  270         int c;
  271 
  272         if (vd == NULL) {
  273                 dva_t *dva = zio->io_bp->blk_dva;
  274                 spa_t *spa = zio->io_spa;
  275                 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
  276                 dva_t dva_copy[SPA_DVAS_PER_BP];
  277 
  278                 /*
  279                  * The sequential scrub code sorts and issues all DVAs
  280                  * of a bp separately. Each of these IOs includes all
  281                  * original DVA copies so that repairs can be performed
  282                  * in the event of an error, but we only actually want
  283                  * to check the first DVA since the others will be
  284                  * checked by their respective sorted IOs. Only if we
  285                  * hit an error will we try all DVAs upon retrying.
  286                  *
  287                  * Note: This check is safe even if the user switches
  288                  * from a legacy scrub to a sequential one in the middle
  289                  * of processing, since scn_is_sorted isn't updated until
  290                  * all outstanding IOs from the previous scrub pass
  291                  * complete.
  292                  */
  293                 if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
  294                     !(zio->io_flags & ZIO_FLAG_IO_RETRY) &&
  295                     dsl_scan_scrubbing(spa->spa_dsl_pool) &&
  296                     scn->scn_is_sorted) {
  297                         c = 1;
  298                 } else {
  299                         c = BP_GET_NDVAS(zio->io_bp);
  300                 }
  301 
  302                 /*
  303                  * If the pool cannot be written to, then infer that some
  304                  * DVAs might be invalid or point to vdevs that do not exist.
  305                  * We skip them.
  306                  */
  307                 if (!spa_writeable(spa)) {
  308                         ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
  309                         int j = 0;
  310                         for (int i = 0; i < c; i++) {
  311                                 if (zfs_dva_valid(spa, &dva[i], zio->io_bp))
  312                                         dva_copy[j++] = dva[i];
  313                         }
  314                         if (j == 0) {
  315                                 zio->io_vsd = NULL;
  316                                 zio->io_error = ENXIO;
  317                                 return (NULL);
  318                         }
  319                         if (j < c) {
  320                                 dva = dva_copy;
  321                                 c = j;
  322                         }
  323                 }
  324 
  325                 mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
  326                 for (c = 0; c < mm->mm_children; c++) {
  327                         mc = &mm->mm_child[c];
  328 
  329                         mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
  330                         mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
  331                         if (mc->mc_vd == NULL) {
  332                                 kmem_free(mm, vdev_mirror_map_size(
  333                                     mm->mm_children));
  334                                 zio->io_vsd = NULL;
  335                                 zio->io_error = ENXIO;
  336                                 return (NULL);
  337                         }
  338                 }
  339         } else {
  340                 /*
  341                  * If we are resilvering, then we should handle scrub reads
  342                  * differently; we shouldn't issue them to the resilvering
  343                  * device because it might not have those blocks.
  344                  *
  345                  * We are resilvering iff:
  346                  * 1) We are a replacing vdev (ie our name is "replacing-1" or
  347                  *    "spare-1" or something like that), and
  348                  * 2) The pool is currently being resilvered.
  349                  *
  350                  * We cannot simply check vd->vdev_resilver_txg, because it's
  351                  * not set in this path.
  352                  *
  353                  * Nor can we just check our vdev_ops; there are cases (such as
  354                  * when a user types "zpool replace pool odev spare_dev" and
  355                  * spare_dev is in the spare list, or when a spare device is
  356                  * automatically used to replace a DEGRADED device) when
  357                  * resilvering is complete but both the original vdev and the
  358                  * spare vdev remain in the pool.  That behavior is intentional.
  359                  * It helps implement the policy that a spare should be
  360                  * automatically removed from the pool after the user replaces
  361                  * the device that originally failed.
  362                  *
  363                  * If a spa load is in progress, then spa_dsl_pool may be
  364                  * uninitialized.  But we shouldn't be resilvering during a spa
  365                  * load anyway.
  366                  */
  367                 boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
  368                     vd->vdev_ops == &vdev_spare_ops) &&
  369                     spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
  370                     dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
  371                 mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
  372                     B_FALSE);
  373                 for (c = 0; c < mm->mm_children; c++) {
  374                         mc = &mm->mm_child[c];
  375                         mc->mc_vd = vd->vdev_child[c];
  376                         mc->mc_offset = zio->io_offset;
  377 
  378                         if (vdev_mirror_rebuilding(mc->mc_vd))
  379                                 mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE;
  380                 }
  381         }
  382 
  383         return (mm);
  384 }
  385 
  386 static int
  387 vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
  388     uint64_t *logical_ashift, uint64_t *physical_ashift)
  389 {
  390         int numerrors = 0;
  391         int lasterror = 0;
  392 
  393         if (vd->vdev_children == 0) {
  394                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
  395                 return (SET_ERROR(EINVAL));
  396         }
  397 
  398         vdev_open_children(vd);
  399 
  400         for (int c = 0; c < vd->vdev_children; c++) {
  401                 vdev_t *cvd = vd->vdev_child[c];
  402 
  403                 if (cvd->vdev_open_error) {
  404                         lasterror = cvd->vdev_open_error;
  405                         numerrors++;
  406                         continue;
  407                 }
  408 
  409                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
  410                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
  411                 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
  412         }
  413         for (int c = 0; c < vd->vdev_children; c++) {
  414                 vdev_t *cvd = vd->vdev_child[c];
  415 
  416                 if (cvd->vdev_open_error)
  417                         continue;
  418                 *physical_ashift = vdev_best_ashift(*logical_ashift,
  419                     *physical_ashift, cvd->vdev_physical_ashift);
  420         }
  421 
  422         if (numerrors == vd->vdev_children) {
  423                 if (vdev_children_are_offline(vd))
  424                         vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE;
  425                 else
  426                         vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
  427                 return (lasterror);
  428         }
  429 
  430         return (0);
  431 }
  432 
  433 static void
  434 vdev_mirror_close(vdev_t *vd)
  435 {
  436         for (int c = 0; c < vd->vdev_children; c++)
  437                 vdev_close(vd->vdev_child[c]);
  438 }
  439 
  440 static void
  441 vdev_mirror_child_done(zio_t *zio)
  442 {
  443         mirror_child_t *mc = zio->io_private;
  444 
  445         mc->mc_error = zio->io_error;
  446         mc->mc_tried = 1;
  447         mc->mc_skipped = 0;
  448 }
  449 
  450 /*
  451  * Check the other, lower-index DVAs to see if they're on the same
  452  * vdev as the child we picked.  If they are, use them since they
  453  * are likely to have been allocated from the primary metaslab in
  454  * use at the time, and hence are more likely to have locality with
  455  * single-copy data.
  456  */
  457 static int
  458 vdev_mirror_dva_select(zio_t *zio, int p)
  459 {
  460         dva_t *dva = zio->io_bp->blk_dva;
  461         mirror_map_t *mm = zio->io_vsd;
  462         int preferred;
  463         int c;
  464 
  465         preferred = mm->mm_preferred[p];
  466         for (p--; p >= 0; p--) {
  467                 c = mm->mm_preferred[p];
  468                 if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
  469                         preferred = c;
  470         }
  471         return (preferred);
  472 }
  473 
  474 static int
  475 vdev_mirror_preferred_child_randomize(zio_t *zio)
  476 {
  477         mirror_map_t *mm = zio->io_vsd;
  478         int p;
  479 
  480         if (mm->mm_root) {
  481                 p = random_in_range(mm->mm_preferred_cnt);
  482                 return (vdev_mirror_dva_select(zio, p));
  483         }
  484 
  485         /*
  486          * To ensure we don't always favour the first matching vdev,
  487          * which could lead to wear leveling issues on SSD's, we
  488          * use the I/O offset as a pseudo random seed into the vdevs
  489          * which have the lowest load.
  490          */
  491         p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
  492         return (mm->mm_preferred[p]);
  493 }
  494 
  495 static boolean_t
  496 vdev_mirror_child_readable(mirror_child_t *mc)
  497 {
  498         vdev_t *vd = mc->mc_vd;
  499 
  500         if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
  501                 return (vdev_draid_readable(vd, mc->mc_offset));
  502         else
  503                 return (vdev_readable(vd));
  504 }
  505 
  506 static boolean_t
  507 vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size)
  508 {
  509         vdev_t *vd = mc->mc_vd;
  510 
  511         if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
  512                 return (vdev_draid_missing(vd, mc->mc_offset, txg, size));
  513         else
  514                 return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
  515 }
  516 
  517 /*
  518  * Try to find a vdev whose DTL doesn't contain the block we want to read
  519  * preferring vdevs based on determined load. If we can't, try the read on
  520  * any vdev we haven't already tried.
  521  *
  522  * Distributed spares are an exception to the above load rule. They are
  523  * always preferred in order to detect gaps in the distributed spare which
  524  * are created when another disk in the dRAID fails. In order to restore
  525  * redundancy those gaps must be read to trigger the required repair IO.
  526  */
  527 static int
  528 vdev_mirror_child_select(zio_t *zio)
  529 {
  530         mirror_map_t *mm = zio->io_vsd;
  531         uint64_t txg = zio->io_txg;
  532         int c, lowest_load;
  533 
  534         ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
  535 
  536         lowest_load = INT_MAX;
  537         mm->mm_preferred_cnt = 0;
  538         for (c = 0; c < mm->mm_children; c++) {
  539                 mirror_child_t *mc;
  540 
  541                 mc = &mm->mm_child[c];
  542                 if (mc->mc_tried || mc->mc_skipped)
  543                         continue;
  544 
  545                 if (mc->mc_vd == NULL ||
  546                     !vdev_mirror_child_readable(mc)) {
  547                         mc->mc_error = SET_ERROR(ENXIO);
  548                         mc->mc_tried = 1;       /* don't even try */
  549                         mc->mc_skipped = 1;
  550                         continue;
  551                 }
  552 
  553                 if (vdev_mirror_child_missing(mc, txg, 1)) {
  554                         mc->mc_error = SET_ERROR(ESTALE);
  555                         mc->mc_skipped = 1;
  556                         mc->mc_speculative = 1;
  557                         continue;
  558                 }
  559 
  560                 if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) {
  561                         mm->mm_preferred[0] = c;
  562                         mm->mm_preferred_cnt = 1;
  563                         break;
  564                 }
  565 
  566                 mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
  567                 if (mc->mc_load > lowest_load)
  568                         continue;
  569 
  570                 if (mc->mc_load < lowest_load) {
  571                         lowest_load = mc->mc_load;
  572                         mm->mm_preferred_cnt = 0;
  573                 }
  574                 mm->mm_preferred[mm->mm_preferred_cnt] = c;
  575                 mm->mm_preferred_cnt++;
  576         }
  577 
  578         if (mm->mm_preferred_cnt == 1) {
  579                 MIRROR_BUMP(vdev_mirror_stat_preferred_found);
  580                 return (mm->mm_preferred[0]);
  581         }
  582 
  583         if (mm->mm_preferred_cnt > 1) {
  584                 MIRROR_BUMP(vdev_mirror_stat_preferred_not_found);
  585                 return (vdev_mirror_preferred_child_randomize(zio));
  586         }
  587 
  588         /*
  589          * Every device is either missing or has this txg in its DTL.
  590          * Look for any child we haven't already tried before giving up.
  591          */
  592         for (c = 0; c < mm->mm_children; c++) {
  593                 if (!mm->mm_child[c].mc_tried)
  594                         return (c);
  595         }
  596 
  597         /*
  598          * Every child failed.  There's no place left to look.
  599          */
  600         return (-1);
  601 }
  602 
  603 static void
  604 vdev_mirror_io_start(zio_t *zio)
  605 {
  606         mirror_map_t *mm;
  607         mirror_child_t *mc;
  608         int c, children;
  609 
  610         mm = vdev_mirror_map_init(zio);
  611         zio->io_vsd = mm;
  612         zio->io_vsd_ops = &vdev_mirror_vsd_ops;
  613 
  614         if (mm == NULL) {
  615                 ASSERT(!spa_trust_config(zio->io_spa));
  616                 ASSERT(zio->io_type == ZIO_TYPE_READ);
  617                 zio_execute(zio);
  618                 return;
  619         }
  620 
  621         if (zio->io_type == ZIO_TYPE_READ) {
  622                 if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
  623                         /*
  624                          * For scrubbing reads we need to issue reads to all
  625                          * children.  One child can reuse parent buffer, but
  626                          * for others we have to allocate separate ones to
  627                          * verify checksums if io_bp is non-NULL, or compare
  628                          * them in vdev_mirror_io_done() otherwise.
  629                          */
  630                         boolean_t first = B_TRUE;
  631                         for (c = 0; c < mm->mm_children; c++) {
  632                                 mc = &mm->mm_child[c];
  633 
  634                                 /* Don't issue ZIOs to offline children */
  635                                 if (!vdev_mirror_child_readable(mc)) {
  636                                         mc->mc_error = SET_ERROR(ENXIO);
  637                                         mc->mc_tried = 1;
  638                                         mc->mc_skipped = 1;
  639                                         continue;
  640                                 }
  641 
  642                                 mc->mc_abd = first ? zio->io_abd :
  643                                     abd_alloc_sametype(zio->io_abd,
  644                                     zio->io_size);
  645                                 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
  646                                     mc->mc_vd, mc->mc_offset, mc->mc_abd,
  647                                     zio->io_size, zio->io_type,
  648                                     zio->io_priority, 0,
  649                                     vdev_mirror_child_done, mc));
  650                                 first = B_FALSE;
  651                         }
  652                         zio_execute(zio);
  653                         return;
  654                 }
  655                 /*
  656                  * For normal reads just pick one child.
  657                  */
  658                 c = vdev_mirror_child_select(zio);
  659                 children = (c >= 0);
  660         } else {
  661                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
  662 
  663                 /*
  664                  * Writes go to all children.
  665                  */
  666                 c = 0;
  667                 children = mm->mm_children;
  668         }
  669 
  670         while (children--) {
  671                 mc = &mm->mm_child[c];
  672                 c++;
  673 
  674                 /*
  675                  * When sequentially resilvering only issue write repair
  676                  * IOs to the vdev which is being rebuilt since performance
  677                  * is limited by the slowest child.  This is an issue for
  678                  * faster replacement devices such as distributed spares.
  679                  */
  680                 if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
  681                     (zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
  682                     !(zio->io_flags & ZIO_FLAG_SCRUB) &&
  683                     mm->mm_rebuilding && !mc->mc_rebuilding) {
  684                         continue;
  685                 }
  686 
  687                 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
  688                     mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
  689                     zio->io_type, zio->io_priority, 0,
  690                     vdev_mirror_child_done, mc));
  691         }
  692 
  693         zio_execute(zio);
  694 }
  695 
  696 static int
  697 vdev_mirror_worst_error(mirror_map_t *mm)
  698 {
  699         int error[2] = { 0, 0 };
  700 
  701         for (int c = 0; c < mm->mm_children; c++) {
  702                 mirror_child_t *mc = &mm->mm_child[c];
  703                 int s = mc->mc_speculative;
  704                 error[s] = zio_worst_error(error[s], mc->mc_error);
  705         }
  706 
  707         return (error[0] ? error[0] : error[1]);
  708 }
  709 
  710 static void
  711 vdev_mirror_io_done(zio_t *zio)
  712 {
  713         mirror_map_t *mm = zio->io_vsd;
  714         mirror_child_t *mc;
  715         int c;
  716         int good_copies = 0;
  717         int unexpected_errors = 0;
  718         int last_good_copy = -1;
  719 
  720         if (mm == NULL)
  721                 return;
  722 
  723         for (c = 0; c < mm->mm_children; c++) {
  724                 mc = &mm->mm_child[c];
  725 
  726                 if (mc->mc_error) {
  727                         if (!mc->mc_skipped)
  728                                 unexpected_errors++;
  729                 } else if (mc->mc_tried) {
  730                         last_good_copy = c;
  731                         good_copies++;
  732                 }
  733         }
  734 
  735         if (zio->io_type == ZIO_TYPE_WRITE) {
  736                 /*
  737                  * XXX -- for now, treat partial writes as success.
  738                  *
  739                  * Now that we support write reallocation, it would be better
  740                  * to treat partial failure as real failure unless there are
  741                  * no non-degraded top-level vdevs left, and not update DTLs
  742                  * if we intend to reallocate.
  743                  */
  744                 if (good_copies != mm->mm_children) {
  745                         /*
  746                          * Always require at least one good copy.
  747                          *
  748                          * For ditto blocks (io_vd == NULL), require
  749                          * all copies to be good.
  750                          *
  751                          * XXX -- for replacing vdevs, there's no great answer.
  752                          * If the old device is really dead, we may not even
  753                          * be able to access it -- so we only want to
  754                          * require good writes to the new device.  But if
  755                          * the new device turns out to be flaky, we want
  756                          * to be able to detach it -- which requires all
  757                          * writes to the old device to have succeeded.
  758                          */
  759                         if (good_copies == 0 || zio->io_vd == NULL)
  760                                 zio->io_error = vdev_mirror_worst_error(mm);
  761                 }
  762                 return;
  763         }
  764 
  765         ASSERT(zio->io_type == ZIO_TYPE_READ);
  766 
  767         /*
  768          * If we don't have a good copy yet, keep trying other children.
  769          */
  770         if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
  771                 ASSERT(c >= 0 && c < mm->mm_children);
  772                 mc = &mm->mm_child[c];
  773                 zio_vdev_io_redone(zio);
  774                 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
  775                     mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
  776                     ZIO_TYPE_READ, zio->io_priority, 0,
  777                     vdev_mirror_child_done, mc));
  778                 return;
  779         }
  780 
  781         if (zio->io_flags & ZIO_FLAG_SCRUB && !mm->mm_resilvering) {
  782                 abd_t *best_abd = NULL;
  783                 if (last_good_copy >= 0)
  784                         best_abd = mm->mm_child[last_good_copy].mc_abd;
  785 
  786                 /*
  787                  * If we're scrubbing but don't have a BP available (because
  788                  * this vdev is under a raidz or draid vdev) then the best we
  789                  * can do is compare all of the copies read.  If they're not
  790                  * identical then return a checksum error and the most likely
  791                  * correct data.  The raidz code will issue a repair I/O if
  792                  * possible.
  793                  */
  794                 if (zio->io_bp == NULL) {
  795                         ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops ||
  796                             zio->io_vd->vdev_ops == &vdev_spare_ops);
  797 
  798                         abd_t *pref_abd = NULL;
  799                         for (c = 0; c < last_good_copy; c++) {
  800                                 mc = &mm->mm_child[c];
  801                                 if (mc->mc_error || !mc->mc_tried)
  802                                         continue;
  803 
  804                                 if (abd_cmp(mc->mc_abd, best_abd) != 0)
  805                                         zio->io_error = SET_ERROR(ECKSUM);
  806 
  807                                 /*
  808                                  * The distributed spare is always prefered
  809                                  * by vdev_mirror_child_select() so it's
  810                                  * considered to be the best candidate.
  811                                  */
  812                                 if (pref_abd == NULL &&
  813                                     mc->mc_vd->vdev_ops ==
  814                                     &vdev_draid_spare_ops)
  815                                         pref_abd = mc->mc_abd;
  816 
  817                                 /*
  818                                  * In the absence of a preferred copy, use
  819                                  * the parent pointer to avoid a memory copy.
  820                                  */
  821                                 if (mc->mc_abd == zio->io_abd)
  822                                         best_abd = mc->mc_abd;
  823                         }
  824                         if (pref_abd)
  825                                 best_abd = pref_abd;
  826                 } else {
  827 
  828                         /*
  829                          * If we have a BP available, then checksums are
  830                          * already verified and we just need a buffer
  831                          * with valid data, preferring parent one to
  832                          * avoid a memory copy.
  833                          */
  834                         for (c = 0; c < last_good_copy; c++) {
  835                                 mc = &mm->mm_child[c];
  836                                 if (mc->mc_error || !mc->mc_tried)
  837                                         continue;
  838                                 if (mc->mc_abd == zio->io_abd) {
  839                                         best_abd = mc->mc_abd;
  840                                         break;
  841                                 }
  842                         }
  843                 }
  844 
  845                 if (best_abd && best_abd != zio->io_abd)
  846                         abd_copy(zio->io_abd, best_abd, zio->io_size);
  847                 for (c = 0; c < mm->mm_children; c++) {
  848                         mc = &mm->mm_child[c];
  849                         if (mc->mc_abd != zio->io_abd)
  850                                 abd_free(mc->mc_abd);
  851                         mc->mc_abd = NULL;
  852                 }
  853         }
  854 
  855         if (good_copies == 0) {
  856                 zio->io_error = vdev_mirror_worst_error(mm);
  857                 ASSERT(zio->io_error != 0);
  858         }
  859 
  860         if (good_copies && spa_writeable(zio->io_spa) &&
  861             (unexpected_errors ||
  862             (zio->io_flags & ZIO_FLAG_RESILVER) ||
  863             ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
  864                 /*
  865                  * Use the good data we have in hand to repair damaged children.
  866                  */
  867                 for (c = 0; c < mm->mm_children; c++) {
  868                         /*
  869                          * Don't rewrite known good children.
  870                          * Not only is it unnecessary, it could
  871                          * actually be harmful: if the system lost
  872                          * power while rewriting the only good copy,
  873                          * there would be no good copies left!
  874                          */
  875                         mc = &mm->mm_child[c];
  876 
  877                         if (mc->mc_error == 0) {
  878                                 vdev_ops_t *ops = mc->mc_vd->vdev_ops;
  879 
  880                                 if (mc->mc_tried)
  881                                         continue;
  882                                 /*
  883                                  * We didn't try this child.  We need to
  884                                  * repair it if:
  885                                  * 1. it's a scrub (in which case we have
  886                                  * tried everything that was healthy)
  887                                  *  - or -
  888                                  * 2. it's an indirect or distributed spare
  889                                  * vdev (in which case it could point to any
  890                                  * other vdev, which might have a bad DTL)
  891                                  *  - or -
  892                                  * 3. the DTL indicates that this data is
  893                                  * missing from this vdev
  894                                  */
  895                                 if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
  896                                     ops != &vdev_indirect_ops &&
  897                                     ops != &vdev_draid_spare_ops &&
  898                                     !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
  899                                     zio->io_txg, 1))
  900                                         continue;
  901                                 mc->mc_error = SET_ERROR(ESTALE);
  902                         }
  903 
  904                         zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
  905                             mc->mc_vd, mc->mc_offset,
  906                             zio->io_abd, zio->io_size, ZIO_TYPE_WRITE,
  907                             zio->io_priority == ZIO_PRIORITY_REBUILD ?
  908                             ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
  909                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
  910                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
  911                 }
  912         }
  913 }
  914 
  915 static void
  916 vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
  917 {
  918         if (faulted == vd->vdev_children) {
  919                 if (vdev_children_are_offline(vd)) {
  920                         vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE,
  921                             VDEV_AUX_CHILDREN_OFFLINE);
  922                 } else {
  923                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
  924                             VDEV_AUX_NO_REPLICAS);
  925                 }
  926         } else if (degraded + faulted != 0) {
  927                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
  928         } else {
  929                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
  930         }
  931 }
  932 
  933 /*
  934  * Return the maximum asize for a rebuild zio in the provided range.
  935  */
  936 static uint64_t
  937 vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
  938     uint64_t max_segment)
  939 {
  940         (void) start;
  941 
  942         uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
  943             SPA_MAXBLOCKSIZE);
  944 
  945         return (MIN(asize, vdev_psize_to_asize(vd, psize)));
  946 }
  947 
  948 vdev_ops_t vdev_mirror_ops = {
  949         .vdev_op_init = NULL,
  950         .vdev_op_fini = NULL,
  951         .vdev_op_open = vdev_mirror_open,
  952         .vdev_op_close = vdev_mirror_close,
  953         .vdev_op_asize = vdev_default_asize,
  954         .vdev_op_min_asize = vdev_default_min_asize,
  955         .vdev_op_min_alloc = NULL,
  956         .vdev_op_io_start = vdev_mirror_io_start,
  957         .vdev_op_io_done = vdev_mirror_io_done,
  958         .vdev_op_state_change = vdev_mirror_state_change,
  959         .vdev_op_need_resilver = vdev_default_need_resilver,
  960         .vdev_op_hold = NULL,
  961         .vdev_op_rele = NULL,
  962         .vdev_op_remap = NULL,
  963         .vdev_op_xlate = vdev_default_xlate,
  964         .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
  965         .vdev_op_metaslab_init = NULL,
  966         .vdev_op_config_generate = NULL,
  967         .vdev_op_nparity = NULL,
  968         .vdev_op_ndisks = NULL,
  969         .vdev_op_type = VDEV_TYPE_MIRROR,       /* name of this vdev type */
  970         .vdev_op_leaf = B_FALSE                 /* not a leaf vdev */
  971 };
  972 
  973 vdev_ops_t vdev_replacing_ops = {
  974         .vdev_op_init = NULL,
  975         .vdev_op_fini = NULL,
  976         .vdev_op_open = vdev_mirror_open,
  977         .vdev_op_close = vdev_mirror_close,
  978         .vdev_op_asize = vdev_default_asize,
  979         .vdev_op_min_asize = vdev_default_min_asize,
  980         .vdev_op_min_alloc = NULL,
  981         .vdev_op_io_start = vdev_mirror_io_start,
  982         .vdev_op_io_done = vdev_mirror_io_done,
  983         .vdev_op_state_change = vdev_mirror_state_change,
  984         .vdev_op_need_resilver = vdev_default_need_resilver,
  985         .vdev_op_hold = NULL,
  986         .vdev_op_rele = NULL,
  987         .vdev_op_remap = NULL,
  988         .vdev_op_xlate = vdev_default_xlate,
  989         .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
  990         .vdev_op_metaslab_init = NULL,
  991         .vdev_op_config_generate = NULL,
  992         .vdev_op_nparity = NULL,
  993         .vdev_op_ndisks = NULL,
  994         .vdev_op_type = VDEV_TYPE_REPLACING,    /* name of this vdev type */
  995         .vdev_op_leaf = B_FALSE                 /* not a leaf vdev */
  996 };
  997 
  998 vdev_ops_t vdev_spare_ops = {
  999         .vdev_op_init = NULL,
 1000         .vdev_op_fini = NULL,
 1001         .vdev_op_open = vdev_mirror_open,
 1002         .vdev_op_close = vdev_mirror_close,
 1003         .vdev_op_asize = vdev_default_asize,
 1004         .vdev_op_min_asize = vdev_default_min_asize,
 1005         .vdev_op_min_alloc = NULL,
 1006         .vdev_op_io_start = vdev_mirror_io_start,
 1007         .vdev_op_io_done = vdev_mirror_io_done,
 1008         .vdev_op_state_change = vdev_mirror_state_change,
 1009         .vdev_op_need_resilver = vdev_default_need_resilver,
 1010         .vdev_op_hold = NULL,
 1011         .vdev_op_rele = NULL,
 1012         .vdev_op_remap = NULL,
 1013         .vdev_op_xlate = vdev_default_xlate,
 1014         .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
 1015         .vdev_op_metaslab_init = NULL,
 1016         .vdev_op_config_generate = NULL,
 1017         .vdev_op_nparity = NULL,
 1018         .vdev_op_ndisks = NULL,
 1019         .vdev_op_type = VDEV_TYPE_SPARE,        /* name of this vdev type */
 1020         .vdev_op_leaf = B_FALSE                 /* not a leaf vdev */
 1021 };
 1022 
 1023 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW,
 1024         "Rotating media load increment for non-seeking I/Os");
 1025 
 1026 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT,
 1027         ZMOD_RW, "Rotating media load increment for seeking I/Os");
 1028 
 1029 /* BEGIN CSTYLED */
 1030 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT,
 1031         ZMOD_RW,
 1032         "Offset in bytes from the last I/O which triggers "
 1033         "a reduced rotating media seek increment");
 1034 /* END CSTYLED */
 1035 
 1036 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT,
 1037         ZMOD_RW, "Non-rotating media load increment for non-seeking I/Os");
 1038 
 1039 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT,
 1040         ZMOD_RW, "Non-rotating media load increment for seeking I/Os");

Cache object: 748e7c5ccbf1deb4f040db8bb5c5d282


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.