tr_raid1e.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
   17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
   20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   26  * SUCH DAMAGE.
   27  */
   28 
   29 #include <sys/cdefs.h>
   30 __FBSDID("$FreeBSD$");
   31 
   32 #include <sys/param.h>
   33 #include <sys/bio.h>
   34 #include <sys/endian.h>
   35 #include <sys/kernel.h>
   36 #include <sys/kobj.h>
   37 #include <sys/limits.h>
   38 #include <sys/lock.h>
   39 #include <sys/malloc.h>
   40 #include <sys/mutex.h>
   41 #include <sys/sysctl.h>
   42 #include <sys/systm.h>
   43 #include <geom/geom.h>
   44 #include <geom/geom_dbg.h>
   45 #include "geom/raid/g_raid.h"
   46 #include "g_raid_tr_if.h"
   47 
   48 #define N       2
   49 
   50 SYSCTL_DECL(_kern_geom_raid_raid1e);
   51 
   52 #define RAID1E_REBUILD_SLAB     (1 << 20) /* One transation in a rebuild */
   53 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
   54 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
   55     &g_raid1e_rebuild_slab, 0,
   56     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
   57 
   58 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
   59 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
   60 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
   61     &g_raid1e_rebuild_fair_io, 0,
   62     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
   63 
   64 #define RAID1E_REBUILD_CLUSTER_IDLE 100
   65 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
   66 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
   67     &g_raid1e_rebuild_cluster_idle, 0,
   68     "Number of slabs to do each time we trigger a rebuild cycle");
   69 
   70 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
   71 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
   72 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
   73     &g_raid1e_rebuild_meta_update, 0,
   74     "When to update the meta data.");
   75 
   76 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
   77 
   78 #define TR_RAID1E_NONE 0
   79 #define TR_RAID1E_REBUILD 1
   80 #define TR_RAID1E_RESYNC 2
   81 
   82 #define TR_RAID1E_F_DOING_SOME  0x1
   83 #define TR_RAID1E_F_LOCKED      0x2
   84 #define TR_RAID1E_F_ABORT       0x4
   85 
   86 struct g_raid_tr_raid1e_object {
   87         struct g_raid_tr_object  trso_base;
   88         int                      trso_starting;
   89         int                      trso_stopping;
   90         int                      trso_type;
   91         int                      trso_recover_slabs; /* slabs before rest */
   92         int                      trso_fair_io;
   93         int                      trso_meta_update;
   94         int                      trso_flags;
   95         struct g_raid_subdisk   *trso_failed_sd; /* like per volume */
   96         void                    *trso_buffer;    /* Buffer space */
   97         off_t                    trso_lock_pos; /* Locked range start. */
   98         off_t                    trso_lock_len; /* Locked range length. */
   99         struct bio               trso_bio;
  100 };
  101 
  102 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
  103 static g_raid_tr_event_t g_raid_tr_event_raid1e;
  104 static g_raid_tr_start_t g_raid_tr_start_raid1e;
  105 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
  106 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
  107 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
  108 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
  109 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
  110 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
  111 static g_raid_tr_free_t g_raid_tr_free_raid1e;
  112 
  113 static kobj_method_t g_raid_tr_raid1e_methods[] = {
  114         KOBJMETHOD(g_raid_tr_taste,     g_raid_tr_taste_raid1e),
  115         KOBJMETHOD(g_raid_tr_event,     g_raid_tr_event_raid1e),
  116         KOBJMETHOD(g_raid_tr_start,     g_raid_tr_start_raid1e),
  117         KOBJMETHOD(g_raid_tr_stop,      g_raid_tr_stop_raid1e),
  118         KOBJMETHOD(g_raid_tr_iostart,   g_raid_tr_iostart_raid1e),
  119         KOBJMETHOD(g_raid_tr_iodone,    g_raid_tr_iodone_raid1e),
  120         KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
  121         KOBJMETHOD(g_raid_tr_locked,    g_raid_tr_locked_raid1e),
  122         KOBJMETHOD(g_raid_tr_idle,      g_raid_tr_idle_raid1e),
  123         KOBJMETHOD(g_raid_tr_free,      g_raid_tr_free_raid1e),
  124         { 0, 0 }
  125 };
  126 
  127 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
  128         "RAID1E",
  129         g_raid_tr_raid1e_methods,
  130         sizeof(struct g_raid_tr_raid1e_object),
  131         .trc_enable = 1,
  132         .trc_priority = 200,
  133         .trc_accept_unmapped = 1
  134 };
  135 
  136 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
  137 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
  138     struct g_raid_subdisk *sd);
  139 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
  140     int no, off_t off, off_t len, u_int mask);
  141 
  142 static inline void
  143 V2P(struct g_raid_volume *vol, off_t virt,
  144     int *disk, off_t *offset, off_t *start)
  145 {
  146         off_t nstrip;
  147         u_int strip_size;
  148 
  149         strip_size = vol->v_strip_size;
  150         /* Strip number. */
  151         nstrip = virt / strip_size;
  152         /* Start position in strip. */
  153         *start = virt % strip_size;
  154         /* Disk number. */
  155         *disk = (nstrip * N) % vol->v_disks_count;
  156         /* Strip start position in disk. */
  157         *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
  158 }
  159 
  160 static inline void
  161 P2V(struct g_raid_volume *vol, int disk, off_t offset,
  162     off_t *virt, int *copy)
  163 {
  164         off_t nstrip, start;
  165         u_int strip_size;
  166 
  167         strip_size = vol->v_strip_size;
  168         /* Start position in strip. */
  169         start = offset % strip_size;
  170         /* Physical strip number. */
  171         nstrip = (offset / strip_size) * vol->v_disks_count + disk;
  172         /* Number of physical strip (copy) inside virtual strip. */
  173         *copy = nstrip % N;
  174         /* Offset in virtual space. */
  175         *virt = (nstrip / N) * strip_size + start;
  176 }
  177 
  178 static int
  179 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
  180 {
  181         struct g_raid_tr_raid1e_object *trs;
  182 
  183         trs = (struct g_raid_tr_raid1e_object *)tr;
  184         if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
  185             tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
  186                 return (G_RAID_TR_TASTE_FAIL);
  187         trs->trso_starting = 1;
  188         return (G_RAID_TR_TASTE_SUCCEED);
  189 }
  190 
  191 static int
  192 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
  193 {
  194         struct g_raid_softc *sc;
  195         struct g_raid_subdisk *sd, *bestsd, *worstsd;
  196         int i, j, state, sstate;
  197 
  198         sc = vol->v_softc;
  199         state = G_RAID_VOLUME_S_OPTIMAL;
  200         for (i = 0; i < vol->v_disks_count / N; i++) {
  201                 bestsd = &vol->v_subdisks[i * N];
  202                 for (j = 1; j < N; j++) {
  203                         sd = &vol->v_subdisks[i * N + j];
  204                         if (sd->sd_state > bestsd->sd_state)
  205                                 bestsd = sd;
  206                         else if (sd->sd_state == bestsd->sd_state &&
  207                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
  208                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
  209                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
  210                                 bestsd = sd;
  211                 }
  212                 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
  213                     bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
  214                         /* We found reasonable candidate. */
  215                         G_RAID_DEBUG1(1, sc,
  216                             "Promote subdisk %s:%d from %s to ACTIVE.",
  217                             vol->v_name, bestsd->sd_pos,
  218                             g_raid_subdisk_state2str(bestsd->sd_state));
  219                         g_raid_change_subdisk_state(bestsd,
  220                             G_RAID_SUBDISK_S_ACTIVE);
  221                         g_raid_write_metadata(sc,
  222                             vol, bestsd, bestsd->sd_disk);
  223                 }
  224                 worstsd = &vol->v_subdisks[i * N];
  225                 for (j = 1; j < N; j++) {
  226                         sd = &vol->v_subdisks[i * N + j];
  227                         if (sd->sd_state < worstsd->sd_state)
  228                                 worstsd = sd;
  229                 }
  230                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  231                         sstate = G_RAID_VOLUME_S_OPTIMAL;
  232                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  233                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
  234                 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  235                         sstate = G_RAID_VOLUME_S_DEGRADED;
  236                 else
  237                         sstate = G_RAID_VOLUME_S_BROKEN;
  238                 if (sstate < state)
  239                         state = sstate;
  240         }
  241         return (state);
  242 }
  243 
  244 static int
  245 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
  246 {
  247         struct g_raid_softc *sc;
  248         struct g_raid_subdisk *sd, *bestsd, *worstsd;
  249         int i, j, state, sstate;
  250 
  251         sc = vol->v_softc;
  252         if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
  253             vol->v_disks_count)
  254                 return (G_RAID_VOLUME_S_OPTIMAL);
  255         for (i = 0; i < vol->v_disks_count; i++) {
  256                 sd = &vol->v_subdisks[i];
  257                 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
  258                         /* We found reasonable candidate. */
  259                         G_RAID_DEBUG1(1, sc,
  260                             "Promote subdisk %s:%d from %s to STALE.",
  261                             vol->v_name, sd->sd_pos,
  262                             g_raid_subdisk_state2str(sd->sd_state));
  263                         g_raid_change_subdisk_state(sd,
  264                             G_RAID_SUBDISK_S_STALE);
  265                         g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
  266                 }
  267         }
  268         state = G_RAID_VOLUME_S_OPTIMAL;
  269         for (i = 0; i < vol->v_disks_count; i++) {
  270                 bestsd = &vol->v_subdisks[i];
  271                 worstsd = &vol->v_subdisks[i];
  272                 for (j = 1; j < N; j++) {
  273                         sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
  274                         if (sd->sd_state > bestsd->sd_state)
  275                                 bestsd = sd;
  276                         else if (sd->sd_state == bestsd->sd_state &&
  277                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
  278                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
  279                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
  280                                 bestsd = sd;
  281                         if (sd->sd_state < worstsd->sd_state)
  282                                 worstsd = sd;
  283                 }
  284                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  285                         sstate = G_RAID_VOLUME_S_OPTIMAL;
  286                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  287                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
  288                 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  289                         sstate = G_RAID_VOLUME_S_DEGRADED;
  290                 else
  291                         sstate = G_RAID_VOLUME_S_BROKEN;
  292                 if (sstate < state)
  293                         state = sstate;
  294         }
  295         return (state);
  296 }
  297 
  298 static int
  299 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
  300     struct g_raid_subdisk *sd)
  301 {
  302         struct g_raid_tr_raid1e_object *trs;
  303         struct g_raid_softc *sc;
  304         u_int s;
  305 
  306         sc = vol->v_softc;
  307         trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
  308         if (trs->trso_stopping &&
  309             (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
  310                 s = G_RAID_VOLUME_S_STOPPED;
  311         else if (trs->trso_starting)
  312                 s = G_RAID_VOLUME_S_STARTING;
  313         else {
  314                 if ((vol->v_disks_count % N) == 0)
  315                         s = g_raid_tr_update_state_raid1e_even(vol);
  316                 else
  317                         s = g_raid_tr_update_state_raid1e_odd(vol);
  318         }
  319         if (s != vol->v_state) {
  320                 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
  321                     G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
  322                     G_RAID_EVENT_VOLUME);
  323                 g_raid_change_volume_state(vol, s);
  324                 if (!trs->trso_starting && !trs->trso_stopping)
  325                         g_raid_write_metadata(sc, vol, NULL, NULL);
  326         }
  327         if (!trs->trso_starting && !trs->trso_stopping)
  328                 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
  329         return (0);
  330 }
  331 
  332 static void
  333 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
  334     struct g_raid_disk *disk)
  335 {
  336         struct g_raid_volume *vol;
  337 
  338         vol = sd->sd_volume;
  339         /*
  340          * We don't fail the last disk in the pack, since it still has decent
  341          * data on it and that's better than failing the disk if it is the root
  342          * file system.
  343          *
  344          * XXX should this be controlled via a tunable?  It makes sense for
  345          * the volume that has / on it.  I can't think of a case where we'd
  346          * want the volume to go away on this kind of event.
  347          */
  348         if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
  349              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
  350              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
  351              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
  352              vol->v_disks_count) &&
  353             (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
  354                 return;
  355         g_raid_fail_disk(sc, sd, disk);
  356 }
  357 
  358 static void
  359 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
  360 {
  361         struct g_raid_volume *vol;
  362         struct g_raid_subdisk *sd;
  363 
  364         vol = trs->trso_base.tro_volume;
  365         sd = trs->trso_failed_sd;
  366         g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
  367         free(trs->trso_buffer, M_TR_RAID1E);
  368         trs->trso_buffer = NULL;
  369         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  370         trs->trso_type = TR_RAID1E_NONE;
  371         trs->trso_recover_slabs = 0;
  372         trs->trso_failed_sd = NULL;
  373         g_raid_tr_update_state_raid1e(vol, NULL);
  374 }
  375 
  376 static void
  377 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
  378 {
  379         struct g_raid_tr_raid1e_object *trs;
  380         struct g_raid_subdisk *sd;
  381 
  382         trs = (struct g_raid_tr_raid1e_object *)tr;
  383         sd = trs->trso_failed_sd;
  384         G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
  385             "Subdisk %s:%d-%s rebuild completed.",
  386             sd->sd_volume->v_name, sd->sd_pos,
  387             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  388         g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
  389         sd->sd_rebuild_pos = 0;
  390         g_raid_tr_raid1e_rebuild_done(trs);
  391 }
  392 
  393 static void
  394 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
  395 {
  396         struct g_raid_tr_raid1e_object *trs;
  397         struct g_raid_subdisk *sd;
  398         struct g_raid_volume *vol;
  399 
  400         vol = tr->tro_volume;
  401         trs = (struct g_raid_tr_raid1e_object *)tr;
  402         sd = trs->trso_failed_sd;
  403         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
  404                 G_RAID_DEBUG1(1, vol->v_softc,
  405                     "Subdisk %s:%d-%s rebuild is aborting.",
  406                     sd->sd_volume->v_name, sd->sd_pos,
  407                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  408                 trs->trso_flags |= TR_RAID1E_F_ABORT;
  409         } else {
  410                 G_RAID_DEBUG1(0, vol->v_softc,
  411                     "Subdisk %s:%d-%s rebuild aborted.",
  412                     sd->sd_volume->v_name, sd->sd_pos,
  413                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  414                 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
  415                 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
  416                         trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
  417                         g_raid_unlock_range(tr->tro_volume,
  418                             trs->trso_lock_pos, trs->trso_lock_len);
  419                 }
  420                 g_raid_tr_raid1e_rebuild_done(trs);
  421         }
  422 }
  423 
  424 static void
  425 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
  426 {
  427         struct g_raid_tr_raid1e_object *trs;
  428         struct g_raid_softc *sc;
  429         struct g_raid_volume *vol;
  430         struct g_raid_subdisk *sd;
  431         struct bio *bp;
  432         off_t len, virtual, vend, offset, start;
  433         int disk, copy, best;
  434 
  435         trs = (struct g_raid_tr_raid1e_object *)tr;
  436         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
  437                 return;
  438         vol = tr->tro_volume;
  439         sc = vol->v_softc;
  440         sd = trs->trso_failed_sd;
  441 
  442         while (1) {
  443                 if (sd->sd_rebuild_pos >= sd->sd_size) {
  444                         g_raid_tr_raid1e_rebuild_finish(tr);
  445                         return;
  446                 }
  447                 /* Get virtual offset from physical rebuild position. */
  448                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
  449                 /* Get physical offset back to get first stripe position. */
  450                 V2P(vol, virtual, &disk, &offset, &start);
  451                 /* Calculate contignous data length. */
  452                 len = MIN(g_raid1e_rebuild_slab,
  453                     sd->sd_size - sd->sd_rebuild_pos);
  454                 if ((vol->v_disks_count % N) != 0)
  455                         len = MIN(len, vol->v_strip_size - start);
  456                 /* Find disk with most accurate data. */
  457                 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
  458                     offset + start, len, 0);
  459                 if (best < 0) {
  460                         /* There is no any valid disk. */
  461                         g_raid_tr_raid1e_rebuild_abort(tr);
  462                         return;
  463                 } else if (best != copy) {
  464                         /* Some other disk has better data. */
  465                         break;
  466                 }
  467                 /* We have the most accurate data. Skip the range. */
  468                 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
  469                     sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
  470                 sd->sd_rebuild_pos += len;
  471         }
  472 
  473         bp = &trs->trso_bio;
  474         memset(bp, 0, sizeof(*bp));
  475         bp->bio_offset = offset + start +
  476             ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
  477         bp->bio_length = len;
  478         bp->bio_data = trs->trso_buffer;
  479         bp->bio_cmd = BIO_READ;
  480         bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
  481         bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
  482         G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
  483         /*
  484          * If we are crossing stripe boundary, correct affected virtual
  485          * range we should lock.
  486          */
  487         if (start + len > vol->v_strip_size) {
  488                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
  489                 len = vend - virtual;
  490         }
  491         trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
  492         trs->trso_flags |= TR_RAID1E_F_LOCKED;
  493         trs->trso_lock_pos = virtual;
  494         trs->trso_lock_len = len;
  495         /* Lock callback starts I/O */
  496         g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
  497 }
  498 
  499 static void
  500 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
  501 {
  502         struct g_raid_volume *vol;
  503         struct g_raid_tr_raid1e_object *trs;
  504         struct g_raid_subdisk *sd;
  505 
  506         vol = tr->tro_volume;
  507         trs = (struct g_raid_tr_raid1e_object *)tr;
  508         if (trs->trso_failed_sd) {
  509                 G_RAID_DEBUG1(1, vol->v_softc,
  510                     "Already rebuild in start rebuild. pos %jd\n",
  511                     (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
  512                 return;
  513         }
  514         sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
  515         if (sd == NULL)
  516                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
  517         if (sd == NULL) {
  518                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
  519                 if (sd != NULL) {
  520                         sd->sd_rebuild_pos = 0;
  521                         g_raid_change_subdisk_state(sd,
  522                             G_RAID_SUBDISK_S_RESYNC);
  523                         g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
  524                 } else {
  525                         sd = g_raid_get_subdisk(vol,
  526                             G_RAID_SUBDISK_S_UNINITIALIZED);
  527                         if (sd == NULL)
  528                                 sd = g_raid_get_subdisk(vol,
  529                                     G_RAID_SUBDISK_S_NEW);
  530                         if (sd != NULL) {
  531                                 sd->sd_rebuild_pos = 0;
  532                                 g_raid_change_subdisk_state(sd,
  533                                     G_RAID_SUBDISK_S_REBUILD);
  534                                 g_raid_write_metadata(vol->v_softc,
  535                                     vol, sd, NULL);
  536                         }
  537                 }
  538         }
  539         if (sd == NULL) {
  540                 G_RAID_DEBUG1(1, vol->v_softc,
  541                     "No failed disk to rebuild.  night night.");
  542                 return;
  543         }
  544         trs->trso_failed_sd = sd;
  545         G_RAID_DEBUG1(0, vol->v_softc,
  546             "Subdisk %s:%d-%s rebuild start at %jd.",
  547             sd->sd_volume->v_name, sd->sd_pos,
  548             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
  549             trs->trso_failed_sd->sd_rebuild_pos);
  550         trs->trso_type = TR_RAID1E_REBUILD;
  551         trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
  552         trs->trso_meta_update = g_raid1e_rebuild_meta_update;
  553         g_raid_tr_raid1e_rebuild_some(tr);
  554 }
  555 
  556 static void
  557 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
  558     struct g_raid_subdisk *sd)
  559 {
  560         struct g_raid_volume *vol;
  561         struct g_raid_tr_raid1e_object *trs;
  562         int nr;
  563 
  564         vol = tr->tro_volume;
  565         trs = (struct g_raid_tr_raid1e_object *)tr;
  566         if (trs->trso_stopping)
  567                 return;
  568         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
  569             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
  570         switch(trs->trso_type) {
  571         case TR_RAID1E_NONE:
  572                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
  573                         return;
  574                 if (nr == 0) {
  575                         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
  576                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
  577                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
  578                         if (nr == 0)
  579                                 return;
  580                 }
  581                 g_raid_tr_raid1e_rebuild_start(tr);
  582                 break;
  583         case TR_RAID1E_REBUILD:
  584                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
  585                     trs->trso_failed_sd == sd)
  586                         g_raid_tr_raid1e_rebuild_abort(tr);
  587                 break;
  588         case TR_RAID1E_RESYNC:
  589                 break;
  590         }
  591 }
  592 
  593 static int
  594 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
  595     struct g_raid_subdisk *sd, u_int event)
  596 {
  597 
  598         g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
  599         return (0);
  600 }
  601 
  602 static int
  603 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
  604 {
  605         struct g_raid_tr_raid1e_object *trs;
  606         struct g_raid_volume *vol;
  607 
  608         trs = (struct g_raid_tr_raid1e_object *)tr;
  609         vol = tr->tro_volume;
  610         trs->trso_starting = 0;
  611         g_raid_tr_update_state_raid1e(vol, NULL);
  612         return (0);
  613 }
  614 
  615 static int
  616 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
  617 {
  618         struct g_raid_tr_raid1e_object *trs;
  619         struct g_raid_volume *vol;
  620 
  621         trs = (struct g_raid_tr_raid1e_object *)tr;
  622         vol = tr->tro_volume;
  623         trs->trso_starting = 0;
  624         trs->trso_stopping = 1;
  625         g_raid_tr_update_state_raid1e(vol, NULL);
  626         return (0);
  627 }
  628 
  629 /*
  630  * Select the disk to read from.  Take into account: subdisk state, running
  631  * error recovery, average disk load, head position and possible cache hits.
  632  */
  633 #define ABS(x)          (((x) >= 0) ? (x) : (-(x)))
  634 static int
  635 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
  636     int no, off_t off, off_t len, u_int mask)
  637 {
  638         struct g_raid_subdisk *sd;
  639         off_t offset;
  640         int i, best, prio, bestprio;
  641 
  642         best = -1;
  643         bestprio = INT_MAX;
  644         for (i = 0; i < N; i++) {
  645                 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
  646                 offset = off;
  647                 if (no + i >= vol->v_disks_count)
  648                         offset += vol->v_strip_size;
  649 
  650                 prio = G_RAID_SUBDISK_LOAD(sd);
  651                 if ((mask & (1 << sd->sd_pos)) != 0)
  652                         continue;
  653                 switch (sd->sd_state) {
  654                 case G_RAID_SUBDISK_S_ACTIVE:
  655                         break;
  656                 case G_RAID_SUBDISK_S_RESYNC:
  657                         if (offset + off < sd->sd_rebuild_pos)
  658                                 break;
  659                         /* FALLTHROUGH */
  660                 case G_RAID_SUBDISK_S_STALE:
  661                         prio += i << 24;
  662                         break;
  663                 case G_RAID_SUBDISK_S_REBUILD:
  664                         if (offset + off < sd->sd_rebuild_pos)
  665                                 break;
  666                         /* FALLTHROUGH */
  667                 default:
  668                         continue;
  669                 }
  670                 prio += min(sd->sd_recovery, 255) << 16;
  671                 /* If disk head is precisely in position - highly prefer it. */
  672                 if (G_RAID_SUBDISK_POS(sd) == offset)
  673                         prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
  674                 else
  675                 /* If disk head is close to position - prefer it. */
  676                 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
  677                     G_RAID_SUBDISK_TRACK_SIZE)
  678                         prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
  679                 if (prio < bestprio) {
  680                         bestprio = prio;
  681                         best = i;
  682                 }
  683         }
  684         return (best);
  685 }
  686 
  687 static void
  688 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
  689 {
  690         struct g_raid_volume *vol;
  691         struct g_raid_subdisk *sd;
  692         struct bio_queue_head queue;
  693         struct bio *cbp;
  694         char *addr;
  695         off_t offset, start, length, remain;
  696         u_int no, strip_size;
  697         int best;
  698 
  699         vol = tr->tro_volume;
  700         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
  701                 addr = NULL;
  702         else
  703                 addr = bp->bio_data;
  704         strip_size = vol->v_strip_size;
  705         V2P(vol, bp->bio_offset, &no, &offset, &start);
  706         remain = bp->bio_length;
  707         bioq_init(&queue);
  708         while (remain > 0) {
  709                 length = MIN(strip_size - start, remain);
  710                 best = g_raid_tr_raid1e_select_read_disk(vol,
  711                     no, offset, length, 0);
  712                 KASSERT(best >= 0, ("No readable disk in volume %s!",
  713                     vol->v_name));
  714                 no += best;
  715                 if (no >= vol->v_disks_count) {
  716                         no -= vol->v_disks_count;
  717                         offset += strip_size;
  718                 }
  719                 cbp = g_clone_bio(bp);
  720                 if (cbp == NULL)
  721                         goto failure;
  722                 cbp->bio_offset = offset + start;
  723                 cbp->bio_length = length;
  724                 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
  725                         cbp->bio_ma_offset += (uintptr_t)addr;
  726                         cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
  727                         cbp->bio_ma_offset %= PAGE_SIZE;
  728                         cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
  729                             cbp->bio_length) / PAGE_SIZE;
  730                 } else
  731                         cbp->bio_data = addr;
  732                 cbp->bio_caller1 = &vol->v_subdisks[no];
  733                 bioq_insert_tail(&queue, cbp);
  734                 no += N - best;
  735                 if (no >= vol->v_disks_count) {
  736                         no -= vol->v_disks_count;
  737                         offset += strip_size;
  738                 }
  739                 remain -= length;
  740                 addr += length;
  741                 start = 0;
  742         }
  743         while ((cbp = bioq_takefirst(&queue)) != NULL) {
  744                 sd = cbp->bio_caller1;
  745                 cbp->bio_caller1 = NULL;
  746                 g_raid_subdisk_iostart(sd, cbp);
  747         }
  748         return;
  749 failure:
  750         while ((cbp = bioq_takefirst(&queue)) != NULL)
  751                 g_destroy_bio(cbp);
  752         if (bp->bio_error == 0)
  753                 bp->bio_error = ENOMEM;
  754         g_raid_iodone(bp, bp->bio_error);
  755 }
  756 
  757 static void
  758 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
  759 {
  760         struct g_raid_volume *vol;
  761         struct g_raid_subdisk *sd;
  762         struct bio_queue_head queue;
  763         struct bio *cbp;
  764         char *addr;
  765         off_t offset, start, length, remain;
  766         u_int no, strip_size;
  767         int i;
  768 
  769         vol = tr->tro_volume;
  770         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
  771                 addr = NULL;
  772         else
  773                 addr = bp->bio_data;
  774         strip_size = vol->v_strip_size;
  775         V2P(vol, bp->bio_offset, &no, &offset, &start);
  776         remain = bp->bio_length;
  777         bioq_init(&queue);
  778         while (remain > 0) {
  779                 length = MIN(strip_size - start, remain);
  780                 for (i = 0; i < N; i++) {
  781                         sd = &vol->v_subdisks[no];
  782                         switch (sd->sd_state) {
  783                         case G_RAID_SUBDISK_S_ACTIVE:
  784                         case G_RAID_SUBDISK_S_STALE:
  785                         case G_RAID_SUBDISK_S_RESYNC:
  786                                 break;
  787                         case G_RAID_SUBDISK_S_REBUILD:
  788                                 if (offset + start >= sd->sd_rebuild_pos)
  789                                         goto nextdisk;
  790                                 break;
  791                         default:
  792                                 goto nextdisk;
  793                         }
  794                         cbp = g_clone_bio(bp);
  795                         if (cbp == NULL)
  796                                 goto failure;
  797                         cbp->bio_offset = offset + start;
  798                         cbp->bio_length = length;
  799                         if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
  800                             bp->bio_cmd != BIO_DELETE) {
  801                                 cbp->bio_ma_offset += (uintptr_t)addr;
  802                                 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
  803                                 cbp->bio_ma_offset %= PAGE_SIZE;
  804                                 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
  805                                     cbp->bio_length) / PAGE_SIZE;
  806                         } else
  807                                 cbp->bio_data = addr;
  808                         cbp->bio_caller1 = sd;
  809                         bioq_insert_tail(&queue, cbp);
  810 nextdisk:
  811                         if (++no >= vol->v_disks_count) {
  812                                 no = 0;
  813                                 offset += strip_size;
  814                         }
  815                 }
  816                 remain -= length;
  817                 if (bp->bio_cmd != BIO_DELETE)
  818                         addr += length;
  819                 start = 0;
  820         }
  821         while ((cbp = bioq_takefirst(&queue)) != NULL) {
  822                 sd = cbp->bio_caller1;
  823                 cbp->bio_caller1 = NULL;
  824                 g_raid_subdisk_iostart(sd, cbp);
  825         }
  826         return;
  827 failure:
  828         while ((cbp = bioq_takefirst(&queue)) != NULL)
  829                 g_destroy_bio(cbp);
  830         if (bp->bio_error == 0)
  831                 bp->bio_error = ENOMEM;
  832         g_raid_iodone(bp, bp->bio_error);
  833 }
  834 
  835 static void
  836 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
  837 {
  838         struct g_raid_volume *vol;
  839         struct g_raid_tr_raid1e_object *trs;
  840 
  841         vol = tr->tro_volume;
  842         trs = (struct g_raid_tr_raid1e_object *)tr;
  843         if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
  844             vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
  845             vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
  846                 g_raid_iodone(bp, EIO);
  847                 return;
  848         }
  849         /*
  850          * If we're rebuilding, squeeze in rebuild activity every so often,
  851          * even when the disk is busy.  Be sure to only count real I/O
  852          * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
  853          * by this module.
  854          */
  855         if (trs->trso_failed_sd != NULL &&
  856             !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
  857                 /* Make this new or running now round short. */
  858                 trs->trso_recover_slabs = 0;
  859                 if (--trs->trso_fair_io <= 0) {
  860                         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
  861                         g_raid_tr_raid1e_rebuild_some(tr);
  862                 }
  863         }
  864         switch (bp->bio_cmd) {
  865         case BIO_READ:
  866                 g_raid_tr_iostart_raid1e_read(tr, bp);
  867                 break;
  868         case BIO_WRITE:
  869         case BIO_DELETE:
  870                 g_raid_tr_iostart_raid1e_write(tr, bp);
  871                 break;
  872         case BIO_SPEEDUP:
  873         case BIO_FLUSH:
  874                 g_raid_tr_flush_common(tr, bp);
  875                 break;
  876         default:
  877                 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
  878                     bp->bio_cmd, vol->v_name));
  879                 break;
  880         }
  881 }
  882 
  883 static void
  884 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
  885     struct g_raid_subdisk *sd, struct bio *bp)
  886 {
  887         struct bio *cbp;
  888         struct g_raid_subdisk *nsd;
  889         struct g_raid_volume *vol;
  890         struct bio *pbp;
  891         struct g_raid_tr_raid1e_object *trs;
  892         off_t virtual, offset, start;
  893         uintptr_t mask;
  894         int error, do_write, copy, disk, best;
  895 
  896         trs = (struct g_raid_tr_raid1e_object *)tr;
  897         vol = tr->tro_volume;
  898         if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
  899                 if (trs->trso_type == TR_RAID1E_REBUILD) {
  900                         nsd = trs->trso_failed_sd;
  901                         if (bp->bio_cmd == BIO_READ) {
  902                                 /* Immediately abort rebuild, if requested. */
  903                                 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
  904                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  905                                         g_raid_tr_raid1e_rebuild_abort(tr);
  906                                         return;
  907                                 }
  908 
  909                                 /* On read error, skip and cross fingers. */
  910                                 if (bp->bio_error != 0) {
  911                                         G_RAID_LOGREQ(0, bp,
  912                                             "Read error during rebuild (%d), "
  913                                             "possible data loss!",
  914                                             bp->bio_error);
  915                                         goto rebuild_round_done;
  916                                 }
  917 
  918                                 /*
  919                                  * The read operation finished, queue the
  920                                  * write and get out.
  921                                  */
  922                                 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
  923                                     bp->bio_error);
  924                                 bp->bio_cmd = BIO_WRITE;
  925                                 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
  926                                 bp->bio_offset = nsd->sd_rebuild_pos;
  927                                 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
  928                                 g_raid_subdisk_iostart(nsd, bp);
  929                         } else {
  930                                 /*
  931                                  * The write operation just finished.  Do
  932                                  * another.  We keep cloning the master bio
  933                                  * since it has the right buffers allocated to
  934                                  * it.
  935                                  */
  936                                 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
  937                                     bp->bio_error);
  938                                 if (bp->bio_error != 0 ||
  939                                     trs->trso_flags & TR_RAID1E_F_ABORT) {
  940                                         if ((trs->trso_flags &
  941                                             TR_RAID1E_F_ABORT) == 0) {
  942                                                 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
  943                                                     nsd, nsd->sd_disk);
  944                                         }
  945                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  946                                         g_raid_tr_raid1e_rebuild_abort(tr);
  947                                         return;
  948                                 }
  949 rebuild_round_done:
  950                                 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
  951                                 g_raid_unlock_range(tr->tro_volume,
  952                                     trs->trso_lock_pos, trs->trso_lock_len);
  953                                 nsd->sd_rebuild_pos += bp->bio_length;
  954                                 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
  955                                         g_raid_tr_raid1e_rebuild_finish(tr);
  956                                         return;
  957                                 }
  958 
  959                                 /* Abort rebuild if we are stopping */
  960                                 if (trs->trso_stopping) {
  961                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  962                                         g_raid_tr_raid1e_rebuild_abort(tr);
  963                                         return;
  964                                 }
  965 
  966                                 if (--trs->trso_meta_update <= 0) {
  967                                         g_raid_write_metadata(vol->v_softc,
  968                                             vol, nsd, nsd->sd_disk);
  969                                         trs->trso_meta_update =
  970                                             g_raid1e_rebuild_meta_update;
  971                                         /* Compensate short rebuild I/Os. */
  972                                         if ((vol->v_disks_count % N) != 0 &&
  973                                             vol->v_strip_size <
  974                                              g_raid1e_rebuild_slab) {
  975                                                 trs->trso_meta_update *=
  976                                                     g_raid1e_rebuild_slab;
  977                                                 trs->trso_meta_update /=
  978                                                     vol->v_strip_size;
  979                                         }
  980                                 }
  981                                 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  982                                 if (--trs->trso_recover_slabs <= 0)
  983                                         return;
  984                                 /* Run next rebuild iteration. */
  985                                 g_raid_tr_raid1e_rebuild_some(tr);
  986                         }
  987                 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
  988                         /*
  989                          * read good sd, read bad sd in parallel.  when both
  990                          * done, compare the buffers.  write good to the bad
  991                          * if different.  do the next bit of work.
  992                          */
  993                         panic("Somehow, we think we're doing a resync");
  994                 }
  995                 return;
  996         }
  997         pbp = bp->bio_parent;
  998         pbp->bio_inbed++;
  999         mask = (intptr_t)bp->bio_caller2;
 1000         if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
 1001                 /*
 1002                  * Read failed on first drive.  Retry the read error on
 1003                  * another disk drive, if available, before erroring out the
 1004                  * read.
 1005                  */
 1006                 sd->sd_disk->d_read_errs++;
 1007                 G_RAID_LOGREQ(0, bp,
 1008                     "Read error (%d), %d read errors total",
 1009                     bp->bio_error, sd->sd_disk->d_read_errs);
 1010 
 1011                 /*
 1012                  * If there are too many read errors, we move to degraded.
 1013                  * XXX Do we want to FAIL the drive (eg, make the user redo
 1014                  * everything to get it back in sync), or just degrade the
 1015                  * drive, which kicks off a resync?
 1016                  */
 1017                 do_write = 0;
 1018                 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
 1019                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 1020                 else if (mask == 0)
 1021                         do_write = 1;
 1022 
 1023                 /* Restore what we were doing. */
 1024                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1025                 V2P(vol, virtual, &disk, &offset, &start);
 1026 
 1027                 /* Find the other disk, and try to do the I/O to it. */
 1028                 mask |= 1 << copy;
 1029                 best = g_raid_tr_raid1e_select_read_disk(vol,
 1030                     disk, offset, start, mask);
 1031                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
 1032                         disk += best;
 1033                         if (disk >= vol->v_disks_count) {
 1034                                 disk -= vol->v_disks_count;
 1035                                 offset += vol->v_strip_size;
 1036                         }
 1037                         cbp->bio_offset = offset + start;
 1038                         cbp->bio_length = bp->bio_length;
 1039                         cbp->bio_data = bp->bio_data;
 1040                         cbp->bio_ma = bp->bio_ma;
 1041                         cbp->bio_ma_offset = bp->bio_ma_offset;
 1042                         cbp->bio_ma_n = bp->bio_ma_n;
 1043                         g_destroy_bio(bp);
 1044                         nsd = &vol->v_subdisks[disk];
 1045                         G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
 1046                             nsd->sd_pos);
 1047                         if (do_write)
 1048                                 mask |= 1 << 31;
 1049                         if ((mask & (1U << 31)) != 0)
 1050                                 sd->sd_recovery++;
 1051                         cbp->bio_caller2 = (void *)mask;
 1052                         if (do_write) {
 1053                                 cbp->bio_caller1 = nsd;
 1054                                 /* Lock callback starts I/O */
 1055                                 g_raid_lock_range(sd->sd_volume,
 1056                                     virtual, cbp->bio_length, pbp, cbp);
 1057                         } else {
 1058                                 g_raid_subdisk_iostart(nsd, cbp);
 1059                         }
 1060                         return;
 1061                 }
 1062                 /*
 1063                  * We can't retry.  Return the original error by falling
 1064                  * through.  This will happen when there's only one good disk.
 1065                  * We don't need to fail the raid, since its actual state is
 1066                  * based on the state of the subdisks.
 1067                  */
 1068                 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
 1069         }
 1070         if (bp->bio_cmd == BIO_READ &&
 1071             bp->bio_error == 0 &&
 1072             (mask & (1U << 31)) != 0) {
 1073                 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
 1074 
 1075                 /* Restore what we were doing. */
 1076                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1077                 V2P(vol, virtual, &disk, &offset, &start);
 1078 
 1079                 /* Find best disk to write. */
 1080                 best = g_raid_tr_raid1e_select_read_disk(vol,
 1081                     disk, offset, start, ~mask);
 1082                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
 1083                         disk += best;
 1084                         if (disk >= vol->v_disks_count) {
 1085                                 disk -= vol->v_disks_count;
 1086                                 offset += vol->v_strip_size;
 1087                         }
 1088                         cbp->bio_offset = offset + start;
 1089                         cbp->bio_cmd = BIO_WRITE;
 1090                         cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
 1091                         cbp->bio_caller2 = (void *)mask;
 1092                         g_destroy_bio(bp);
 1093                         G_RAID_LOGREQ(2, cbp,
 1094                             "Attempting bad sector remap on failing drive.");
 1095                         g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
 1096                         return;
 1097                 }
 1098         }
 1099         if ((mask & (1U << 31)) != 0) {
 1100                 /*
 1101                  * We're done with a recovery, mark the range as unlocked.
 1102                  * For any write errors, we aggressively fail the disk since
 1103                  * there was both a READ and a WRITE error at this location.
 1104                  * Both types of errors generally indicates the drive is on
 1105                  * the verge of total failure anyway.  Better to stop trusting
 1106                  * it now.  However, we need to reset error to 0 in that case
 1107                  * because we're not failing the original I/O which succeeded.
 1108                  */
 1109 
 1110                 /* Restore what we were doing. */
 1111                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1112                 V2P(vol, virtual, &disk, &offset, &start);
 1113 
 1114                 for (copy = 0; copy < N; copy++) {
 1115                         if ((mask & (1 << copy) ) != 0)
 1116                                 vol->v_subdisks[(disk + copy) %
 1117                                     vol->v_disks_count].sd_recovery--;
 1118                 }
 1119 
 1120                 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
 1121                         G_RAID_LOGREQ(0, bp, "Remap write failed: "
 1122                             "failing subdisk.");
 1123                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 1124                         bp->bio_error = 0;
 1125                 }
 1126                 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
 1127                 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
 1128         }
 1129         if (pbp->bio_cmd != BIO_READ) {
 1130                 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
 1131                         pbp->bio_error = bp->bio_error;
 1132                 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
 1133                         G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
 1134                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 1135                 }
 1136                 error = pbp->bio_error;
 1137         } else
 1138                 error = bp->bio_error;
 1139         g_destroy_bio(bp);
 1140         if (pbp->bio_children == pbp->bio_inbed) {
 1141                 pbp->bio_completed = pbp->bio_length;
 1142                 g_raid_iodone(pbp, error);
 1143         }
 1144 }
 1145 
 1146 static int
 1147 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, void *virtual,
 1148     off_t boffset, size_t blength)
 1149 {
 1150         struct g_raid_volume *vol;
 1151         struct g_raid_subdisk *sd;
 1152         struct bio_queue_head queue;
 1153         char *addr;
 1154         off_t offset, start, length, remain;
 1155         u_int no, strip_size;
 1156         int i, error;
 1157 
 1158         vol = tr->tro_volume;
 1159         addr = virtual;
 1160         strip_size = vol->v_strip_size;
 1161         V2P(vol, boffset, &no, &offset, &start);
 1162         remain = blength;
 1163         bioq_init(&queue);
 1164         while (remain > 0) {
 1165                 length = MIN(strip_size - start, remain);
 1166                 for (i = 0; i < N; i++) {
 1167                         sd = &vol->v_subdisks[no];
 1168                         switch (sd->sd_state) {
 1169                         case G_RAID_SUBDISK_S_ACTIVE:
 1170                         case G_RAID_SUBDISK_S_STALE:
 1171                         case G_RAID_SUBDISK_S_RESYNC:
 1172                                 break;
 1173                         case G_RAID_SUBDISK_S_REBUILD:
 1174                                 if (offset + start >= sd->sd_rebuild_pos)
 1175                                         goto nextdisk;
 1176                                 break;
 1177                         default:
 1178                                 goto nextdisk;
 1179                         }
 1180                         error = g_raid_subdisk_kerneldump(sd, addr,
 1181                             offset + start, length);
 1182                         if (error != 0)
 1183                                 return (error);
 1184 nextdisk:
 1185                         if (++no >= vol->v_disks_count) {
 1186                                 no = 0;
 1187                                 offset += strip_size;
 1188                         }
 1189                 }
 1190                 remain -= length;
 1191                 addr += length;
 1192                 start = 0;
 1193         }
 1194         return (0);
 1195 }
 1196 
 1197 static int
 1198 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
 1199 {
 1200         struct bio *bp;
 1201         struct g_raid_subdisk *sd;
 1202 
 1203         bp = (struct bio *)argp;
 1204         sd = (struct g_raid_subdisk *)bp->bio_caller1;
 1205         g_raid_subdisk_iostart(sd, bp);
 1206 
 1207         return (0);
 1208 }
 1209 
 1210 static int
 1211 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
 1212 {
 1213         struct g_raid_tr_raid1e_object *trs;
 1214         struct g_raid_volume *vol;
 1215 
 1216         vol = tr->tro_volume;
 1217         trs = (struct g_raid_tr_raid1e_object *)tr;
 1218         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
 1219         trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
 1220         /* Compensate short rebuild I/Os. */
 1221         if ((vol->v_disks_count % N) != 0 &&
 1222             vol->v_strip_size < g_raid1e_rebuild_slab) {
 1223                 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
 1224                 trs->trso_recover_slabs /= vol->v_strip_size;
 1225         }
 1226         if (trs->trso_type == TR_RAID1E_REBUILD)
 1227                 g_raid_tr_raid1e_rebuild_some(tr);
 1228         return (0);
 1229 }
 1230 
 1231 static int
 1232 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
 1233 {
 1234         struct g_raid_tr_raid1e_object *trs;
 1235 
 1236         trs = (struct g_raid_tr_raid1e_object *)tr;
 1237 
 1238         if (trs->trso_buffer != NULL) {
 1239                 free(trs->trso_buffer, M_TR_RAID1E);
 1240                 trs->trso_buffer = NULL;
 1241         }
 1242         return (0);
 1243 }
 1244 
 1245 G_RAID_TR_DECLARE(raid1e, "RAID1E");
Cache object: 40a82672572b109e7dffb0b96608b93e
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/geom/raid/tr_raid1e.c

FreeBSD/Linux Kernel Cross Reference
sys/geom/raid/tr_raid1e.c