The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/geom/raid/tr_raid1e.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD: releng/10.0/sys/geom/raid/tr_raid1e.c 248720 2013-03-26 05:42:12Z mav $");
   29 
   30 #include <sys/param.h>
   31 #include <sys/bio.h>
   32 #include <sys/endian.h>
   33 #include <sys/kernel.h>
   34 #include <sys/kobj.h>
   35 #include <sys/limits.h>
   36 #include <sys/lock.h>
   37 #include <sys/malloc.h>
   38 #include <sys/mutex.h>
   39 #include <sys/sysctl.h>
   40 #include <sys/systm.h>
   41 #include <geom/geom.h>
   42 #include "geom/raid/g_raid.h"
   43 #include "g_raid_tr_if.h"
   44 
   45 #define N       2
   46 
   47 SYSCTL_DECL(_kern_geom_raid_raid1e);
   48 
   49 #define RAID1E_REBUILD_SLAB     (1 << 20) /* One transation in a rebuild */
   50 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
   51 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size",
   52     &g_raid1e_rebuild_slab);
   53 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
   54     &g_raid1e_rebuild_slab, 0,
   55     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
   56 
   57 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
   58 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
   59 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io",
   60     &g_raid1e_rebuild_fair_io);
   61 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
   62     &g_raid1e_rebuild_fair_io, 0,
   63     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
   64 
   65 #define RAID1E_REBUILD_CLUSTER_IDLE 100
   66 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
   67 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle",
   68     &g_raid1e_rebuild_cluster_idle);
   69 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
   70     &g_raid1e_rebuild_cluster_idle, 0,
   71     "Number of slabs to do each time we trigger a rebuild cycle");
   72 
   73 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
   74 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
   75 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update",
   76     &g_raid1e_rebuild_meta_update);
   77 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
   78     &g_raid1e_rebuild_meta_update, 0,
   79     "When to update the meta data.");
   80 
   81 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
   82 
   83 #define TR_RAID1E_NONE 0
   84 #define TR_RAID1E_REBUILD 1
   85 #define TR_RAID1E_RESYNC 2
   86 
   87 #define TR_RAID1E_F_DOING_SOME  0x1
   88 #define TR_RAID1E_F_LOCKED      0x2
   89 #define TR_RAID1E_F_ABORT       0x4
   90 
   91 struct g_raid_tr_raid1e_object {
   92         struct g_raid_tr_object  trso_base;
   93         int                      trso_starting;
   94         int                      trso_stopping;
   95         int                      trso_type;
   96         int                      trso_recover_slabs; /* slabs before rest */
   97         int                      trso_fair_io;
   98         int                      trso_meta_update;
   99         int                      trso_flags;
  100         struct g_raid_subdisk   *trso_failed_sd; /* like per volume */
  101         void                    *trso_buffer;    /* Buffer space */
  102         off_t                    trso_lock_pos; /* Locked range start. */
  103         off_t                    trso_lock_len; /* Locked range length. */
  104         struct bio               trso_bio;
  105 };
  106 
  107 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
  108 static g_raid_tr_event_t g_raid_tr_event_raid1e;
  109 static g_raid_tr_start_t g_raid_tr_start_raid1e;
  110 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
  111 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
  112 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
  113 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
  114 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
  115 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
  116 static g_raid_tr_free_t g_raid_tr_free_raid1e;
  117 
  118 static kobj_method_t g_raid_tr_raid1e_methods[] = {
  119         KOBJMETHOD(g_raid_tr_taste,     g_raid_tr_taste_raid1e),
  120         KOBJMETHOD(g_raid_tr_event,     g_raid_tr_event_raid1e),
  121         KOBJMETHOD(g_raid_tr_start,     g_raid_tr_start_raid1e),
  122         KOBJMETHOD(g_raid_tr_stop,      g_raid_tr_stop_raid1e),
  123         KOBJMETHOD(g_raid_tr_iostart,   g_raid_tr_iostart_raid1e),
  124         KOBJMETHOD(g_raid_tr_iodone,    g_raid_tr_iodone_raid1e),
  125         KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
  126         KOBJMETHOD(g_raid_tr_locked,    g_raid_tr_locked_raid1e),
  127         KOBJMETHOD(g_raid_tr_idle,      g_raid_tr_idle_raid1e),
  128         KOBJMETHOD(g_raid_tr_free,      g_raid_tr_free_raid1e),
  129         { 0, 0 }
  130 };
  131 
  132 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
  133         "RAID1E",
  134         g_raid_tr_raid1e_methods,
  135         sizeof(struct g_raid_tr_raid1e_object),
  136         .trc_enable = 1,
  137         .trc_priority = 200
  138 };
  139 
  140 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
  141 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
  142     struct g_raid_subdisk *sd);
  143 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
  144     int no, off_t off, off_t len, u_int mask);
  145 
  146 static inline void
  147 V2P(struct g_raid_volume *vol, off_t virt,
  148     int *disk, off_t *offset, off_t *start)
  149 {
  150         off_t nstrip;
  151         u_int strip_size;
  152 
  153         strip_size = vol->v_strip_size;
  154         /* Strip number. */
  155         nstrip = virt / strip_size;
  156         /* Start position in strip. */
  157         *start = virt % strip_size;
  158         /* Disk number. */
  159         *disk = (nstrip * N) % vol->v_disks_count;
  160         /* Strip start position in disk. */
  161         *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
  162 }
  163 
  164 static inline void
  165 P2V(struct g_raid_volume *vol, int disk, off_t offset,
  166     off_t *virt, int *copy)
  167 {
  168         off_t nstrip, start;
  169         u_int strip_size;
  170 
  171         strip_size = vol->v_strip_size;
  172         /* Start position in strip. */
  173         start = offset % strip_size;
  174         /* Physical strip number. */
  175         nstrip = (offset / strip_size) * vol->v_disks_count + disk;
  176         /* Number of physical strip (copy) inside virtual strip. */
  177         *copy = nstrip % N;
  178         /* Offset in virtual space. */
  179         *virt = (nstrip / N) * strip_size + start;
  180 }
  181 
  182 static int
  183 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
  184 {
  185         struct g_raid_tr_raid1e_object *trs;
  186 
  187         trs = (struct g_raid_tr_raid1e_object *)tr;
  188         if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
  189             tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
  190                 return (G_RAID_TR_TASTE_FAIL);
  191         trs->trso_starting = 1;
  192         return (G_RAID_TR_TASTE_SUCCEED);
  193 }
  194 
  195 static int
  196 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
  197 {
  198         struct g_raid_softc *sc;
  199         struct g_raid_subdisk *sd, *bestsd, *worstsd;
  200         int i, j, state, sstate;
  201 
  202         sc = vol->v_softc;
  203         state = G_RAID_VOLUME_S_OPTIMAL;
  204         for (i = 0; i < vol->v_disks_count / N; i++) {
  205                 bestsd = &vol->v_subdisks[i * N];
  206                 for (j = 1; j < N; j++) {
  207                         sd = &vol->v_subdisks[i * N + j];
  208                         if (sd->sd_state > bestsd->sd_state)
  209                                 bestsd = sd;
  210                         else if (sd->sd_state == bestsd->sd_state &&
  211                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
  212                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
  213                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
  214                                 bestsd = sd;
  215                 }
  216                 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
  217                     bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
  218                         /* We found reasonable candidate. */
  219                         G_RAID_DEBUG1(1, sc,
  220                             "Promote subdisk %s:%d from %s to ACTIVE.",
  221                             vol->v_name, bestsd->sd_pos,
  222                             g_raid_subdisk_state2str(bestsd->sd_state));
  223                         g_raid_change_subdisk_state(bestsd,
  224                             G_RAID_SUBDISK_S_ACTIVE);
  225                         g_raid_write_metadata(sc,
  226                             vol, bestsd, bestsd->sd_disk);
  227                 }
  228                 worstsd = &vol->v_subdisks[i * N];
  229                 for (j = 1; j < N; j++) {
  230                         sd = &vol->v_subdisks[i * N + j];
  231                         if (sd->sd_state < worstsd->sd_state)
  232                                 worstsd = sd;
  233                 }
  234                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  235                         sstate = G_RAID_VOLUME_S_OPTIMAL;
  236                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  237                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
  238                 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  239                         sstate = G_RAID_VOLUME_S_DEGRADED;
  240                 else
  241                         sstate = G_RAID_VOLUME_S_BROKEN;
  242                 if (sstate < state)
  243                         state = sstate;
  244         }
  245         return (state);
  246 }
  247 
  248 static int
  249 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
  250 {
  251         struct g_raid_softc *sc;
  252         struct g_raid_subdisk *sd, *bestsd, *worstsd;
  253         int i, j, state, sstate;
  254 
  255         sc = vol->v_softc;
  256         if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
  257             vol->v_disks_count)
  258                 return (G_RAID_VOLUME_S_OPTIMAL);
  259         for (i = 0; i < vol->v_disks_count; i++) {
  260                 sd = &vol->v_subdisks[i];
  261                 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
  262                         /* We found reasonable candidate. */
  263                         G_RAID_DEBUG1(1, sc,
  264                             "Promote subdisk %s:%d from %s to STALE.",
  265                             vol->v_name, sd->sd_pos,
  266                             g_raid_subdisk_state2str(sd->sd_state));
  267                         g_raid_change_subdisk_state(sd,
  268                             G_RAID_SUBDISK_S_STALE);
  269                         g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
  270                 }
  271         }
  272         state = G_RAID_VOLUME_S_OPTIMAL;
  273         for (i = 0; i < vol->v_disks_count; i++) {
  274                 bestsd = &vol->v_subdisks[i];
  275                 worstsd = &vol->v_subdisks[i];
  276                 for (j = 1; j < N; j++) {
  277                         sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
  278                         if (sd->sd_state > bestsd->sd_state)
  279                                 bestsd = sd;
  280                         else if (sd->sd_state == bestsd->sd_state &&
  281                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
  282                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
  283                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
  284                                 bestsd = sd;
  285                         if (sd->sd_state < worstsd->sd_state)
  286                                 worstsd = sd;
  287                 }
  288                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  289                         sstate = G_RAID_VOLUME_S_OPTIMAL;
  290                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  291                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
  292                 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  293                         sstate = G_RAID_VOLUME_S_DEGRADED;
  294                 else
  295                         sstate = G_RAID_VOLUME_S_BROKEN;
  296                 if (sstate < state)
  297                         state = sstate;
  298         }
  299         return (state);
  300 }
  301 
  302 static int
  303 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
  304     struct g_raid_subdisk *sd)
  305 {
  306         struct g_raid_tr_raid1e_object *trs;
  307         struct g_raid_softc *sc;
  308         u_int s;
  309 
  310         sc = vol->v_softc;
  311         trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
  312         if (trs->trso_stopping &&
  313             (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
  314                 s = G_RAID_VOLUME_S_STOPPED;
  315         else if (trs->trso_starting)
  316                 s = G_RAID_VOLUME_S_STARTING;
  317         else {
  318                 if ((vol->v_disks_count % N) == 0)
  319                         s = g_raid_tr_update_state_raid1e_even(vol);
  320                 else
  321                         s = g_raid_tr_update_state_raid1e_odd(vol);
  322         }
  323         if (s != vol->v_state) {
  324                 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
  325                     G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
  326                     G_RAID_EVENT_VOLUME);
  327                 g_raid_change_volume_state(vol, s);
  328                 if (!trs->trso_starting && !trs->trso_stopping)
  329                         g_raid_write_metadata(sc, vol, NULL, NULL);
  330         }
  331         if (!trs->trso_starting && !trs->trso_stopping)
  332                 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
  333         return (0);
  334 }
  335 
  336 static void
  337 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
  338     struct g_raid_disk *disk)
  339 {
  340         struct g_raid_volume *vol;
  341 
  342         vol = sd->sd_volume;
  343         /*
  344          * We don't fail the last disk in the pack, since it still has decent
  345          * data on it and that's better than failing the disk if it is the root
  346          * file system.
  347          *
  348          * XXX should this be controlled via a tunable?  It makes sense for
  349          * the volume that has / on it.  I can't think of a case where we'd
  350          * want the volume to go away on this kind of event.
  351          */
  352         if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
  353              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
  354              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
  355              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
  356              vol->v_disks_count) &&
  357             (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
  358                 return;
  359         g_raid_fail_disk(sc, sd, disk);
  360 }
  361 
  362 static void
  363 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
  364 {
  365         struct g_raid_volume *vol;
  366         struct g_raid_subdisk *sd;
  367 
  368         vol = trs->trso_base.tro_volume;
  369         sd = trs->trso_failed_sd;
  370         g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
  371         free(trs->trso_buffer, M_TR_RAID1E);
  372         trs->trso_buffer = NULL;
  373         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  374         trs->trso_type = TR_RAID1E_NONE;
  375         trs->trso_recover_slabs = 0;
  376         trs->trso_failed_sd = NULL;
  377         g_raid_tr_update_state_raid1e(vol, NULL);
  378 }
  379 
  380 static void
  381 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
  382 {
  383         struct g_raid_tr_raid1e_object *trs;
  384         struct g_raid_subdisk *sd;
  385 
  386         trs = (struct g_raid_tr_raid1e_object *)tr;
  387         sd = trs->trso_failed_sd;
  388         G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
  389             "Subdisk %s:%d-%s rebuild completed.",
  390             sd->sd_volume->v_name, sd->sd_pos,
  391             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  392         g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
  393         sd->sd_rebuild_pos = 0;
  394         g_raid_tr_raid1e_rebuild_done(trs);
  395 }
  396 
  397 static void
  398 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
  399 {
  400         struct g_raid_tr_raid1e_object *trs;
  401         struct g_raid_subdisk *sd;
  402         struct g_raid_volume *vol;
  403 
  404         vol = tr->tro_volume;
  405         trs = (struct g_raid_tr_raid1e_object *)tr;
  406         sd = trs->trso_failed_sd;
  407         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
  408                 G_RAID_DEBUG1(1, vol->v_softc,
  409                     "Subdisk %s:%d-%s rebuild is aborting.",
  410                     sd->sd_volume->v_name, sd->sd_pos,
  411                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  412                 trs->trso_flags |= TR_RAID1E_F_ABORT;
  413         } else {
  414                 G_RAID_DEBUG1(0, vol->v_softc,
  415                     "Subdisk %s:%d-%s rebuild aborted.",
  416                     sd->sd_volume->v_name, sd->sd_pos,
  417                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  418                 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
  419                 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
  420                         trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
  421                         g_raid_unlock_range(tr->tro_volume,
  422                             trs->trso_lock_pos, trs->trso_lock_len);
  423                 }
  424                 g_raid_tr_raid1e_rebuild_done(trs);
  425         }
  426 }
  427 
  428 static void
  429 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
  430 {
  431         struct g_raid_tr_raid1e_object *trs;
  432         struct g_raid_softc *sc;
  433         struct g_raid_volume *vol;
  434         struct g_raid_subdisk *sd;
  435         struct bio *bp;
  436         off_t len, virtual, vend, offset, start;
  437         int disk, copy, best;
  438 
  439         trs = (struct g_raid_tr_raid1e_object *)tr;
  440         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
  441                 return;
  442         vol = tr->tro_volume;
  443         sc = vol->v_softc;
  444         sd = trs->trso_failed_sd;
  445 
  446         while (1) {
  447                 if (sd->sd_rebuild_pos >= sd->sd_size) {
  448                         g_raid_tr_raid1e_rebuild_finish(tr);
  449                         return;
  450                 }
  451                 /* Get virtual offset from physical rebuild position. */
  452                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
  453                 /* Get physical offset back to get first stripe position. */
  454                 V2P(vol, virtual, &disk, &offset, &start);
  455                 /* Calculate contignous data length. */
  456                 len = MIN(g_raid1e_rebuild_slab,
  457                     sd->sd_size - sd->sd_rebuild_pos);
  458                 if ((vol->v_disks_count % N) != 0)
  459                         len = MIN(len, vol->v_strip_size - start);
  460                 /* Find disk with most accurate data. */
  461                 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
  462                     offset + start, len, 0);
  463                 if (best < 0) {
  464                         /* There is no any valid disk. */
  465                         g_raid_tr_raid1e_rebuild_abort(tr);
  466                         return;
  467                 } else if (best != copy) {
  468                         /* Some other disk has better data. */
  469                         break;
  470                 }
  471                 /* We have the most accurate data. Skip the range. */
  472                 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
  473                     sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
  474                 sd->sd_rebuild_pos += len;
  475         }
  476 
  477         bp = &trs->trso_bio;
  478         memset(bp, 0, sizeof(*bp));
  479         bp->bio_offset = offset + start +
  480             ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
  481         bp->bio_length = len;
  482         bp->bio_data = trs->trso_buffer;
  483         bp->bio_cmd = BIO_READ;
  484         bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
  485         bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
  486         G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
  487         /*
  488          * If we are crossing stripe boundary, correct affected virtual
  489          * range we should lock.
  490          */
  491         if (start + len > vol->v_strip_size) {
  492                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
  493                 len = vend - virtual;
  494         }
  495         trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
  496         trs->trso_flags |= TR_RAID1E_F_LOCKED;
  497         trs->trso_lock_pos = virtual;
  498         trs->trso_lock_len = len;
  499         /* Lock callback starts I/O */
  500         g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
  501 }
  502 
  503 static void
  504 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
  505 {
  506         struct g_raid_volume *vol;
  507         struct g_raid_tr_raid1e_object *trs;
  508         struct g_raid_subdisk *sd;
  509 
  510         vol = tr->tro_volume;
  511         trs = (struct g_raid_tr_raid1e_object *)tr;
  512         if (trs->trso_failed_sd) {
  513                 G_RAID_DEBUG1(1, vol->v_softc,
  514                     "Already rebuild in start rebuild. pos %jd\n",
  515                     (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
  516                 return;
  517         }
  518         sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
  519         if (sd == NULL)
  520                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
  521         if (sd == NULL) {
  522                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
  523                 if (sd != NULL) {
  524                         sd->sd_rebuild_pos = 0;
  525                         g_raid_change_subdisk_state(sd,
  526                             G_RAID_SUBDISK_S_RESYNC);
  527                         g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
  528                 } else {
  529                         sd = g_raid_get_subdisk(vol,
  530                             G_RAID_SUBDISK_S_UNINITIALIZED);
  531                         if (sd == NULL)
  532                                 sd = g_raid_get_subdisk(vol,
  533                                     G_RAID_SUBDISK_S_NEW);
  534                         if (sd != NULL) {
  535                                 sd->sd_rebuild_pos = 0;
  536                                 g_raid_change_subdisk_state(sd,
  537                                     G_RAID_SUBDISK_S_REBUILD);
  538                                 g_raid_write_metadata(vol->v_softc,
  539                                     vol, sd, NULL);
  540                         }
  541                 }
  542         }
  543         if (sd == NULL) {
  544                 G_RAID_DEBUG1(1, vol->v_softc,
  545                     "No failed disk to rebuild.  night night.");
  546                 return;
  547         }
  548         trs->trso_failed_sd = sd;
  549         G_RAID_DEBUG1(0, vol->v_softc,
  550             "Subdisk %s:%d-%s rebuild start at %jd.",
  551             sd->sd_volume->v_name, sd->sd_pos,
  552             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
  553             trs->trso_failed_sd->sd_rebuild_pos);
  554         trs->trso_type = TR_RAID1E_REBUILD;
  555         trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
  556         trs->trso_meta_update = g_raid1e_rebuild_meta_update;
  557         g_raid_tr_raid1e_rebuild_some(tr);
  558 }
  559 
  560 static void
  561 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
  562     struct g_raid_subdisk *sd)
  563 {
  564         struct g_raid_volume *vol;
  565         struct g_raid_tr_raid1e_object *trs;
  566         int nr;
  567         
  568         vol = tr->tro_volume;
  569         trs = (struct g_raid_tr_raid1e_object *)tr;
  570         if (trs->trso_stopping)
  571                 return;
  572         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
  573             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
  574         switch(trs->trso_type) {
  575         case TR_RAID1E_NONE:
  576                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
  577                         return;
  578                 if (nr == 0) {
  579                         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
  580                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
  581                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
  582                         if (nr == 0)
  583                                 return;
  584                 }
  585                 g_raid_tr_raid1e_rebuild_start(tr);
  586                 break;
  587         case TR_RAID1E_REBUILD:
  588                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
  589                     trs->trso_failed_sd == sd)
  590                         g_raid_tr_raid1e_rebuild_abort(tr);
  591                 break;
  592         case TR_RAID1E_RESYNC:
  593                 break;
  594         }
  595 }
  596 
  597 static int
  598 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
  599     struct g_raid_subdisk *sd, u_int event)
  600 {
  601 
  602         g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
  603         return (0);
  604 }
  605 
  606 static int
  607 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
  608 {
  609         struct g_raid_tr_raid1e_object *trs;
  610         struct g_raid_volume *vol;
  611 
  612         trs = (struct g_raid_tr_raid1e_object *)tr;
  613         vol = tr->tro_volume;
  614         trs->trso_starting = 0;
  615         g_raid_tr_update_state_raid1e(vol, NULL);
  616         return (0);
  617 }
  618 
  619 static int
  620 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
  621 {
  622         struct g_raid_tr_raid1e_object *trs;
  623         struct g_raid_volume *vol;
  624 
  625         trs = (struct g_raid_tr_raid1e_object *)tr;
  626         vol = tr->tro_volume;
  627         trs->trso_starting = 0;
  628         trs->trso_stopping = 1;
  629         g_raid_tr_update_state_raid1e(vol, NULL);
  630         return (0);
  631 }
  632 
  633 /*
  634  * Select the disk to read from.  Take into account: subdisk state, running
  635  * error recovery, average disk load, head position and possible cache hits.
  636  */
  637 #define ABS(x)          (((x) >= 0) ? (x) : (-(x)))
  638 static int
  639 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
  640     int no, off_t off, off_t len, u_int mask)
  641 {
  642         struct g_raid_subdisk *sd;
  643         off_t offset;
  644         int i, best, prio, bestprio;
  645 
  646         best = -1;
  647         bestprio = INT_MAX;
  648         for (i = 0; i < N; i++) {
  649                 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
  650                 offset = off;
  651                 if (no + i >= vol->v_disks_count)
  652                         offset += vol->v_strip_size;
  653 
  654                 prio = G_RAID_SUBDISK_LOAD(sd);
  655                 if ((mask & (1 << sd->sd_pos)) != 0)
  656                         continue;
  657                 switch (sd->sd_state) {
  658                 case G_RAID_SUBDISK_S_ACTIVE:
  659                         break;
  660                 case G_RAID_SUBDISK_S_RESYNC:
  661                         if (offset + off < sd->sd_rebuild_pos)
  662                                 break;
  663                         /* FALLTHROUGH */
  664                 case G_RAID_SUBDISK_S_STALE:
  665                         prio += i << 24;
  666                         break;
  667                 case G_RAID_SUBDISK_S_REBUILD:
  668                         if (offset + off < sd->sd_rebuild_pos)
  669                                 break;
  670                         /* FALLTHROUGH */
  671                 default:
  672                         continue;
  673                 }
  674                 prio += min(sd->sd_recovery, 255) << 16;
  675                 /* If disk head is precisely in position - highly prefer it. */
  676                 if (G_RAID_SUBDISK_POS(sd) == offset)
  677                         prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
  678                 else
  679                 /* If disk head is close to position - prefer it. */
  680                 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
  681                     G_RAID_SUBDISK_TRACK_SIZE)
  682                         prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
  683                 if (prio < bestprio) {
  684                         bestprio = prio;
  685                         best = i;
  686                 }
  687         }
  688         return (best);
  689 }
  690 
  691 static void
  692 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
  693 {
  694         struct g_raid_volume *vol;
  695         struct g_raid_subdisk *sd;
  696         struct bio_queue_head queue;
  697         struct bio *cbp;
  698         char *addr;
  699         off_t offset, start, length, remain;
  700         u_int no, strip_size;
  701         int best;
  702 
  703         vol = tr->tro_volume;
  704         addr = bp->bio_data;
  705         strip_size = vol->v_strip_size;
  706         V2P(vol, bp->bio_offset, &no, &offset, &start);
  707         remain = bp->bio_length;
  708         bioq_init(&queue);
  709         while (remain > 0) {
  710                 length = MIN(strip_size - start, remain);
  711                 best = g_raid_tr_raid1e_select_read_disk(vol,
  712                     no, offset, length, 0);
  713                 KASSERT(best >= 0, ("No readable disk in volume %s!",
  714                     vol->v_name));
  715                 no += best;
  716                 if (no >= vol->v_disks_count) {
  717                         no -= vol->v_disks_count;
  718                         offset += strip_size;
  719                 }
  720                 cbp = g_clone_bio(bp);
  721                 if (cbp == NULL)
  722                         goto failure;
  723                 cbp->bio_offset = offset + start;
  724                 cbp->bio_data = addr;
  725                 cbp->bio_length = length;
  726                 cbp->bio_caller1 = &vol->v_subdisks[no];
  727                 bioq_insert_tail(&queue, cbp);
  728                 no += N - best;
  729                 if (no >= vol->v_disks_count) {
  730                         no -= vol->v_disks_count;
  731                         offset += strip_size;
  732                 }
  733                 remain -= length;
  734                 addr += length;
  735                 start = 0;
  736         }
  737         for (cbp = bioq_first(&queue); cbp != NULL;
  738             cbp = bioq_first(&queue)) {
  739                 bioq_remove(&queue, cbp);
  740                 sd = cbp->bio_caller1;
  741                 cbp->bio_caller1 = NULL;
  742                 g_raid_subdisk_iostart(sd, cbp);
  743         }
  744         return;
  745 failure:
  746         for (cbp = bioq_first(&queue); cbp != NULL;
  747             cbp = bioq_first(&queue)) {
  748                 bioq_remove(&queue, cbp);
  749                 g_destroy_bio(cbp);
  750         }
  751         if (bp->bio_error == 0)
  752                 bp->bio_error = ENOMEM;
  753         g_raid_iodone(bp, bp->bio_error);
  754 }
  755 
  756 static void
  757 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
  758 {
  759         struct g_raid_volume *vol;
  760         struct g_raid_subdisk *sd;
  761         struct bio_queue_head queue;
  762         struct bio *cbp;
  763         char *addr;
  764         off_t offset, start, length, remain;
  765         u_int no, strip_size;
  766         int i;
  767 
  768         vol = tr->tro_volume;
  769         addr = bp->bio_data;
  770         strip_size = vol->v_strip_size;
  771         V2P(vol, bp->bio_offset, &no, &offset, &start);
  772         remain = bp->bio_length;
  773         bioq_init(&queue);
  774         while (remain > 0) {
  775                 length = MIN(strip_size - start, remain);
  776                 for (i = 0; i < N; i++) {
  777                         sd = &vol->v_subdisks[no];
  778                         switch (sd->sd_state) {
  779                         case G_RAID_SUBDISK_S_ACTIVE:
  780                         case G_RAID_SUBDISK_S_STALE:
  781                         case G_RAID_SUBDISK_S_RESYNC:
  782                                 break;
  783                         case G_RAID_SUBDISK_S_REBUILD:
  784                                 if (offset + start >= sd->sd_rebuild_pos)
  785                                         goto nextdisk;
  786                                 break;
  787                         default:
  788                                 goto nextdisk;
  789                         }
  790                         cbp = g_clone_bio(bp);
  791                         if (cbp == NULL)
  792                                 goto failure;
  793                         cbp->bio_offset = offset + start;
  794                         cbp->bio_data = addr;
  795                         cbp->bio_length = length;
  796                         cbp->bio_caller1 = sd;
  797                         bioq_insert_tail(&queue, cbp);
  798 nextdisk:
  799                         if (++no >= vol->v_disks_count) {
  800                                 no = 0;
  801                                 offset += strip_size;
  802                         }
  803                 }
  804                 remain -= length;
  805                 if (bp->bio_cmd != BIO_DELETE)
  806                         addr += length;
  807                 start = 0;
  808         }
  809         for (cbp = bioq_first(&queue); cbp != NULL;
  810             cbp = bioq_first(&queue)) {
  811                 bioq_remove(&queue, cbp);
  812                 sd = cbp->bio_caller1;
  813                 cbp->bio_caller1 = NULL;
  814                 g_raid_subdisk_iostart(sd, cbp);
  815         }
  816         return;
  817 failure:
  818         for (cbp = bioq_first(&queue); cbp != NULL;
  819             cbp = bioq_first(&queue)) {
  820                 bioq_remove(&queue, cbp);
  821                 g_destroy_bio(cbp);
  822         }
  823         if (bp->bio_error == 0)
  824                 bp->bio_error = ENOMEM;
  825         g_raid_iodone(bp, bp->bio_error);
  826 }
  827 
  828 static void
  829 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
  830 {
  831         struct g_raid_volume *vol;
  832         struct g_raid_tr_raid1e_object *trs;
  833 
  834         vol = tr->tro_volume;
  835         trs = (struct g_raid_tr_raid1e_object *)tr;
  836         if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
  837             vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
  838             vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
  839                 g_raid_iodone(bp, EIO);
  840                 return;
  841         }
  842         /*
  843          * If we're rebuilding, squeeze in rebuild activity every so often,
  844          * even when the disk is busy.  Be sure to only count real I/O
  845          * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
  846          * by this module.
  847          */
  848         if (trs->trso_failed_sd != NULL &&
  849             !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
  850                 /* Make this new or running now round short. */
  851                 trs->trso_recover_slabs = 0;
  852                 if (--trs->trso_fair_io <= 0) {
  853                         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
  854                         g_raid_tr_raid1e_rebuild_some(tr);
  855                 }
  856         }
  857         switch (bp->bio_cmd) {
  858         case BIO_READ:
  859                 g_raid_tr_iostart_raid1e_read(tr, bp);
  860                 break;
  861         case BIO_WRITE:
  862         case BIO_DELETE:
  863                 g_raid_tr_iostart_raid1e_write(tr, bp);
  864                 break;
  865         case BIO_FLUSH:
  866                 g_raid_tr_flush_common(tr, bp);
  867                 break;
  868         default:
  869                 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
  870                     bp->bio_cmd, vol->v_name));
  871                 break;
  872         }
  873 }
  874 
  875 static void
  876 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
  877     struct g_raid_subdisk *sd, struct bio *bp)
  878 {
  879         struct bio *cbp;
  880         struct g_raid_subdisk *nsd;
  881         struct g_raid_volume *vol;
  882         struct bio *pbp;
  883         struct g_raid_tr_raid1e_object *trs;
  884         off_t virtual, offset, start;
  885         uintptr_t mask;
  886         int error, do_write, copy, disk, best;
  887 
  888         trs = (struct g_raid_tr_raid1e_object *)tr;
  889         vol = tr->tro_volume;
  890         if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
  891                 if (trs->trso_type == TR_RAID1E_REBUILD) {
  892                         nsd = trs->trso_failed_sd;
  893                         if (bp->bio_cmd == BIO_READ) {
  894 
  895                                 /* Immediately abort rebuild, if requested. */
  896                                 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
  897                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  898                                         g_raid_tr_raid1e_rebuild_abort(tr);
  899                                         return;
  900                                 }
  901 
  902                                 /* On read error, skip and cross fingers. */
  903                                 if (bp->bio_error != 0) {
  904                                         G_RAID_LOGREQ(0, bp,
  905                                             "Read error during rebuild (%d), "
  906                                             "possible data loss!",
  907                                             bp->bio_error);
  908                                         goto rebuild_round_done;
  909                                 }
  910 
  911                                 /*
  912                                  * The read operation finished, queue the
  913                                  * write and get out.
  914                                  */
  915                                 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
  916                                     bp->bio_error);
  917                                 bp->bio_cmd = BIO_WRITE;
  918                                 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
  919                                 bp->bio_offset = nsd->sd_rebuild_pos;
  920                                 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
  921                                 g_raid_subdisk_iostart(nsd, bp);
  922                         } else {
  923                                 /*
  924                                  * The write operation just finished.  Do
  925                                  * another.  We keep cloning the master bio
  926                                  * since it has the right buffers allocated to
  927                                  * it.
  928                                  */
  929                                 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
  930                                     bp->bio_error);
  931                                 if (bp->bio_error != 0 ||
  932                                     trs->trso_flags & TR_RAID1E_F_ABORT) {
  933                                         if ((trs->trso_flags &
  934                                             TR_RAID1E_F_ABORT) == 0) {
  935                                                 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
  936                                                     nsd, nsd->sd_disk);
  937                                         }
  938                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  939                                         g_raid_tr_raid1e_rebuild_abort(tr);
  940                                         return;
  941                                 }
  942 rebuild_round_done:
  943                                 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
  944                                 g_raid_unlock_range(tr->tro_volume,
  945                                     trs->trso_lock_pos, trs->trso_lock_len);
  946                                 nsd->sd_rebuild_pos += bp->bio_length;
  947                                 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
  948                                         g_raid_tr_raid1e_rebuild_finish(tr);
  949                                         return;
  950                                 }
  951 
  952                                 /* Abort rebuild if we are stopping */
  953                                 if (trs->trso_stopping) {
  954                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  955                                         g_raid_tr_raid1e_rebuild_abort(tr);
  956                                         return;
  957                                 }
  958 
  959                                 if (--trs->trso_meta_update <= 0) {
  960                                         g_raid_write_metadata(vol->v_softc,
  961                                             vol, nsd, nsd->sd_disk);
  962                                         trs->trso_meta_update =
  963                                             g_raid1e_rebuild_meta_update;
  964                                         /* Compensate short rebuild I/Os. */
  965                                         if ((vol->v_disks_count % N) != 0 &&
  966                                             vol->v_strip_size <
  967                                              g_raid1e_rebuild_slab) {
  968                                                 trs->trso_meta_update *=
  969                                                     g_raid1e_rebuild_slab;
  970                                                 trs->trso_meta_update /=
  971                                                     vol->v_strip_size;
  972                                         }
  973                                 }
  974                                 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  975                                 if (--trs->trso_recover_slabs <= 0)
  976                                         return;
  977                                 /* Run next rebuild iteration. */
  978                                 g_raid_tr_raid1e_rebuild_some(tr);
  979                         }
  980                 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
  981                         /*
  982                          * read good sd, read bad sd in parallel.  when both
  983                          * done, compare the buffers.  write good to the bad
  984                          * if different.  do the next bit of work.
  985                          */
  986                         panic("Somehow, we think we're doing a resync");
  987                 }
  988                 return;
  989         }
  990         pbp = bp->bio_parent;
  991         pbp->bio_inbed++;
  992         mask = (intptr_t)bp->bio_caller2;
  993         if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
  994                 /*
  995                  * Read failed on first drive.  Retry the read error on
  996                  * another disk drive, if available, before erroring out the
  997                  * read.
  998                  */
  999                 sd->sd_disk->d_read_errs++;
 1000                 G_RAID_LOGREQ(0, bp,
 1001                     "Read error (%d), %d read errors total",
 1002                     bp->bio_error, sd->sd_disk->d_read_errs);
 1003 
 1004                 /*
 1005                  * If there are too many read errors, we move to degraded.
 1006                  * XXX Do we want to FAIL the drive (eg, make the user redo
 1007                  * everything to get it back in sync), or just degrade the
 1008                  * drive, which kicks off a resync?
 1009                  */
 1010                 do_write = 0;
 1011                 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
 1012                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 1013                 else if (mask == 0)
 1014                         do_write = 1;
 1015 
 1016                 /* Restore what we were doing. */
 1017                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1018                 V2P(vol, virtual, &disk, &offset, &start);
 1019 
 1020                 /* Find the other disk, and try to do the I/O to it. */
 1021                 mask |= 1 << copy;
 1022                 best = g_raid_tr_raid1e_select_read_disk(vol,
 1023                     disk, offset, start, mask);
 1024                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
 1025                         disk += best;
 1026                         if (disk >= vol->v_disks_count) {
 1027                                 disk -= vol->v_disks_count;
 1028                                 offset += vol->v_strip_size;
 1029                         }
 1030                         cbp->bio_offset = offset + start;
 1031                         cbp->bio_length = bp->bio_length;
 1032                         cbp->bio_data = bp->bio_data;
 1033                         g_destroy_bio(bp);
 1034                         nsd = &vol->v_subdisks[disk];
 1035                         G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
 1036                             nsd->sd_pos);
 1037                         if (do_write)
 1038                                 mask |= 1 << 31;
 1039                         if ((mask & (1 << 31)) != 0)
 1040                                 sd->sd_recovery++;
 1041                         cbp->bio_caller2 = (void *)mask;
 1042                         if (do_write) {
 1043                                 cbp->bio_caller1 = nsd;
 1044                                 /* Lock callback starts I/O */
 1045                                 g_raid_lock_range(sd->sd_volume,
 1046                                     virtual, cbp->bio_length, pbp, cbp);
 1047                         } else {
 1048                                 g_raid_subdisk_iostart(nsd, cbp);
 1049                         }
 1050                         return;
 1051                 }
 1052                 /*
 1053                  * We can't retry.  Return the original error by falling
 1054                  * through.  This will happen when there's only one good disk.
 1055                  * We don't need to fail the raid, since its actual state is
 1056                  * based on the state of the subdisks.
 1057                  */
 1058                 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
 1059         }
 1060         if (bp->bio_cmd == BIO_READ &&
 1061             bp->bio_error == 0 &&
 1062             (mask & (1 << 31)) != 0) {
 1063                 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
 1064 
 1065                 /* Restore what we were doing. */
 1066                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1067                 V2P(vol, virtual, &disk, &offset, &start);
 1068 
 1069                 /* Find best disk to write. */
 1070                 best = g_raid_tr_raid1e_select_read_disk(vol,
 1071                     disk, offset, start, ~mask);
 1072                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
 1073                         disk += best;
 1074                         if (disk >= vol->v_disks_count) {
 1075                                 disk -= vol->v_disks_count;
 1076                                 offset += vol->v_strip_size;
 1077                         }
 1078                         cbp->bio_offset = offset + start;
 1079                         cbp->bio_cmd = BIO_WRITE;
 1080                         cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
 1081                         cbp->bio_caller2 = (void *)mask;
 1082                         g_destroy_bio(bp);
 1083                         G_RAID_LOGREQ(2, cbp,
 1084                             "Attempting bad sector remap on failing drive.");
 1085                         g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
 1086                         return;
 1087                 }
 1088         }
 1089         if ((mask & (1 << 31)) != 0) {
 1090                 /*
 1091                  * We're done with a recovery, mark the range as unlocked.
 1092                  * For any write errors, we agressively fail the disk since
 1093                  * there was both a READ and a WRITE error at this location.
 1094                  * Both types of errors generally indicates the drive is on
 1095                  * the verge of total failure anyway.  Better to stop trusting
 1096                  * it now.  However, we need to reset error to 0 in that case
 1097                  * because we're not failing the original I/O which succeeded.
 1098                  */
 1099 
 1100                 /* Restore what we were doing. */
 1101                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1102                 V2P(vol, virtual, &disk, &offset, &start);
 1103 
 1104                 for (copy = 0; copy < N; copy++) {
 1105                         if ((mask & (1 << copy) ) != 0)
 1106                                 vol->v_subdisks[(disk + copy) %
 1107                                     vol->v_disks_count].sd_recovery--;
 1108                 }
 1109 
 1110                 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
 1111                         G_RAID_LOGREQ(0, bp, "Remap write failed: "
 1112                             "failing subdisk.");
 1113                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 1114                         bp->bio_error = 0;
 1115                 }
 1116                 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
 1117                 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
 1118         }
 1119         if (pbp->bio_cmd != BIO_READ) {
 1120                 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
 1121                         pbp->bio_error = bp->bio_error;
 1122                 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
 1123                         G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
 1124                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 1125                 }
 1126                 error = pbp->bio_error;
 1127         } else
 1128                 error = bp->bio_error;
 1129         g_destroy_bio(bp);
 1130         if (pbp->bio_children == pbp->bio_inbed) {
 1131                 pbp->bio_completed = pbp->bio_length;
 1132                 g_raid_iodone(pbp, error);
 1133         }
 1134 }
 1135 
 1136 static int
 1137 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
 1138     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
 1139 {
 1140         struct g_raid_volume *vol;
 1141         struct g_raid_subdisk *sd;
 1142         struct bio_queue_head queue;
 1143         char *addr;
 1144         off_t offset, start, length, remain;
 1145         u_int no, strip_size;
 1146         int i, error;
 1147 
 1148         vol = tr->tro_volume;
 1149         addr = virtual;
 1150         strip_size = vol->v_strip_size;
 1151         V2P(vol, boffset, &no, &offset, &start);
 1152         remain = blength;
 1153         bioq_init(&queue);
 1154         while (remain > 0) {
 1155                 length = MIN(strip_size - start, remain);
 1156                 for (i = 0; i < N; i++) {
 1157                         sd = &vol->v_subdisks[no];
 1158                         switch (sd->sd_state) {
 1159                         case G_RAID_SUBDISK_S_ACTIVE:
 1160                         case G_RAID_SUBDISK_S_STALE:
 1161                         case G_RAID_SUBDISK_S_RESYNC:
 1162                                 break;
 1163                         case G_RAID_SUBDISK_S_REBUILD:
 1164                                 if (offset + start >= sd->sd_rebuild_pos)
 1165                                         goto nextdisk;
 1166                                 break;
 1167                         default:
 1168                                 goto nextdisk;
 1169                         }
 1170                         error = g_raid_subdisk_kerneldump(sd,
 1171                             addr, 0, offset + start, length);
 1172                         if (error != 0)
 1173                                 return (error);
 1174 nextdisk:
 1175                         if (++no >= vol->v_disks_count) {
 1176                                 no = 0;
 1177                                 offset += strip_size;
 1178                         }
 1179                 }
 1180                 remain -= length;
 1181                 addr += length;
 1182                 start = 0;
 1183         }
 1184         return (0);
 1185 }
 1186 
 1187 static int
 1188 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
 1189 {
 1190         struct bio *bp;
 1191         struct g_raid_subdisk *sd;
 1192 
 1193         bp = (struct bio *)argp;
 1194         sd = (struct g_raid_subdisk *)bp->bio_caller1;
 1195         g_raid_subdisk_iostart(sd, bp);
 1196 
 1197         return (0);
 1198 }
 1199 
 1200 static int
 1201 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
 1202 {
 1203         struct g_raid_tr_raid1e_object *trs;
 1204         struct g_raid_volume *vol;
 1205 
 1206         vol = tr->tro_volume;
 1207         trs = (struct g_raid_tr_raid1e_object *)tr;
 1208         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
 1209         trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
 1210         /* Compensate short rebuild I/Os. */
 1211         if ((vol->v_disks_count % N) != 0 &&
 1212             vol->v_strip_size < g_raid1e_rebuild_slab) {
 1213                 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
 1214                 trs->trso_recover_slabs /= vol->v_strip_size;
 1215         }
 1216         if (trs->trso_type == TR_RAID1E_REBUILD)
 1217                 g_raid_tr_raid1e_rebuild_some(tr);
 1218         return (0);
 1219 }
 1220 
 1221 static int
 1222 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
 1223 {
 1224         struct g_raid_tr_raid1e_object *trs;
 1225 
 1226         trs = (struct g_raid_tr_raid1e_object *)tr;
 1227 
 1228         if (trs->trso_buffer != NULL) {
 1229                 free(trs->trso_buffer, M_TR_RAID1E);
 1230                 trs->trso_buffer = NULL;
 1231         }
 1232         return (0);
 1233 }
 1234 
 1235 G_RAID_TR_DECLARE(raid1e, "RAID1E");

Cache object: b2a3eda45d3c5422df207f39d2c646ec


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.