The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/geom/raid/tr_raid1e.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD$");
   29 
   30 #include <sys/param.h>
   31 #include <sys/bio.h>
   32 #include <sys/endian.h>
   33 #include <sys/kernel.h>
   34 #include <sys/kobj.h>
   35 #include <sys/limits.h>
   36 #include <sys/lock.h>
   37 #include <sys/malloc.h>
   38 #include <sys/mutex.h>
   39 #include <sys/sysctl.h>
   40 #include <sys/systm.h>
   41 #include <geom/geom.h>
   42 #include "geom/raid/g_raid.h"
   43 #include "g_raid_tr_if.h"
   44 
   45 #define N       2
   46 
   47 SYSCTL_DECL(_kern_geom_raid_raid1e);
   48 
   49 #define RAID1E_REBUILD_SLAB     (1 << 20) /* One transation in a rebuild */
   50 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
   51 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size",
   52     &g_raid1e_rebuild_slab);
   53 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
   54     &g_raid1e_rebuild_slab, 0,
   55     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
   56 
   57 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
   58 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
   59 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io",
   60     &g_raid1e_rebuild_fair_io);
   61 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
   62     &g_raid1e_rebuild_fair_io, 0,
   63     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
   64 
   65 #define RAID1E_REBUILD_CLUSTER_IDLE 100
   66 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
   67 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle",
   68     &g_raid1e_rebuild_cluster_idle);
   69 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
   70     &g_raid1e_rebuild_cluster_idle, 0,
   71     "Number of slabs to do each time we trigger a rebuild cycle");
   72 
   73 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
   74 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
   75 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update",
   76     &g_raid1e_rebuild_meta_update);
   77 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
   78     &g_raid1e_rebuild_meta_update, 0,
   79     "When to update the meta data.");
   80 
   81 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
   82 
   83 #define TR_RAID1E_NONE 0
   84 #define TR_RAID1E_REBUILD 1
   85 #define TR_RAID1E_RESYNC 2
   86 
   87 #define TR_RAID1E_F_DOING_SOME  0x1
   88 #define TR_RAID1E_F_LOCKED      0x2
   89 #define TR_RAID1E_F_ABORT       0x4
   90 
   91 struct g_raid_tr_raid1e_object {
   92         struct g_raid_tr_object  trso_base;
   93         int                      trso_starting;
   94         int                      trso_stopping;
   95         int                      trso_type;
   96         int                      trso_recover_slabs; /* slabs before rest */
   97         int                      trso_fair_io;
   98         int                      trso_meta_update;
   99         int                      trso_flags;
  100         struct g_raid_subdisk   *trso_failed_sd; /* like per volume */
  101         void                    *trso_buffer;    /* Buffer space */
  102         off_t                    trso_lock_pos; /* Locked range start. */
  103         off_t                    trso_lock_len; /* Locked range length. */
  104         struct bio               trso_bio;
  105 };
  106 
  107 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
  108 static g_raid_tr_event_t g_raid_tr_event_raid1e;
  109 static g_raid_tr_start_t g_raid_tr_start_raid1e;
  110 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
  111 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
  112 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
  113 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
  114 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
  115 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
  116 static g_raid_tr_free_t g_raid_tr_free_raid1e;
  117 
  118 static kobj_method_t g_raid_tr_raid1e_methods[] = {
  119         KOBJMETHOD(g_raid_tr_taste,     g_raid_tr_taste_raid1e),
  120         KOBJMETHOD(g_raid_tr_event,     g_raid_tr_event_raid1e),
  121         KOBJMETHOD(g_raid_tr_start,     g_raid_tr_start_raid1e),
  122         KOBJMETHOD(g_raid_tr_stop,      g_raid_tr_stop_raid1e),
  123         KOBJMETHOD(g_raid_tr_iostart,   g_raid_tr_iostart_raid1e),
  124         KOBJMETHOD(g_raid_tr_iodone,    g_raid_tr_iodone_raid1e),
  125         KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
  126         KOBJMETHOD(g_raid_tr_locked,    g_raid_tr_locked_raid1e),
  127         KOBJMETHOD(g_raid_tr_idle,      g_raid_tr_idle_raid1e),
  128         KOBJMETHOD(g_raid_tr_free,      g_raid_tr_free_raid1e),
  129         { 0, 0 }
  130 };
  131 
  132 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
  133         "RAID1E",
  134         g_raid_tr_raid1e_methods,
  135         sizeof(struct g_raid_tr_raid1e_object),
  136         .trc_enable = 1,
  137         .trc_priority = 200,
  138         .trc_accept_unmapped = 1
  139 };
  140 
  141 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
  142 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
  143     struct g_raid_subdisk *sd);
  144 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
  145     int no, off_t off, off_t len, u_int mask);
  146 
  147 static inline void
  148 V2P(struct g_raid_volume *vol, off_t virt,
  149     int *disk, off_t *offset, off_t *start)
  150 {
  151         off_t nstrip;
  152         u_int strip_size;
  153 
  154         strip_size = vol->v_strip_size;
  155         /* Strip number. */
  156         nstrip = virt / strip_size;
  157         /* Start position in strip. */
  158         *start = virt % strip_size;
  159         /* Disk number. */
  160         *disk = (nstrip * N) % vol->v_disks_count;
  161         /* Strip start position in disk. */
  162         *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
  163 }
  164 
  165 static inline void
  166 P2V(struct g_raid_volume *vol, int disk, off_t offset,
  167     off_t *virt, int *copy)
  168 {
  169         off_t nstrip, start;
  170         u_int strip_size;
  171 
  172         strip_size = vol->v_strip_size;
  173         /* Start position in strip. */
  174         start = offset % strip_size;
  175         /* Physical strip number. */
  176         nstrip = (offset / strip_size) * vol->v_disks_count + disk;
  177         /* Number of physical strip (copy) inside virtual strip. */
  178         *copy = nstrip % N;
  179         /* Offset in virtual space. */
  180         *virt = (nstrip / N) * strip_size + start;
  181 }
  182 
  183 static int
  184 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
  185 {
  186         struct g_raid_tr_raid1e_object *trs;
  187 
  188         trs = (struct g_raid_tr_raid1e_object *)tr;
  189         if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
  190             tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
  191                 return (G_RAID_TR_TASTE_FAIL);
  192         trs->trso_starting = 1;
  193         return (G_RAID_TR_TASTE_SUCCEED);
  194 }
  195 
  196 static int
  197 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
  198 {
  199         struct g_raid_softc *sc;
  200         struct g_raid_subdisk *sd, *bestsd, *worstsd;
  201         int i, j, state, sstate;
  202 
  203         sc = vol->v_softc;
  204         state = G_RAID_VOLUME_S_OPTIMAL;
  205         for (i = 0; i < vol->v_disks_count / N; i++) {
  206                 bestsd = &vol->v_subdisks[i * N];
  207                 for (j = 1; j < N; j++) {
  208                         sd = &vol->v_subdisks[i * N + j];
  209                         if (sd->sd_state > bestsd->sd_state)
  210                                 bestsd = sd;
  211                         else if (sd->sd_state == bestsd->sd_state &&
  212                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
  213                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
  214                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
  215                                 bestsd = sd;
  216                 }
  217                 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
  218                     bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
  219                         /* We found reasonable candidate. */
  220                         G_RAID_DEBUG1(1, sc,
  221                             "Promote subdisk %s:%d from %s to ACTIVE.",
  222                             vol->v_name, bestsd->sd_pos,
  223                             g_raid_subdisk_state2str(bestsd->sd_state));
  224                         g_raid_change_subdisk_state(bestsd,
  225                             G_RAID_SUBDISK_S_ACTIVE);
  226                         g_raid_write_metadata(sc,
  227                             vol, bestsd, bestsd->sd_disk);
  228                 }
  229                 worstsd = &vol->v_subdisks[i * N];
  230                 for (j = 1; j < N; j++) {
  231                         sd = &vol->v_subdisks[i * N + j];
  232                         if (sd->sd_state < worstsd->sd_state)
  233                                 worstsd = sd;
  234                 }
  235                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  236                         sstate = G_RAID_VOLUME_S_OPTIMAL;
  237                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  238                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
  239                 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  240                         sstate = G_RAID_VOLUME_S_DEGRADED;
  241                 else
  242                         sstate = G_RAID_VOLUME_S_BROKEN;
  243                 if (sstate < state)
  244                         state = sstate;
  245         }
  246         return (state);
  247 }
  248 
  249 static int
  250 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
  251 {
  252         struct g_raid_softc *sc;
  253         struct g_raid_subdisk *sd, *bestsd, *worstsd;
  254         int i, j, state, sstate;
  255 
  256         sc = vol->v_softc;
  257         if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
  258             vol->v_disks_count)
  259                 return (G_RAID_VOLUME_S_OPTIMAL);
  260         for (i = 0; i < vol->v_disks_count; i++) {
  261                 sd = &vol->v_subdisks[i];
  262                 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
  263                         /* We found reasonable candidate. */
  264                         G_RAID_DEBUG1(1, sc,
  265                             "Promote subdisk %s:%d from %s to STALE.",
  266                             vol->v_name, sd->sd_pos,
  267                             g_raid_subdisk_state2str(sd->sd_state));
  268                         g_raid_change_subdisk_state(sd,
  269                             G_RAID_SUBDISK_S_STALE);
  270                         g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
  271                 }
  272         }
  273         state = G_RAID_VOLUME_S_OPTIMAL;
  274         for (i = 0; i < vol->v_disks_count; i++) {
  275                 bestsd = &vol->v_subdisks[i];
  276                 worstsd = &vol->v_subdisks[i];
  277                 for (j = 1; j < N; j++) {
  278                         sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
  279                         if (sd->sd_state > bestsd->sd_state)
  280                                 bestsd = sd;
  281                         else if (sd->sd_state == bestsd->sd_state &&
  282                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
  283                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
  284                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
  285                                 bestsd = sd;
  286                         if (sd->sd_state < worstsd->sd_state)
  287                                 worstsd = sd;
  288                 }
  289                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  290                         sstate = G_RAID_VOLUME_S_OPTIMAL;
  291                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  292                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
  293                 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  294                         sstate = G_RAID_VOLUME_S_DEGRADED;
  295                 else
  296                         sstate = G_RAID_VOLUME_S_BROKEN;
  297                 if (sstate < state)
  298                         state = sstate;
  299         }
  300         return (state);
  301 }
  302 
  303 static int
  304 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
  305     struct g_raid_subdisk *sd)
  306 {
  307         struct g_raid_tr_raid1e_object *trs;
  308         struct g_raid_softc *sc;
  309         u_int s;
  310 
  311         sc = vol->v_softc;
  312         trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
  313         if (trs->trso_stopping &&
  314             (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
  315                 s = G_RAID_VOLUME_S_STOPPED;
  316         else if (trs->trso_starting)
  317                 s = G_RAID_VOLUME_S_STARTING;
  318         else {
  319                 if ((vol->v_disks_count % N) == 0)
  320                         s = g_raid_tr_update_state_raid1e_even(vol);
  321                 else
  322                         s = g_raid_tr_update_state_raid1e_odd(vol);
  323         }
  324         if (s != vol->v_state) {
  325                 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
  326                     G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
  327                     G_RAID_EVENT_VOLUME);
  328                 g_raid_change_volume_state(vol, s);
  329                 if (!trs->trso_starting && !trs->trso_stopping)
  330                         g_raid_write_metadata(sc, vol, NULL, NULL);
  331         }
  332         if (!trs->trso_starting && !trs->trso_stopping)
  333                 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
  334         return (0);
  335 }
  336 
  337 static void
  338 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
  339     struct g_raid_disk *disk)
  340 {
  341         struct g_raid_volume *vol;
  342 
  343         vol = sd->sd_volume;
  344         /*
  345          * We don't fail the last disk in the pack, since it still has decent
  346          * data on it and that's better than failing the disk if it is the root
  347          * file system.
  348          *
  349          * XXX should this be controlled via a tunable?  It makes sense for
  350          * the volume that has / on it.  I can't think of a case where we'd
  351          * want the volume to go away on this kind of event.
  352          */
  353         if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
  354              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
  355              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
  356              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
  357              vol->v_disks_count) &&
  358             (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
  359                 return;
  360         g_raid_fail_disk(sc, sd, disk);
  361 }
  362 
  363 static void
  364 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
  365 {
  366         struct g_raid_volume *vol;
  367         struct g_raid_subdisk *sd;
  368 
  369         vol = trs->trso_base.tro_volume;
  370         sd = trs->trso_failed_sd;
  371         g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
  372         free(trs->trso_buffer, M_TR_RAID1E);
  373         trs->trso_buffer = NULL;
  374         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  375         trs->trso_type = TR_RAID1E_NONE;
  376         trs->trso_recover_slabs = 0;
  377         trs->trso_failed_sd = NULL;
  378         g_raid_tr_update_state_raid1e(vol, NULL);
  379 }
  380 
  381 static void
  382 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
  383 {
  384         struct g_raid_tr_raid1e_object *trs;
  385         struct g_raid_subdisk *sd;
  386 
  387         trs = (struct g_raid_tr_raid1e_object *)tr;
  388         sd = trs->trso_failed_sd;
  389         G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
  390             "Subdisk %s:%d-%s rebuild completed.",
  391             sd->sd_volume->v_name, sd->sd_pos,
  392             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  393         g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
  394         sd->sd_rebuild_pos = 0;
  395         g_raid_tr_raid1e_rebuild_done(trs);
  396 }
  397 
  398 static void
  399 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
  400 {
  401         struct g_raid_tr_raid1e_object *trs;
  402         struct g_raid_subdisk *sd;
  403         struct g_raid_volume *vol;
  404 
  405         vol = tr->tro_volume;
  406         trs = (struct g_raid_tr_raid1e_object *)tr;
  407         sd = trs->trso_failed_sd;
  408         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
  409                 G_RAID_DEBUG1(1, vol->v_softc,
  410                     "Subdisk %s:%d-%s rebuild is aborting.",
  411                     sd->sd_volume->v_name, sd->sd_pos,
  412                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  413                 trs->trso_flags |= TR_RAID1E_F_ABORT;
  414         } else {
  415                 G_RAID_DEBUG1(0, vol->v_softc,
  416                     "Subdisk %s:%d-%s rebuild aborted.",
  417                     sd->sd_volume->v_name, sd->sd_pos,
  418                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  419                 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
  420                 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
  421                         trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
  422                         g_raid_unlock_range(tr->tro_volume,
  423                             trs->trso_lock_pos, trs->trso_lock_len);
  424                 }
  425                 g_raid_tr_raid1e_rebuild_done(trs);
  426         }
  427 }
  428 
  429 static void
  430 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
  431 {
  432         struct g_raid_tr_raid1e_object *trs;
  433         struct g_raid_softc *sc;
  434         struct g_raid_volume *vol;
  435         struct g_raid_subdisk *sd;
  436         struct bio *bp;
  437         off_t len, virtual, vend, offset, start;
  438         int disk, copy, best;
  439 
  440         trs = (struct g_raid_tr_raid1e_object *)tr;
  441         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
  442                 return;
  443         vol = tr->tro_volume;
  444         sc = vol->v_softc;
  445         sd = trs->trso_failed_sd;
  446 
  447         while (1) {
  448                 if (sd->sd_rebuild_pos >= sd->sd_size) {
  449                         g_raid_tr_raid1e_rebuild_finish(tr);
  450                         return;
  451                 }
  452                 /* Get virtual offset from physical rebuild position. */
  453                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
  454                 /* Get physical offset back to get first stripe position. */
  455                 V2P(vol, virtual, &disk, &offset, &start);
  456                 /* Calculate contignous data length. */
  457                 len = MIN(g_raid1e_rebuild_slab,
  458                     sd->sd_size - sd->sd_rebuild_pos);
  459                 if ((vol->v_disks_count % N) != 0)
  460                         len = MIN(len, vol->v_strip_size - start);
  461                 /* Find disk with most accurate data. */
  462                 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
  463                     offset + start, len, 0);
  464                 if (best < 0) {
  465                         /* There is no any valid disk. */
  466                         g_raid_tr_raid1e_rebuild_abort(tr);
  467                         return;
  468                 } else if (best != copy) {
  469                         /* Some other disk has better data. */
  470                         break;
  471                 }
  472                 /* We have the most accurate data. Skip the range. */
  473                 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
  474                     sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
  475                 sd->sd_rebuild_pos += len;
  476         }
  477 
  478         bp = &trs->trso_bio;
  479         memset(bp, 0, sizeof(*bp));
  480         bp->bio_offset = offset + start +
  481             ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
  482         bp->bio_length = len;
  483         bp->bio_data = trs->trso_buffer;
  484         bp->bio_cmd = BIO_READ;
  485         bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
  486         bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
  487         G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
  488         /*
  489          * If we are crossing stripe boundary, correct affected virtual
  490          * range we should lock.
  491          */
  492         if (start + len > vol->v_strip_size) {
  493                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
  494                 len = vend - virtual;
  495         }
  496         trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
  497         trs->trso_flags |= TR_RAID1E_F_LOCKED;
  498         trs->trso_lock_pos = virtual;
  499         trs->trso_lock_len = len;
  500         /* Lock callback starts I/O */
  501         g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
  502 }
  503 
  504 static void
  505 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
  506 {
  507         struct g_raid_volume *vol;
  508         struct g_raid_tr_raid1e_object *trs;
  509         struct g_raid_subdisk *sd;
  510 
  511         vol = tr->tro_volume;
  512         trs = (struct g_raid_tr_raid1e_object *)tr;
  513         if (trs->trso_failed_sd) {
  514                 G_RAID_DEBUG1(1, vol->v_softc,
  515                     "Already rebuild in start rebuild. pos %jd\n",
  516                     (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
  517                 return;
  518         }
  519         sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
  520         if (sd == NULL)
  521                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
  522         if (sd == NULL) {
  523                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
  524                 if (sd != NULL) {
  525                         sd->sd_rebuild_pos = 0;
  526                         g_raid_change_subdisk_state(sd,
  527                             G_RAID_SUBDISK_S_RESYNC);
  528                         g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
  529                 } else {
  530                         sd = g_raid_get_subdisk(vol,
  531                             G_RAID_SUBDISK_S_UNINITIALIZED);
  532                         if (sd == NULL)
  533                                 sd = g_raid_get_subdisk(vol,
  534                                     G_RAID_SUBDISK_S_NEW);
  535                         if (sd != NULL) {
  536                                 sd->sd_rebuild_pos = 0;
  537                                 g_raid_change_subdisk_state(sd,
  538                                     G_RAID_SUBDISK_S_REBUILD);
  539                                 g_raid_write_metadata(vol->v_softc,
  540                                     vol, sd, NULL);
  541                         }
  542                 }
  543         }
  544         if (sd == NULL) {
  545                 G_RAID_DEBUG1(1, vol->v_softc,
  546                     "No failed disk to rebuild.  night night.");
  547                 return;
  548         }
  549         trs->trso_failed_sd = sd;
  550         G_RAID_DEBUG1(0, vol->v_softc,
  551             "Subdisk %s:%d-%s rebuild start at %jd.",
  552             sd->sd_volume->v_name, sd->sd_pos,
  553             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
  554             trs->trso_failed_sd->sd_rebuild_pos);
  555         trs->trso_type = TR_RAID1E_REBUILD;
  556         trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
  557         trs->trso_meta_update = g_raid1e_rebuild_meta_update;
  558         g_raid_tr_raid1e_rebuild_some(tr);
  559 }
  560 
  561 static void
  562 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
  563     struct g_raid_subdisk *sd)
  564 {
  565         struct g_raid_volume *vol;
  566         struct g_raid_tr_raid1e_object *trs;
  567         int nr;
  568         
  569         vol = tr->tro_volume;
  570         trs = (struct g_raid_tr_raid1e_object *)tr;
  571         if (trs->trso_stopping)
  572                 return;
  573         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
  574             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
  575         switch(trs->trso_type) {
  576         case TR_RAID1E_NONE:
  577                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
  578                         return;
  579                 if (nr == 0) {
  580                         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
  581                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
  582                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
  583                         if (nr == 0)
  584                                 return;
  585                 }
  586                 g_raid_tr_raid1e_rebuild_start(tr);
  587                 break;
  588         case TR_RAID1E_REBUILD:
  589                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
  590                     trs->trso_failed_sd == sd)
  591                         g_raid_tr_raid1e_rebuild_abort(tr);
  592                 break;
  593         case TR_RAID1E_RESYNC:
  594                 break;
  595         }
  596 }
  597 
  598 static int
  599 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
  600     struct g_raid_subdisk *sd, u_int event)
  601 {
  602 
  603         g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
  604         return (0);
  605 }
  606 
  607 static int
  608 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
  609 {
  610         struct g_raid_tr_raid1e_object *trs;
  611         struct g_raid_volume *vol;
  612 
  613         trs = (struct g_raid_tr_raid1e_object *)tr;
  614         vol = tr->tro_volume;
  615         trs->trso_starting = 0;
  616         g_raid_tr_update_state_raid1e(vol, NULL);
  617         return (0);
  618 }
  619 
  620 static int
  621 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
  622 {
  623         struct g_raid_tr_raid1e_object *trs;
  624         struct g_raid_volume *vol;
  625 
  626         trs = (struct g_raid_tr_raid1e_object *)tr;
  627         vol = tr->tro_volume;
  628         trs->trso_starting = 0;
  629         trs->trso_stopping = 1;
  630         g_raid_tr_update_state_raid1e(vol, NULL);
  631         return (0);
  632 }
  633 
  634 /*
  635  * Select the disk to read from.  Take into account: subdisk state, running
  636  * error recovery, average disk load, head position and possible cache hits.
  637  */
  638 #define ABS(x)          (((x) >= 0) ? (x) : (-(x)))
  639 static int
  640 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
  641     int no, off_t off, off_t len, u_int mask)
  642 {
  643         struct g_raid_subdisk *sd;
  644         off_t offset;
  645         int i, best, prio, bestprio;
  646 
  647         best = -1;
  648         bestprio = INT_MAX;
  649         for (i = 0; i < N; i++) {
  650                 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
  651                 offset = off;
  652                 if (no + i >= vol->v_disks_count)
  653                         offset += vol->v_strip_size;
  654 
  655                 prio = G_RAID_SUBDISK_LOAD(sd);
  656                 if ((mask & (1 << sd->sd_pos)) != 0)
  657                         continue;
  658                 switch (sd->sd_state) {
  659                 case G_RAID_SUBDISK_S_ACTIVE:
  660                         break;
  661                 case G_RAID_SUBDISK_S_RESYNC:
  662                         if (offset + off < sd->sd_rebuild_pos)
  663                                 break;
  664                         /* FALLTHROUGH */
  665                 case G_RAID_SUBDISK_S_STALE:
  666                         prio += i << 24;
  667                         break;
  668                 case G_RAID_SUBDISK_S_REBUILD:
  669                         if (offset + off < sd->sd_rebuild_pos)
  670                                 break;
  671                         /* FALLTHROUGH */
  672                 default:
  673                         continue;
  674                 }
  675                 prio += min(sd->sd_recovery, 255) << 16;
  676                 /* If disk head is precisely in position - highly prefer it. */
  677                 if (G_RAID_SUBDISK_POS(sd) == offset)
  678                         prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
  679                 else
  680                 /* If disk head is close to position - prefer it. */
  681                 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
  682                     G_RAID_SUBDISK_TRACK_SIZE)
  683                         prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
  684                 if (prio < bestprio) {
  685                         bestprio = prio;
  686                         best = i;
  687                 }
  688         }
  689         return (best);
  690 }
  691 
  692 static void
  693 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
  694 {
  695         struct g_raid_volume *vol;
  696         struct g_raid_subdisk *sd;
  697         struct bio_queue_head queue;
  698         struct bio *cbp;
  699         char *addr;
  700         off_t offset, start, length, remain;
  701         u_int no, strip_size;
  702         int best;
  703 
  704         vol = tr->tro_volume;
  705         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
  706                 addr = NULL;
  707         else
  708                 addr = bp->bio_data;
  709         strip_size = vol->v_strip_size;
  710         V2P(vol, bp->bio_offset, &no, &offset, &start);
  711         remain = bp->bio_length;
  712         bioq_init(&queue);
  713         while (remain > 0) {
  714                 length = MIN(strip_size - start, remain);
  715                 best = g_raid_tr_raid1e_select_read_disk(vol,
  716                     no, offset, length, 0);
  717                 KASSERT(best >= 0, ("No readable disk in volume %s!",
  718                     vol->v_name));
  719                 no += best;
  720                 if (no >= vol->v_disks_count) {
  721                         no -= vol->v_disks_count;
  722                         offset += strip_size;
  723                 }
  724                 cbp = g_clone_bio(bp);
  725                 if (cbp == NULL)
  726                         goto failure;
  727                 cbp->bio_offset = offset + start;
  728                 cbp->bio_length = length;
  729                 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
  730                         cbp->bio_ma_offset += (uintptr_t)addr;
  731                         cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
  732                         cbp->bio_ma_offset %= PAGE_SIZE;
  733                         cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
  734                             cbp->bio_length) / PAGE_SIZE;
  735                 } else
  736                         cbp->bio_data = addr;
  737                 cbp->bio_caller1 = &vol->v_subdisks[no];
  738                 bioq_insert_tail(&queue, cbp);
  739                 no += N - best;
  740                 if (no >= vol->v_disks_count) {
  741                         no -= vol->v_disks_count;
  742                         offset += strip_size;
  743                 }
  744                 remain -= length;
  745                 addr += length;
  746                 start = 0;
  747         }
  748         while ((cbp = bioq_takefirst(&queue)) != NULL) {
  749                 sd = cbp->bio_caller1;
  750                 cbp->bio_caller1 = NULL;
  751                 g_raid_subdisk_iostart(sd, cbp);
  752         }
  753         return;
  754 failure:
  755         while ((cbp = bioq_takefirst(&queue)) != NULL)
  756                 g_destroy_bio(cbp);
  757         if (bp->bio_error == 0)
  758                 bp->bio_error = ENOMEM;
  759         g_raid_iodone(bp, bp->bio_error);
  760 }
  761 
  762 static void
  763 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
  764 {
  765         struct g_raid_volume *vol;
  766         struct g_raid_subdisk *sd;
  767         struct bio_queue_head queue;
  768         struct bio *cbp;
  769         char *addr;
  770         off_t offset, start, length, remain;
  771         u_int no, strip_size;
  772         int i;
  773 
  774         vol = tr->tro_volume;
  775         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
  776                 addr = NULL;
  777         else
  778                 addr = bp->bio_data;
  779         strip_size = vol->v_strip_size;
  780         V2P(vol, bp->bio_offset, &no, &offset, &start);
  781         remain = bp->bio_length;
  782         bioq_init(&queue);
  783         while (remain > 0) {
  784                 length = MIN(strip_size - start, remain);
  785                 for (i = 0; i < N; i++) {
  786                         sd = &vol->v_subdisks[no];
  787                         switch (sd->sd_state) {
  788                         case G_RAID_SUBDISK_S_ACTIVE:
  789                         case G_RAID_SUBDISK_S_STALE:
  790                         case G_RAID_SUBDISK_S_RESYNC:
  791                                 break;
  792                         case G_RAID_SUBDISK_S_REBUILD:
  793                                 if (offset + start >= sd->sd_rebuild_pos)
  794                                         goto nextdisk;
  795                                 break;
  796                         default:
  797                                 goto nextdisk;
  798                         }
  799                         cbp = g_clone_bio(bp);
  800                         if (cbp == NULL)
  801                                 goto failure;
  802                         cbp->bio_offset = offset + start;
  803                         cbp->bio_length = length;
  804                         if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
  805                             bp->bio_cmd != BIO_DELETE) {
  806                                 cbp->bio_ma_offset += (uintptr_t)addr;
  807                                 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
  808                                 cbp->bio_ma_offset %= PAGE_SIZE;
  809                                 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
  810                                     cbp->bio_length) / PAGE_SIZE;
  811                         } else
  812                                 cbp->bio_data = addr;
  813                         cbp->bio_caller1 = sd;
  814                         bioq_insert_tail(&queue, cbp);
  815 nextdisk:
  816                         if (++no >= vol->v_disks_count) {
  817                                 no = 0;
  818                                 offset += strip_size;
  819                         }
  820                 }
  821                 remain -= length;
  822                 if (bp->bio_cmd != BIO_DELETE)
  823                         addr += length;
  824                 start = 0;
  825         }
  826         while ((cbp = bioq_takefirst(&queue)) != NULL) {
  827                 sd = cbp->bio_caller1;
  828                 cbp->bio_caller1 = NULL;
  829                 g_raid_subdisk_iostart(sd, cbp);
  830         }
  831         return;
  832 failure:
  833         while ((cbp = bioq_takefirst(&queue)) != NULL)
  834                 g_destroy_bio(cbp);
  835         if (bp->bio_error == 0)
  836                 bp->bio_error = ENOMEM;
  837         g_raid_iodone(bp, bp->bio_error);
  838 }
  839 
  840 static void
  841 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
  842 {
  843         struct g_raid_volume *vol;
  844         struct g_raid_tr_raid1e_object *trs;
  845 
  846         vol = tr->tro_volume;
  847         trs = (struct g_raid_tr_raid1e_object *)tr;
  848         if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
  849             vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
  850             vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
  851                 g_raid_iodone(bp, EIO);
  852                 return;
  853         }
  854         /*
  855          * If we're rebuilding, squeeze in rebuild activity every so often,
  856          * even when the disk is busy.  Be sure to only count real I/O
  857          * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
  858          * by this module.
  859          */
  860         if (trs->trso_failed_sd != NULL &&
  861             !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
  862                 /* Make this new or running now round short. */
  863                 trs->trso_recover_slabs = 0;
  864                 if (--trs->trso_fair_io <= 0) {
  865                         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
  866                         g_raid_tr_raid1e_rebuild_some(tr);
  867                 }
  868         }
  869         switch (bp->bio_cmd) {
  870         case BIO_READ:
  871                 g_raid_tr_iostart_raid1e_read(tr, bp);
  872                 break;
  873         case BIO_WRITE:
  874         case BIO_DELETE:
  875                 g_raid_tr_iostart_raid1e_write(tr, bp);
  876                 break;
  877         case BIO_FLUSH:
  878                 g_raid_tr_flush_common(tr, bp);
  879                 break;
  880         default:
  881                 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
  882                     bp->bio_cmd, vol->v_name));
  883                 break;
  884         }
  885 }
  886 
  887 static void
  888 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
  889     struct g_raid_subdisk *sd, struct bio *bp)
  890 {
  891         struct bio *cbp;
  892         struct g_raid_subdisk *nsd;
  893         struct g_raid_volume *vol;
  894         struct bio *pbp;
  895         struct g_raid_tr_raid1e_object *trs;
  896         off_t virtual, offset, start;
  897         uintptr_t mask;
  898         int error, do_write, copy, disk, best;
  899 
  900         trs = (struct g_raid_tr_raid1e_object *)tr;
  901         vol = tr->tro_volume;
  902         if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
  903                 if (trs->trso_type == TR_RAID1E_REBUILD) {
  904                         nsd = trs->trso_failed_sd;
  905                         if (bp->bio_cmd == BIO_READ) {
  906 
  907                                 /* Immediately abort rebuild, if requested. */
  908                                 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
  909                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  910                                         g_raid_tr_raid1e_rebuild_abort(tr);
  911                                         return;
  912                                 }
  913 
  914                                 /* On read error, skip and cross fingers. */
  915                                 if (bp->bio_error != 0) {
  916                                         G_RAID_LOGREQ(0, bp,
  917                                             "Read error during rebuild (%d), "
  918                                             "possible data loss!",
  919                                             bp->bio_error);
  920                                         goto rebuild_round_done;
  921                                 }
  922 
  923                                 /*
  924                                  * The read operation finished, queue the
  925                                  * write and get out.
  926                                  */
  927                                 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
  928                                     bp->bio_error);
  929                                 bp->bio_cmd = BIO_WRITE;
  930                                 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
  931                                 bp->bio_offset = nsd->sd_rebuild_pos;
  932                                 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
  933                                 g_raid_subdisk_iostart(nsd, bp);
  934                         } else {
  935                                 /*
  936                                  * The write operation just finished.  Do
  937                                  * another.  We keep cloning the master bio
  938                                  * since it has the right buffers allocated to
  939                                  * it.
  940                                  */
  941                                 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
  942                                     bp->bio_error);
  943                                 if (bp->bio_error != 0 ||
  944                                     trs->trso_flags & TR_RAID1E_F_ABORT) {
  945                                         if ((trs->trso_flags &
  946                                             TR_RAID1E_F_ABORT) == 0) {
  947                                                 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
  948                                                     nsd, nsd->sd_disk);
  949                                         }
  950                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  951                                         g_raid_tr_raid1e_rebuild_abort(tr);
  952                                         return;
  953                                 }
  954 rebuild_round_done:
  955                                 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
  956                                 g_raid_unlock_range(tr->tro_volume,
  957                                     trs->trso_lock_pos, trs->trso_lock_len);
  958                                 nsd->sd_rebuild_pos += bp->bio_length;
  959                                 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
  960                                         g_raid_tr_raid1e_rebuild_finish(tr);
  961                                         return;
  962                                 }
  963 
  964                                 /* Abort rebuild if we are stopping */
  965                                 if (trs->trso_stopping) {
  966                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  967                                         g_raid_tr_raid1e_rebuild_abort(tr);
  968                                         return;
  969                                 }
  970 
  971                                 if (--trs->trso_meta_update <= 0) {
  972                                         g_raid_write_metadata(vol->v_softc,
  973                                             vol, nsd, nsd->sd_disk);
  974                                         trs->trso_meta_update =
  975                                             g_raid1e_rebuild_meta_update;
  976                                         /* Compensate short rebuild I/Os. */
  977                                         if ((vol->v_disks_count % N) != 0 &&
  978                                             vol->v_strip_size <
  979                                              g_raid1e_rebuild_slab) {
  980                                                 trs->trso_meta_update *=
  981                                                     g_raid1e_rebuild_slab;
  982                                                 trs->trso_meta_update /=
  983                                                     vol->v_strip_size;
  984                                         }
  985                                 }
  986                                 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  987                                 if (--trs->trso_recover_slabs <= 0)
  988                                         return;
  989                                 /* Run next rebuild iteration. */
  990                                 g_raid_tr_raid1e_rebuild_some(tr);
  991                         }
  992                 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
  993                         /*
  994                          * read good sd, read bad sd in parallel.  when both
  995                          * done, compare the buffers.  write good to the bad
  996                          * if different.  do the next bit of work.
  997                          */
  998                         panic("Somehow, we think we're doing a resync");
  999                 }
 1000                 return;
 1001         }
 1002         pbp = bp->bio_parent;
 1003         pbp->bio_inbed++;
 1004         mask = (intptr_t)bp->bio_caller2;
 1005         if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
 1006                 /*
 1007                  * Read failed on first drive.  Retry the read error on
 1008                  * another disk drive, if available, before erroring out the
 1009                  * read.
 1010                  */
 1011                 sd->sd_disk->d_read_errs++;
 1012                 G_RAID_LOGREQ(0, bp,
 1013                     "Read error (%d), %d read errors total",
 1014                     bp->bio_error, sd->sd_disk->d_read_errs);
 1015 
 1016                 /*
 1017                  * If there are too many read errors, we move to degraded.
 1018                  * XXX Do we want to FAIL the drive (eg, make the user redo
 1019                  * everything to get it back in sync), or just degrade the
 1020                  * drive, which kicks off a resync?
 1021                  */
 1022                 do_write = 0;
 1023                 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
 1024                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 1025                 else if (mask == 0)
 1026                         do_write = 1;
 1027 
 1028                 /* Restore what we were doing. */
 1029                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1030                 V2P(vol, virtual, &disk, &offset, &start);
 1031 
 1032                 /* Find the other disk, and try to do the I/O to it. */
 1033                 mask |= 1 << copy;
 1034                 best = g_raid_tr_raid1e_select_read_disk(vol,
 1035                     disk, offset, start, mask);
 1036                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
 1037                         disk += best;
 1038                         if (disk >= vol->v_disks_count) {
 1039                                 disk -= vol->v_disks_count;
 1040                                 offset += vol->v_strip_size;
 1041                         }
 1042                         cbp->bio_offset = offset + start;
 1043                         cbp->bio_length = bp->bio_length;
 1044                         cbp->bio_data = bp->bio_data;
 1045                         cbp->bio_ma = bp->bio_ma;
 1046                         cbp->bio_ma_offset = bp->bio_ma_offset;
 1047                         cbp->bio_ma_n = bp->bio_ma_n;
 1048                         g_destroy_bio(bp);
 1049                         nsd = &vol->v_subdisks[disk];
 1050                         G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
 1051                             nsd->sd_pos);
 1052                         if (do_write)
 1053                                 mask |= 1 << 31;
 1054                         if ((mask & (1 << 31)) != 0)
 1055                                 sd->sd_recovery++;
 1056                         cbp->bio_caller2 = (void *)mask;
 1057                         if (do_write) {
 1058                                 cbp->bio_caller1 = nsd;
 1059                                 /* Lock callback starts I/O */
 1060                                 g_raid_lock_range(sd->sd_volume,
 1061                                     virtual, cbp->bio_length, pbp, cbp);
 1062                         } else {
 1063                                 g_raid_subdisk_iostart(nsd, cbp);
 1064                         }
 1065                         return;
 1066                 }
 1067                 /*
 1068                  * We can't retry.  Return the original error by falling
 1069                  * through.  This will happen when there's only one good disk.
 1070                  * We don't need to fail the raid, since its actual state is
 1071                  * based on the state of the subdisks.
 1072                  */
 1073                 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
 1074         }
 1075         if (bp->bio_cmd == BIO_READ &&
 1076             bp->bio_error == 0 &&
 1077             (mask & (1 << 31)) != 0) {
 1078                 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
 1079 
 1080                 /* Restore what we were doing. */
 1081                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1082                 V2P(vol, virtual, &disk, &offset, &start);
 1083 
 1084                 /* Find best disk to write. */
 1085                 best = g_raid_tr_raid1e_select_read_disk(vol,
 1086                     disk, offset, start, ~mask);
 1087                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
 1088                         disk += best;
 1089                         if (disk >= vol->v_disks_count) {
 1090                                 disk -= vol->v_disks_count;
 1091                                 offset += vol->v_strip_size;
 1092                         }
 1093                         cbp->bio_offset = offset + start;
 1094                         cbp->bio_cmd = BIO_WRITE;
 1095                         cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
 1096                         cbp->bio_caller2 = (void *)mask;
 1097                         g_destroy_bio(bp);
 1098                         G_RAID_LOGREQ(2, cbp,
 1099                             "Attempting bad sector remap on failing drive.");
 1100                         g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
 1101                         return;
 1102                 }
 1103         }
 1104         if ((mask & (1 << 31)) != 0) {
 1105                 /*
 1106                  * We're done with a recovery, mark the range as unlocked.
 1107                  * For any write errors, we agressively fail the disk since
 1108                  * there was both a READ and a WRITE error at this location.
 1109                  * Both types of errors generally indicates the drive is on
 1110                  * the verge of total failure anyway.  Better to stop trusting
 1111                  * it now.  However, we need to reset error to 0 in that case
 1112                  * because we're not failing the original I/O which succeeded.
 1113                  */
 1114 
 1115                 /* Restore what we were doing. */
 1116                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1117                 V2P(vol, virtual, &disk, &offset, &start);
 1118 
 1119                 for (copy = 0; copy < N; copy++) {
 1120                         if ((mask & (1 << copy) ) != 0)
 1121                                 vol->v_subdisks[(disk + copy) %
 1122                                     vol->v_disks_count].sd_recovery--;
 1123                 }
 1124 
 1125                 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
 1126                         G_RAID_LOGREQ(0, bp, "Remap write failed: "
 1127                             "failing subdisk.");
 1128                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 1129                         bp->bio_error = 0;
 1130                 }
 1131                 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
 1132                 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
 1133         }
 1134         if (pbp->bio_cmd != BIO_READ) {
 1135                 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
 1136                         pbp->bio_error = bp->bio_error;
 1137                 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
 1138                         G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
 1139                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 1140                 }
 1141                 error = pbp->bio_error;
 1142         } else
 1143                 error = bp->bio_error;
 1144         g_destroy_bio(bp);
 1145         if (pbp->bio_children == pbp->bio_inbed) {
 1146                 pbp->bio_completed = pbp->bio_length;
 1147                 g_raid_iodone(pbp, error);
 1148         }
 1149 }
 1150 
 1151 static int
 1152 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
 1153     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
 1154 {
 1155         struct g_raid_volume *vol;
 1156         struct g_raid_subdisk *sd;
 1157         struct bio_queue_head queue;
 1158         char *addr;
 1159         off_t offset, start, length, remain;
 1160         u_int no, strip_size;
 1161         int i, error;
 1162 
 1163         vol = tr->tro_volume;
 1164         addr = virtual;
 1165         strip_size = vol->v_strip_size;
 1166         V2P(vol, boffset, &no, &offset, &start);
 1167         remain = blength;
 1168         bioq_init(&queue);
 1169         while (remain > 0) {
 1170                 length = MIN(strip_size - start, remain);
 1171                 for (i = 0; i < N; i++) {
 1172                         sd = &vol->v_subdisks[no];
 1173                         switch (sd->sd_state) {
 1174                         case G_RAID_SUBDISK_S_ACTIVE:
 1175                         case G_RAID_SUBDISK_S_STALE:
 1176                         case G_RAID_SUBDISK_S_RESYNC:
 1177                                 break;
 1178                         case G_RAID_SUBDISK_S_REBUILD:
 1179                                 if (offset + start >= sd->sd_rebuild_pos)
 1180                                         goto nextdisk;
 1181                                 break;
 1182                         default:
 1183                                 goto nextdisk;
 1184                         }
 1185                         error = g_raid_subdisk_kerneldump(sd,
 1186                             addr, 0, offset + start, length);
 1187                         if (error != 0)
 1188                                 return (error);
 1189 nextdisk:
 1190                         if (++no >= vol->v_disks_count) {
 1191                                 no = 0;
 1192                                 offset += strip_size;
 1193                         }
 1194                 }
 1195                 remain -= length;
 1196                 addr += length;
 1197                 start = 0;
 1198         }
 1199         return (0);
 1200 }
 1201 
 1202 static int
 1203 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
 1204 {
 1205         struct bio *bp;
 1206         struct g_raid_subdisk *sd;
 1207 
 1208         bp = (struct bio *)argp;
 1209         sd = (struct g_raid_subdisk *)bp->bio_caller1;
 1210         g_raid_subdisk_iostart(sd, bp);
 1211 
 1212         return (0);
 1213 }
 1214 
 1215 static int
 1216 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
 1217 {
 1218         struct g_raid_tr_raid1e_object *trs;
 1219         struct g_raid_volume *vol;
 1220 
 1221         vol = tr->tro_volume;
 1222         trs = (struct g_raid_tr_raid1e_object *)tr;
 1223         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
 1224         trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
 1225         /* Compensate short rebuild I/Os. */
 1226         if ((vol->v_disks_count % N) != 0 &&
 1227             vol->v_strip_size < g_raid1e_rebuild_slab) {
 1228                 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
 1229                 trs->trso_recover_slabs /= vol->v_strip_size;
 1230         }
 1231         if (trs->trso_type == TR_RAID1E_REBUILD)
 1232                 g_raid_tr_raid1e_rebuild_some(tr);
 1233         return (0);
 1234 }
 1235 
 1236 static int
 1237 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
 1238 {
 1239         struct g_raid_tr_raid1e_object *trs;
 1240 
 1241         trs = (struct g_raid_tr_raid1e_object *)tr;
 1242 
 1243         if (trs->trso_buffer != NULL) {
 1244                 free(trs->trso_buffer, M_TR_RAID1E);
 1245                 trs->trso_buffer = NULL;
 1246         }
 1247         return (0);
 1248 }
 1249 
 1250 G_RAID_TR_DECLARE(raid1e, "RAID1E");

Cache object: 7dc1d01d822f767e2eb9f652b5ac699a


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.