The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/geom/raid/tr_raid1e.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD: releng/9.0/sys/geom/raid/tr_raid1e.c 220210 2011-03-31 16:19:53Z mav $");
   29 
   30 #include <sys/param.h>
   31 #include <sys/bio.h>
   32 #include <sys/endian.h>
   33 #include <sys/kernel.h>
   34 #include <sys/kobj.h>
   35 #include <sys/limits.h>
   36 #include <sys/lock.h>
   37 #include <sys/malloc.h>
   38 #include <sys/mutex.h>
   39 #include <sys/sysctl.h>
   40 #include <sys/systm.h>
   41 #include <geom/geom.h>
   42 #include "geom/raid/g_raid.h"
   43 #include "g_raid_tr_if.h"
   44 
   45 #define N       2
   46 
   47 SYSCTL_DECL(_kern_geom_raid);
   48 SYSCTL_NODE(_kern_geom_raid, OID_AUTO, raid1e, CTLFLAG_RW, 0,
   49     "RAID1E parameters");
   50 
   51 #define RAID1E_REBUILD_SLAB     (1 << 20) /* One transation in a rebuild */
   52 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
   53 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size",
   54     &g_raid1e_rebuild_slab);
   55 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
   56     &g_raid1e_rebuild_slab, 0,
   57     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
   58 
   59 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
   60 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
   61 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io",
   62     &g_raid1e_rebuild_fair_io);
   63 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
   64     &g_raid1e_rebuild_fair_io, 0,
   65     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
   66 
   67 #define RAID1E_REBUILD_CLUSTER_IDLE 100
   68 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
   69 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle",
   70     &g_raid1e_rebuild_cluster_idle);
   71 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
   72     &g_raid1e_rebuild_cluster_idle, 0,
   73     "Number of slabs to do each time we trigger a rebuild cycle");
   74 
   75 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
   76 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
   77 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update",
   78     &g_raid1e_rebuild_meta_update);
   79 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
   80     &g_raid1e_rebuild_meta_update, 0,
   81     "When to update the meta data.");
   82 
   83 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
   84 
   85 #define TR_RAID1E_NONE 0
   86 #define TR_RAID1E_REBUILD 1
   87 #define TR_RAID1E_RESYNC 2
   88 
   89 #define TR_RAID1E_F_DOING_SOME  0x1
   90 #define TR_RAID1E_F_LOCKED      0x2
   91 #define TR_RAID1E_F_ABORT       0x4
   92 
   93 struct g_raid_tr_raid1e_object {
   94         struct g_raid_tr_object  trso_base;
   95         int                      trso_starting;
   96         int                      trso_stopping;
   97         int                      trso_type;
   98         int                      trso_recover_slabs; /* slabs before rest */
   99         int                      trso_fair_io;
  100         int                      trso_meta_update;
  101         int                      trso_flags;
  102         struct g_raid_subdisk   *trso_failed_sd; /* like per volume */
  103         void                    *trso_buffer;    /* Buffer space */
  104         off_t                    trso_lock_pos; /* Locked range start. */
  105         off_t                    trso_lock_len; /* Locked range length. */
  106         struct bio               trso_bio;
  107 };
  108 
  109 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
  110 static g_raid_tr_event_t g_raid_tr_event_raid1e;
  111 static g_raid_tr_start_t g_raid_tr_start_raid1e;
  112 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
  113 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
  114 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
  115 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
  116 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
  117 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
  118 static g_raid_tr_free_t g_raid_tr_free_raid1e;
  119 
  120 static kobj_method_t g_raid_tr_raid1e_methods[] = {
  121         KOBJMETHOD(g_raid_tr_taste,     g_raid_tr_taste_raid1e),
  122         KOBJMETHOD(g_raid_tr_event,     g_raid_tr_event_raid1e),
  123         KOBJMETHOD(g_raid_tr_start,     g_raid_tr_start_raid1e),
  124         KOBJMETHOD(g_raid_tr_stop,      g_raid_tr_stop_raid1e),
  125         KOBJMETHOD(g_raid_tr_iostart,   g_raid_tr_iostart_raid1e),
  126         KOBJMETHOD(g_raid_tr_iodone,    g_raid_tr_iodone_raid1e),
  127         KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
  128         KOBJMETHOD(g_raid_tr_locked,    g_raid_tr_locked_raid1e),
  129         KOBJMETHOD(g_raid_tr_idle,      g_raid_tr_idle_raid1e),
  130         KOBJMETHOD(g_raid_tr_free,      g_raid_tr_free_raid1e),
  131         { 0, 0 }
  132 };
  133 
  134 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
  135         "RAID1E",
  136         g_raid_tr_raid1e_methods,
  137         sizeof(struct g_raid_tr_raid1e_object),
  138         .trc_priority = 200
  139 };
  140 
  141 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
  142 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
  143     struct g_raid_subdisk *sd);
  144 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
  145     int no, off_t off, off_t len, u_int mask);
  146 
  147 static inline void
  148 V2P(struct g_raid_volume *vol, off_t virt,
  149     int *disk, off_t *offset, off_t *start)
  150 {
  151         off_t nstrip;
  152         u_int strip_size;
  153 
  154         strip_size = vol->v_strip_size;
  155         /* Strip number. */
  156         nstrip = virt / strip_size;
  157         /* Start position in strip. */
  158         *start = virt % strip_size;
  159         /* Disk number. */
  160         *disk = (nstrip * N) % vol->v_disks_count;
  161         /* Strip start position in disk. */
  162         *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
  163 }
  164 
  165 static inline void
  166 P2V(struct g_raid_volume *vol, int disk, off_t offset,
  167     off_t *virt, int *copy)
  168 {
  169         off_t nstrip, start;
  170         u_int strip_size;
  171 
  172         strip_size = vol->v_strip_size;
  173         /* Start position in strip. */
  174         start = offset % strip_size;
  175         /* Physical strip number. */
  176         nstrip = (offset / strip_size) * vol->v_disks_count + disk;
  177         /* Number of physical strip (copy) inside virtual strip. */
  178         *copy = nstrip % N;
  179         /* Offset in virtual space. */
  180         *virt = (nstrip / N) * strip_size + start;
  181 }
  182 
  183 static int
  184 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
  185 {
  186         struct g_raid_tr_raid1e_object *trs;
  187 
  188         trs = (struct g_raid_tr_raid1e_object *)tr;
  189         if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
  190             tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE)
  191                 return (G_RAID_TR_TASTE_FAIL);
  192         trs->trso_starting = 1;
  193         return (G_RAID_TR_TASTE_SUCCEED);
  194 }
  195 
  196 static int
  197 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
  198 {
  199         struct g_raid_softc *sc;
  200         struct g_raid_subdisk *sd, *bestsd, *worstsd;
  201         int i, j, state, sstate;
  202 
  203         sc = vol->v_softc;
  204         state = G_RAID_VOLUME_S_OPTIMAL;
  205         for (i = 0; i < vol->v_disks_count / N; i++) {
  206                 bestsd = &vol->v_subdisks[i * N];
  207                 for (j = 1; j < N; j++) {
  208                         sd = &vol->v_subdisks[i * N + j];
  209                         if (sd->sd_state > bestsd->sd_state)
  210                                 bestsd = sd;
  211                         else if (sd->sd_state == bestsd->sd_state &&
  212                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
  213                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
  214                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
  215                                 bestsd = sd;
  216                 }
  217                 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
  218                     bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
  219                         /* We found reasonable candidate. */
  220                         G_RAID_DEBUG1(1, sc,
  221                             "Promote subdisk %s:%d from %s to ACTIVE.",
  222                             vol->v_name, bestsd->sd_pos,
  223                             g_raid_subdisk_state2str(bestsd->sd_state));
  224                         g_raid_change_subdisk_state(bestsd,
  225                             G_RAID_SUBDISK_S_ACTIVE);
  226                         g_raid_write_metadata(sc,
  227                             vol, bestsd, bestsd->sd_disk);
  228                 }
  229                 worstsd = &vol->v_subdisks[i * N];
  230                 for (j = 1; j < N; j++) {
  231                         sd = &vol->v_subdisks[i * N + j];
  232                         if (sd->sd_state < worstsd->sd_state)
  233                                 worstsd = sd;
  234                 }
  235                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  236                         sstate = G_RAID_VOLUME_S_OPTIMAL;
  237                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  238                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
  239                 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  240                         sstate = G_RAID_VOLUME_S_DEGRADED;
  241                 else
  242                         sstate = G_RAID_VOLUME_S_BROKEN;
  243                 if (sstate < state)
  244                         state = sstate;
  245         }
  246         return (state);
  247 }
  248 
  249 static int
  250 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
  251 {
  252         struct g_raid_softc *sc;
  253         struct g_raid_subdisk *sd, *bestsd, *worstsd;
  254         int i, j, state, sstate;
  255 
  256         sc = vol->v_softc;
  257         if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
  258             vol->v_disks_count)
  259                 return (G_RAID_VOLUME_S_OPTIMAL);
  260         for (i = 0; i < vol->v_disks_count; i++) {
  261                 sd = &vol->v_subdisks[i];
  262                 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
  263                         /* We found reasonable candidate. */
  264                         G_RAID_DEBUG1(1, sc,
  265                             "Promote subdisk %s:%d from %s to STALE.",
  266                             vol->v_name, sd->sd_pos,
  267                             g_raid_subdisk_state2str(sd->sd_state));
  268                         g_raid_change_subdisk_state(sd,
  269                             G_RAID_SUBDISK_S_STALE);
  270                         g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
  271                 }
  272         }
  273         state = G_RAID_VOLUME_S_OPTIMAL;
  274         for (i = 0; i < vol->v_disks_count; i++) {
  275                 bestsd = &vol->v_subdisks[i];
  276                 worstsd = &vol->v_subdisks[i];
  277                 for (j = 1; j < N; j++) {
  278                         sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
  279                         if (sd->sd_state > bestsd->sd_state)
  280                                 bestsd = sd;
  281                         else if (sd->sd_state == bestsd->sd_state &&
  282                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
  283                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
  284                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
  285                                 bestsd = sd;
  286                         if (sd->sd_state < worstsd->sd_state)
  287                                 worstsd = sd;
  288                 }
  289                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  290                         sstate = G_RAID_VOLUME_S_OPTIMAL;
  291                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  292                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
  293                 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  294                         sstate = G_RAID_VOLUME_S_DEGRADED;
  295                 else
  296                         sstate = G_RAID_VOLUME_S_BROKEN;
  297                 if (sstate < state)
  298                         state = sstate;
  299         }
  300         return (state);
  301 }
  302 
  303 static int
  304 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
  305     struct g_raid_subdisk *sd)
  306 {
  307         struct g_raid_tr_raid1e_object *trs;
  308         struct g_raid_softc *sc;
  309         u_int s;
  310 
  311         sc = vol->v_softc;
  312         trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
  313         if (trs->trso_stopping &&
  314             (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
  315                 s = G_RAID_VOLUME_S_STOPPED;
  316         else if (trs->trso_starting)
  317                 s = G_RAID_VOLUME_S_STARTING;
  318         else {
  319                 if ((vol->v_disks_count % N) == 0)
  320                         s = g_raid_tr_update_state_raid1e_even(vol);
  321                 else
  322                         s = g_raid_tr_update_state_raid1e_odd(vol);
  323         }
  324         if (s != vol->v_state) {
  325                 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
  326                     G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
  327                     G_RAID_EVENT_VOLUME);
  328                 g_raid_change_volume_state(vol, s);
  329                 if (!trs->trso_starting && !trs->trso_stopping)
  330                         g_raid_write_metadata(sc, vol, NULL, NULL);
  331         }
  332         if (!trs->trso_starting && !trs->trso_stopping)
  333                 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
  334         return (0);
  335 }
  336 
  337 static void
  338 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
  339     struct g_raid_disk *disk)
  340 {
  341         /*
  342          * We don't fail the last disk in the pack, since it still has decent
  343          * data on it and that's better than failing the disk if it is the root
  344          * file system.
  345          *
  346          * XXX should this be controlled via a tunable?  It makes sense for
  347          * the volume that has / on it.  I can't think of a case where we'd
  348          * want the volume to go away on this kind of event.
  349          */
  350         if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 &&
  351             g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd)
  352                 return;
  353         g_raid_fail_disk(sc, sd, disk);
  354 }
  355 
  356 static void
  357 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
  358 {
  359         struct g_raid_volume *vol;
  360         struct g_raid_subdisk *sd;
  361 
  362         vol = trs->trso_base.tro_volume;
  363         sd = trs->trso_failed_sd;
  364         g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
  365         free(trs->trso_buffer, M_TR_RAID1E);
  366         trs->trso_buffer = NULL;
  367         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  368         trs->trso_type = TR_RAID1E_NONE;
  369         trs->trso_recover_slabs = 0;
  370         trs->trso_failed_sd = NULL;
  371         g_raid_tr_update_state_raid1e(vol, NULL);
  372 }
  373 
  374 static void
  375 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
  376 {
  377         struct g_raid_tr_raid1e_object *trs;
  378         struct g_raid_subdisk *sd;
  379 
  380         trs = (struct g_raid_tr_raid1e_object *)tr;
  381         sd = trs->trso_failed_sd;
  382         G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
  383             "Subdisk %s:%d-%s rebuild completed.",
  384             sd->sd_volume->v_name, sd->sd_pos,
  385             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  386         g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
  387         sd->sd_rebuild_pos = 0;
  388         g_raid_tr_raid1e_rebuild_done(trs);
  389 }
  390 
  391 static void
  392 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
  393 {
  394         struct g_raid_tr_raid1e_object *trs;
  395         struct g_raid_subdisk *sd;
  396         struct g_raid_volume *vol;
  397 
  398         vol = tr->tro_volume;
  399         trs = (struct g_raid_tr_raid1e_object *)tr;
  400         sd = trs->trso_failed_sd;
  401         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
  402                 G_RAID_DEBUG1(1, vol->v_softc,
  403                     "Subdisk %s:%d-%s rebuild is aborting.",
  404                     sd->sd_volume->v_name, sd->sd_pos,
  405                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  406                 trs->trso_flags |= TR_RAID1E_F_ABORT;
  407         } else {
  408                 G_RAID_DEBUG1(0, vol->v_softc,
  409                     "Subdisk %s:%d-%s rebuild aborted.",
  410                     sd->sd_volume->v_name, sd->sd_pos,
  411                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  412                 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
  413                 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
  414                         trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
  415                         g_raid_unlock_range(tr->tro_volume,
  416                             trs->trso_lock_pos, trs->trso_lock_len);
  417                 }
  418                 g_raid_tr_raid1e_rebuild_done(trs);
  419         }
  420 }
  421 
  422 static void
  423 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
  424 {
  425         struct g_raid_tr_raid1e_object *trs;
  426         struct g_raid_softc *sc;
  427         struct g_raid_volume *vol;
  428         struct g_raid_subdisk *sd;
  429         struct bio *bp;
  430         off_t len, virtual, vend, offset, start;
  431         int disk, copy, best;
  432 
  433         trs = (struct g_raid_tr_raid1e_object *)tr;
  434         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
  435                 return;
  436         vol = tr->tro_volume;
  437         sc = vol->v_softc;
  438         sd = trs->trso_failed_sd;
  439 
  440         while (1) {
  441                 if (sd->sd_rebuild_pos >= sd->sd_size) {
  442                         g_raid_tr_raid1e_rebuild_finish(tr);
  443                         return;
  444                 }
  445                 /* Get virtual offset from physical rebuild position. */
  446                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
  447                 /* Get physical offset back to get first stripe position. */
  448                 V2P(vol, virtual, &disk, &offset, &start);
  449                 /* Calculate contignous data length. */
  450                 len = MIN(g_raid1e_rebuild_slab,
  451                     sd->sd_size - sd->sd_rebuild_pos);
  452                 if ((vol->v_disks_count % N) != 0)
  453                         len = MIN(len, vol->v_strip_size - start);
  454                 /* Find disk with most accurate data. */
  455                 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
  456                     offset + start, len, 0);
  457                 if (best < 0) {
  458                         /* There is no any valid disk. */
  459                         g_raid_tr_raid1e_rebuild_abort(tr);
  460                         return;
  461                 } else if (best != copy) {
  462                         /* Some other disk has better data. */
  463                         break;
  464                 }
  465                 /* We have the most accurate data. Skip the range. */
  466                 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
  467                     sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
  468                 sd->sd_rebuild_pos += len;
  469         }
  470 
  471         bp = &trs->trso_bio;
  472         memset(bp, 0, sizeof(*bp));
  473         bp->bio_offset = offset + start +
  474             ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
  475         bp->bio_length = len;
  476         bp->bio_data = trs->trso_buffer;
  477         bp->bio_cmd = BIO_READ;
  478         bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
  479         bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
  480         G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
  481         /*
  482          * If we are crossing stripe boundary, correct affected virtual
  483          * range we should lock.
  484          */
  485         if (start + len > vol->v_strip_size) {
  486                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
  487                 len = vend - virtual;
  488         }
  489         trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
  490         trs->trso_flags |= TR_RAID1E_F_LOCKED;
  491         trs->trso_lock_pos = virtual;
  492         trs->trso_lock_len = len;
  493         /* Lock callback starts I/O */
  494         g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
  495 }
  496 
  497 static void
  498 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
  499 {
  500         struct g_raid_volume *vol;
  501         struct g_raid_tr_raid1e_object *trs;
  502         struct g_raid_subdisk *sd;
  503 
  504         vol = tr->tro_volume;
  505         trs = (struct g_raid_tr_raid1e_object *)tr;
  506         if (trs->trso_failed_sd) {
  507                 G_RAID_DEBUG1(1, vol->v_softc,
  508                     "Already rebuild in start rebuild. pos %jd\n",
  509                     (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
  510                 return;
  511         }
  512         sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
  513         if (sd == NULL)
  514                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
  515         if (sd == NULL) {
  516                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
  517                 if (sd != NULL) {
  518                         sd->sd_rebuild_pos = 0;
  519                         g_raid_change_subdisk_state(sd,
  520                             G_RAID_SUBDISK_S_RESYNC);
  521                         g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
  522                 } else {
  523                         sd = g_raid_get_subdisk(vol,
  524                             G_RAID_SUBDISK_S_UNINITIALIZED);
  525                         if (sd == NULL)
  526                                 sd = g_raid_get_subdisk(vol,
  527                                     G_RAID_SUBDISK_S_NEW);
  528                         if (sd != NULL) {
  529                                 sd->sd_rebuild_pos = 0;
  530                                 g_raid_change_subdisk_state(sd,
  531                                     G_RAID_SUBDISK_S_REBUILD);
  532                                 g_raid_write_metadata(vol->v_softc,
  533                                     vol, sd, NULL);
  534                         }
  535                 }
  536         }
  537         if (sd == NULL) {
  538                 G_RAID_DEBUG1(1, vol->v_softc,
  539                     "No failed disk to rebuild.  night night.");
  540                 return;
  541         }
  542         trs->trso_failed_sd = sd;
  543         G_RAID_DEBUG1(0, vol->v_softc,
  544             "Subdisk %s:%d-%s rebuild start at %jd.",
  545             sd->sd_volume->v_name, sd->sd_pos,
  546             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
  547             trs->trso_failed_sd->sd_rebuild_pos);
  548         trs->trso_type = TR_RAID1E_REBUILD;
  549         trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
  550         trs->trso_meta_update = g_raid1e_rebuild_meta_update;
  551         g_raid_tr_raid1e_rebuild_some(tr);
  552 }
  553 
  554 static void
  555 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
  556     struct g_raid_subdisk *sd)
  557 {
  558         struct g_raid_volume *vol;
  559         struct g_raid_tr_raid1e_object *trs;
  560         int nr;
  561         
  562         vol = tr->tro_volume;
  563         trs = (struct g_raid_tr_raid1e_object *)tr;
  564         if (trs->trso_stopping)
  565                 return;
  566         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
  567             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
  568         switch(trs->trso_type) {
  569         case TR_RAID1E_NONE:
  570                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
  571                         return;
  572                 if (nr == 0) {
  573                         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
  574                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
  575                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
  576                         if (nr == 0)
  577                                 return;
  578                 }
  579                 g_raid_tr_raid1e_rebuild_start(tr);
  580                 break;
  581         case TR_RAID1E_REBUILD:
  582                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
  583                     trs->trso_failed_sd == sd)
  584                         g_raid_tr_raid1e_rebuild_abort(tr);
  585                 break;
  586         case TR_RAID1E_RESYNC:
  587                 break;
  588         }
  589 }
  590 
  591 static int
  592 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
  593     struct g_raid_subdisk *sd, u_int event)
  594 {
  595 
  596         g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
  597         return (0);
  598 }
  599 
  600 static int
  601 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
  602 {
  603         struct g_raid_tr_raid1e_object *trs;
  604         struct g_raid_volume *vol;
  605 
  606         trs = (struct g_raid_tr_raid1e_object *)tr;
  607         vol = tr->tro_volume;
  608         trs->trso_starting = 0;
  609         g_raid_tr_update_state_raid1e(vol, NULL);
  610         return (0);
  611 }
  612 
  613 static int
  614 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
  615 {
  616         struct g_raid_tr_raid1e_object *trs;
  617         struct g_raid_volume *vol;
  618 
  619         trs = (struct g_raid_tr_raid1e_object *)tr;
  620         vol = tr->tro_volume;
  621         trs->trso_starting = 0;
  622         trs->trso_stopping = 1;
  623         g_raid_tr_update_state_raid1e(vol, NULL);
  624         return (0);
  625 }
  626 
  627 /*
  628  * Select the disk to read from.  Take into account: subdisk state, running
  629  * error recovery, average disk load, head position and possible cache hits.
  630  */
  631 #define ABS(x)          (((x) >= 0) ? (x) : (-(x)))
  632 static int
  633 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
  634     int no, off_t off, off_t len, u_int mask)
  635 {
  636         struct g_raid_subdisk *sd;
  637         off_t offset;
  638         int i, best, prio, bestprio;
  639 
  640         best = -1;
  641         bestprio = INT_MAX;
  642         for (i = 0; i < N; i++) {
  643                 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
  644                 offset = off;
  645                 if (no + i >= vol->v_disks_count)
  646                         offset += vol->v_strip_size;
  647 
  648                 prio = G_RAID_SUBDISK_LOAD(sd);
  649                 if ((mask & (1 << sd->sd_pos)) != 0)
  650                         continue;
  651                 switch (sd->sd_state) {
  652                 case G_RAID_SUBDISK_S_ACTIVE:
  653                         break;
  654                 case G_RAID_SUBDISK_S_RESYNC:
  655                         if (offset + off < sd->sd_rebuild_pos)
  656                                 break;
  657                         /* FALLTHROUGH */
  658                 case G_RAID_SUBDISK_S_STALE:
  659                         prio += i << 24;
  660                         break;
  661                 case G_RAID_SUBDISK_S_REBUILD:
  662                         if (offset + off < sd->sd_rebuild_pos)
  663                                 break;
  664                         /* FALLTHROUGH */
  665                 default:
  666                         continue;
  667                 }
  668                 prio += min(sd->sd_recovery, 255) << 16;
  669                 /* If disk head is precisely in position - highly prefer it. */
  670                 if (G_RAID_SUBDISK_POS(sd) == offset)
  671                         prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
  672                 else
  673                 /* If disk head is close to position - prefer it. */
  674                 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
  675                     G_RAID_SUBDISK_TRACK_SIZE)
  676                         prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
  677                 if (prio < bestprio) {
  678                         bestprio = prio;
  679                         best = i;
  680                 }
  681         }
  682         return (best);
  683 }
  684 
  685 static void
  686 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
  687 {
  688         struct g_raid_volume *vol;
  689         struct g_raid_subdisk *sd;
  690         struct bio_queue_head queue;
  691         struct bio *cbp;
  692         char *addr;
  693         off_t offset, start, length, remain;
  694         u_int no, strip_size;
  695         int best;
  696 
  697         vol = tr->tro_volume;
  698         addr = bp->bio_data;
  699         strip_size = vol->v_strip_size;
  700         V2P(vol, bp->bio_offset, &no, &offset, &start);
  701         remain = bp->bio_length;
  702         bioq_init(&queue);
  703         while (remain > 0) {
  704                 length = MIN(strip_size - start, remain);
  705                 best = g_raid_tr_raid1e_select_read_disk(vol,
  706                     no, offset, length, 0);
  707                 KASSERT(best >= 0, ("No readable disk in volume %s!",
  708                     vol->v_name));
  709                 no += best;
  710                 if (no >= vol->v_disks_count) {
  711                         no -= vol->v_disks_count;
  712                         offset += strip_size;
  713                 }
  714                 cbp = g_clone_bio(bp);
  715                 if (cbp == NULL)
  716                         goto failure;
  717                 cbp->bio_offset = offset + start;
  718                 cbp->bio_data = addr;
  719                 cbp->bio_length = length;
  720                 cbp->bio_caller1 = &vol->v_subdisks[no];
  721                 bioq_insert_tail(&queue, cbp);
  722                 no += N - best;
  723                 if (no >= vol->v_disks_count) {
  724                         no -= vol->v_disks_count;
  725                         offset += strip_size;
  726                 }
  727                 remain -= length;
  728                 addr += length;
  729                 start = 0;
  730         }
  731         for (cbp = bioq_first(&queue); cbp != NULL;
  732             cbp = bioq_first(&queue)) {
  733                 bioq_remove(&queue, cbp);
  734                 sd = cbp->bio_caller1;
  735                 cbp->bio_caller1 = NULL;
  736                 g_raid_subdisk_iostart(sd, cbp);
  737         }
  738         return;
  739 failure:
  740         for (cbp = bioq_first(&queue); cbp != NULL;
  741             cbp = bioq_first(&queue)) {
  742                 bioq_remove(&queue, cbp);
  743                 g_destroy_bio(cbp);
  744         }
  745         if (bp->bio_error == 0)
  746                 bp->bio_error = ENOMEM;
  747         g_raid_iodone(bp, bp->bio_error);
  748 }
  749 
  750 static void
  751 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
  752 {
  753         struct g_raid_volume *vol;
  754         struct g_raid_subdisk *sd;
  755         struct bio_queue_head queue;
  756         struct bio *cbp;
  757         char *addr;
  758         off_t offset, start, length, remain;
  759         u_int no, strip_size;
  760         int i;
  761 
  762         vol = tr->tro_volume;
  763         addr = bp->bio_data;
  764         strip_size = vol->v_strip_size;
  765         V2P(vol, bp->bio_offset, &no, &offset, &start);
  766         remain = bp->bio_length;
  767         bioq_init(&queue);
  768         while (remain > 0) {
  769                 length = MIN(strip_size - start, remain);
  770                 for (i = 0; i < N; i++) {
  771                         sd = &vol->v_subdisks[no];
  772                         switch (sd->sd_state) {
  773                         case G_RAID_SUBDISK_S_ACTIVE:
  774                         case G_RAID_SUBDISK_S_STALE:
  775                         case G_RAID_SUBDISK_S_RESYNC:
  776                                 break;
  777                         case G_RAID_SUBDISK_S_REBUILD:
  778                                 if (offset + start >= sd->sd_rebuild_pos)
  779                                         goto nextdisk;
  780                                 break;
  781                         default:
  782                                 goto nextdisk;
  783                         }
  784                         cbp = g_clone_bio(bp);
  785                         if (cbp == NULL)
  786                                 goto failure;
  787                         cbp->bio_offset = offset + start;
  788                         cbp->bio_data = addr;
  789                         cbp->bio_length = length;
  790                         cbp->bio_caller1 = sd;
  791                         bioq_insert_tail(&queue, cbp);
  792 nextdisk:
  793                         if (++no >= vol->v_disks_count) {
  794                                 no = 0;
  795                                 offset += strip_size;
  796                         }
  797                 }
  798                 remain -= length;
  799                 addr += length;
  800                 start = 0;
  801         }
  802         for (cbp = bioq_first(&queue); cbp != NULL;
  803             cbp = bioq_first(&queue)) {
  804                 bioq_remove(&queue, cbp);
  805                 sd = cbp->bio_caller1;
  806                 cbp->bio_caller1 = NULL;
  807                 g_raid_subdisk_iostart(sd, cbp);
  808         }
  809         return;
  810 failure:
  811         for (cbp = bioq_first(&queue); cbp != NULL;
  812             cbp = bioq_first(&queue)) {
  813                 bioq_remove(&queue, cbp);
  814                 g_destroy_bio(cbp);
  815         }
  816         if (bp->bio_error == 0)
  817                 bp->bio_error = ENOMEM;
  818         g_raid_iodone(bp, bp->bio_error);
  819 }
  820 
  821 static void
  822 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
  823 {
  824         struct g_raid_volume *vol;
  825         struct g_raid_tr_raid1e_object *trs;
  826 
  827         vol = tr->tro_volume;
  828         trs = (struct g_raid_tr_raid1e_object *)tr;
  829         if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
  830             vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
  831             vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
  832                 g_raid_iodone(bp, EIO);
  833                 return;
  834         }
  835         /*
  836          * If we're rebuilding, squeeze in rebuild activity every so often,
  837          * even when the disk is busy.  Be sure to only count real I/O
  838          * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
  839          * by this module.
  840          */
  841         if (trs->trso_failed_sd != NULL &&
  842             !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
  843                 /* Make this new or running now round short. */
  844                 trs->trso_recover_slabs = 0;
  845                 if (--trs->trso_fair_io <= 0) {
  846                         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
  847                         g_raid_tr_raid1e_rebuild_some(tr);
  848                 }
  849         }
  850         switch (bp->bio_cmd) {
  851         case BIO_READ:
  852                 g_raid_tr_iostart_raid1e_read(tr, bp);
  853                 break;
  854         case BIO_WRITE:
  855                 g_raid_tr_iostart_raid1e_write(tr, bp);
  856                 break;
  857         case BIO_DELETE:
  858                 g_raid_iodone(bp, EIO);
  859                 break;
  860         case BIO_FLUSH:
  861                 g_raid_tr_flush_common(tr, bp);
  862                 break;
  863         default:
  864                 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
  865                     bp->bio_cmd, vol->v_name));
  866                 break;
  867         }
  868 }
  869 
  870 static void
  871 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
  872     struct g_raid_subdisk *sd, struct bio *bp)
  873 {
  874         struct bio *cbp;
  875         struct g_raid_subdisk *nsd;
  876         struct g_raid_volume *vol;
  877         struct bio *pbp;
  878         struct g_raid_tr_raid1e_object *trs;
  879         off_t virtual, offset, start;
  880         uintptr_t mask;
  881         int error, do_write, copy, disk, best;
  882 
  883         trs = (struct g_raid_tr_raid1e_object *)tr;
  884         vol = tr->tro_volume;
  885         if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
  886                 if (trs->trso_type == TR_RAID1E_REBUILD) {
  887                         nsd = trs->trso_failed_sd;
  888                         if (bp->bio_cmd == BIO_READ) {
  889 
  890                                 /* Immediately abort rebuild, if requested. */
  891                                 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
  892                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  893                                         g_raid_tr_raid1e_rebuild_abort(tr);
  894                                         return;
  895                                 }
  896 
  897                                 /* On read error, skip and cross fingers. */
  898                                 if (bp->bio_error != 0) {
  899                                         G_RAID_LOGREQ(0, bp,
  900                                             "Read error during rebuild (%d), "
  901                                             "possible data loss!",
  902                                             bp->bio_error);
  903                                         goto rebuild_round_done;
  904                                 }
  905 
  906                                 /*
  907                                  * The read operation finished, queue the
  908                                  * write and get out.
  909                                  */
  910                                 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
  911                                     bp->bio_error);
  912                                 bp->bio_cmd = BIO_WRITE;
  913                                 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
  914                                 bp->bio_offset = nsd->sd_rebuild_pos;
  915                                 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
  916                                 g_raid_subdisk_iostart(nsd, bp);
  917                         } else {
  918                                 /*
  919                                  * The write operation just finished.  Do
  920                                  * another.  We keep cloning the master bio
  921                                  * since it has the right buffers allocated to
  922                                  * it.
  923                                  */
  924                                 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
  925                                     bp->bio_error);
  926                                 if (bp->bio_error != 0 ||
  927                                     trs->trso_flags & TR_RAID1E_F_ABORT) {
  928                                         if ((trs->trso_flags &
  929                                             TR_RAID1E_F_ABORT) == 0) {
  930                                                 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
  931                                                     nsd, nsd->sd_disk);
  932                                         }
  933                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  934                                         g_raid_tr_raid1e_rebuild_abort(tr);
  935                                         return;
  936                                 }
  937 rebuild_round_done:
  938                                 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
  939                                 g_raid_unlock_range(tr->tro_volume,
  940                                     trs->trso_lock_pos, trs->trso_lock_len);
  941                                 nsd->sd_rebuild_pos += bp->bio_length;
  942                                 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
  943                                         g_raid_tr_raid1e_rebuild_finish(tr);
  944                                         return;
  945                                 }
  946 
  947                                 /* Abort rebuild if we are stopping */
  948                                 if (trs->trso_stopping) {
  949                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  950                                         g_raid_tr_raid1e_rebuild_abort(tr);
  951                                         return;
  952                                 }
  953 
  954                                 if (--trs->trso_meta_update <= 0) {
  955                                         g_raid_write_metadata(vol->v_softc,
  956                                             vol, nsd, nsd->sd_disk);
  957                                         trs->trso_meta_update =
  958                                             g_raid1e_rebuild_meta_update;
  959                                         /* Compensate short rebuild I/Os. */
  960                                         if ((vol->v_disks_count % N) != 0 &&
  961                                             vol->v_strip_size <
  962                                              g_raid1e_rebuild_slab) {
  963                                                 trs->trso_meta_update *=
  964                                                     g_raid1e_rebuild_slab;
  965                                                 trs->trso_meta_update /=
  966                                                     vol->v_strip_size;
  967                                         }
  968                                 }
  969                                 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  970                                 if (--trs->trso_recover_slabs <= 0)
  971                                         return;
  972                                 /* Run next rebuild iteration. */
  973                                 g_raid_tr_raid1e_rebuild_some(tr);
  974                         }
  975                 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
  976                         /*
  977                          * read good sd, read bad sd in parallel.  when both
  978                          * done, compare the buffers.  write good to the bad
  979                          * if different.  do the next bit of work.
  980                          */
  981                         panic("Somehow, we think we're doing a resync");
  982                 }
  983                 return;
  984         }
  985         pbp = bp->bio_parent;
  986         pbp->bio_inbed++;
  987         mask = (intptr_t)bp->bio_caller2;
  988         if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
  989                 /*
  990                  * Read failed on first drive.  Retry the read error on
  991                  * another disk drive, if available, before erroring out the
  992                  * read.
  993                  */
  994                 sd->sd_disk->d_read_errs++;
  995                 G_RAID_LOGREQ(0, bp,
  996                     "Read error (%d), %d read errors total",
  997                     bp->bio_error, sd->sd_disk->d_read_errs);
  998 
  999                 /*
 1000                  * If there are too many read errors, we move to degraded.
 1001                  * XXX Do we want to FAIL the drive (eg, make the user redo
 1002                  * everything to get it back in sync), or just degrade the
 1003                  * drive, which kicks off a resync?
 1004                  */
 1005                 do_write = 0;
 1006                 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
 1007                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 1008                 else if (mask == 0)
 1009                         do_write = 1;
 1010 
 1011                 /* Restore what we were doing. */
 1012                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1013                 V2P(vol, virtual, &disk, &offset, &start);
 1014 
 1015                 /* Find the other disk, and try to do the I/O to it. */
 1016                 mask |= 1 << copy;
 1017                 best = g_raid_tr_raid1e_select_read_disk(vol,
 1018                     disk, offset, start, mask);
 1019                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
 1020                         disk += best;
 1021                         if (disk >= vol->v_disks_count) {
 1022                                 disk -= vol->v_disks_count;
 1023                                 offset += vol->v_strip_size;
 1024                         }
 1025                         cbp->bio_offset = offset + start;
 1026                         cbp->bio_length = bp->bio_length;
 1027                         cbp->bio_data = bp->bio_data;
 1028                         g_destroy_bio(bp);
 1029                         nsd = &vol->v_subdisks[disk];
 1030                         G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
 1031                             nsd->sd_pos);
 1032                         if (do_write)
 1033                                 mask |= 1 << 31;
 1034                         if ((mask & (1 << 31)) != 0)
 1035                                 sd->sd_recovery++;
 1036                         cbp->bio_caller2 = (void *)mask;
 1037                         if (do_write) {
 1038                                 cbp->bio_caller1 = nsd;
 1039                                 /* Lock callback starts I/O */
 1040                                 g_raid_lock_range(sd->sd_volume,
 1041                                     virtual, cbp->bio_length, pbp, cbp);
 1042                         } else {
 1043                                 g_raid_subdisk_iostart(nsd, cbp);
 1044                         }
 1045                         return;
 1046                 }
 1047                 /*
 1048                  * We can't retry.  Return the original error by falling
 1049                  * through.  This will happen when there's only one good disk.
 1050                  * We don't need to fail the raid, since its actual state is
 1051                  * based on the state of the subdisks.
 1052                  */
 1053                 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
 1054         }
 1055         if (bp->bio_cmd == BIO_READ &&
 1056             bp->bio_error == 0 &&
 1057             (mask & (1 << 31)) != 0) {
 1058                 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
 1059 
 1060                 /* Restore what we were doing. */
 1061                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1062                 V2P(vol, virtual, &disk, &offset, &start);
 1063 
 1064                 /* Find best disk to write. */
 1065                 best = g_raid_tr_raid1e_select_read_disk(vol,
 1066                     disk, offset, start, ~mask);
 1067                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
 1068                         disk += best;
 1069                         if (disk >= vol->v_disks_count) {
 1070                                 disk -= vol->v_disks_count;
 1071                                 offset += vol->v_strip_size;
 1072                         }
 1073                         cbp->bio_offset = offset + start;
 1074                         cbp->bio_length = bp->bio_length;
 1075                         cbp->bio_data = bp->bio_data;
 1076                         cbp->bio_cmd = BIO_WRITE;
 1077                         cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
 1078                         cbp->bio_caller2 = (void *)mask;
 1079                         g_destroy_bio(bp);
 1080                         G_RAID_LOGREQ(2, cbp,
 1081                             "Attempting bad sector remap on failing drive.");
 1082                         g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
 1083                         return;
 1084                 }
 1085         }
 1086         if ((mask & (1 << 31)) != 0) {
 1087                 /*
 1088                  * We're done with a recovery, mark the range as unlocked.
 1089                  * For any write errors, we agressively fail the disk since
 1090                  * there was both a READ and a WRITE error at this location.
 1091                  * Both types of errors generally indicates the drive is on
 1092                  * the verge of total failure anyway.  Better to stop trusting
 1093                  * it now.  However, we need to reset error to 0 in that case
 1094                  * because we're not failing the original I/O which succeeded.
 1095                  */
 1096 
 1097                 /* Restore what we were doing. */
 1098                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1099                 V2P(vol, virtual, &disk, &offset, &start);
 1100 
 1101                 for (copy = 0; copy < N; copy++) {
 1102                         if ((mask & (1 << copy) ) != 0)
 1103                                 vol->v_subdisks[(disk + copy) %
 1104                                     vol->v_disks_count].sd_recovery--;
 1105                 }
 1106 
 1107                 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
 1108                         G_RAID_LOGREQ(0, bp, "Remap write failed: "
 1109                             "failing subdisk.");
 1110                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 1111                         bp->bio_error = 0;
 1112                 }
 1113                 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
 1114                 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
 1115         }
 1116         error = bp->bio_error;
 1117         g_destroy_bio(bp);
 1118         if (pbp->bio_children == pbp->bio_inbed) {
 1119                 pbp->bio_completed = pbp->bio_length;
 1120                 g_raid_iodone(pbp, error);
 1121         }
 1122 }
 1123 
 1124 static int
 1125 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
 1126     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
 1127 {
 1128         struct g_raid_volume *vol;
 1129         struct g_raid_subdisk *sd;
 1130         struct bio_queue_head queue;
 1131         char *addr;
 1132         off_t offset, start, length, remain;
 1133         u_int no, strip_size;
 1134         int i, error;
 1135 
 1136         vol = tr->tro_volume;
 1137         addr = virtual;
 1138         strip_size = vol->v_strip_size;
 1139         V2P(vol, boffset, &no, &offset, &start);
 1140         remain = blength;
 1141         bioq_init(&queue);
 1142         while (remain > 0) {
 1143                 length = MIN(strip_size - start, remain);
 1144                 for (i = 0; i < N; i++) {
 1145                         sd = &vol->v_subdisks[no];
 1146                         switch (sd->sd_state) {
 1147                         case G_RAID_SUBDISK_S_ACTIVE:
 1148                         case G_RAID_SUBDISK_S_STALE:
 1149                         case G_RAID_SUBDISK_S_RESYNC:
 1150                                 break;
 1151                         case G_RAID_SUBDISK_S_REBUILD:
 1152                                 if (offset + start >= sd->sd_rebuild_pos)
 1153                                         goto nextdisk;
 1154                                 break;
 1155                         default:
 1156                                 goto nextdisk;
 1157                         }
 1158                         error = g_raid_subdisk_kerneldump(sd,
 1159                             addr, 0, offset + start, length);
 1160                         if (error != 0)
 1161                                 return (error);
 1162 nextdisk:
 1163                         if (++no >= vol->v_disks_count) {
 1164                                 no = 0;
 1165                                 offset += strip_size;
 1166                         }
 1167                 }
 1168                 remain -= length;
 1169                 addr += length;
 1170                 start = 0;
 1171         }
 1172         return (0);
 1173 }
 1174 
 1175 static int
 1176 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
 1177 {
 1178         struct bio *bp;
 1179         struct g_raid_subdisk *sd;
 1180 
 1181         bp = (struct bio *)argp;
 1182         sd = (struct g_raid_subdisk *)bp->bio_caller1;
 1183         g_raid_subdisk_iostart(sd, bp);
 1184 
 1185         return (0);
 1186 }
 1187 
 1188 static int
 1189 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
 1190 {
 1191         struct g_raid_tr_raid1e_object *trs;
 1192         struct g_raid_volume *vol;
 1193 
 1194         vol = tr->tro_volume;
 1195         trs = (struct g_raid_tr_raid1e_object *)tr;
 1196         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
 1197         trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
 1198         /* Compensate short rebuild I/Os. */
 1199         if ((vol->v_disks_count % N) != 0 &&
 1200             vol->v_strip_size < g_raid1e_rebuild_slab) {
 1201                 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
 1202                 trs->trso_recover_slabs /= vol->v_strip_size;
 1203         }
 1204         if (trs->trso_type == TR_RAID1E_REBUILD)
 1205                 g_raid_tr_raid1e_rebuild_some(tr);
 1206         return (0);
 1207 }
 1208 
 1209 static int
 1210 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
 1211 {
 1212         struct g_raid_tr_raid1e_object *trs;
 1213 
 1214         trs = (struct g_raid_tr_raid1e_object *)tr;
 1215 
 1216         if (trs->trso_buffer != NULL) {
 1217                 free(trs->trso_buffer, M_TR_RAID1E);
 1218                 trs->trso_buffer = NULL;
 1219         }
 1220         return (0);
 1221 }
 1222 
 1223 G_RAID_TR_DECLARE(g_raid_tr_raid1e);

Cache object: ce4b35bec03094e49b86cc967cdaf5bb


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.