The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/geom/raid/tr_raid1e.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD: releng/11.2/sys/geom/raid/tr_raid1e.c 298808 2016-04-29 20:56:58Z pfg $");
   29 
   30 #include <sys/param.h>
   31 #include <sys/bio.h>
   32 #include <sys/endian.h>
   33 #include <sys/kernel.h>
   34 #include <sys/kobj.h>
   35 #include <sys/limits.h>
   36 #include <sys/lock.h>
   37 #include <sys/malloc.h>
   38 #include <sys/mutex.h>
   39 #include <sys/sysctl.h>
   40 #include <sys/systm.h>
   41 #include <geom/geom.h>
   42 #include "geom/raid/g_raid.h"
   43 #include "g_raid_tr_if.h"
   44 
   45 #define N       2
   46 
   47 SYSCTL_DECL(_kern_geom_raid_raid1e);
   48 
   49 #define RAID1E_REBUILD_SLAB     (1 << 20) /* One transation in a rebuild */
   50 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
   51 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
   52     &g_raid1e_rebuild_slab, 0,
   53     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
   54 
   55 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
   56 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
   57 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
   58     &g_raid1e_rebuild_fair_io, 0,
   59     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
   60 
   61 #define RAID1E_REBUILD_CLUSTER_IDLE 100
   62 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
   63 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
   64     &g_raid1e_rebuild_cluster_idle, 0,
   65     "Number of slabs to do each time we trigger a rebuild cycle");
   66 
   67 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
   68 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
   69 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
   70     &g_raid1e_rebuild_meta_update, 0,
   71     "When to update the meta data.");
   72 
   73 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
   74 
   75 #define TR_RAID1E_NONE 0
   76 #define TR_RAID1E_REBUILD 1
   77 #define TR_RAID1E_RESYNC 2
   78 
   79 #define TR_RAID1E_F_DOING_SOME  0x1
   80 #define TR_RAID1E_F_LOCKED      0x2
   81 #define TR_RAID1E_F_ABORT       0x4
   82 
   83 struct g_raid_tr_raid1e_object {
   84         struct g_raid_tr_object  trso_base;
   85         int                      trso_starting;
   86         int                      trso_stopping;
   87         int                      trso_type;
   88         int                      trso_recover_slabs; /* slabs before rest */
   89         int                      trso_fair_io;
   90         int                      trso_meta_update;
   91         int                      trso_flags;
   92         struct g_raid_subdisk   *trso_failed_sd; /* like per volume */
   93         void                    *trso_buffer;    /* Buffer space */
   94         off_t                    trso_lock_pos; /* Locked range start. */
   95         off_t                    trso_lock_len; /* Locked range length. */
   96         struct bio               trso_bio;
   97 };
   98 
   99 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
  100 static g_raid_tr_event_t g_raid_tr_event_raid1e;
  101 static g_raid_tr_start_t g_raid_tr_start_raid1e;
  102 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
  103 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
  104 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
  105 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
  106 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
  107 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
  108 static g_raid_tr_free_t g_raid_tr_free_raid1e;
  109 
  110 static kobj_method_t g_raid_tr_raid1e_methods[] = {
  111         KOBJMETHOD(g_raid_tr_taste,     g_raid_tr_taste_raid1e),
  112         KOBJMETHOD(g_raid_tr_event,     g_raid_tr_event_raid1e),
  113         KOBJMETHOD(g_raid_tr_start,     g_raid_tr_start_raid1e),
  114         KOBJMETHOD(g_raid_tr_stop,      g_raid_tr_stop_raid1e),
  115         KOBJMETHOD(g_raid_tr_iostart,   g_raid_tr_iostart_raid1e),
  116         KOBJMETHOD(g_raid_tr_iodone,    g_raid_tr_iodone_raid1e),
  117         KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
  118         KOBJMETHOD(g_raid_tr_locked,    g_raid_tr_locked_raid1e),
  119         KOBJMETHOD(g_raid_tr_idle,      g_raid_tr_idle_raid1e),
  120         KOBJMETHOD(g_raid_tr_free,      g_raid_tr_free_raid1e),
  121         { 0, 0 }
  122 };
  123 
  124 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
  125         "RAID1E",
  126         g_raid_tr_raid1e_methods,
  127         sizeof(struct g_raid_tr_raid1e_object),
  128         .trc_enable = 1,
  129         .trc_priority = 200,
  130         .trc_accept_unmapped = 1
  131 };
  132 
  133 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
  134 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
  135     struct g_raid_subdisk *sd);
  136 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
  137     int no, off_t off, off_t len, u_int mask);
  138 
  139 static inline void
  140 V2P(struct g_raid_volume *vol, off_t virt,
  141     int *disk, off_t *offset, off_t *start)
  142 {
  143         off_t nstrip;
  144         u_int strip_size;
  145 
  146         strip_size = vol->v_strip_size;
  147         /* Strip number. */
  148         nstrip = virt / strip_size;
  149         /* Start position in strip. */
  150         *start = virt % strip_size;
  151         /* Disk number. */
  152         *disk = (nstrip * N) % vol->v_disks_count;
  153         /* Strip start position in disk. */
  154         *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
  155 }
  156 
  157 static inline void
  158 P2V(struct g_raid_volume *vol, int disk, off_t offset,
  159     off_t *virt, int *copy)
  160 {
  161         off_t nstrip, start;
  162         u_int strip_size;
  163 
  164         strip_size = vol->v_strip_size;
  165         /* Start position in strip. */
  166         start = offset % strip_size;
  167         /* Physical strip number. */
  168         nstrip = (offset / strip_size) * vol->v_disks_count + disk;
  169         /* Number of physical strip (copy) inside virtual strip. */
  170         *copy = nstrip % N;
  171         /* Offset in virtual space. */
  172         *virt = (nstrip / N) * strip_size + start;
  173 }
  174 
  175 static int
  176 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
  177 {
  178         struct g_raid_tr_raid1e_object *trs;
  179 
  180         trs = (struct g_raid_tr_raid1e_object *)tr;
  181         if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
  182             tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
  183                 return (G_RAID_TR_TASTE_FAIL);
  184         trs->trso_starting = 1;
  185         return (G_RAID_TR_TASTE_SUCCEED);
  186 }
  187 
  188 static int
  189 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
  190 {
  191         struct g_raid_softc *sc;
  192         struct g_raid_subdisk *sd, *bestsd, *worstsd;
  193         int i, j, state, sstate;
  194 
  195         sc = vol->v_softc;
  196         state = G_RAID_VOLUME_S_OPTIMAL;
  197         for (i = 0; i < vol->v_disks_count / N; i++) {
  198                 bestsd = &vol->v_subdisks[i * N];
  199                 for (j = 1; j < N; j++) {
  200                         sd = &vol->v_subdisks[i * N + j];
  201                         if (sd->sd_state > bestsd->sd_state)
  202                                 bestsd = sd;
  203                         else if (sd->sd_state == bestsd->sd_state &&
  204                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
  205                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
  206                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
  207                                 bestsd = sd;
  208                 }
  209                 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
  210                     bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
  211                         /* We found reasonable candidate. */
  212                         G_RAID_DEBUG1(1, sc,
  213                             "Promote subdisk %s:%d from %s to ACTIVE.",
  214                             vol->v_name, bestsd->sd_pos,
  215                             g_raid_subdisk_state2str(bestsd->sd_state));
  216                         g_raid_change_subdisk_state(bestsd,
  217                             G_RAID_SUBDISK_S_ACTIVE);
  218                         g_raid_write_metadata(sc,
  219                             vol, bestsd, bestsd->sd_disk);
  220                 }
  221                 worstsd = &vol->v_subdisks[i * N];
  222                 for (j = 1; j < N; j++) {
  223                         sd = &vol->v_subdisks[i * N + j];
  224                         if (sd->sd_state < worstsd->sd_state)
  225                                 worstsd = sd;
  226                 }
  227                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  228                         sstate = G_RAID_VOLUME_S_OPTIMAL;
  229                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  230                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
  231                 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  232                         sstate = G_RAID_VOLUME_S_DEGRADED;
  233                 else
  234                         sstate = G_RAID_VOLUME_S_BROKEN;
  235                 if (sstate < state)
  236                         state = sstate;
  237         }
  238         return (state);
  239 }
  240 
  241 static int
  242 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
  243 {
  244         struct g_raid_softc *sc;
  245         struct g_raid_subdisk *sd, *bestsd, *worstsd;
  246         int i, j, state, sstate;
  247 
  248         sc = vol->v_softc;
  249         if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
  250             vol->v_disks_count)
  251                 return (G_RAID_VOLUME_S_OPTIMAL);
  252         for (i = 0; i < vol->v_disks_count; i++) {
  253                 sd = &vol->v_subdisks[i];
  254                 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
  255                         /* We found reasonable candidate. */
  256                         G_RAID_DEBUG1(1, sc,
  257                             "Promote subdisk %s:%d from %s to STALE.",
  258                             vol->v_name, sd->sd_pos,
  259                             g_raid_subdisk_state2str(sd->sd_state));
  260                         g_raid_change_subdisk_state(sd,
  261                             G_RAID_SUBDISK_S_STALE);
  262                         g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
  263                 }
  264         }
  265         state = G_RAID_VOLUME_S_OPTIMAL;
  266         for (i = 0; i < vol->v_disks_count; i++) {
  267                 bestsd = &vol->v_subdisks[i];
  268                 worstsd = &vol->v_subdisks[i];
  269                 for (j = 1; j < N; j++) {
  270                         sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
  271                         if (sd->sd_state > bestsd->sd_state)
  272                                 bestsd = sd;
  273                         else if (sd->sd_state == bestsd->sd_state &&
  274                             (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
  275                              sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
  276                             sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
  277                                 bestsd = sd;
  278                         if (sd->sd_state < worstsd->sd_state)
  279                                 worstsd = sd;
  280                 }
  281                 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
  282                         sstate = G_RAID_VOLUME_S_OPTIMAL;
  283                 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  284                         sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
  285                 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
  286                         sstate = G_RAID_VOLUME_S_DEGRADED;
  287                 else
  288                         sstate = G_RAID_VOLUME_S_BROKEN;
  289                 if (sstate < state)
  290                         state = sstate;
  291         }
  292         return (state);
  293 }
  294 
  295 static int
  296 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
  297     struct g_raid_subdisk *sd)
  298 {
  299         struct g_raid_tr_raid1e_object *trs;
  300         struct g_raid_softc *sc;
  301         u_int s;
  302 
  303         sc = vol->v_softc;
  304         trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
  305         if (trs->trso_stopping &&
  306             (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
  307                 s = G_RAID_VOLUME_S_STOPPED;
  308         else if (trs->trso_starting)
  309                 s = G_RAID_VOLUME_S_STARTING;
  310         else {
  311                 if ((vol->v_disks_count % N) == 0)
  312                         s = g_raid_tr_update_state_raid1e_even(vol);
  313                 else
  314                         s = g_raid_tr_update_state_raid1e_odd(vol);
  315         }
  316         if (s != vol->v_state) {
  317                 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
  318                     G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
  319                     G_RAID_EVENT_VOLUME);
  320                 g_raid_change_volume_state(vol, s);
  321                 if (!trs->trso_starting && !trs->trso_stopping)
  322                         g_raid_write_metadata(sc, vol, NULL, NULL);
  323         }
  324         if (!trs->trso_starting && !trs->trso_stopping)
  325                 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
  326         return (0);
  327 }
  328 
  329 static void
  330 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
  331     struct g_raid_disk *disk)
  332 {
  333         struct g_raid_volume *vol;
  334 
  335         vol = sd->sd_volume;
  336         /*
  337          * We don't fail the last disk in the pack, since it still has decent
  338          * data on it and that's better than failing the disk if it is the root
  339          * file system.
  340          *
  341          * XXX should this be controlled via a tunable?  It makes sense for
  342          * the volume that has / on it.  I can't think of a case where we'd
  343          * want the volume to go away on this kind of event.
  344          */
  345         if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
  346              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
  347              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
  348              g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
  349              vol->v_disks_count) &&
  350             (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
  351                 return;
  352         g_raid_fail_disk(sc, sd, disk);
  353 }
  354 
  355 static void
  356 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
  357 {
  358         struct g_raid_volume *vol;
  359         struct g_raid_subdisk *sd;
  360 
  361         vol = trs->trso_base.tro_volume;
  362         sd = trs->trso_failed_sd;
  363         g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
  364         free(trs->trso_buffer, M_TR_RAID1E);
  365         trs->trso_buffer = NULL;
  366         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  367         trs->trso_type = TR_RAID1E_NONE;
  368         trs->trso_recover_slabs = 0;
  369         trs->trso_failed_sd = NULL;
  370         g_raid_tr_update_state_raid1e(vol, NULL);
  371 }
  372 
  373 static void
  374 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
  375 {
  376         struct g_raid_tr_raid1e_object *trs;
  377         struct g_raid_subdisk *sd;
  378 
  379         trs = (struct g_raid_tr_raid1e_object *)tr;
  380         sd = trs->trso_failed_sd;
  381         G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
  382             "Subdisk %s:%d-%s rebuild completed.",
  383             sd->sd_volume->v_name, sd->sd_pos,
  384             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  385         g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
  386         sd->sd_rebuild_pos = 0;
  387         g_raid_tr_raid1e_rebuild_done(trs);
  388 }
  389 
  390 static void
  391 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
  392 {
  393         struct g_raid_tr_raid1e_object *trs;
  394         struct g_raid_subdisk *sd;
  395         struct g_raid_volume *vol;
  396 
  397         vol = tr->tro_volume;
  398         trs = (struct g_raid_tr_raid1e_object *)tr;
  399         sd = trs->trso_failed_sd;
  400         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
  401                 G_RAID_DEBUG1(1, vol->v_softc,
  402                     "Subdisk %s:%d-%s rebuild is aborting.",
  403                     sd->sd_volume->v_name, sd->sd_pos,
  404                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  405                 trs->trso_flags |= TR_RAID1E_F_ABORT;
  406         } else {
  407                 G_RAID_DEBUG1(0, vol->v_softc,
  408                     "Subdisk %s:%d-%s rebuild aborted.",
  409                     sd->sd_volume->v_name, sd->sd_pos,
  410                     sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
  411                 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
  412                 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
  413                         trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
  414                         g_raid_unlock_range(tr->tro_volume,
  415                             trs->trso_lock_pos, trs->trso_lock_len);
  416                 }
  417                 g_raid_tr_raid1e_rebuild_done(trs);
  418         }
  419 }
  420 
  421 static void
  422 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
  423 {
  424         struct g_raid_tr_raid1e_object *trs;
  425         struct g_raid_softc *sc;
  426         struct g_raid_volume *vol;
  427         struct g_raid_subdisk *sd;
  428         struct bio *bp;
  429         off_t len, virtual, vend, offset, start;
  430         int disk, copy, best;
  431 
  432         trs = (struct g_raid_tr_raid1e_object *)tr;
  433         if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
  434                 return;
  435         vol = tr->tro_volume;
  436         sc = vol->v_softc;
  437         sd = trs->trso_failed_sd;
  438 
  439         while (1) {
  440                 if (sd->sd_rebuild_pos >= sd->sd_size) {
  441                         g_raid_tr_raid1e_rebuild_finish(tr);
  442                         return;
  443                 }
  444                 /* Get virtual offset from physical rebuild position. */
  445                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
  446                 /* Get physical offset back to get first stripe position. */
  447                 V2P(vol, virtual, &disk, &offset, &start);
  448                 /* Calculate contignous data length. */
  449                 len = MIN(g_raid1e_rebuild_slab,
  450                     sd->sd_size - sd->sd_rebuild_pos);
  451                 if ((vol->v_disks_count % N) != 0)
  452                         len = MIN(len, vol->v_strip_size - start);
  453                 /* Find disk with most accurate data. */
  454                 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
  455                     offset + start, len, 0);
  456                 if (best < 0) {
  457                         /* There is no any valid disk. */
  458                         g_raid_tr_raid1e_rebuild_abort(tr);
  459                         return;
  460                 } else if (best != copy) {
  461                         /* Some other disk has better data. */
  462                         break;
  463                 }
  464                 /* We have the most accurate data. Skip the range. */
  465                 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
  466                     sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
  467                 sd->sd_rebuild_pos += len;
  468         }
  469 
  470         bp = &trs->trso_bio;
  471         memset(bp, 0, sizeof(*bp));
  472         bp->bio_offset = offset + start +
  473             ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
  474         bp->bio_length = len;
  475         bp->bio_data = trs->trso_buffer;
  476         bp->bio_cmd = BIO_READ;
  477         bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
  478         bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
  479         G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
  480         /*
  481          * If we are crossing stripe boundary, correct affected virtual
  482          * range we should lock.
  483          */
  484         if (start + len > vol->v_strip_size) {
  485                 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
  486                 len = vend - virtual;
  487         }
  488         trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
  489         trs->trso_flags |= TR_RAID1E_F_LOCKED;
  490         trs->trso_lock_pos = virtual;
  491         trs->trso_lock_len = len;
  492         /* Lock callback starts I/O */
  493         g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
  494 }
  495 
  496 static void
  497 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
  498 {
  499         struct g_raid_volume *vol;
  500         struct g_raid_tr_raid1e_object *trs;
  501         struct g_raid_subdisk *sd;
  502 
  503         vol = tr->tro_volume;
  504         trs = (struct g_raid_tr_raid1e_object *)tr;
  505         if (trs->trso_failed_sd) {
  506                 G_RAID_DEBUG1(1, vol->v_softc,
  507                     "Already rebuild in start rebuild. pos %jd\n",
  508                     (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
  509                 return;
  510         }
  511         sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
  512         if (sd == NULL)
  513                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
  514         if (sd == NULL) {
  515                 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
  516                 if (sd != NULL) {
  517                         sd->sd_rebuild_pos = 0;
  518                         g_raid_change_subdisk_state(sd,
  519                             G_RAID_SUBDISK_S_RESYNC);
  520                         g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
  521                 } else {
  522                         sd = g_raid_get_subdisk(vol,
  523                             G_RAID_SUBDISK_S_UNINITIALIZED);
  524                         if (sd == NULL)
  525                                 sd = g_raid_get_subdisk(vol,
  526                                     G_RAID_SUBDISK_S_NEW);
  527                         if (sd != NULL) {
  528                                 sd->sd_rebuild_pos = 0;
  529                                 g_raid_change_subdisk_state(sd,
  530                                     G_RAID_SUBDISK_S_REBUILD);
  531                                 g_raid_write_metadata(vol->v_softc,
  532                                     vol, sd, NULL);
  533                         }
  534                 }
  535         }
  536         if (sd == NULL) {
  537                 G_RAID_DEBUG1(1, vol->v_softc,
  538                     "No failed disk to rebuild.  night night.");
  539                 return;
  540         }
  541         trs->trso_failed_sd = sd;
  542         G_RAID_DEBUG1(0, vol->v_softc,
  543             "Subdisk %s:%d-%s rebuild start at %jd.",
  544             sd->sd_volume->v_name, sd->sd_pos,
  545             sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
  546             trs->trso_failed_sd->sd_rebuild_pos);
  547         trs->trso_type = TR_RAID1E_REBUILD;
  548         trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
  549         trs->trso_meta_update = g_raid1e_rebuild_meta_update;
  550         g_raid_tr_raid1e_rebuild_some(tr);
  551 }
  552 
  553 static void
  554 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
  555     struct g_raid_subdisk *sd)
  556 {
  557         struct g_raid_volume *vol;
  558         struct g_raid_tr_raid1e_object *trs;
  559         int nr;
  560         
  561         vol = tr->tro_volume;
  562         trs = (struct g_raid_tr_raid1e_object *)tr;
  563         if (trs->trso_stopping)
  564                 return;
  565         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
  566             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
  567         switch(trs->trso_type) {
  568         case TR_RAID1E_NONE:
  569                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
  570                         return;
  571                 if (nr == 0) {
  572                         nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
  573                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
  574                             g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
  575                         if (nr == 0)
  576                                 return;
  577                 }
  578                 g_raid_tr_raid1e_rebuild_start(tr);
  579                 break;
  580         case TR_RAID1E_REBUILD:
  581                 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
  582                     trs->trso_failed_sd == sd)
  583                         g_raid_tr_raid1e_rebuild_abort(tr);
  584                 break;
  585         case TR_RAID1E_RESYNC:
  586                 break;
  587         }
  588 }
  589 
  590 static int
  591 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
  592     struct g_raid_subdisk *sd, u_int event)
  593 {
  594 
  595         g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
  596         return (0);
  597 }
  598 
  599 static int
  600 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
  601 {
  602         struct g_raid_tr_raid1e_object *trs;
  603         struct g_raid_volume *vol;
  604 
  605         trs = (struct g_raid_tr_raid1e_object *)tr;
  606         vol = tr->tro_volume;
  607         trs->trso_starting = 0;
  608         g_raid_tr_update_state_raid1e(vol, NULL);
  609         return (0);
  610 }
  611 
  612 static int
  613 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
  614 {
  615         struct g_raid_tr_raid1e_object *trs;
  616         struct g_raid_volume *vol;
  617 
  618         trs = (struct g_raid_tr_raid1e_object *)tr;
  619         vol = tr->tro_volume;
  620         trs->trso_starting = 0;
  621         trs->trso_stopping = 1;
  622         g_raid_tr_update_state_raid1e(vol, NULL);
  623         return (0);
  624 }
  625 
  626 /*
  627  * Select the disk to read from.  Take into account: subdisk state, running
  628  * error recovery, average disk load, head position and possible cache hits.
  629  */
  630 #define ABS(x)          (((x) >= 0) ? (x) : (-(x)))
  631 static int
  632 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
  633     int no, off_t off, off_t len, u_int mask)
  634 {
  635         struct g_raid_subdisk *sd;
  636         off_t offset;
  637         int i, best, prio, bestprio;
  638 
  639         best = -1;
  640         bestprio = INT_MAX;
  641         for (i = 0; i < N; i++) {
  642                 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
  643                 offset = off;
  644                 if (no + i >= vol->v_disks_count)
  645                         offset += vol->v_strip_size;
  646 
  647                 prio = G_RAID_SUBDISK_LOAD(sd);
  648                 if ((mask & (1 << sd->sd_pos)) != 0)
  649                         continue;
  650                 switch (sd->sd_state) {
  651                 case G_RAID_SUBDISK_S_ACTIVE:
  652                         break;
  653                 case G_RAID_SUBDISK_S_RESYNC:
  654                         if (offset + off < sd->sd_rebuild_pos)
  655                                 break;
  656                         /* FALLTHROUGH */
  657                 case G_RAID_SUBDISK_S_STALE:
  658                         prio += i << 24;
  659                         break;
  660                 case G_RAID_SUBDISK_S_REBUILD:
  661                         if (offset + off < sd->sd_rebuild_pos)
  662                                 break;
  663                         /* FALLTHROUGH */
  664                 default:
  665                         continue;
  666                 }
  667                 prio += min(sd->sd_recovery, 255) << 16;
  668                 /* If disk head is precisely in position - highly prefer it. */
  669                 if (G_RAID_SUBDISK_POS(sd) == offset)
  670                         prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
  671                 else
  672                 /* If disk head is close to position - prefer it. */
  673                 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
  674                     G_RAID_SUBDISK_TRACK_SIZE)
  675                         prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
  676                 if (prio < bestprio) {
  677                         bestprio = prio;
  678                         best = i;
  679                 }
  680         }
  681         return (best);
  682 }
  683 
  684 static void
  685 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
  686 {
  687         struct g_raid_volume *vol;
  688         struct g_raid_subdisk *sd;
  689         struct bio_queue_head queue;
  690         struct bio *cbp;
  691         char *addr;
  692         off_t offset, start, length, remain;
  693         u_int no, strip_size;
  694         int best;
  695 
  696         vol = tr->tro_volume;
  697         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
  698                 addr = NULL;
  699         else
  700                 addr = bp->bio_data;
  701         strip_size = vol->v_strip_size;
  702         V2P(vol, bp->bio_offset, &no, &offset, &start);
  703         remain = bp->bio_length;
  704         bioq_init(&queue);
  705         while (remain > 0) {
  706                 length = MIN(strip_size - start, remain);
  707                 best = g_raid_tr_raid1e_select_read_disk(vol,
  708                     no, offset, length, 0);
  709                 KASSERT(best >= 0, ("No readable disk in volume %s!",
  710                     vol->v_name));
  711                 no += best;
  712                 if (no >= vol->v_disks_count) {
  713                         no -= vol->v_disks_count;
  714                         offset += strip_size;
  715                 }
  716                 cbp = g_clone_bio(bp);
  717                 if (cbp == NULL)
  718                         goto failure;
  719                 cbp->bio_offset = offset + start;
  720                 cbp->bio_length = length;
  721                 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
  722                         cbp->bio_ma_offset += (uintptr_t)addr;
  723                         cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
  724                         cbp->bio_ma_offset %= PAGE_SIZE;
  725                         cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
  726                             cbp->bio_length) / PAGE_SIZE;
  727                 } else
  728                         cbp->bio_data = addr;
  729                 cbp->bio_caller1 = &vol->v_subdisks[no];
  730                 bioq_insert_tail(&queue, cbp);
  731                 no += N - best;
  732                 if (no >= vol->v_disks_count) {
  733                         no -= vol->v_disks_count;
  734                         offset += strip_size;
  735                 }
  736                 remain -= length;
  737                 addr += length;
  738                 start = 0;
  739         }
  740         while ((cbp = bioq_takefirst(&queue)) != NULL) {
  741                 sd = cbp->bio_caller1;
  742                 cbp->bio_caller1 = NULL;
  743                 g_raid_subdisk_iostart(sd, cbp);
  744         }
  745         return;
  746 failure:
  747         while ((cbp = bioq_takefirst(&queue)) != NULL)
  748                 g_destroy_bio(cbp);
  749         if (bp->bio_error == 0)
  750                 bp->bio_error = ENOMEM;
  751         g_raid_iodone(bp, bp->bio_error);
  752 }
  753 
  754 static void
  755 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
  756 {
  757         struct g_raid_volume *vol;
  758         struct g_raid_subdisk *sd;
  759         struct bio_queue_head queue;
  760         struct bio *cbp;
  761         char *addr;
  762         off_t offset, start, length, remain;
  763         u_int no, strip_size;
  764         int i;
  765 
  766         vol = tr->tro_volume;
  767         if ((bp->bio_flags & BIO_UNMAPPED) != 0)
  768                 addr = NULL;
  769         else
  770                 addr = bp->bio_data;
  771         strip_size = vol->v_strip_size;
  772         V2P(vol, bp->bio_offset, &no, &offset, &start);
  773         remain = bp->bio_length;
  774         bioq_init(&queue);
  775         while (remain > 0) {
  776                 length = MIN(strip_size - start, remain);
  777                 for (i = 0; i < N; i++) {
  778                         sd = &vol->v_subdisks[no];
  779                         switch (sd->sd_state) {
  780                         case G_RAID_SUBDISK_S_ACTIVE:
  781                         case G_RAID_SUBDISK_S_STALE:
  782                         case G_RAID_SUBDISK_S_RESYNC:
  783                                 break;
  784                         case G_RAID_SUBDISK_S_REBUILD:
  785                                 if (offset + start >= sd->sd_rebuild_pos)
  786                                         goto nextdisk;
  787                                 break;
  788                         default:
  789                                 goto nextdisk;
  790                         }
  791                         cbp = g_clone_bio(bp);
  792                         if (cbp == NULL)
  793                                 goto failure;
  794                         cbp->bio_offset = offset + start;
  795                         cbp->bio_length = length;
  796                         if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
  797                             bp->bio_cmd != BIO_DELETE) {
  798                                 cbp->bio_ma_offset += (uintptr_t)addr;
  799                                 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
  800                                 cbp->bio_ma_offset %= PAGE_SIZE;
  801                                 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
  802                                     cbp->bio_length) / PAGE_SIZE;
  803                         } else
  804                                 cbp->bio_data = addr;
  805                         cbp->bio_caller1 = sd;
  806                         bioq_insert_tail(&queue, cbp);
  807 nextdisk:
  808                         if (++no >= vol->v_disks_count) {
  809                                 no = 0;
  810                                 offset += strip_size;
  811                         }
  812                 }
  813                 remain -= length;
  814                 if (bp->bio_cmd != BIO_DELETE)
  815                         addr += length;
  816                 start = 0;
  817         }
  818         while ((cbp = bioq_takefirst(&queue)) != NULL) {
  819                 sd = cbp->bio_caller1;
  820                 cbp->bio_caller1 = NULL;
  821                 g_raid_subdisk_iostart(sd, cbp);
  822         }
  823         return;
  824 failure:
  825         while ((cbp = bioq_takefirst(&queue)) != NULL)
  826                 g_destroy_bio(cbp);
  827         if (bp->bio_error == 0)
  828                 bp->bio_error = ENOMEM;
  829         g_raid_iodone(bp, bp->bio_error);
  830 }
  831 
  832 static void
  833 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
  834 {
  835         struct g_raid_volume *vol;
  836         struct g_raid_tr_raid1e_object *trs;
  837 
  838         vol = tr->tro_volume;
  839         trs = (struct g_raid_tr_raid1e_object *)tr;
  840         if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
  841             vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
  842             vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
  843                 g_raid_iodone(bp, EIO);
  844                 return;
  845         }
  846         /*
  847          * If we're rebuilding, squeeze in rebuild activity every so often,
  848          * even when the disk is busy.  Be sure to only count real I/O
  849          * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
  850          * by this module.
  851          */
  852         if (trs->trso_failed_sd != NULL &&
  853             !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
  854                 /* Make this new or running now round short. */
  855                 trs->trso_recover_slabs = 0;
  856                 if (--trs->trso_fair_io <= 0) {
  857                         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
  858                         g_raid_tr_raid1e_rebuild_some(tr);
  859                 }
  860         }
  861         switch (bp->bio_cmd) {
  862         case BIO_READ:
  863                 g_raid_tr_iostart_raid1e_read(tr, bp);
  864                 break;
  865         case BIO_WRITE:
  866         case BIO_DELETE:
  867                 g_raid_tr_iostart_raid1e_write(tr, bp);
  868                 break;
  869         case BIO_FLUSH:
  870                 g_raid_tr_flush_common(tr, bp);
  871                 break;
  872         default:
  873                 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
  874                     bp->bio_cmd, vol->v_name));
  875                 break;
  876         }
  877 }
  878 
  879 static void
  880 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
  881     struct g_raid_subdisk *sd, struct bio *bp)
  882 {
  883         struct bio *cbp;
  884         struct g_raid_subdisk *nsd;
  885         struct g_raid_volume *vol;
  886         struct bio *pbp;
  887         struct g_raid_tr_raid1e_object *trs;
  888         off_t virtual, offset, start;
  889         uintptr_t mask;
  890         int error, do_write, copy, disk, best;
  891 
  892         trs = (struct g_raid_tr_raid1e_object *)tr;
  893         vol = tr->tro_volume;
  894         if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
  895                 if (trs->trso_type == TR_RAID1E_REBUILD) {
  896                         nsd = trs->trso_failed_sd;
  897                         if (bp->bio_cmd == BIO_READ) {
  898 
  899                                 /* Immediately abort rebuild, if requested. */
  900                                 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
  901                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  902                                         g_raid_tr_raid1e_rebuild_abort(tr);
  903                                         return;
  904                                 }
  905 
  906                                 /* On read error, skip and cross fingers. */
  907                                 if (bp->bio_error != 0) {
  908                                         G_RAID_LOGREQ(0, bp,
  909                                             "Read error during rebuild (%d), "
  910                                             "possible data loss!",
  911                                             bp->bio_error);
  912                                         goto rebuild_round_done;
  913                                 }
  914 
  915                                 /*
  916                                  * The read operation finished, queue the
  917                                  * write and get out.
  918                                  */
  919                                 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
  920                                     bp->bio_error);
  921                                 bp->bio_cmd = BIO_WRITE;
  922                                 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
  923                                 bp->bio_offset = nsd->sd_rebuild_pos;
  924                                 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
  925                                 g_raid_subdisk_iostart(nsd, bp);
  926                         } else {
  927                                 /*
  928                                  * The write operation just finished.  Do
  929                                  * another.  We keep cloning the master bio
  930                                  * since it has the right buffers allocated to
  931                                  * it.
  932                                  */
  933                                 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
  934                                     bp->bio_error);
  935                                 if (bp->bio_error != 0 ||
  936                                     trs->trso_flags & TR_RAID1E_F_ABORT) {
  937                                         if ((trs->trso_flags &
  938                                             TR_RAID1E_F_ABORT) == 0) {
  939                                                 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
  940                                                     nsd, nsd->sd_disk);
  941                                         }
  942                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  943                                         g_raid_tr_raid1e_rebuild_abort(tr);
  944                                         return;
  945                                 }
  946 rebuild_round_done:
  947                                 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
  948                                 g_raid_unlock_range(tr->tro_volume,
  949                                     trs->trso_lock_pos, trs->trso_lock_len);
  950                                 nsd->sd_rebuild_pos += bp->bio_length;
  951                                 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
  952                                         g_raid_tr_raid1e_rebuild_finish(tr);
  953                                         return;
  954                                 }
  955 
  956                                 /* Abort rebuild if we are stopping */
  957                                 if (trs->trso_stopping) {
  958                                         trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  959                                         g_raid_tr_raid1e_rebuild_abort(tr);
  960                                         return;
  961                                 }
  962 
  963                                 if (--trs->trso_meta_update <= 0) {
  964                                         g_raid_write_metadata(vol->v_softc,
  965                                             vol, nsd, nsd->sd_disk);
  966                                         trs->trso_meta_update =
  967                                             g_raid1e_rebuild_meta_update;
  968                                         /* Compensate short rebuild I/Os. */
  969                                         if ((vol->v_disks_count % N) != 0 &&
  970                                             vol->v_strip_size <
  971                                              g_raid1e_rebuild_slab) {
  972                                                 trs->trso_meta_update *=
  973                                                     g_raid1e_rebuild_slab;
  974                                                 trs->trso_meta_update /=
  975                                                     vol->v_strip_size;
  976                                         }
  977                                 }
  978                                 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
  979                                 if (--trs->trso_recover_slabs <= 0)
  980                                         return;
  981                                 /* Run next rebuild iteration. */
  982                                 g_raid_tr_raid1e_rebuild_some(tr);
  983                         }
  984                 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
  985                         /*
  986                          * read good sd, read bad sd in parallel.  when both
  987                          * done, compare the buffers.  write good to the bad
  988                          * if different.  do the next bit of work.
  989                          */
  990                         panic("Somehow, we think we're doing a resync");
  991                 }
  992                 return;
  993         }
  994         pbp = bp->bio_parent;
  995         pbp->bio_inbed++;
  996         mask = (intptr_t)bp->bio_caller2;
  997         if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
  998                 /*
  999                  * Read failed on first drive.  Retry the read error on
 1000                  * another disk drive, if available, before erroring out the
 1001                  * read.
 1002                  */
 1003                 sd->sd_disk->d_read_errs++;
 1004                 G_RAID_LOGREQ(0, bp,
 1005                     "Read error (%d), %d read errors total",
 1006                     bp->bio_error, sd->sd_disk->d_read_errs);
 1007 
 1008                 /*
 1009                  * If there are too many read errors, we move to degraded.
 1010                  * XXX Do we want to FAIL the drive (eg, make the user redo
 1011                  * everything to get it back in sync), or just degrade the
 1012                  * drive, which kicks off a resync?
 1013                  */
 1014                 do_write = 0;
 1015                 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
 1016                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 1017                 else if (mask == 0)
 1018                         do_write = 1;
 1019 
 1020                 /* Restore what we were doing. */
 1021                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1022                 V2P(vol, virtual, &disk, &offset, &start);
 1023 
 1024                 /* Find the other disk, and try to do the I/O to it. */
 1025                 mask |= 1 << copy;
 1026                 best = g_raid_tr_raid1e_select_read_disk(vol,
 1027                     disk, offset, start, mask);
 1028                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
 1029                         disk += best;
 1030                         if (disk >= vol->v_disks_count) {
 1031                                 disk -= vol->v_disks_count;
 1032                                 offset += vol->v_strip_size;
 1033                         }
 1034                         cbp->bio_offset = offset + start;
 1035                         cbp->bio_length = bp->bio_length;
 1036                         cbp->bio_data = bp->bio_data;
 1037                         cbp->bio_ma = bp->bio_ma;
 1038                         cbp->bio_ma_offset = bp->bio_ma_offset;
 1039                         cbp->bio_ma_n = bp->bio_ma_n;
 1040                         g_destroy_bio(bp);
 1041                         nsd = &vol->v_subdisks[disk];
 1042                         G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
 1043                             nsd->sd_pos);
 1044                         if (do_write)
 1045                                 mask |= 1 << 31;
 1046                         if ((mask & (1U << 31)) != 0)
 1047                                 sd->sd_recovery++;
 1048                         cbp->bio_caller2 = (void *)mask;
 1049                         if (do_write) {
 1050                                 cbp->bio_caller1 = nsd;
 1051                                 /* Lock callback starts I/O */
 1052                                 g_raid_lock_range(sd->sd_volume,
 1053                                     virtual, cbp->bio_length, pbp, cbp);
 1054                         } else {
 1055                                 g_raid_subdisk_iostart(nsd, cbp);
 1056                         }
 1057                         return;
 1058                 }
 1059                 /*
 1060                  * We can't retry.  Return the original error by falling
 1061                  * through.  This will happen when there's only one good disk.
 1062                  * We don't need to fail the raid, since its actual state is
 1063                  * based on the state of the subdisks.
 1064                  */
 1065                 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
 1066         }
 1067         if (bp->bio_cmd == BIO_READ &&
 1068             bp->bio_error == 0 &&
 1069             (mask & (1U << 31)) != 0) {
 1070                 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
 1071 
 1072                 /* Restore what we were doing. */
 1073                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1074                 V2P(vol, virtual, &disk, &offset, &start);
 1075 
 1076                 /* Find best disk to write. */
 1077                 best = g_raid_tr_raid1e_select_read_disk(vol,
 1078                     disk, offset, start, ~mask);
 1079                 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
 1080                         disk += best;
 1081                         if (disk >= vol->v_disks_count) {
 1082                                 disk -= vol->v_disks_count;
 1083                                 offset += vol->v_strip_size;
 1084                         }
 1085                         cbp->bio_offset = offset + start;
 1086                         cbp->bio_cmd = BIO_WRITE;
 1087                         cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
 1088                         cbp->bio_caller2 = (void *)mask;
 1089                         g_destroy_bio(bp);
 1090                         G_RAID_LOGREQ(2, cbp,
 1091                             "Attempting bad sector remap on failing drive.");
 1092                         g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
 1093                         return;
 1094                 }
 1095         }
 1096         if ((mask & (1U << 31)) != 0) {
 1097                 /*
 1098                  * We're done with a recovery, mark the range as unlocked.
 1099                  * For any write errors, we aggressively fail the disk since
 1100                  * there was both a READ and a WRITE error at this location.
 1101                  * Both types of errors generally indicates the drive is on
 1102                  * the verge of total failure anyway.  Better to stop trusting
 1103                  * it now.  However, we need to reset error to 0 in that case
 1104                  * because we're not failing the original I/O which succeeded.
 1105                  */
 1106 
 1107                 /* Restore what we were doing. */
 1108                 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
 1109                 V2P(vol, virtual, &disk, &offset, &start);
 1110 
 1111                 for (copy = 0; copy < N; copy++) {
 1112                         if ((mask & (1 << copy) ) != 0)
 1113                                 vol->v_subdisks[(disk + copy) %
 1114                                     vol->v_disks_count].sd_recovery--;
 1115                 }
 1116 
 1117                 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
 1118                         G_RAID_LOGREQ(0, bp, "Remap write failed: "
 1119                             "failing subdisk.");
 1120                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 1121                         bp->bio_error = 0;
 1122                 }
 1123                 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
 1124                 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
 1125         }
 1126         if (pbp->bio_cmd != BIO_READ) {
 1127                 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
 1128                         pbp->bio_error = bp->bio_error;
 1129                 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
 1130                         G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
 1131                         g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 1132                 }
 1133                 error = pbp->bio_error;
 1134         } else
 1135                 error = bp->bio_error;
 1136         g_destroy_bio(bp);
 1137         if (pbp->bio_children == pbp->bio_inbed) {
 1138                 pbp->bio_completed = pbp->bio_length;
 1139                 g_raid_iodone(pbp, error);
 1140         }
 1141 }
 1142 
 1143 static int
 1144 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
 1145     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
 1146 {
 1147         struct g_raid_volume *vol;
 1148         struct g_raid_subdisk *sd;
 1149         struct bio_queue_head queue;
 1150         char *addr;
 1151         off_t offset, start, length, remain;
 1152         u_int no, strip_size;
 1153         int i, error;
 1154 
 1155         vol = tr->tro_volume;
 1156         addr = virtual;
 1157         strip_size = vol->v_strip_size;
 1158         V2P(vol, boffset, &no, &offset, &start);
 1159         remain = blength;
 1160         bioq_init(&queue);
 1161         while (remain > 0) {
 1162                 length = MIN(strip_size - start, remain);
 1163                 for (i = 0; i < N; i++) {
 1164                         sd = &vol->v_subdisks[no];
 1165                         switch (sd->sd_state) {
 1166                         case G_RAID_SUBDISK_S_ACTIVE:
 1167                         case G_RAID_SUBDISK_S_STALE:
 1168                         case G_RAID_SUBDISK_S_RESYNC:
 1169                                 break;
 1170                         case G_RAID_SUBDISK_S_REBUILD:
 1171                                 if (offset + start >= sd->sd_rebuild_pos)
 1172                                         goto nextdisk;
 1173                                 break;
 1174                         default:
 1175                                 goto nextdisk;
 1176                         }
 1177                         error = g_raid_subdisk_kerneldump(sd,
 1178                             addr, 0, offset + start, length);
 1179                         if (error != 0)
 1180                                 return (error);
 1181 nextdisk:
 1182                         if (++no >= vol->v_disks_count) {
 1183                                 no = 0;
 1184                                 offset += strip_size;
 1185                         }
 1186                 }
 1187                 remain -= length;
 1188                 addr += length;
 1189                 start = 0;
 1190         }
 1191         return (0);
 1192 }
 1193 
 1194 static int
 1195 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
 1196 {
 1197         struct bio *bp;
 1198         struct g_raid_subdisk *sd;
 1199 
 1200         bp = (struct bio *)argp;
 1201         sd = (struct g_raid_subdisk *)bp->bio_caller1;
 1202         g_raid_subdisk_iostart(sd, bp);
 1203 
 1204         return (0);
 1205 }
 1206 
 1207 static int
 1208 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
 1209 {
 1210         struct g_raid_tr_raid1e_object *trs;
 1211         struct g_raid_volume *vol;
 1212 
 1213         vol = tr->tro_volume;
 1214         trs = (struct g_raid_tr_raid1e_object *)tr;
 1215         trs->trso_fair_io = g_raid1e_rebuild_fair_io;
 1216         trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
 1217         /* Compensate short rebuild I/Os. */
 1218         if ((vol->v_disks_count % N) != 0 &&
 1219             vol->v_strip_size < g_raid1e_rebuild_slab) {
 1220                 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
 1221                 trs->trso_recover_slabs /= vol->v_strip_size;
 1222         }
 1223         if (trs->trso_type == TR_RAID1E_REBUILD)
 1224                 g_raid_tr_raid1e_rebuild_some(tr);
 1225         return (0);
 1226 }
 1227 
 1228 static int
 1229 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
 1230 {
 1231         struct g_raid_tr_raid1e_object *trs;
 1232 
 1233         trs = (struct g_raid_tr_raid1e_object *)tr;
 1234 
 1235         if (trs->trso_buffer != NULL) {
 1236                 free(trs->trso_buffer, M_TR_RAID1E);
 1237                 trs->trso_buffer = NULL;
 1238         }
 1239         return (0);
 1240 }
 1241 
 1242 G_RAID_TR_DECLARE(raid1e, "RAID1E");

Cache object: e1caaecc6b4d5d7fa59458c24cfcce4b


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.