The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/softraid_raid5.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /* $OpenBSD: softraid_raid5.c,v 1.32 2021/05/16 15:12:37 deraadt Exp $ */
    2 /*
    3  * Copyright (c) 2014 Joel Sing <jsing@openbsd.org>
    4  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
    5  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
    6  *
    7  * Permission to use, copy, modify, and distribute this software for any
    8  * purpose with or without fee is hereby granted, provided that the above
    9  * copyright notice and this permission notice appear in all copies.
   10  *
   11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
   12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
   13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
   14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
   15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
   16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
   17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
   18  */
   19 
   20 #include "bio.h"
   21 
   22 #include <sys/param.h>
   23 #include <sys/systm.h>
   24 #include <sys/buf.h>
   25 #include <sys/device.h>
   26 #include <sys/ioctl.h>
   27 #include <sys/malloc.h>
   28 #include <sys/kernel.h>
   29 #include <sys/disk.h>
   30 #include <sys/rwlock.h>
   31 #include <sys/queue.h>
   32 #include <sys/fcntl.h>
   33 #include <sys/mount.h>
   34 #include <sys/sensors.h>
   35 #include <sys/stat.h>
   36 #include <sys/task.h>
   37 #include <sys/pool.h>
   38 #include <sys/conf.h>
   39 #include <sys/uio.h>
   40 
   41 #include <scsi/scsi_all.h>
   42 #include <scsi/scsiconf.h>
   43 #include <scsi/scsi_disk.h>
   44 
   45 #include <dev/softraidvar.h>
   46 
   47 /* RAID 5 functions. */
   48 int     sr_raid5_create(struct sr_discipline *, struct bioc_createraid *,
   49             int, int64_t);
   50 int     sr_raid5_assemble(struct sr_discipline *, struct bioc_createraid *,
   51             int, void *);
   52 int     sr_raid5_init(struct sr_discipline *);
   53 int     sr_raid5_rw(struct sr_workunit *);
   54 int     sr_raid5_openings(struct sr_discipline *);
   55 void    sr_raid5_intr(struct buf *);
   56 int     sr_raid5_wu_done(struct sr_workunit *);
   57 void    sr_raid5_set_chunk_state(struct sr_discipline *, int, int);
   58 void    sr_raid5_set_vol_state(struct sr_discipline *);
   59 
   60 int     sr_raid5_addio(struct sr_workunit *wu, int, daddr_t, long,
   61             void *, int, int, void *);
   62 int     sr_raid5_regenerate(struct sr_workunit *, int, daddr_t, long,
   63             void *);
   64 int     sr_raid5_write(struct sr_workunit *, struct sr_workunit *, int, int,
   65             daddr_t, long, void *, int, int);
   66 void    sr_raid5_xor(void *, void *, int);
   67 
   68 void    sr_raid5_rebuild(struct sr_discipline *);
   69 void    sr_raid5_scrub(struct sr_discipline *);
   70 
   71 /* discipline initialisation. */
   72 void
   73 sr_raid5_discipline_init(struct sr_discipline *sd)
   74 {
   75         /* Fill out discipline members. */
   76         sd->sd_type = SR_MD_RAID5;
   77         strlcpy(sd->sd_name, "RAID 5", sizeof(sd->sd_name));
   78         sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
   79             SR_CAP_REBUILD | SR_CAP_REDUNDANT;
   80         sd->sd_max_wu = SR_RAID5_NOWU + 2;      /* Two for scrub/rebuild. */
   81 
   82         /* Setup discipline specific function pointers. */
   83         sd->sd_assemble = sr_raid5_assemble;
   84         sd->sd_create = sr_raid5_create;
   85         sd->sd_openings = sr_raid5_openings;
   86         sd->sd_rebuild = sr_raid5_rebuild;
   87         sd->sd_scsi_rw = sr_raid5_rw;
   88         sd->sd_scsi_intr = sr_raid5_intr;
   89         sd->sd_scsi_wu_done = sr_raid5_wu_done;
   90         sd->sd_set_chunk_state = sr_raid5_set_chunk_state;
   91         sd->sd_set_vol_state = sr_raid5_set_vol_state;
   92 }
   93 
   94 int
   95 sr_raid5_create(struct sr_discipline *sd, struct bioc_createraid *bc,
   96     int no_chunk, int64_t coerced_size)
   97 {
   98         if (no_chunk < 3) {
   99                 sr_error(sd->sd_sc, "%s requires three or more chunks",
  100                     sd->sd_name);
  101                 return EINVAL;
  102         }
  103 
  104         /*
  105          * XXX add variable strip size later even though MAXPHYS is really
  106          * the clever value, users like to tinker with that type of stuff.
  107          */
  108         sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
  109         sd->sd_meta->ssdi.ssd_size = (coerced_size &
  110             ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
  111             DEV_BSHIFT) - 1)) * (no_chunk - 1);
  112 
  113         return sr_raid5_init(sd);
  114 }
  115 
  116 int
  117 sr_raid5_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
  118     int no_chunk, void *data)
  119 {
  120         return sr_raid5_init(sd);
  121 }
  122 
  123 int
  124 sr_raid5_init(struct sr_discipline *sd)
  125 {
  126         /* Initialise runtime values. */
  127         sd->mds.mdd_raid5.sr5_strip_bits =
  128             sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
  129         if (sd->mds.mdd_raid5.sr5_strip_bits == -1) {
  130                 sr_error(sd->sd_sc, "invalid strip size");
  131                 return EINVAL;
  132         }
  133 
  134         sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
  135 
  136         return 0;
  137 }
  138 
  139 int
  140 sr_raid5_openings(struct sr_discipline *sd)
  141 {
  142         /* Two work units per I/O, two for rebuild/scrub. */
  143         return ((sd->sd_max_wu - 2) >> 1);
  144 }
  145 
  146 void
  147 sr_raid5_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
  148 {
  149         int                     old_state, s;
  150 
  151         DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
  152             DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
  153             sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
  154 
  155         /* ok to go to splbio since this only happens in error path */
  156         s = splbio();
  157         old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
  158 
  159         /* multiple IOs to the same chunk that fail will come through here */
  160         if (old_state == new_state)
  161                 goto done;
  162 
  163         switch (old_state) {
  164         case BIOC_SDONLINE:
  165                 switch (new_state) {
  166                 case BIOC_SDOFFLINE:
  167                 case BIOC_SDSCRUB:
  168                         break;
  169                 default:
  170                         goto die;
  171                 }
  172                 break;
  173 
  174         case BIOC_SDOFFLINE:
  175                 if (new_state == BIOC_SDREBUILD) {
  176                         ;
  177                 } else
  178                         goto die;
  179                 break;
  180 
  181         case BIOC_SDSCRUB:
  182                 switch (new_state) {
  183                 case BIOC_SDONLINE:
  184                 case BIOC_SDOFFLINE:
  185                         break;
  186                 default:
  187                         goto die;
  188                 }
  189                 break;
  190 
  191         case BIOC_SDREBUILD:
  192                 switch (new_state) {
  193                 case BIOC_SDONLINE:
  194                 case BIOC_SDOFFLINE:
  195                         break;
  196                 default:
  197                         goto die;
  198                 }
  199                 break;
  200 
  201         default:
  202 die:
  203                 splx(s); /* XXX */
  204                 panic("%s: %s: %s: invalid chunk state transition %d -> %d",
  205                     DEVNAME(sd->sd_sc),
  206                     sd->sd_meta->ssd_devname,
  207                     sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
  208                     old_state, new_state);
  209                 /* NOTREACHED */
  210         }
  211 
  212         sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
  213         sd->sd_set_vol_state(sd);
  214 
  215         sd->sd_must_flush = 1;
  216         task_add(systq, &sd->sd_meta_save_task);
  217 done:
  218         splx(s);
  219 }
  220 
  221 void
  222 sr_raid5_set_vol_state(struct sr_discipline *sd)
  223 {
  224         int                     states[SR_MAX_STATES];
  225         int                     new_state, i, s, nd;
  226         int                     old_state = sd->sd_vol_status;
  227 
  228         DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
  229             DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
  230 
  231         nd = sd->sd_meta->ssdi.ssd_chunk_no;
  232 
  233         for (i = 0; i < SR_MAX_STATES; i++)
  234                 states[i] = 0;
  235 
  236         for (i = 0; i < nd; i++) {
  237                 s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
  238                 if (s >= SR_MAX_STATES)
  239                         panic("%s: %s: %s: invalid chunk state",
  240                             DEVNAME(sd->sd_sc),
  241                             sd->sd_meta->ssd_devname,
  242                             sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
  243                 states[s]++;
  244         }
  245 
  246         if (states[BIOC_SDONLINE] == nd)
  247                 new_state = BIOC_SVONLINE;
  248         else if (states[BIOC_SDONLINE] < nd - 1)
  249                 new_state = BIOC_SVOFFLINE;
  250         else if (states[BIOC_SDSCRUB] != 0)
  251                 new_state = BIOC_SVSCRUB;
  252         else if (states[BIOC_SDREBUILD] != 0)
  253                 new_state = BIOC_SVREBUILD;
  254         else if (states[BIOC_SDONLINE] == nd - 1)
  255                 new_state = BIOC_SVDEGRADED;
  256         else {
  257 #ifdef SR_DEBUG
  258                 DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state "
  259                     "was %d\n", DEVNAME(sd->sd_sc), old_state);
  260                 for (i = 0; i < nd; i++)
  261                         DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n",
  262                             DEVNAME(sd->sd_sc), i,
  263                             sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
  264 #endif
  265                 panic("invalid volume state");
  266         }
  267 
  268         DNPRINTF(SR_D_STATE, "%s: %s: sr_raid5_set_vol_state %d -> %d\n",
  269             DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
  270             old_state, new_state);
  271 
  272         switch (old_state) {
  273         case BIOC_SVONLINE:
  274                 switch (new_state) {
  275                 case BIOC_SVONLINE: /* can go to same state */
  276                 case BIOC_SVOFFLINE:
  277                 case BIOC_SVDEGRADED:
  278                 case BIOC_SVREBUILD: /* happens on boot */
  279                         break;
  280                 default:
  281                         goto die;
  282                 }
  283                 break;
  284 
  285         case BIOC_SVOFFLINE:
  286                 /* XXX this might be a little too much */
  287                 goto die;
  288 
  289         case BIOC_SVDEGRADED:
  290                 switch (new_state) {
  291                 case BIOC_SVOFFLINE:
  292                 case BIOC_SVREBUILD:
  293                 case BIOC_SVDEGRADED: /* can go to the same state */
  294                         break;
  295                 default:
  296                         goto die;
  297                 }
  298                 break;
  299 
  300         case BIOC_SVBUILDING:
  301                 switch (new_state) {
  302                 case BIOC_SVONLINE:
  303                 case BIOC_SVOFFLINE:
  304                 case BIOC_SVBUILDING: /* can go to the same state */
  305                         break;
  306                 default:
  307                         goto die;
  308                 }
  309                 break;
  310 
  311         case BIOC_SVSCRUB:
  312                 switch (new_state) {
  313                 case BIOC_SVONLINE:
  314                 case BIOC_SVOFFLINE:
  315                 case BIOC_SVDEGRADED:
  316                 case BIOC_SVSCRUB: /* can go to same state */
  317                         break;
  318                 default:
  319                         goto die;
  320                 }
  321                 break;
  322 
  323         case BIOC_SVREBUILD:
  324                 switch (new_state) {
  325                 case BIOC_SVONLINE:
  326                 case BIOC_SVOFFLINE:
  327                 case BIOC_SVDEGRADED:
  328                 case BIOC_SVREBUILD: /* can go to the same state */
  329                         break;
  330                 default:
  331                         goto die;
  332                 }
  333                 break;
  334 
  335         default:
  336 die:
  337                 panic("%s: %s: invalid volume state transition %d -> %d",
  338                     DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
  339                     old_state, new_state);
  340                 /* NOTREACHED */
  341         }
  342 
  343         sd->sd_vol_status = new_state;
  344 }
  345 
  346 static inline int
  347 sr_raid5_chunk_online(struct sr_discipline *sd, int chunk)
  348 {
  349         switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
  350         case BIOC_SDONLINE:
  351         case BIOC_SDSCRUB:
  352                 return 1;
  353         default:
  354                 return 0;
  355         }
  356 }
  357 
  358 static inline int
  359 sr_raid5_chunk_rebuild(struct sr_discipline *sd, int chunk)
  360 {
  361         switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
  362         case BIOC_SDREBUILD:
  363                 return 1;
  364         default:
  365                 return 0;
  366         }
  367 }
  368 
  369 int
  370 sr_raid5_rw(struct sr_workunit *wu)
  371 {
  372         struct sr_workunit      *wu_r = NULL;
  373         struct sr_discipline    *sd = wu->swu_dis;
  374         struct scsi_xfer        *xs = wu->swu_xs;
  375         struct sr_chunk         *scp;
  376         daddr_t                 blkno, lba;
  377         int64_t                 chunk_offs, lbaoffs, offset, strip_offs;
  378         int64_t                 strip_bits, strip_no, strip_size;
  379         int64_t                 chunk, no_chunk;
  380         int64_t                 parity, row_size;
  381         long                    length, datalen;
  382         void                    *data;
  383         int                     s;
  384 
  385         /* blkno and scsi error will be handled by sr_validate_io */
  386         if (sr_validate_io(wu, &blkno, "sr_raid5_rw"))
  387                 goto bad;
  388 
  389         DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_rw %s: blkno %lld size %d\n",
  390             DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
  391             (xs->flags & SCSI_DATA_IN) ? "read" : "write",
  392             (long long)blkno, xs->datalen);
  393 
  394         strip_size = sd->sd_meta->ssdi.ssd_strip_size;
  395         strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
  396         no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
  397         row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
  398 
  399         data = xs->data;
  400         datalen = xs->datalen;
  401         lbaoffs = blkno << DEV_BSHIFT;
  402 
  403         if (xs->flags & SCSI_DATA_OUT) {
  404                 if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
  405                         printf("%s: %s failed to get read work unit",
  406                             DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
  407                         goto bad;
  408                 }
  409                 wu_r->swu_state = SR_WU_INPROGRESS;
  410                 wu_r->swu_flags |= SR_WUF_DISCIPLINE;
  411         }
  412 
  413         wu->swu_blk_start = 0;
  414         while (datalen != 0) {
  415                 strip_no = lbaoffs >> strip_bits;
  416                 strip_offs = lbaoffs & (strip_size - 1);
  417                 chunk_offs = (strip_no / no_chunk) << strip_bits;
  418                 offset = chunk_offs + strip_offs;
  419 
  420                 /* get size remaining in this stripe */
  421                 length = MIN(strip_size - strip_offs, datalen);
  422 
  423                 /*
  424                  * Map disk offset to data and parity chunks, using a left
  425                  * asymmetric algorithm for the parity assignment.
  426                  */
  427                 chunk = strip_no % no_chunk;
  428                 parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
  429                 if (chunk >= parity)
  430                         chunk++;
  431 
  432                 lba = offset >> DEV_BSHIFT;
  433 
  434                 /* XXX big hammer.. exclude I/O from entire stripe */
  435                 if (wu->swu_blk_start == 0)
  436                         wu->swu_blk_start = (strip_no / no_chunk) * row_size;
  437                 wu->swu_blk_end = (strip_no / no_chunk) * row_size +
  438                     (row_size - 1);
  439 
  440                 scp = sd->sd_vol.sv_chunks[chunk];
  441                 if (xs->flags & SCSI_DATA_IN) {
  442                         switch (scp->src_meta.scm_status) {
  443                         case BIOC_SDONLINE:
  444                         case BIOC_SDSCRUB:
  445                                 /*
  446                                  * Chunk is online, issue a single read
  447                                  * request.
  448                                  */
  449                                 if (sr_raid5_addio(wu, chunk, lba, length,
  450                                     data, xs->flags, 0, NULL))
  451                                         goto bad;
  452                                 break;
  453                         case BIOC_SDOFFLINE:
  454                         case BIOC_SDREBUILD:
  455                         case BIOC_SDHOTSPARE:
  456                                 if (sr_raid5_regenerate(wu, chunk, lba,
  457                                     length, data))
  458                                         goto bad;
  459                                 break;
  460                         default:
  461                                 printf("%s: is offline, can't read\n",
  462                                     DEVNAME(sd->sd_sc));
  463                                 goto bad;
  464                         }
  465                 } else {
  466                         if (sr_raid5_write(wu, wu_r, chunk, parity, lba,
  467                             length, data, xs->flags, 0))
  468                                 goto bad;
  469                 }
  470 
  471                 /* advance to next block */
  472                 lbaoffs += length;
  473                 datalen -= length;
  474                 data += length;
  475         }
  476 
  477         s = splbio();
  478         if (wu_r) {
  479                 if (wu_r->swu_io_count > 0) {
  480                         /* collide write request with reads */
  481                         wu_r->swu_blk_start = wu->swu_blk_start;
  482                         wu_r->swu_blk_end = wu->swu_blk_end;
  483 
  484                         wu->swu_state = SR_WU_DEFERRED;
  485                         wu_r->swu_collider = wu;
  486                         TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
  487 
  488                         wu = wu_r;
  489                 } else {
  490                         sr_scsi_wu_put(sd, wu_r);
  491                 }
  492         }
  493         splx(s);
  494 
  495         sr_schedule_wu(wu);
  496 
  497         return (0);
  498 
  499 bad:
  500         /* wu is unwound by sr_wu_put */
  501         if (wu_r)
  502                 sr_scsi_wu_put(sd, wu_r);
  503         return (1);
  504 }
  505 
  506 int
  507 sr_raid5_regenerate(struct sr_workunit *wu, int chunk, daddr_t blkno,
  508     long len, void *data)
  509 {
  510         struct sr_discipline    *sd = wu->swu_dis;
  511         int                     i;
  512 
  513         /*
  514          * Regenerate a block on a RAID 5 volume by xoring the data and parity
  515          * from all of the remaining online chunks. This requires the parity
  516          * to already be correct.
  517          */
  518 
  519         DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_regenerate chunk %d offline, "
  520             "regenerating block %llu\n",
  521             DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, chunk, blkno);
  522 
  523         memset(data, 0, len);
  524         for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
  525                 if (i == chunk)
  526                         continue;
  527                 if (!sr_raid5_chunk_online(sd, i))
  528                         goto bad;
  529                 if (sr_raid5_addio(wu, i, blkno, len, NULL, SCSI_DATA_IN,
  530                     0, data))
  531                         goto bad;
  532         }
  533         return (0);
  534 
  535 bad:
  536         return (1);
  537 }
  538 
  539 int
  540 sr_raid5_write(struct sr_workunit *wu, struct sr_workunit *wu_r, int chunk,
  541     int parity, daddr_t blkno, long len, void *data, int xsflags,
  542     int ccbflags)
  543 {
  544         struct sr_discipline    *sd = wu->swu_dis;
  545         struct scsi_xfer        *xs = wu->swu_xs;
  546         void                    *xorbuf;
  547         int                     chunk_online, chunk_rebuild;
  548         int                     parity_online, parity_rebuild;
  549         int                     other_offline = 0, other_rebuild = 0;
  550         int                     i;
  551 
  552         /*
  553          * Perform a write to a RAID 5 volume. This write routine does not
  554          * require the parity to already be correct and will operate on a
  555          * uninitialised volume.
  556          *
  557          * There are four possible cases:
  558          *
  559          * 1) All data chunks and parity are online. In this case we read the
  560          *    data from all data chunks, except the one we are writing to, in
  561          *    order to calculate and write the new parity.
  562          *
  563          * 2) The parity chunk is offline. In this case we only need to write
  564          *    to the data chunk. No parity calculation is required.
  565          *
  566          * 3) The data chunk is offline. In this case we read the data from all
  567          *    online chunks in order to calculate and write the new parity.
  568          *    This is the same as (1) except we do not write the data chunk.
  569          *
  570          * 4) A different data chunk is offline. The new parity is calculated
  571          *    by taking the existing parity, xoring the original data and
  572          *    xoring in the new data. This requires that the parity already be
  573          *    correct, which it will be if any of the data chunks has
  574          *    previously been written.
  575          *
  576          * There is an additional complication introduced by a chunk that is
  577          * being rebuilt. If this is the data or parity chunk, then we want
  578          * to write to it as per normal. If it is another data chunk then we
  579          * need to presume that it has not yet been regenerated and use the
  580          * same method as detailed in (4) above.
  581          */
  582 
  583         DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_write chunk %i parity %i "
  584             "blkno %llu\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
  585             chunk, parity, (unsigned long long)blkno);
  586 
  587         chunk_online = sr_raid5_chunk_online(sd, chunk);
  588         chunk_rebuild = sr_raid5_chunk_rebuild(sd, chunk);
  589         parity_online = sr_raid5_chunk_online(sd, parity);
  590         parity_rebuild = sr_raid5_chunk_rebuild(sd, parity);
  591 
  592         for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
  593                 if (i == chunk || i == parity)
  594                         continue;
  595                 if (sr_raid5_chunk_rebuild(sd, i))
  596                         other_rebuild = 1;
  597                 else if (!sr_raid5_chunk_online(sd, i))
  598                         other_offline = 1;
  599         }
  600 
  601         DNPRINTF(SR_D_DIS, "%s: %s chunk online %d, parity online %d, "
  602             "other offline %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
  603             chunk_online, parity_online, other_offline);
  604 
  605         if (!parity_online && !parity_rebuild)
  606                 goto data_write;
  607 
  608         xorbuf = sr_block_get(sd, len);
  609         if (xorbuf == NULL)
  610                 goto bad;
  611         memcpy(xorbuf, data, len);
  612 
  613         if (other_offline || other_rebuild) {
  614 
  615                 /*
  616                  * XXX - If we can guarantee that this LBA has been scrubbed
  617                  * then we can also take this faster path.
  618                  */
  619 
  620                 /* Read in existing data and existing parity. */
  621                 if (sr_raid5_addio(wu_r, chunk, blkno, len, NULL,
  622                     SCSI_DATA_IN, 0, xorbuf))
  623                         goto bad;
  624                 if (sr_raid5_addio(wu_r, parity, blkno, len, NULL,
  625                     SCSI_DATA_IN, 0, xorbuf))
  626                         goto bad;
  627 
  628         } else {
  629 
  630                 /* Read in existing data from all other chunks. */
  631                 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
  632                         if (i == chunk || i == parity)
  633                                 continue;
  634                         if (sr_raid5_addio(wu_r, i, blkno, len, NULL,
  635                             SCSI_DATA_IN, 0, xorbuf))
  636                                 goto bad;
  637                 }
  638 
  639         }
  640 
  641         /* Write new parity. */
  642         if (sr_raid5_addio(wu, parity, blkno, len, xorbuf, xs->flags,
  643             SR_CCBF_FREEBUF, NULL))
  644                 goto bad;
  645 
  646 data_write:
  647         /* Write new data. */
  648         if (chunk_online || chunk_rebuild)
  649                 if (sr_raid5_addio(wu, chunk, blkno, len, data, xs->flags,
  650                     0, NULL))
  651                         goto bad;
  652 
  653         return (0);
  654 
  655 bad:
  656         return (1);
  657 }
  658 
  659 void
  660 sr_raid5_intr(struct buf *bp)
  661 {
  662         struct sr_ccb           *ccb = (struct sr_ccb *)bp;
  663         struct sr_workunit      *wu = ccb->ccb_wu;
  664         struct sr_discipline    *sd = wu->swu_dis;
  665         int                     s;
  666 
  667         DNPRINTF(SR_D_INTR, "%s: sr_raid5_intr bp %p xs %p\n",
  668             DEVNAME(sd->sd_sc), bp, wu->swu_xs);
  669 
  670         s = splbio();
  671         sr_ccb_done(ccb);
  672 
  673         /* XXX - Should this be done via the taskq? */
  674 
  675         /* XOR data to result. */
  676         if (ccb->ccb_state == SR_CCB_OK && ccb->ccb_opaque)
  677                 sr_raid5_xor(ccb->ccb_opaque, ccb->ccb_buf.b_data,
  678                     ccb->ccb_buf.b_bcount);
  679 
  680         /* Free allocated data buffer. */
  681         if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
  682                 sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
  683                 ccb->ccb_buf.b_data = NULL;
  684         }
  685 
  686         sr_wu_done(wu);
  687         splx(s);
  688 }
  689 
  690 int
  691 sr_raid5_wu_done(struct sr_workunit *wu)
  692 {
  693         struct sr_discipline    *sd = wu->swu_dis;
  694         struct scsi_xfer        *xs = wu->swu_xs;
  695 
  696         /* XXX - we have no way of propagating errors... */
  697         if (wu->swu_flags & (SR_WUF_DISCIPLINE | SR_WUF_REBUILD))
  698                 return SR_WU_OK;
  699 
  700         /* XXX - This is insufficient for RAID 5. */
  701         if (wu->swu_ios_succeeded > 0) {
  702                 xs->error = XS_NOERROR;
  703                 return SR_WU_OK;
  704         }
  705 
  706         if (xs->flags & SCSI_DATA_IN) {
  707                 printf("%s: retrying read on block %lld\n",
  708                     sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
  709                 sr_wu_release_ccbs(wu);
  710                 wu->swu_state = SR_WU_RESTART;
  711                 if (sd->sd_scsi_rw(wu) == 0)
  712                         return SR_WU_RESTART;
  713         } else {
  714                 /* XXX - retry write if we just went from online to degraded. */
  715                 printf("%s: permanently fail write on block %lld\n",
  716                     sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
  717         }
  718 
  719         wu->swu_state = SR_WU_FAILED;
  720         xs->error = XS_DRIVER_STUFFUP;
  721 
  722         return SR_WU_FAILED;
  723 }
  724 
  725 int
  726 sr_raid5_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
  727     long len, void *data, int xsflags, int ccbflags, void *xorbuf)
  728 {
  729         struct sr_discipline    *sd = wu->swu_dis;
  730         struct sr_ccb           *ccb;
  731 
  732         DNPRINTF(SR_D_DIS, "sr_raid5_addio: %s chunk %d block %lld "
  733             "length %ld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write",
  734             chunk, (long long)blkno, len, xorbuf ? "X0R" : "-");
  735 
  736         /* Allocate temporary buffer. */
  737         if (data == NULL) {
  738                 data = sr_block_get(sd, len);
  739                 if (data == NULL)
  740                         return (-1);
  741                 ccbflags |= SR_CCBF_FREEBUF;
  742         }
  743 
  744         ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
  745         if (ccb == NULL) {
  746                 if (ccbflags & SR_CCBF_FREEBUF)
  747                         sr_block_put(sd, data, len);
  748                 return (-1);
  749         }
  750         ccb->ccb_opaque = xorbuf;
  751         sr_wu_enqueue_ccb(wu, ccb);
  752 
  753         return (0);
  754 }
  755 
  756 void
  757 sr_raid5_xor(void *a, void *b, int len)
  758 {
  759         uint32_t                *xa = a, *xb = b;
  760 
  761         len >>= 2;
  762         while (len--)
  763                 *xa++ ^= *xb++;
  764 }
  765 
  766 void
  767 sr_raid5_rebuild(struct sr_discipline *sd)
  768 {
  769         int64_t strip_no, strip_size, strip_bits, i, restart;
  770         int64_t chunk_count, chunk_strips, chunk_lba, chunk_size, row_size;
  771         struct sr_workunit *wu_r, *wu_w;
  772         int s, slept, percent = 0, old_percent = -1;
  773         int rebuild_chunk = -1;
  774         void *xorbuf;
  775 
  776         /* Find the rebuild chunk. */
  777         for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
  778                 if (sr_raid5_chunk_rebuild(sd, i)) {
  779                         rebuild_chunk = i;
  780                         break;
  781                 }
  782         }
  783         if (rebuild_chunk == -1)
  784                 goto bad;
  785 
  786         strip_size = sd->sd_meta->ssdi.ssd_strip_size;
  787         strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
  788         chunk_count = sd->sd_meta->ssdi.ssd_chunk_no - 1;
  789         chunk_size = sd->sd_meta->ssdi.ssd_size / chunk_count;
  790         chunk_strips = (chunk_size << DEV_BSHIFT) >> strip_bits;
  791         row_size = (chunk_count << strip_bits) >> DEV_BSHIFT;
  792 
  793         DNPRINTF(SR_D_REBUILD, "%s: %s sr_raid5_rebuild volume size = %lld, "
  794             "chunk count = %lld, chunk size = %lld, chunk strips = %lld, "
  795             "row size = %lld\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
  796             sd->sd_meta->ssdi.ssd_size, chunk_count, chunk_size, chunk_strips,
  797             row_size);
  798 
  799         restart = sd->sd_meta->ssd_rebuild / row_size;
  800         if (restart > chunk_strips) {
  801                 printf("%s: bogus rebuild restart offset, starting from 0\n",
  802                     DEVNAME(sd->sd_sc));
  803                 restart = 0;
  804         }
  805         if (restart != 0) {
  806                 percent = sr_rebuild_percent(sd);
  807                 printf("%s: resuming rebuild on %s at %d%%\n",
  808                     DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, percent);
  809         }
  810 
  811         for (strip_no = restart; strip_no < chunk_strips; strip_no++) {
  812                 chunk_lba = (strip_size >> DEV_BSHIFT) * strip_no;
  813 
  814                 DNPRINTF(SR_D_REBUILD, "%s: %s rebuild strip %lld, "
  815                     "chunk lba = %lld\n", DEVNAME(sd->sd_sc),
  816                     sd->sd_meta->ssd_devname, strip_no, chunk_lba);
  817 
  818                 wu_w = sr_scsi_wu_get(sd, 0);
  819                 wu_r = sr_scsi_wu_get(sd, 0);
  820 
  821                 xorbuf = sr_block_get(sd, strip_size);
  822                 if (xorbuf == NULL)
  823                         goto bad;
  824                 if (sr_raid5_regenerate(wu_r, rebuild_chunk, chunk_lba,
  825                     strip_size, xorbuf))
  826                         goto bad;
  827                 if (sr_raid5_addio(wu_w, rebuild_chunk, chunk_lba, strip_size,
  828                     xorbuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL))
  829                         goto bad;
  830 
  831                 /* Collide write work unit with read work unit. */
  832                 wu_r->swu_state = SR_WU_INPROGRESS;
  833                 wu_r->swu_flags |= SR_WUF_REBUILD;
  834                 wu_w->swu_state = SR_WU_DEFERRED;
  835                 wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
  836                 wu_r->swu_collider = wu_w;
  837 
  838                 /* Block I/O to this strip while we rebuild it. */
  839                 wu_r->swu_blk_start = (strip_no / chunk_count) * row_size;
  840                 wu_r->swu_blk_end = wu_r->swu_blk_start + row_size - 1;
  841                 wu_w->swu_blk_start = wu_r->swu_blk_start;
  842                 wu_w->swu_blk_end = wu_r->swu_blk_end;
  843 
  844                 DNPRINTF(SR_D_REBUILD, "%s: %s rebuild swu_blk_start = %lld, "
  845                     "swu_blk_end = %lld\n", DEVNAME(sd->sd_sc),
  846                     sd->sd_meta->ssd_devname,
  847                     wu_r->swu_blk_start, wu_r->swu_blk_end);
  848 
  849                 s = splbio();
  850                 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
  851                 splx(s);
  852 
  853                 sr_schedule_wu(wu_r);
  854 
  855                 slept = 0;
  856                 while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
  857                         tsleep_nsec(wu_w, PRIBIO, "sr_rebuild", INFSLP);
  858                         slept = 1;
  859                 }
  860                 if (!slept) {
  861                         tsleep_nsec(sd->sd_sc, PWAIT, "sr_yield",
  862                             MSEC_TO_NSEC(1));
  863                 }
  864 
  865                 sr_scsi_wu_put(sd, wu_r);
  866                 sr_scsi_wu_put(sd, wu_w);
  867 
  868                 sd->sd_meta->ssd_rebuild = chunk_lba * chunk_count;
  869 
  870                 percent = sr_rebuild_percent(sd);
  871                 if (percent != old_percent && strip_no != chunk_strips - 1) {
  872                         if (sr_meta_save(sd, SR_META_DIRTY))
  873                                 printf("%s: could not save metadata to %s\n",
  874                                     DEVNAME(sd->sd_sc),
  875                                     sd->sd_meta->ssd_devname);
  876                         old_percent = percent;
  877                 }
  878 
  879                 if (sd->sd_reb_abort)
  880                         goto abort;
  881         }
  882 
  883         DNPRINTF(SR_D_REBUILD, "%s: %s rebuild complete\n", DEVNAME(sd->sd_sc),
  884             sd->sd_meta->ssd_devname);
  885 
  886         /* all done */
  887         sd->sd_meta->ssd_rebuild = 0;
  888         for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
  889                 if (sd->sd_vol.sv_chunks[i]->src_meta.scm_status ==
  890                     BIOC_SDREBUILD) {
  891                         sd->sd_set_chunk_state(sd, i, BIOC_SDONLINE);
  892                         break;
  893                 }
  894         }
  895 
  896         return;
  897 
  898 abort:
  899         if (sr_meta_save(sd, SR_META_DIRTY))
  900                 printf("%s: could not save metadata to %s\n",
  901                     DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
  902 bad:
  903         return;
  904 }
  905 
  906 #if 0
  907 void
  908 sr_raid5_scrub(struct sr_discipline *sd)
  909 {
  910         int64_t strip_no, strip_size, no_chunk, parity, max_strip, strip_bits;
  911         int64_t i;
  912         struct sr_workunit *wu_r, *wu_w;
  913         int s, slept;
  914         void *xorbuf;
  915 
  916         wu_w = sr_scsi_wu_get(sd, 0);
  917         wu_r = sr_scsi_wu_get(sd, 0);
  918 
  919         no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
  920         strip_size = sd->sd_meta->ssdi.ssd_strip_size;
  921         strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
  922         max_strip = sd->sd_meta->ssdi.ssd_size >> strip_bits;
  923 
  924         for (strip_no = 0; strip_no < max_strip; strip_no++) {
  925                 parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
  926 
  927                 xorbuf = sr_block_get(sd, strip_size);
  928                 for (i = 0; i <= no_chunk; i++) {
  929                         if (i != parity)
  930                                 sr_raid5_addio(wu_r, i, 0xBADCAFE, strip_size,
  931                                     NULL, SCSI_DATA_IN, 0, xorbuf);
  932                 }
  933                 sr_raid5_addio(wu_w, parity, 0xBADCAFE, strip_size, xorbuf,
  934                     SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL);
  935 
  936                 wu_r->swu_flags |= SR_WUF_REBUILD;
  937 
  938                 /* Collide wu_w with wu_r */
  939                 wu_w->swu_state = SR_WU_DEFERRED;
  940                 wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
  941                 wu_r->swu_collider = wu_w;
  942 
  943                 s = splbio();
  944                 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
  945                 splx(s);
  946 
  947                 wu_r->swu_state = SR_WU_INPROGRESS;
  948                 sr_schedule_wu(wu_r);
  949 
  950                 slept = 0;
  951                 while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
  952                         tsleep_nsec(wu_w, PRIBIO, "sr_scrub", INFSLP);
  953                         slept = 1;
  954                 }
  955                 if (!slept) {
  956                         tsleep_nsec(sd->sd_sc, PWAIT, "sr_yield",
  957                             MSEC_TO_NSEC(1));
  958                 }
  959         }
  960 }
  961 #endif

Cache object: ea48010b5fc6b5d46b541af515a3fe16


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.