The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/vinum/vinumraid5.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1997, 1998
    3  *      Cybernet Corporation and Nan Yang Computer Services Limited.
    4  *      All rights reserved.
    5  *
    6  *  This software was developed as part of the NetMAX project.
    7  *
    8  *  Written by Greg Lehey
    9  *
   10  *  This software is distributed under the so-called ``Berkeley
   11  *  License'':
   12  *
   13  * Redistribution and use in source and binary forms, with or without
   14  * modification, are permitted provided that the following conditions
   15  * are met:
   16  * 1. Redistributions of source code must retain the above copyright
   17  *    notice, this list of conditions and the following disclaimer.
   18  * 2. Redistributions in binary form must reproduce the above copyright
   19  *    notice, this list of conditions and the following disclaimer in the
   20  *    documentation and/or other materials provided with the distribution.
   21  * 3. All advertising materials mentioning features or use of this software
   22  *    must display the following acknowledgement:
   23  *      This product includes software developed by Cybernet Corporation
   24  *      and Nan Yang Computer Services Limited
   25  * 4. Neither the name of the Companies nor the names of its contributors
   26  *    may be used to endorse or promote products derived from this software
   27  *    without specific prior written permission.
   28  *
   29  * This software is provided ``as is'', and any express or implied
   30  * warranties, including, but not limited to, the implied warranties of
   31  * merchantability and fitness for a particular purpose are disclaimed.
   32  * In no event shall the company or contributors be liable for any
   33  * direct, indirect, incidental, special, exemplary, or consequential
   34  * damages (including, but not limited to, procurement of substitute
   35  * goods or services; loss of use, data, or profits; or business
   36  * interruption) however caused and on any theory of liability, whether
   37  * in contract, strict liability, or tort (including negligence or
   38  * otherwise) arising in any way out of the use of this software, even if
   39  * advised of the possibility of such damage.
   40  *
   41  * $Id: vinumraid5.c,v 1.23 2003/02/08 03:32:45 grog Exp $
   42  */
   43 
   44 #include <sys/cdefs.h>
   45 __FBSDID("$FreeBSD$");
   46 #include <dev/vinum/vinumhdr.h>
   47 #include <dev/vinum/request.h>
   48 #include <sys/resourcevar.h>
   49 
   50 /*
   51  * Parameters which describe the current transfer.
   52  * These are only used for calculation, but they
   53  * need to be passed to other functions, so it's
   54  * tidier to put them in a struct
   55  */
   56 struct metrics {
   57     daddr_t stripebase;                                     /* base address of stripe (1st subdisk) */
   58     int stripeoffset;                                       /* offset in stripe */
   59     int stripesectors;                                      /* total sectors to transfer in this stripe */
   60     daddr_t sdbase;                                         /* offset in subdisk of stripe base */
   61     int sdcount;                                            /* number of disks involved in this transfer */
   62     daddr_t diskstart;                                      /* remember where this transfer starts */
   63     int psdno;                                              /* number of parity subdisk */
   64     int badsdno;                                            /* number of down subdisk, if there is one */
   65     int firstsdno;                                          /* first data subdisk number */
   66     /* These correspond to the fields in rqelement, sort of */
   67     int useroffset;
   68     /*
   69      * Initial offset and length values for the first
   70      * data block
   71      */
   72     int initoffset;                                         /* start address of block to transfer */
   73     short initlen;                                          /* length in sectors of data transfer */
   74     /* Define a normal operation */
   75     int dataoffset;                                         /* start address of block to transfer */
   76     int datalen;                                            /* length in sectors of data transfer */
   77     /* Define a group operation */
   78     int groupoffset;                                        /* subdisk offset of group operation */
   79     int grouplen;                                           /* length in sectors of group operation */
   80     /* Define a normal write operation */
   81     int writeoffset;                                        /* subdisk offset of normal write */
   82     int writelen;                                           /* length in sectors of write operation */
   83     enum xferinfo flags;                                    /* to check what we're doing */
   84     int rqcount;                                            /* number of elements in request */
   85 };
   86 
   87 enum requeststatus bre5(struct request *rq,
   88     int plexno,
   89     daddr_t * diskstart,
   90     daddr_t diskend);
   91 void complete_raid5_write(struct rqelement *);
   92 enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
   93 void setrqebounds(struct rqelement *rqe, struct metrics *mp);
   94 
   95 /*
   96  * define the low-level requests needed to perform
   97  * a high-level I/O operation for a specific plex
   98  * 'plexno'.
   99  *
  100  * Return 0 if all subdisks involved in the
  101  * request are up, 1 if some subdisks are not up,
  102  * and -1 if the request is at least partially
  103  * outside the bounds of the subdisks.
  104  *
  105  * Modify the pointer *diskstart to point to the
  106  * end address.  On read, return on the first bad
  107  * subdisk, so that the caller
  108  * (build_read_request) can try alternatives.
  109  *
  110  * On entry to this routine, the prq structures
  111  * are not assigned.  The assignment is performed
  112  * by expandrq().  Strictly speaking, the elements
  113  * rqe->sdno of all entries should be set to -1,
  114  * since 0 (from bzero) is a valid subdisk number.
  115  * We avoid this problem by initializing the ones
  116  * we use, and not looking at the others (index >=
  117  * prq->requests).
  118  */
  119 enum requeststatus
  120 bre5(struct request *rq,
  121     int plexno,
  122     daddr_t * diskaddr,
  123     daddr_t diskend)
  124 {
  125     struct metrics m;                                       /* most of the information */
  126     struct sd *sd;
  127     struct plex *plex;
  128     struct buf *bp;                                         /* user's bp */
  129     struct rqgroup *rqg;                                    /* the request group that we will create */
  130     struct rqelement *rqe;                                  /* point to this request information */
  131     int rsectors;                                           /* sectors remaining in this stripe */
  132     int mysdno;                                             /* another sd index in loops */
  133     int rqno;                                               /* request number */
  134 
  135     rqg = NULL;                                             /* shut up, damn compiler */
  136     m.diskstart = *diskaddr;                                /* start of transfer */
  137     bp = rq->bp;                                            /* buffer pointer */
  138     plex = &PLEX[plexno];                                   /* point to the plex */
  139 
  140 
  141     while (*diskaddr < diskend) {                           /* until we get it all sorted out */
  142         if (*diskaddr >= plex->length)                      /* beyond the end of the plex */
  143             return REQUEST_EOF;                             /* can't continue */
  144 
  145         m.badsdno = -1;                                     /* no bad subdisk yet */
  146 
  147         /* Part A: Define the request */
  148         /*
  149          * First, calculate some sizes:
  150          * The offset of the start address from
  151          * the start of the stripe.
  152          */
  153         m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1));
  154 
  155         /*
  156          * The plex-relative address of the
  157          * start of the stripe.
  158          */
  159         m.stripebase = *diskaddr - m.stripeoffset;
  160 
  161         /* subdisk containing the parity stripe */
  162         if (plex->organization == plex_raid5)
  163             m.psdno = plex->subdisks - 1
  164                 - (*diskaddr / (plex->stripesize * (plex->subdisks - 1)))
  165                 % plex->subdisks;
  166         else                                                /* RAID-4 */
  167             m.psdno = plex->subdisks - 1;
  168 
  169         /*
  170          * The number of the subdisk in which
  171          * the start is located.
  172          */
  173         m.firstsdno = m.stripeoffset / plex->stripesize;
  174         if (m.firstsdno >= m.psdno)                         /* at or past parity sd */
  175             m.firstsdno++;                                  /* increment it */
  176 
  177         /*
  178          * The offset from the beginning of
  179          * the stripe on this subdisk.
  180          */
  181         m.initoffset = m.stripeoffset % plex->stripesize;
  182 
  183         /* The offset of the stripe start relative to this subdisk */
  184         m.sdbase = m.stripebase / (plex->subdisks - 1);
  185 
  186         m.useroffset = *diskaddr - m.diskstart;             /* The offset of the start in the user buffer */
  187 
  188         /*
  189          * The number of sectors to transfer in the
  190          * current (first) subdisk.
  191          */
  192         m.initlen = min(diskend - *diskaddr,                /* the amount remaining to transfer */
  193             plex->stripesize - m.initoffset);               /* and the amount left in this block */
  194 
  195         /*
  196          * The number of sectors to transfer in this stripe
  197          * is the minumum of the amount remaining to transfer
  198          * and the amount left in this stripe.
  199          */
  200         m.stripesectors = min(diskend - *diskaddr,
  201             plex->stripesize * (plex->subdisks - 1) - m.stripeoffset);
  202 
  203         /* The number of data subdisks involved in this request */
  204         m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize;
  205 
  206         /* Part B: decide what kind of transfer this will be.
  207 
  208          * start and end addresses of the transfer in
  209          * the current block.
  210          *
  211          * There are a number of different kinds of
  212          * transfer, each of which relates to a
  213          * specific subdisk:
  214          *
  215          * 1. Normal read.  All participating subdisks
  216          *    are up, and the transfer can be made
  217          *    directly to the user buffer.  The bounds
  218          *    of the transfer are described by
  219          *    m.dataoffset and m.datalen.  We have
  220          *    already calculated m.initoffset and
  221          *    m.initlen, which define the parameters
  222          *    for the first data block.
  223          *
  224          * 2. Recovery read.  One participating
  225          *    subdisk is down.  To recover data, all
  226          *    the other subdisks, including the parity
  227          *    subdisk, must be read.  The data is
  228          *    recovered by exclusive-oring all the
  229          *    other blocks.  The bounds of the
  230          *    transfer are described by m.groupoffset
  231          *    and m.grouplen.
  232          *
  233          * 3. A read request may request reading both
  234          *    available data (normal read) and
  235          *    non-available data (recovery read).
  236          *    This can be a problem if the address
  237          *    ranges of the two reads do not coincide:
  238          *    in this case, the normal read needs to
  239          *    be extended to cover the address range
  240          *    of the recovery read, and must thus be
  241          *    performed out of malloced memory.
  242          *
  243          * 4. Normal write.  All the participating
  244          *    subdisks are up.  The bounds of the
  245          *    transfer are described by m.dataoffset
  246          *    and m.datalen.  Since these values
  247          *    differ for each block, we calculate the
  248          *    bounds for the parity block
  249          *    independently as the maximum of the
  250          *    individual blocks and store these values
  251          *    in m.writeoffset and m.writelen.  This
  252          *    write proceeds in four phases:
  253          *
  254          *    i.  Read the old contents of each block
  255          *        and the parity block.
  256          *    ii.  ``Remove'' the old contents from
  257          *         the parity block with exclusive or.
  258          *    iii. ``Insert'' the new contents of the
  259          *          block in the parity block, again
  260          *          with exclusive or.
  261          *
  262          *    iv.  Write the new contents of the data
  263          *         blocks and the parity block.  The data
  264          *         block transfers can be made directly from
  265          *         the user buffer.
  266          *
  267          * 5. Degraded write where the data block is
  268          *    not available.  The bounds of the
  269          *    transfer are described by m.groupoffset
  270          *    and m.grouplen. This requires the
  271          *    following steps:
  272          *
  273          *    i.  Read in all the other data blocks,
  274          *        excluding the parity block.
  275          *
  276          *    ii.  Recreate the parity block from the
  277          *         other data blocks and the data to be
  278          *         written.
  279          *
  280          *    iii. Write the parity block.
  281          *
  282          * 6. Parityless write, a write where the
  283          *    parity block is not available.  This is
  284          *    in fact the simplest: just write the
  285          *    data blocks.  This can proceed directly
  286          *    from the user buffer.  The bounds of the
  287          *    transfer are described by m.dataoffset
  288          *    and m.datalen.
  289          *
  290          * 7. Combination of degraded data block write
  291          *    and normal write.  In this case the
  292          *    address ranges of the reads may also
  293          *    need to be extended to cover all
  294          *    participating blocks.
  295          *
  296          * All requests in a group transfer transfer
  297          * the same address range relative to their
  298          * subdisk.  The individual transfers may
  299          * vary, but since our group of requests is
  300          * all in a single slice, we can define a
  301          * range in which they all fall.
  302          *
  303          * In the following code section, we determine
  304          * which kind of transfer we will perform.  If
  305          * there is a group transfer, we also decide
  306          * its bounds relative to the subdisks.  At
  307          * the end, we have the following values:
  308          *
  309          *  m.flags indicates the kinds of transfers
  310          *    we will perform.
  311          *  m.initoffset indicates the offset of the
  312          *    beginning of any data operation relative
  313          *    to the beginning of the stripe base.
  314          *  m.initlen specifies the length of any data
  315          *    operation.
  316          *  m.dataoffset contains the same value as
  317          *    m.initoffset.
  318          *  m.datalen contains the same value as
  319          *    m.initlen.  Initially dataoffset and
  320          *    datalen describe the parameters for the
  321          *    first data block; while building the data
  322          *    block requests, they are updated for each
  323          *    block.
  324          *  m.groupoffset indicates the offset of any
  325          *    group operation relative to the beginning
  326          *    of the stripe base.
  327          *  m.grouplen specifies the length of any
  328          *    group operation.
  329          *  m.writeoffset indicates the offset of a
  330          *    normal write relative to the beginning of
  331          *    the stripe base.  This value differs from
  332          *    m.dataoffset in that it applies to the
  333          *    entire operation, and not just the first
  334          *    block.
  335          *  m.writelen specifies the total span of a
  336          *    normal write operation.  writeoffset and
  337          *    writelen are used to define the parity
  338          *    block.
  339          */
  340         m.groupoffset = 0;                                  /* assume no group... */
  341         m.grouplen = 0;                                     /* until we know we have one */
  342         m.writeoffset = m.initoffset;                       /* start offset of transfer */
  343         m.writelen = 0;                                     /* nothing to write yet */
  344         m.flags = 0;                                        /* no flags yet */
  345         rsectors = m.stripesectors;                         /* remaining sectors to examine */
  346         m.dataoffset = m.initoffset;                        /* start at the beginning of the transfer */
  347         m.datalen = m.initlen;
  348 
  349         if (m.sdcount > 1) {
  350             plex->multiblock++;                             /* more than one block for the request */
  351             /*
  352              * If we have two transfers that don't overlap,
  353              * (one at the end of the first block, the other
  354              * at the beginning of the second block),
  355              * it's cheaper to split them.
  356              */
  357             if (rsectors < plex->stripesize) {
  358                 m.sdcount = 1;                              /* just one subdisk */
  359                 m.stripesectors = m.initlen;                /* and just this many sectors */
  360                 rsectors = m.initlen;                       /* and in the loop counter */
  361             }
  362         }
  363         if (SD[plex->sdnos[m.psdno]].state < sd_reborn)     /* is our parity subdisk down? */
  364             m.badsdno = m.psdno;                            /* note that it's down */
  365         if (bp->b_iocmd == BIO_READ) {                      /* read operation */
  366             for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
  367                 if (mysdno == m.psdno)                      /* ignore parity on read */
  368                     mysdno++;
  369                 if (mysdno == plex->subdisks)               /* wraparound */
  370                     mysdno = 0;
  371                 if (mysdno == m.psdno)                      /* parity, */
  372                     mysdno++;                               /* we've given already */
  373 
  374                 if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */
  375                     if (m.badsdno >= 0)                     /* we had one already, */
  376                         return REQUEST_DOWN;                /* we can't take a second */
  377                     m.badsdno = mysdno;                     /* got the first */
  378                     m.groupoffset = m.dataoffset;           /* define the bounds */
  379                     m.grouplen = m.datalen;
  380                     m.flags |= XFR_RECOVERY_READ;           /* we need recovery */
  381                     plex->recovered_reads++;                /* count another one */
  382                 } else
  383                     m.flags |= XFR_NORMAL_READ;             /* normal read */
  384 
  385                 /* Update the pointers for the next block */
  386                 m.dataoffset = 0;                           /* back to the start of the stripe */
  387                 rsectors -= m.datalen;                      /* remaining sectors to examine */
  388                 m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */
  389             }
  390         } else {                                            /* write operation */
  391             for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
  392                 if (mysdno == m.psdno)                      /* parity stripe, we've dealt with that */
  393                     mysdno++;
  394                 if (mysdno == plex->subdisks)               /* wraparound */
  395                     mysdno = 0;
  396                 if (mysdno == m.psdno)                      /* parity, */
  397                     mysdno++;                               /* we've given already */
  398 
  399                 sd = &SD[plex->sdnos[mysdno]];
  400                 if (sd->state != sd_up) {
  401                     enum requeststatus s;
  402 
  403                     s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
  404                     if (s && (m.badsdno >= 0)) {            /* second bad disk, */
  405                         int sdno;
  406                         /*
  407                          * If the parity disk is down, there's
  408                          * no recovery.  We make all involved
  409                          * subdisks stale.  Otherwise, we
  410                          * should be able to recover, but it's
  411                          * like pulling teeth.  Fix it later.
  412                          */
  413                         for (sdno = 0; sdno < m.sdcount; sdno++) {
  414                             struct sd *sd = &SD[plex->sdnos[sdno]];
  415                             if (sd->state >= sd_reborn)     /* sort of up, */
  416                                 set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */
  417                         }
  418                         return s;                           /* and crap out */
  419                     }
  420                     m.badsdno = mysdno;                     /* note which one is bad */
  421                     m.flags |= XFR_DEGRADED_WRITE;          /* we need recovery */
  422                     plex->degraded_writes++;                /* count another one */
  423                     m.groupoffset = m.dataoffset;           /* define the bounds */
  424                     m.grouplen = m.datalen;
  425                 } else {
  426                     m.flags |= XFR_NORMAL_WRITE;            /* normal write operation */
  427                     if (m.writeoffset > m.dataoffset) {     /* move write operation lower */
  428                         m.writelen = max(m.writeoffset + m.writelen,
  429                             m.dataoffset + m.datalen)
  430                             - m.dataoffset;
  431                         m.writeoffset = m.dataoffset;
  432                     } else
  433                         m.writelen = max(m.writeoffset + m.writelen,
  434                             m.dataoffset + m.datalen)
  435                             - m.writeoffset;
  436                 }
  437 
  438                 /* Update the pointers for the next block */
  439                 m.dataoffset = 0;                           /* back to the start of the stripe */
  440                 rsectors -= m.datalen;                      /* remaining sectors to examine */
  441                 m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */
  442             }
  443             if (m.badsdno == m.psdno) {                     /* got a bad parity block, */
  444                 struct sd *psd = &SD[plex->sdnos[m.psdno]];
  445 
  446                 if (psd->state == sd_down)
  447                     set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */
  448                 else if (psd->state == sd_crashed)
  449                     set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */
  450                 m.flags &= ~XFR_NORMAL_WRITE;               /* this write isn't normal, */
  451                 m.flags |= XFR_PARITYLESS_WRITE;            /* it's parityless */
  452                 plex->parityless_writes++;                  /* count another one */
  453             }
  454         }
  455 
  456         /* reset the initial transfer values */
  457         m.dataoffset = m.initoffset;                        /* start at the beginning of the transfer */
  458         m.datalen = m.initlen;
  459 
  460         /* decide how many requests we need */
  461         if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))
  462             /* doing a recovery read or degraded write, */
  463             m.rqcount = plex->subdisks;                     /* all subdisks */
  464         else if (m.flags & XFR_NORMAL_WRITE)                /* normal write, */
  465             m.rqcount = m.sdcount + 1;                      /* all data blocks and the parity block */
  466         else                                                /* parityless write or normal read */
  467             m.rqcount = m.sdcount;                          /* just the data blocks */
  468 
  469         /* Part C: build the requests */
  470         rqg = allocrqg(rq, m.rqcount);                      /* get a request group */
  471         if (rqg == NULL) {                                  /* malloc failed */
  472             bp->b_error = ENOMEM;
  473             bp->b_ioflags |= BIO_ERROR;
  474             return REQUEST_ENOMEM;
  475         }
  476         rqg->plexno = plexno;
  477         rqg->flags = m.flags;
  478         rqno = 0;                                           /* index in the request group */
  479 
  480         /* 1: PARITY BLOCK */
  481         /*
  482          * Are we performing an operation which requires parity?  In that case,
  483          * work out the parameters and define the parity block.
  484          * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE
  485          */
  486         if (m.flags & XFR_PARITYOP) {                       /* need parity */
  487             rqe = &rqg->rqe[rqno];                          /* point to element */
  488             sd = &SD[plex->sdnos[m.psdno]];                 /* the subdisk in question */
  489             rqe->rqg = rqg;                                 /* point back to group */
  490             rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */
  491             &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE);     /* transfer flags without data op stuf */
  492             setrqebounds(rqe, &m);                          /* set up the bounds of the transfer */
  493             rqe->sdno = sd->sdno;                           /* subdisk number */
  494             rqe->driveno = sd->driveno;
  495             if (build_rq_buffer(rqe, plex))                 /* build the buffer */
  496                 return REQUEST_ENOMEM;                      /* can't do it */
  497             rqe->b.b_iocmd = BIO_READ;                      /* we must read first */
  498             m.sdcount++;                                    /* adjust the subdisk count */
  499             rqno++;                                         /* and point to the next request */
  500         }
  501         /*
  502          * 2: DATA BLOCKS
  503          * Now build up requests for the blocks required
  504          * for individual transfers
  505          */
  506         for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) {
  507             if (mysdno == m.psdno)                          /* parity, */
  508                 mysdno++;                                   /* we've given already */
  509             if (mysdno == plex->subdisks)                   /* got to the end, */
  510                 mysdno = 0;                                 /* wrap around */
  511             if (mysdno == m.psdno)                          /* parity, */
  512                 mysdno++;                                   /* we've given already */
  513 
  514             rqe = &rqg->rqe[rqno];                          /* point to element */
  515             sd = &SD[plex->sdnos[mysdno]];                  /* the subdisk in question */
  516             rqe->rqg = rqg;                                 /* point to group */
  517             if (m.flags & XFR_NEEDS_MALLOC)                 /* we need a malloced buffer first */
  518                 rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */
  519             else
  520                 rqe->flags = m.flags | XFR_DATA_BLOCK;      /* transfer flags */
  521             if (mysdno == m.badsdno) {                      /* this is the bad subdisk */
  522                 rqg->badsdno = rqno;                        /* note which one */
  523                 rqe->flags |= XFR_BAD_SUBDISK;              /* note that it's dead */
  524                 /*
  525                  * we can't read or write from/to it,
  526                  * but we don't need to malloc
  527                  */
  528                 rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE);
  529             }
  530             setrqebounds(rqe, &m);                          /* set up the bounds of the transfer */
  531             rqe->useroffset = m.useroffset;                 /* offset in user buffer */
  532             rqe->sdno = sd->sdno;                           /* subdisk number */
  533             rqe->driveno = sd->driveno;
  534             if (build_rq_buffer(rqe, plex))                 /* build the buffer */
  535                 return REQUEST_ENOMEM;                      /* can't do it */
  536             if ((m.flags & XFR_PARITYOP)                    /* parity operation, */
  537             &&((m.flags & XFR_BAD_SUBDISK) == 0))           /* and not the bad subdisk, */
  538                 rqe->b.b_iocmd = BIO_READ;                  /* we must read first */
  539 
  540             /* Now update pointers for the next block */
  541             *diskaddr += m.datalen;                         /* skip past what we've done */
  542             m.stripesectors -= m.datalen;                   /* deduct from what's left */
  543             m.useroffset += m.datalen;                      /* and move on in the user buffer */
  544             m.datalen = min(m.stripesectors, plex->stripesize); /* and recalculate */
  545             m.dataoffset = 0;                               /* start at the beginning of next block */
  546         }
  547 
  548         /*
  549          * 3: REMAINING BLOCKS FOR RECOVERY
  550          * Finally, if we have a recovery operation, build
  551          * up transfers for the other subdisks.  Follow the
  552          * subdisks around until we get to where we started.
  553          * These requests use only the group parameters.
  554          */
  555         if ((rqno < m.rqcount)                              /* haven't done them all already */
  556         &&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) {
  557             for (; rqno < m.rqcount; rqno++, mysdno++) {
  558                 if (mysdno == m.psdno)                      /* parity, */
  559                     mysdno++;                               /* we've given already */
  560                 if (mysdno == plex->subdisks)               /* got to the end, */
  561                     mysdno = 0;                             /* wrap around */
  562                 if (mysdno == m.psdno)                      /* parity, */
  563                     mysdno++;                               /* we've given already */
  564 
  565                 rqe = &rqg->rqe[rqno];                      /* point to element */
  566                 sd = &SD[plex->sdnos[mysdno]];              /* the subdisk in question */
  567                 rqe->rqg = rqg;                             /* point to group */
  568 
  569                 rqe->sdoffset = m.sdbase + m.groupoffset;   /* start of transfer */
  570                 rqe->dataoffset = 0;                        /* for tidiness' sake */
  571                 rqe->groupoffset = 0;                       /* group starts at the beginining */
  572                 rqe->datalen = 0;
  573                 rqe->grouplen = m.grouplen;
  574                 rqe->buflen = m.grouplen;
  575                 rqe->flags = (m.flags | XFR_MALLOCED)       /* transfer flags without data op stuf */
  576                 &~XFR_DATAOP;
  577                 rqe->sdno = sd->sdno;                       /* subdisk number */
  578                 rqe->driveno = sd->driveno;
  579                 if (build_rq_buffer(rqe, plex))             /* build the buffer */
  580                     return REQUEST_ENOMEM;                  /* can't do it */
  581                 rqe->b.b_iocmd = BIO_READ;                  /* we must read first */
  582             }
  583         }
  584         /*
  585          * We need to lock the address range before
  586          * doing anything.  We don't have to be
  587          * performing a recovery operation: somebody
  588          * else could be doing so, and the results could
  589          * influence us.  Note the fact here, we'll perform
  590          * the lock in launch_requests.
  591          */
  592         rqg->lockbase = m.stripebase;
  593         if (*diskaddr < diskend)                            /* didn't finish the request on this stripe */
  594             plex->multistripe++;                            /* count another one */
  595     }
  596     return REQUEST_OK;
  597 }
  598 
  599 /*
  600  * Helper function for rqe5: adjust the bounds of
  601  * the transfers to minimize the buffer
  602  * allocation.
  603  *
  604  * Each request can handle two of three different
  605  * data ranges:
  606  *
  607  * 1.  The range described by the parameters
  608  *     dataoffset and datalen, for normal read or
  609  *     parityless write.
  610  * 2.  The range described by the parameters
  611  *     groupoffset and grouplen, for recovery read
  612  *     and degraded write.
  613  * 3.  For normal write, the range depends on the
  614  *     kind of block.  For data blocks, the range
  615  *     is defined by dataoffset and datalen.  For
  616  *     parity blocks, it is defined by writeoffset
  617  *     and writelen.
  618  *
  619  * In order not to allocate more memory than
  620  * necessary, this function adjusts the bounds
  621  * parameter for each request to cover just the
  622  * minimum necessary for the function it performs.
  623  * This will normally vary from one request to the
  624  * next.
  625  *
  626  * Things are slightly different for the parity
  627  * block.  In this case, the bounds defined by
  628  * mp->writeoffset and mp->writelen also play a
  629  * rôle.  Select this case by setting the
  630  * parameter forparity != 0.
  631  */
  632 void
  633 setrqebounds(struct rqelement *rqe, struct metrics *mp)
  634 {
  635     /* parity block of a normal write */
  636     if ((rqe->flags & (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK))
  637         == (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) {         /* case 3 */
  638         if (rqe->flags & XFR_DEGRADED_WRITE) {              /* also degraded write */
  639             /*
  640              * With a combined normal and degraded write, we
  641              * will zero out the area of the degraded write
  642              * in the second phase, so we don't need to read
  643              * it in.  Unfortunately, we need a way to tell
  644              * build_request_buffer the size of the buffer,
  645              * and currently that's the length of the read.
  646              * As a result, we read everything, even the stuff
  647              * that we're going to nuke.
  648              * FIXME XXX
  649              */
  650             if (mp->groupoffset < mp->writeoffset) {        /* group operation starts lower */
  651                 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
  652                 rqe->dataoffset = mp->writeoffset - mp->groupoffset; /* data starts here */
  653                 rqe->groupoffset = 0;                       /* and the group at the beginning */
  654             } else {                                        /* individual data starts first */
  655                 rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */
  656                 rqe->dataoffset = 0;                        /* individual data starts at the beginning */
  657                 rqe->groupoffset = mp->groupoffset - mp->writeoffset; /* group starts here */
  658             }
  659             rqe->datalen = mp->writelen;
  660             rqe->grouplen = mp->grouplen;
  661         } else {                                            /* just normal write (case 3) */
  662             rqe->sdoffset = mp->sdbase + mp->writeoffset;   /* start of transfer */
  663             rqe->dataoffset = 0;                            /* degradation starts at the beginning */
  664             rqe->groupoffset = 0;                           /* for tidiness' sake */
  665             rqe->datalen = mp->writelen;
  666             rqe->grouplen = 0;
  667         }
  668     } else if (rqe->flags & XFR_DATAOP) {                   /* data operation (case 1 or 3) */
  669         if (rqe->flags & XFR_GROUPOP) {                     /* also a group operation (case 2) */
  670             if (mp->groupoffset < mp->dataoffset) {         /* group operation starts lower */
  671                 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
  672                 rqe->dataoffset = mp->dataoffset - mp->groupoffset; /* data starts here */
  673                 rqe->groupoffset = 0;                       /* and the group at the beginning */
  674             } else {                                        /* individual data starts first */
  675                 rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */
  676                 rqe->dataoffset = 0;                        /* individual data starts at the beginning */
  677                 rqe->groupoffset = mp->groupoffset - mp->dataoffset; /* group starts here */
  678             }
  679             rqe->datalen = mp->datalen;
  680             rqe->grouplen = mp->grouplen;
  681         } else {                                            /* just data operation (case 1) */
  682             rqe->sdoffset = mp->sdbase + mp->dataoffset;    /* start of transfer */
  683             rqe->dataoffset = 0;                            /* degradation starts at the beginning */
  684             rqe->groupoffset = 0;                           /* for tidiness' sake */
  685             rqe->datalen = mp->datalen;
  686             rqe->grouplen = 0;
  687         }
  688     } else {                                                /* just group operations (case 2) */
  689         rqe->sdoffset = mp->sdbase + mp->groupoffset;       /* start of transfer */
  690         rqe->dataoffset = 0;                                /* for tidiness' sake */
  691         rqe->groupoffset = 0;                               /* group starts at the beginining */
  692         rqe->datalen = 0;
  693         rqe->grouplen = mp->grouplen;
  694     }
  695     rqe->buflen = max(rqe->dataoffset + rqe->datalen,       /* total buffer length */
  696         rqe->groupoffset + rqe->grouplen);
  697 }
  698 /* Local Variables: */
  699 /* fill-column: 50 */
  700 /* End: */

Cache object: 681434e5281e78b9e87b05900e5278f2


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.