The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/raidframe/rf_pqdegdags.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: rf_pqdegdags.c,v 1.9 2004/01/10 00:56:28 oster Exp $   */
    2 /*
    3  * Copyright (c) 1995 Carnegie-Mellon University.
    4  * All rights reserved.
    5  *
    6  * Author: Daniel Stodolsky
    7  *
    8  * Permission to use, copy, modify and distribute this software and
    9  * its documentation is hereby granted, provided that both the copyright
   10  * notice and this permission notice appear in all copies of the
   11  * software, derivative works or modified versions, and any portions
   12  * thereof, and that both notices appear in supporting documentation.
   13  *
   14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   17  *
   18  * Carnegie Mellon requests users of this software to return to
   19  *
   20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   21  *  School of Computer Science
   22  *  Carnegie Mellon University
   23  *  Pittsburgh PA 15213-3890
   24  *
   25  * any improvements or extensions that they make and grant Carnegie the
   26  * rights to redistribute these changes.
   27  */
   28 
   29 /*
   30  * rf_pqdegdags.c
   31  * Degraded mode dags for double fault cases.
   32 */
   33 
   34 
   35 #include <sys/cdefs.h>
   36 __KERNEL_RCSID(0, "$NetBSD: rf_pqdegdags.c,v 1.9 2004/01/10 00:56:28 oster Exp $");
   37 
   38 #include "rf_archs.h"
   39 
   40 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
   41 
   42 #include <dev/raidframe/raidframevar.h>
   43 
   44 #include "rf_raid.h"
   45 #include "rf_dag.h"
   46 #include "rf_dagdegrd.h"
   47 #include "rf_dagdegwr.h"
   48 #include "rf_dagfuncs.h"
   49 #include "rf_dagutils.h"
   50 #include "rf_etimer.h"
   51 #include "rf_acctrace.h"
   52 #include "rf_general.h"
   53 #include "rf_pqdegdags.h"
   54 #include "rf_pq.h"
   55 
   56 static void 
   57 applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
   58     RF_PhysDiskAddr_t * qpda, void *bp);
   59 
   60 /*
   61    Two data drives have failed, and we are doing a read that covers one of them.
   62    We may also be reading some of the surviving drives.
   63 
   64 
   65  *****************************************************************************************
   66  *
   67  * creates a DAG to perform a degraded-mode read of data within one stripe.
   68  * This DAG is as follows:
   69  *
   70  *                                      Hdr
   71  *                                       |
   72  *                                     Block
   73  *                       /         /           \         \     \   \
   74  *                      Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
   75  *                      | \       | \         | \       | \    | \ | \
   76  *
   77  *                                 |                 |
   78  *                              Unblock              X
   79  *                                  \               /
   80  *                                   ------ T ------
   81  *
   82  * Each R node is a successor of the L node
   83  * One successor arc from each R node goes to U, and the other to X
   84  * There is one Rud for each chunk of surviving user data requested by the user,
   85  * and one Rrd for each chunk of surviving user data _not_ being read by the user
   86  * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
   87  * X = pq recovery node, T = terminate
   88  *
   89  * The block & unblock nodes are leftovers from a previous version.  They
   90  * do nothing, but I haven't deleted them because it would be a tremendous
   91  * effort to put them back in.
   92  *
   93  * Note:  The target buffer for the XOR node is set to the actual user buffer where the
   94  * failed data is supposed to end up.  This buffer is zero'd by the code here.  Thus,
   95  * if you create a degraded read dag, use it, and then re-use, you have to be sure to
   96  * zero the target buffer prior to the re-use.
   97  *
   98  * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
   99  * needs and what's not.
  100  ****************************************************************************************/
  101 /*   init a disk node with 2 successors and one predecessor */
  102 #define INIT_DISK_NODE(node,name) \
  103 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
  104 (node)->succedents[0] = unblockNode; \
  105 (node)->succedents[1] = recoveryNode; \
  106 (node)->antecedents[0] = blockNode; \
  107 (node)->antType[0] = rf_control
  108 
  109 #define DISK_NODE_PARAMS(_node_,_p_) \
  110   (_node_).params[0].p = _p_ ; \
  111   (_node_).params[1].p = (_p_)->bufPtr; \
  112   (_node_).params[2].v = parityStripeID; \
  113   (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru)
  114 
  115 #define DISK_NODE_PDA(node)  ((node)->params[0].p)
  116 
  117 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
  118 {
  119         rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
  120             "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
  121 }
  122 
  123 static void 
  124 applyPDA(raidPtr, pda, ppda, qpda, bp)
  125         RF_Raid_t *raidPtr;
  126         RF_PhysDiskAddr_t *pda;
  127         RF_PhysDiskAddr_t *ppda;
  128         RF_PhysDiskAddr_t *qpda;
  129         void   *bp;
  130 {
  131         RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
  132         RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
  133         RF_SectorCount_t s0len = ppda->numSector, len;
  134         RF_SectorNum_t suoffset;
  135         unsigned coeff;
  136         char   *pbuf = ppda->bufPtr;
  137         char   *qbuf = qpda->bufPtr;
  138         char   *buf;
  139         int     delta;
  140 
  141         suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
  142         len = pda->numSector;
  143         /* see if pda intersects a recovery pda */
  144         if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
  145                 buf = pda->bufPtr;
  146                 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
  147                 coeff = (coeff % raidPtr->Layout.numDataCol);
  148 
  149                 if (suoffset < s0off) {
  150                         delta = s0off - suoffset;
  151                         buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
  152                         suoffset = s0off;
  153                         len -= delta;
  154                 }
  155                 if (suoffset > s0off) {
  156                         delta = suoffset - s0off;
  157                         pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
  158                         qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
  159                 }
  160                 if ((suoffset + len) > (s0len + s0off))
  161                         len = s0len + s0off - suoffset;
  162 
  163                 /* src, dest, len */
  164                 rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
  165 
  166                 /* dest, src, len, coeff */
  167                 rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
  168         }
  169 }
  170 /*
  171    Recover data in the case of a double failure. There can be two
  172    result buffers, one for each chunk of data trying to be recovered.
  173    The params are pda's that have not been range restricted or otherwise
  174    politely massaged - this should be done here. The last params are the
  175    pdas of P and Q, followed by the raidPtr. The list can look like
  176 
  177    pda, pda, ... , p pda, q pda, raidptr, asm
  178 
  179    or
  180 
  181    pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
  182 
  183    depending on wether two chunks of recovery data were required.
  184 
  185    The second condition only arises if there are two failed buffers
  186    whose lengths do not add up a stripe unit.
  187 */
  188 
  189 
  190 int 
  191 rf_PQDoubleRecoveryFunc(node)
  192         RF_DagNode_t *node;
  193 {
  194         int     np = node->numParams;
  195         RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
  196         RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
  197         RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
  198         int     d, i;
  199         unsigned coeff;
  200         RF_RaidAddr_t sosAddr, suoffset;
  201         RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
  202         int     two = 0;
  203         RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
  204         char   *buf;
  205         int     numDataCol = layoutPtr->numDataCol;
  206         RF_Etimer_t timer;
  207         RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
  208 
  209         RF_ETIMER_START(timer);
  210 
  211         if (asmap->failedPDAs[1] &&
  212             (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
  213                 RF_ASSERT(0);
  214                 ppda = node->params[np - 6].p;
  215                 ppda2 = node->params[np - 5].p;
  216                 qpda = node->params[np - 4].p;
  217                 qpda2 = node->params[np - 3].p;
  218                 d = (np - 6);
  219                 two = 1;
  220         } else {
  221                 ppda = node->params[np - 4].p;
  222                 qpda = node->params[np - 3].p;
  223                 d = (np - 4);
  224         }
  225 
  226         for (i = 0; i < d; i++) {
  227                 pda = node->params[i].p;
  228                 buf = pda->bufPtr;
  229                 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
  230                 len = pda->numSector;
  231                 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
  232                 /* compute the data unit offset within the column */
  233                 coeff = (coeff % raidPtr->Layout.numDataCol);
  234                 /* see if pda intersects a recovery pda */
  235                 applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
  236                 if (two)
  237                         applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
  238         }
  239 
  240         /* ok, we got the parity back to the point where we can recover. We
  241          * now need to determine the coeff of the columns that need to be
  242          * recovered. We can also only need to recover a single stripe unit. */
  243 
  244         if (asmap->failedPDAs[1] == NULL) {     /* only a single stripe unit
  245                                                  * to recover. */
  246                 pda = asmap->failedPDAs[0];
  247                 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
  248                 /* need to determine the column of the other failed disk */
  249                 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
  250                 /* compute the data unit offset within the column */
  251                 coeff = (coeff % raidPtr->Layout.numDataCol);
  252                 for (i = 0; i < numDataCol; i++) {
  253                         npda.raidAddress = sosAddr + (i * secPerSU);
  254                         (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
  255                         /* skip over dead disks */
  256                         if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
  257                                 if (i != coeff)
  258                                         break;
  259                 }
  260                 RF_ASSERT(i < numDataCol);
  261                 RF_ASSERT(two == 0);
  262                 /* recover the data. Since we need only want to recover one
  263                  * column, we overwrite the parity with the other one. */
  264                 if (coeff < i)  /* recovering 'a' */
  265                         rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
  266                 else            /* recovering 'b' */
  267                         rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
  268         } else
  269                 RF_PANIC();
  270 
  271         RF_ETIMER_STOP(timer);
  272         RF_ETIMER_EVAL(timer);
  273         if (tracerec)
  274                 tracerec->q_us += RF_ETIMER_VAL_US(timer);
  275         rf_GenericWakeupFunc(node, 0);
  276         return (0);
  277 }
  278 
  279 int 
  280 rf_PQWriteDoubleRecoveryFunc(node)
  281         RF_DagNode_t *node;
  282 {
  283         /* The situation:
  284          * 
  285          * We are doing a write that hits only one failed data unit. The other
  286          * failed data unit is not being overwritten, so we need to generate
  287          * it.
  288          * 
  289          * For the moment, we assume all the nonfailed data being written is in
  290          * the shadow of the failed data unit. (i.e,, either a single data
  291          * unit write or the entire failed stripe unit is being overwritten. )
  292          * 
  293          * Recovery strategy: apply the recovery data to the parity and q. Use P
  294          * & Q to recover the second failed data unit in P. Zero fill Q, then
  295          * apply the recovered data to p. Then apply the data being written to
  296          * the failed drive. Then walk through the surviving drives, applying
  297          * new data when it exists, othewise the recovery data. Quite a mess.
  298          * 
  299          * 
  300          * The params
  301          * 
  302          * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
  303          * write pda (numStripeUnitAccess - numDataFailed), failed pda,
  304          * raidPtr, asmap */
  305 
  306         int     np = node->numParams;
  307         RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
  308         RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
  309         RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
  310         int     i;
  311         RF_RaidAddr_t sosAddr;
  312         unsigned coeff;
  313         RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
  314         RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
  315         int     numDataCol = layoutPtr->numDataCol;
  316         RF_Etimer_t timer;
  317         RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
  318 
  319         RF_ASSERT(node->numResults == 2);
  320         RF_ASSERT(asmap->failedPDAs[1] == NULL);
  321         RF_ETIMER_START(timer);
  322         ppda = node->results[0];
  323         qpda = node->results[1];
  324         /* apply the recovery data */
  325         for (i = 0; i < numDataCol - 2; i++)
  326                 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
  327 
  328         /* determine the other failed data unit */
  329         pda = asmap->failedPDAs[0];
  330         sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
  331         /* need to determine the column of the other failed disk */
  332         coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
  333         /* compute the data unit offset within the column */
  334         coeff = (coeff % raidPtr->Layout.numDataCol);
  335         for (i = 0; i < numDataCol; i++) {
  336                 npda.raidAddress = sosAddr + (i * secPerSU);
  337                 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
  338                 /* skip over dead disks */
  339                 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
  340                         if (i != coeff)
  341                                 break;
  342         }
  343         RF_ASSERT(i < numDataCol);
  344         /* recover the data. The column we want to recover we write over the
  345          * parity. The column we don't care about we dump in q. */
  346         if (coeff < i)          /* recovering 'a' */
  347                 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
  348         else                    /* recovering 'b' */
  349                 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
  350 
  351         /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
  352         memset(qpda->bufPtr, 0, rf_RaidAddressToByte(raidPtr, qpda->numSector));
  353         rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
  354 
  355         /* now apply all the write data to the buffer */
  356         /* single stripe unit write case: the failed data is only thing we are
  357          * writing. */
  358         RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
  359         /* dest, src, len, coeff */
  360         rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
  361         rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
  362 
  363         /* now apply all the recovery data */
  364         for (i = 0; i < numDataCol - 2; i++)
  365                 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
  366 
  367         RF_ETIMER_STOP(timer);
  368         RF_ETIMER_EVAL(timer);
  369         if (tracerec)
  370                 tracerec->q_us += RF_ETIMER_VAL_US(timer);
  371 
  372         rf_GenericWakeupFunc(node, 0);
  373         return (0);
  374 }
  375 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
  376 {
  377         RF_PANIC();
  378 }
  379 /*
  380    Two lost data unit write case.
  381 
  382    There are really two cases here:
  383 
  384    (1) The write completely covers the two lost data units.
  385        In that case, a reconstruct write that doesn't write the
  386        failed data units will do the correct thing. So in this case,
  387        the dag looks like
  388 
  389             full stripe read of surviving data units (not being overwriten)
  390             write new data (ignoring failed units)   compute P&Q
  391                                                      write P&Q
  392 
  393 
  394    (2) The write does not completely cover both failed data units
  395        (but touches at least one of them). Then we need to do the
  396        equivalent of a reconstruct read to recover the missing data
  397        unit from the other stripe.
  398 
  399        For any data we are writing that is not in the "shadow"
  400        of the failed units, we need to do a four cycle update.
  401        PANIC on this case. for now
  402 
  403 */
  404 
  405 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
  406 {
  407         RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
  408         RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
  409         int     sum;
  410         int     nf = asmap->numDataFailed;
  411 
  412         sum = asmap->failedPDAs[0]->numSector;
  413         if (nf == 2)
  414                 sum += asmap->failedPDAs[1]->numSector;
  415 
  416         if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
  417                 /* large write case */
  418                 rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
  419                 return;
  420         }
  421         if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
  422                 /* small write case, no user data not in shadow */
  423                 rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
  424                 return;
  425         }
  426         RF_PANIC();
  427 }
  428 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
  429 {
  430         rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
  431 }
  432 #endif                          /* (RF_INCLUDE_DECL_PQ > 0) ||
  433                                  * (RF_INCLUDE_RAID6 > 0) */

Cache object: 813dd3d986f6ba97a4238bc322ee0690


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.