The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/raidframe/rf_dagdegwr.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: rf_dagdegwr.c,v 1.23.2.1 2004/04/11 11:19:58 tron Exp $        */
    2 /*
    3  * Copyright (c) 1995 Carnegie-Mellon University.
    4  * All rights reserved.
    5  *
    6  * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
    7  *
    8  * Permission to use, copy, modify and distribute this software and
    9  * its documentation is hereby granted, provided that both the copyright
   10  * notice and this permission notice appear in all copies of the
   11  * software, derivative works or modified versions, and any portions
   12  * thereof, and that both notices appear in supporting documentation.
   13  *
   14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   17  *
   18  * Carnegie Mellon requests users of this software to return to
   19  *
   20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   21  *  School of Computer Science
   22  *  Carnegie Mellon University
   23  *  Pittsburgh PA 15213-3890
   24  *
   25  * any improvements or extensions that they make and grant Carnegie the
   26  * rights to redistribute these changes.
   27  */
   28 
   29 /*
   30  * rf_dagdegwr.c
   31  *
   32  * code for creating degraded write DAGs
   33  *
   34  */
   35 
   36 #include <sys/cdefs.h>
   37 __KERNEL_RCSID(0, "$NetBSD: rf_dagdegwr.c,v 1.23.2.1 2004/04/11 11:19:58 tron Exp $");
   38 
   39 #include <dev/raidframe/raidframevar.h>
   40 
   41 #include "rf_raid.h"
   42 #include "rf_dag.h"
   43 #include "rf_dagutils.h"
   44 #include "rf_dagfuncs.h"
   45 #include "rf_debugMem.h"
   46 #include "rf_general.h"
   47 #include "rf_dagdegwr.h"
   48 #include "rf_map.h"
   49 
   50 
   51 /******************************************************************************
   52  *
   53  * General comments on DAG creation:
   54  *
   55  * All DAGs in this file use roll-away error recovery.  Each DAG has a single
   56  * commit node, usually called "Cmt."  If an error occurs before the Cmt node
   57  * is reached, the execution engine will halt forward execution and work
   58  * backward through the graph, executing the undo functions.  Assuming that
   59  * each node in the graph prior to the Cmt node are undoable and atomic - or -
   60  * does not make changes to permanent state, the graph will fail atomically.
   61  * If an error occurs after the Cmt node executes, the engine will roll-forward
   62  * through the graph, blindly executing nodes until it reaches the end.
   63  * If a graph reaches the end, it is assumed to have completed successfully.
   64  *
   65  * A graph has only 1 Cmt node.
   66  *
   67  */
   68 
   69 
   70 /******************************************************************************
   71  *
   72  * The following wrappers map the standard DAG creation interface to the
   73  * DAG creation routines.  Additionally, these wrappers enable experimentation
   74  * with new DAG structures by providing an extra level of indirection, allowing
   75  * the DAG creation routines to be replaced at this single point.
   76  */
   77 
   78 static 
   79 RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG)
   80 {
   81         rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp,
   82             flags, allocList, 1, rf_RecoveryXorFunc, RF_TRUE);
   83 }
   84 
   85 void 
   86 rf_CreateDegradedWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
   87                           RF_DagHeader_t *dag_h, void *bp, 
   88                           RF_RaidAccessFlags_t flags, 
   89                           RF_AllocListElem_t *allocList)
   90 {
   91 
   92         RF_ASSERT(asmap->numDataFailed == 1);
   93         dag_h->creator = "DegradedWriteDAG";
   94 
   95         /*
   96          * if the access writes only a portion of the failed unit, and also
   97          * writes some portion of at least one surviving unit, we create two
   98          * DAGs, one for the failed component and one for the non-failed
   99          * component, and do them sequentially.  Note that the fact that we're
  100          * accessing only a portion of the failed unit indicates that the
  101          * access either starts or ends in the failed unit, and hence we need
  102          * create only two dags.  This is inefficient in that the same data or
  103          * parity can get read and written twice using this structure.  I need
  104          * to fix this to do the access all at once.
  105          */
  106         RF_ASSERT(!(asmap->numStripeUnitsAccessed != 1 &&
  107                     asmap->failedPDAs[0]->numSector !=
  108                         raidPtr->Layout.sectorsPerStripeUnit));
  109         rf_CreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
  110             allocList);
  111 }
  112 
  113 
  114 
  115 /******************************************************************************
  116  *
  117  * DAG creation code begins here
  118  */
  119 
  120 
  121 
  122 /******************************************************************************
  123  *
  124  * CommonCreateSimpleDegradedWriteDAG -- creates a DAG to do a degraded-mode
  125  * write, which is as follows
  126  *
  127  *                                        / {Wnq} --\
  128  * hdr -> blockNode ->  Rod -> Xor -> Cmt -> Wnp ----> unblock -> term
  129  *                  \  {Rod} /            \  Wnd ---/
  130  *                                        \ {Wnd} -/
  131  *
  132  * commit nodes: Xor, Wnd
  133  *
  134  * IMPORTANT:
  135  * This DAG generator does not work for double-degraded archs since it does not
  136  * generate Q
  137  *
  138  * This dag is essentially identical to the large-write dag, except that the
  139  * write to the failed data unit is suppressed.
  140  *
  141  * IMPORTANT:  this dag does not work in the case where the access writes only
  142  * a portion of the failed unit, and also writes some portion of at least one
  143  * surviving SU.  this case is handled in CreateDegradedWriteDAG above.
  144  *
  145  * The block & unblock nodes are leftovers from a previous version.  They
  146  * do nothing, but I haven't deleted them because it would be a tremendous
  147  * effort to put them back in.
  148  *
  149  * This dag is used whenever a one of the data units in a write has failed.
  150  * If it is the parity unit that failed, the nonredundant write dag (below)
  151  * is used.
  152  *****************************************************************************/
  153 
  154 void 
  155 rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t *raidPtr, 
  156                                       RF_AccessStripeMap_t *asmap,
  157                                       RF_DagHeader_t *dag_h, void *bp,
  158                                       RF_RaidAccessFlags_t flags,
  159                                       RF_AllocListElem_t *allocList,
  160                                       int nfaults,
  161                                       int (*redFunc) (RF_DagNode_t *),
  162                                       int allowBufferRecycle)
  163 {
  164         int     nNodes, nRrdNodes, nWndNodes, nXorBufs, i, j, paramNum,
  165                 rdnodesFaked;
  166         RF_DagNode_t *blockNode, *unblockNode, *wnpNode, *wnqNode, *termNode;
  167         RF_DagNode_t *wndNodes, *rrdNodes, *xorNode, *commitNode;
  168         RF_DagNode_t *tmpNode, *tmpwndNode, *tmprrdNode;
  169         RF_SectorCount_t sectorsPerSU;
  170         RF_ReconUnitNum_t which_ru;
  171         char   *xorTargetBuf = NULL;    /* the target buffer for the XOR
  172                                          * operation */
  173         char   overlappingPDAs[RF_MAXCOL];/* a temporary array of flags */
  174         RF_AccessStripeMapHeader_t *new_asm_h[2];
  175         RF_PhysDiskAddr_t *pda, *parityPDA;
  176         RF_StripeNum_t parityStripeID;
  177         RF_PhysDiskAddr_t *failedPDA;
  178         RF_RaidLayout_t *layoutPtr;
  179 
  180         layoutPtr = &(raidPtr->Layout);
  181         parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress,
  182             &which_ru);
  183         sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
  184         /* failedPDA points to the pda within the asm that targets the failed
  185          * disk */
  186         failedPDA = asmap->failedPDAs[0];
  187 
  188 #if RF_DEBUG_DAG
  189         if (rf_dagDebug)
  190                 printf("[Creating degraded-write DAG]\n");
  191 #endif
  192 
  193         RF_ASSERT(asmap->numDataFailed == 1);
  194         dag_h->creator = "SimpleDegradedWriteDAG";
  195 
  196         /*
  197          * Generate two ASMs identifying the surviving data
  198          * we need in order to recover the lost data.
  199          */
  200         /* overlappingPDAs array must be zero'd */
  201         memset(overlappingPDAs, 0, RF_MAXCOL);
  202         rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h,
  203             &nXorBufs, NULL, overlappingPDAs, allocList);
  204 
  205         /* create all the nodes at once */
  206         nWndNodes = asmap->numStripeUnitsAccessed - 1;  /* no access is
  207                                                          * generated for the
  208                                                          * failed pda */
  209 
  210         nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
  211             ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
  212         /*
  213          * XXX
  214          *
  215          * There's a bug with a complete stripe overwrite- that means 0 reads
  216          * of old data, and the rest of the DAG generation code doesn't like
  217          * that. A release is coming, and I don't wanna risk breaking a critical
  218          * DAG generator, so here's what I'm gonna do- if there's no read nodes,
  219          * I'm gonna fake there being a read node, and I'm gonna swap in a
  220          * no-op node in its place (to make all the link-up code happy).
  221          * This should be fixed at some point.  --jimz
  222          */
  223         if (nRrdNodes == 0) {
  224                 nRrdNodes = 1;
  225                 rdnodesFaked = 1;
  226         } else {
  227                 rdnodesFaked = 0;
  228         }
  229         /* lock, unlock, xor, Wnd, Rrd, W(nfaults) */
  230         nNodes = 5 + nfaults + nWndNodes + nRrdNodes;
  231 
  232         blockNode = rf_AllocDAGNode();
  233         blockNode->list_next = dag_h->nodes;
  234         dag_h->nodes = blockNode;
  235 
  236         commitNode = rf_AllocDAGNode();
  237         commitNode->list_next = dag_h->nodes;
  238         dag_h->nodes = commitNode;
  239 
  240         unblockNode = rf_AllocDAGNode();
  241         unblockNode->list_next = dag_h->nodes;
  242         dag_h->nodes = unblockNode;
  243 
  244         termNode = rf_AllocDAGNode();
  245         termNode->list_next = dag_h->nodes;
  246         dag_h->nodes = termNode;
  247 
  248         xorNode = rf_AllocDAGNode();
  249         xorNode->list_next = dag_h->nodes;
  250         dag_h->nodes = xorNode;
  251 
  252         wnpNode = rf_AllocDAGNode();
  253         wnpNode->list_next = dag_h->nodes;
  254         dag_h->nodes = wnpNode;
  255 
  256         for (i = 0; i < nWndNodes; i++) {
  257                 tmpNode = rf_AllocDAGNode();
  258                 tmpNode->list_next = dag_h->nodes;
  259                 dag_h->nodes = tmpNode;
  260         }
  261         wndNodes = dag_h->nodes;        
  262 
  263         for (i = 0; i < nRrdNodes; i++) {
  264                 tmpNode = rf_AllocDAGNode();
  265                 tmpNode->list_next = dag_h->nodes;
  266                 dag_h->nodes = tmpNode;
  267         }
  268         rrdNodes = dag_h->nodes;
  269 
  270 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
  271         if (nfaults == 2) {
  272                 wnqNode = rf_AllocDAGNode();
  273                 wnqNode->list_next = dag_h->nodes;
  274                 dag_h->nodes = wnqNode;
  275         } else {
  276 #endif
  277                 wnqNode = NULL;
  278 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
  279         }
  280 #endif
  281         RF_ASSERT(i == nNodes);
  282 
  283         /* this dag can not commit until all rrd and xor Nodes have completed */
  284         dag_h->numCommitNodes = 1;
  285         dag_h->numCommits = 0;
  286         dag_h->numSuccedents = 1;
  287 
  288         RF_ASSERT(nRrdNodes > 0);
  289         rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
  290             NULL, nRrdNodes, 0, 0, 0, dag_h, "Nil", allocList);
  291         rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
  292             NULL, nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList);
  293         rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
  294             NULL, 1, nWndNodes + nfaults, 0, 0, dag_h, "Nil", allocList);
  295         rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
  296             NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
  297         rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
  298             nRrdNodes, 2 * nXorBufs + 2, nfaults, dag_h, "Xrc", allocList);
  299 
  300         /*
  301          * Fill in the Rrd nodes. If any of the rrd buffers are the same size as
  302          * the failed buffer, save a pointer to it so we can use it as the target
  303          * of the XOR. The pdas in the rrd nodes have been range-restricted, so if
  304          * a buffer is the same size as the failed buffer, it must also be at the
  305          * same alignment within the SU.
  306          */
  307         i = 0;
  308         tmprrdNode = rrdNodes;
  309         if (new_asm_h[0]) {
  310                 for (i = 0, pda = new_asm_h[0]->stripeMap->physInfo;
  311                     i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
  312                     i++, pda = pda->next) {
  313                         rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
  314                             rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
  315                         RF_ASSERT(pda);
  316                         tmprrdNode->params[0].p = pda;
  317                         tmprrdNode->params[1].p = pda->bufPtr;
  318                         tmprrdNode->params[2].v = parityStripeID;
  319                         tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
  320                         tmprrdNode = tmprrdNode->list_next;
  321                 }
  322         }
  323         /* i now equals the number of stripe units accessed in new_asm_h[0] */
  324         /* Note that for tmprrdNode, this means a continuation from above, so no need to 
  325            assign it anything.. */
  326         if (new_asm_h[1]) {
  327                 for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo;
  328                     j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
  329                     j++, pda = pda->next) {
  330                         rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
  331                             rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
  332                         RF_ASSERT(pda);
  333                         tmprrdNode->params[0].p = pda;
  334                         tmprrdNode->params[1].p = pda->bufPtr;
  335                         tmprrdNode->params[2].v = parityStripeID;
  336                         tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
  337                         if (allowBufferRecycle && (pda->numSector == failedPDA->numSector))
  338                                 xorTargetBuf = pda->bufPtr;
  339                         tmprrdNode = tmprrdNode->list_next;
  340                 }
  341         }
  342         if (rdnodesFaked) {
  343                 /*
  344                  * This is where we'll init that fake noop read node
  345                  * (XXX should the wakeup func be different?)
  346                  */
  347                 /* node that rrdNodes will just be a single node... */
  348                 rf_InitNode(rrdNodes, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
  349                     NULL, 1, 1, 0, 0, dag_h, "RrN", allocList);
  350         }
  351         /*
  352          * Make a PDA for the parity unit.  The parity PDA should start at
  353          * the same offset into the SU as the failed PDA.
  354          */
  355         /* Danner comment: I don't think this copy is really necessary. We are
  356          * in one of two cases here. (1) The entire failed unit is written.
  357          * Then asmap->parityInfo will describe the entire parity. (2) We are
  358          * only writing a subset of the failed unit and nothing else. Then the
  359          * asmap->parityInfo describes the failed unit and the copy can also
  360          * be avoided. */
  361 
  362         parityPDA = rf_AllocPhysDiskAddr();
  363         parityPDA->next = dag_h->pda_cleanup_list;
  364         dag_h->pda_cleanup_list = parityPDA;
  365         parityPDA->col = asmap->parityInfo->col;
  366         parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU)
  367             * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
  368         parityPDA->numSector = failedPDA->numSector;
  369 
  370         if (!xorTargetBuf) {
  371                 xorTargetBuf = rf_AllocBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
  372         }
  373         /* init the Wnp node */
  374         rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
  375             rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
  376         wnpNode->params[0].p = parityPDA;
  377         wnpNode->params[1].p = xorTargetBuf;
  378         wnpNode->params[2].v = parityStripeID;
  379         wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
  380 
  381 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
  382         /* fill in the Wnq Node */
  383         if (nfaults == 2) {
  384                 {
  385                         RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t),
  386                             (RF_PhysDiskAddr_t *), allocList);
  387                         parityPDA->col = asmap->qInfo->col;
  388                         parityPDA->startSector = ((asmap->qInfo->startSector / sectorsPerSU)
  389                             * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
  390                         parityPDA->numSector = failedPDA->numSector;
  391 
  392                         rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
  393                             rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
  394                         wnqNode->params[0].p = parityPDA;
  395                         RF_MallocAndAdd(xorNode->results[1],
  396                             rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
  397                         wnqNode->params[1].p = xorNode->results[1];
  398                         wnqNode->params[2].v = parityStripeID;
  399                         wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
  400                 }
  401         }
  402 #endif
  403         /* fill in the Wnd nodes */
  404         tmpwndNode = wndNodes;
  405         for (pda = asmap->physInfo, i = 0; i < nWndNodes; i++, pda = pda->next) {
  406                 if (pda == failedPDA) {
  407                         i--;
  408                         continue;
  409                 }
  410                 rf_InitNode(tmpwndNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
  411                     rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
  412                 RF_ASSERT(pda);
  413                 tmpwndNode->params[0].p = pda;
  414                 tmpwndNode->params[1].p = pda->bufPtr;
  415                 tmpwndNode->params[2].v = parityStripeID;
  416                 tmpwndNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
  417                 tmpwndNode = tmpwndNode->list_next;
  418         }
  419 
  420         /* fill in the results of the xor node */
  421         xorNode->results[0] = xorTargetBuf;
  422 
  423         /* fill in the params of the xor node */
  424 
  425         paramNum = 0;
  426         if (rdnodesFaked == 0) {
  427                 tmprrdNode = rrdNodes;
  428                 for (i = 0; i < nRrdNodes; i++) {
  429                         /* all the Rrd nodes need to be xored together */
  430                         xorNode->params[paramNum++] = tmprrdNode->params[0];
  431                         xorNode->params[paramNum++] = tmprrdNode->params[1];
  432                         tmprrdNode = tmprrdNode->list_next;
  433                 }
  434         }
  435         tmpwndNode = wndNodes;
  436         for (i = 0; i < nWndNodes; i++) {
  437                 /* any Wnd nodes that overlap the failed access need to be
  438                  * xored in */
  439                 if (overlappingPDAs[i]) {
  440                         pda = rf_AllocPhysDiskAddr();
  441                         memcpy((char *) pda, (char *) tmpwndNode->params[0].p, sizeof(RF_PhysDiskAddr_t));
  442                         /* add it into the pda_cleanup_list *after* the copy, TYVM */
  443                         pda->next = dag_h->pda_cleanup_list;
  444                         dag_h->pda_cleanup_list = pda;
  445                         rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0);
  446                         xorNode->params[paramNum++].p = pda;
  447                         xorNode->params[paramNum++].p = pda->bufPtr;
  448                 }
  449                 tmpwndNode = tmpwndNode->list_next;
  450         }
  451 
  452         /*
  453          * Install the failed PDA into the xor param list so that the
  454          * new data gets xor'd in.
  455          */
  456         xorNode->params[paramNum++].p = failedPDA;
  457         xorNode->params[paramNum++].p = failedPDA->bufPtr;
  458 
  459         /*
  460          * The last 2 params to the recovery xor node are always the failed
  461          * PDA and the raidPtr. install the failedPDA even though we have just
  462          * done so above. This allows us to use the same XOR function for both
  463          * degraded reads and degraded writes.
  464          */
  465         xorNode->params[paramNum++].p = failedPDA;
  466         xorNode->params[paramNum++].p = raidPtr;
  467         RF_ASSERT(paramNum == 2 * nXorBufs + 2);
  468 
  469         /*
  470          * Code to link nodes begins here
  471          */
  472 
  473         /* link header to block node */
  474         RF_ASSERT(blockNode->numAntecedents == 0);
  475         dag_h->succedents[0] = blockNode;
  476 
  477         /* link block node to rd nodes */
  478         RF_ASSERT(blockNode->numSuccedents == nRrdNodes);
  479         tmprrdNode = rrdNodes;
  480         for (i = 0; i < nRrdNodes; i++) {
  481                 RF_ASSERT(tmprrdNode->numAntecedents == 1);
  482                 blockNode->succedents[i] = tmprrdNode;
  483                 tmprrdNode->antecedents[0] = blockNode;
  484                 tmprrdNode->antType[0] = rf_control;
  485                 tmprrdNode = tmprrdNode->list_next;
  486         }
  487 
  488         /* link read nodes to xor node */
  489         RF_ASSERT(xorNode->numAntecedents == nRrdNodes);
  490         tmprrdNode = rrdNodes;
  491         for (i = 0; i < nRrdNodes; i++) {
  492                 RF_ASSERT(tmprrdNode->numSuccedents == 1);
  493                 tmprrdNode->succedents[0] = xorNode;
  494                 xorNode->antecedents[i] = tmprrdNode;
  495                 xorNode->antType[i] = rf_trueData;
  496                 tmprrdNode = tmprrdNode->list_next;
  497         }
  498 
  499         /* link xor node to commit node */
  500         RF_ASSERT(xorNode->numSuccedents == 1);
  501         RF_ASSERT(commitNode->numAntecedents == 1);
  502         xorNode->succedents[0] = commitNode;
  503         commitNode->antecedents[0] = xorNode;
  504         commitNode->antType[0] = rf_control;
  505 
  506         /* link commit node to wnd nodes */
  507         RF_ASSERT(commitNode->numSuccedents == nfaults + nWndNodes);
  508         tmpwndNode = wndNodes;
  509         for (i = 0; i < nWndNodes; i++) {
  510                 RF_ASSERT(tmpwndNode->numAntecedents == 1);
  511                 commitNode->succedents[i] = tmpwndNode;
  512                 tmpwndNode->antecedents[0] = commitNode;
  513                 tmpwndNode->antType[0] = rf_control;
  514                 tmpwndNode = tmpwndNode->list_next;
  515         }
  516 
  517         /* link the commit node to wnp, wnq nodes */
  518         RF_ASSERT(wnpNode->numAntecedents == 1);
  519         commitNode->succedents[nWndNodes] = wnpNode;
  520         wnpNode->antecedents[0] = commitNode;
  521         wnpNode->antType[0] = rf_control;
  522 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
  523         if (nfaults == 2) {
  524                 RF_ASSERT(wnqNode->numAntecedents == 1);
  525                 commitNode->succedents[nWndNodes + 1] = wnqNode;
  526                 wnqNode->antecedents[0] = commitNode;
  527                 wnqNode->antType[0] = rf_control;
  528         }
  529 #endif
  530         /* link write new data nodes to unblock node */
  531         RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nfaults));
  532         tmpwndNode = wndNodes;
  533         for (i = 0; i < nWndNodes; i++) {
  534                 RF_ASSERT(tmpwndNode->numSuccedents == 1);
  535                 tmpwndNode->succedents[0] = unblockNode;
  536                 unblockNode->antecedents[i] = tmpwndNode;
  537                 unblockNode->antType[i] = rf_control;
  538                 tmpwndNode = tmpwndNode->list_next;
  539         }
  540 
  541         /* link write new parity node to unblock node */
  542         RF_ASSERT(wnpNode->numSuccedents == 1);
  543         wnpNode->succedents[0] = unblockNode;
  544         unblockNode->antecedents[nWndNodes] = wnpNode;
  545         unblockNode->antType[nWndNodes] = rf_control;
  546 
  547 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
  548         /* link write new q node to unblock node */
  549         if (nfaults == 2) {
  550                 RF_ASSERT(wnqNode->numSuccedents == 1);
  551                 wnqNode->succedents[0] = unblockNode;
  552                 unblockNode->antecedents[nWndNodes + 1] = wnqNode;
  553                 unblockNode->antType[nWndNodes + 1] = rf_control;
  554         }
  555 #endif
  556         /* link unblock node to term node */
  557         RF_ASSERT(unblockNode->numSuccedents == 1);
  558         RF_ASSERT(termNode->numAntecedents == 1);
  559         RF_ASSERT(termNode->numSuccedents == 0);
  560         unblockNode->succedents[0] = termNode;
  561         termNode->antecedents[0] = unblockNode;
  562         termNode->antType[0] = rf_control;
  563 }
  564 #define CONS_PDA(if,start,num) \
  565   pda_p->col = asmap->if->col; \
  566   pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \
  567   pda_p->numSector = num; \
  568   pda_p->next = NULL; \
  569   RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList)
  570 #if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0)
  571 void 
  572 rf_WriteGenerateFailedAccessASMs(
  573     RF_Raid_t * raidPtr,
  574     RF_AccessStripeMap_t * asmap,
  575     RF_PhysDiskAddr_t ** pdap,
  576     int *nNodep,
  577     RF_PhysDiskAddr_t ** pqpdap,
  578     int *nPQNodep,
  579     RF_AllocListElem_t * allocList)
  580 {
  581         RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
  582         int     PDAPerDisk, i;
  583         RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
  584         int     numDataCol = layoutPtr->numDataCol;
  585         int     state;
  586         unsigned napdas;
  587         RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end;
  588         RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1];
  589         RF_PhysDiskAddr_t *pda_p;
  590         RF_RaidAddr_t sosAddr;
  591 
  592         /* determine how many pda's we will have to generate per unaccess
  593          * stripe. If there is only one failed data unit, it is one; if two,
  594          * possibly two, depending wether they overlap. */
  595 
  596         fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector);
  597         fone_end = fone_start + fone->numSector;
  598 
  599         if (asmap->numDataFailed == 1) {
  600                 PDAPerDisk = 1;
  601                 state = 1;
  602                 RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
  603                 pda_p = *pqpdap;
  604                 /* build p */
  605                 CONS_PDA(parityInfo, fone_start, fone->numSector);
  606                 pda_p->type = RF_PDA_TYPE_PARITY;
  607                 pda_p++;
  608                 /* build q */
  609                 CONS_PDA(qInfo, fone_start, fone->numSector);
  610                 pda_p->type = RF_PDA_TYPE_Q;
  611         } else {
  612                 ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector);
  613                 ftwo_end = ftwo_start + ftwo->numSector;
  614                 if (fone->numSector + ftwo->numSector > secPerSU) {
  615                         PDAPerDisk = 1;
  616                         state = 2;
  617                         RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
  618                         pda_p = *pqpdap;
  619                         CONS_PDA(parityInfo, 0, secPerSU);
  620                         pda_p->type = RF_PDA_TYPE_PARITY;
  621                         pda_p++;
  622                         CONS_PDA(qInfo, 0, secPerSU);
  623                         pda_p->type = RF_PDA_TYPE_Q;
  624                 } else {
  625                         PDAPerDisk = 2;
  626                         state = 3;
  627                         /* four of them, fone, then ftwo */
  628                         RF_MallocAndAdd(*pqpdap, 4 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
  629                         pda_p = *pqpdap;
  630                         CONS_PDA(parityInfo, fone_start, fone->numSector);
  631                         pda_p->type = RF_PDA_TYPE_PARITY;
  632                         pda_p++;
  633                         CONS_PDA(qInfo, fone_start, fone->numSector);
  634                         pda_p->type = RF_PDA_TYPE_Q;
  635                         pda_p++;
  636                         CONS_PDA(parityInfo, ftwo_start, ftwo->numSector);
  637                         pda_p->type = RF_PDA_TYPE_PARITY;
  638                         pda_p++;
  639                         CONS_PDA(qInfo, ftwo_start, ftwo->numSector);
  640                         pda_p->type = RF_PDA_TYPE_Q;
  641                 }
  642         }
  643         /* figure out number of nonaccessed pda */
  644         napdas = PDAPerDisk * (numDataCol - 2);
  645         *nPQNodep = PDAPerDisk;
  646 
  647         *nNodep = napdas;
  648         if (napdas == 0)
  649                 return;         /* short circuit */
  650 
  651         /* allocate up our list of pda's */
  652 
  653         RF_MallocAndAdd(pda_p, napdas * sizeof(RF_PhysDiskAddr_t), 
  654                         (RF_PhysDiskAddr_t *), allocList);
  655         *pdap = pda_p;
  656 
  657         /* linkem together */
  658         for (i = 0; i < (napdas - 1); i++)
  659                 pda_p[i].next = pda_p + (i + 1);
  660 
  661         sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
  662         for (i = 0; i < numDataCol; i++) {
  663                 if ((pda_p - (*pdap)) == napdas)
  664                         continue;
  665                 pda_p->type = RF_PDA_TYPE_DATA;
  666                 pda_p->raidAddress = sosAddr + (i * secPerSU);
  667                 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0);
  668                 /* skip over dead disks */
  669                 if (RF_DEAD_DISK(raidPtr->Disks[pda_p->col].status))
  670                         continue;
  671                 switch (state) {
  672                 case 1: /* fone */
  673                         pda_p->numSector = fone->numSector;
  674                         pda_p->raidAddress += fone_start;
  675                         pda_p->startSector += fone_start;
  676                         RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
  677                         break;
  678                 case 2: /* full stripe */
  679                         pda_p->numSector = secPerSU;
  680                         RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, secPerSU), (char *), allocList);
  681                         break;
  682                 case 3: /* two slabs */
  683                         pda_p->numSector = fone->numSector;
  684                         pda_p->raidAddress += fone_start;
  685                         pda_p->startSector += fone_start;
  686                         RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
  687                         pda_p++;
  688                         pda_p->type = RF_PDA_TYPE_DATA;
  689                         pda_p->raidAddress = sosAddr + (i * secPerSU);
  690                         (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0);
  691                         pda_p->numSector = ftwo->numSector;
  692                         pda_p->raidAddress += ftwo_start;
  693                         pda_p->startSector += ftwo_start;
  694                         RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
  695                         break;
  696                 default:
  697                         RF_PANIC();
  698                 }
  699                 pda_p++;
  700         }
  701 
  702         RF_ASSERT(pda_p - *pdap == napdas);
  703         return;
  704 }
  705 #define DISK_NODE_PDA(node)  ((node)->params[0].p)
  706 
  707 #define DISK_NODE_PARAMS(_node_,_p_) \
  708   (_node_).params[0].p = _p_ ; \
  709   (_node_).params[1].p = (_p_)->bufPtr; \
  710   (_node_).params[2].v = parityStripeID; \
  711   (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru)
  712 
  713 void 
  714 rf_DoubleDegSmallWrite(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
  715                        RF_DagHeader_t *dag_h, void *bp,
  716                        RF_RaidAccessFlags_t flags,
  717                        RF_AllocListElem_t *allocList,
  718                        char *redundantReadNodeName,
  719                        char *redundantWriteNodeName,
  720                        char *recoveryNodeName,
  721                        int (*recovFunc) (RF_DagNode_t *))
  722 {
  723         RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
  724         RF_DagNode_t *nodes, *wudNodes, *rrdNodes, *recoveryNode, *blockNode,
  725                *unblockNode, *rpNodes, *rqNodes, *wpNodes, *wqNodes, *termNode;
  726         RF_PhysDiskAddr_t *pda, *pqPDAs;
  727         RF_PhysDiskAddr_t *npdas;
  728         int     nWriteNodes, nNodes, nReadNodes, nRrdNodes, nWudNodes, i;
  729         RF_ReconUnitNum_t which_ru;
  730         int     nPQNodes;
  731         RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru);
  732 
  733         /* simple small write case - First part looks like a reconstruct-read
  734          * of the failed data units. Then a write of all data units not
  735          * failed. */
  736 
  737 
  738         /* Hdr | ------Block- /  /         \   Rrd  Rrd ...  Rrd  Rp Rq \  \
  739          * /  -------PQ----- /   \   \ Wud   Wp  WQ          \    |   /
  740          * --Unblock- | T
  741          * 
  742          * Rrd = read recovery data  (potentially none) Wud = write user data
  743          * (not incl. failed disks) Wp = Write P (could be two) Wq = Write Q
  744          * (could be two)
  745          * 
  746          */
  747 
  748         rf_WriteGenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes, allocList);
  749 
  750         RF_ASSERT(asmap->numDataFailed == 1);
  751 
  752         nWudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
  753         nReadNodes = nRrdNodes + 2 * nPQNodes;
  754         nWriteNodes = nWudNodes + 2 * nPQNodes;
  755         nNodes = 4 + nReadNodes + nWriteNodes;
  756 
  757         RF_MallocAndAdd(nodes, nNodes * sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
  758         blockNode = nodes;
  759         unblockNode = blockNode + 1;
  760         termNode = unblockNode + 1;
  761         recoveryNode = termNode + 1;
  762         rrdNodes = recoveryNode + 1;
  763         rpNodes = rrdNodes + nRrdNodes;
  764         rqNodes = rpNodes + nPQNodes;
  765         wudNodes = rqNodes + nPQNodes;
  766         wpNodes = wudNodes + nWudNodes;
  767         wqNodes = wpNodes + nPQNodes;
  768 
  769         dag_h->creator = "PQ_DDSimpleSmallWrite";
  770         dag_h->numSuccedents = 1;
  771         dag_h->succedents[0] = blockNode;
  772         rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
  773         termNode->antecedents[0] = unblockNode;
  774         termNode->antType[0] = rf_control;
  775 
  776         /* init the block and unblock nodes */
  777         /* The block node has all the read nodes as successors */
  778         rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList);
  779         for (i = 0; i < nReadNodes; i++)
  780                 blockNode->succedents[i] = rrdNodes + i;
  781 
  782         /* The unblock node has all the writes as successors */
  783         rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWriteNodes, 0, 0, dag_h, "Nil", allocList);
  784         for (i = 0; i < nWriteNodes; i++) {
  785                 unblockNode->antecedents[i] = wudNodes + i;
  786                 unblockNode->antType[i] = rf_control;
  787         }
  788         unblockNode->succedents[0] = termNode;
  789 
  790 #define INIT_READ_NODE(node,name) \
  791   rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
  792   (node)->succedents[0] = recoveryNode; \
  793   (node)->antecedents[0] = blockNode; \
  794   (node)->antType[0] = rf_control;
  795 
  796         /* build the read nodes */
  797         pda = npdas;
  798         for (i = 0; i < nRrdNodes; i++, pda = pda->next) {
  799                 INIT_READ_NODE(rrdNodes + i, "rrd");
  800                 DISK_NODE_PARAMS(rrdNodes[i], pda);
  801         }
  802 
  803         /* read redundancy pdas */
  804         pda = pqPDAs;
  805         INIT_READ_NODE(rpNodes, "Rp");
  806         RF_ASSERT(pda);
  807         DISK_NODE_PARAMS(rpNodes[0], pda);
  808         pda++;
  809         INIT_READ_NODE(rqNodes, redundantReadNodeName);
  810         RF_ASSERT(pda);
  811         DISK_NODE_PARAMS(rqNodes[0], pda);
  812         if (nPQNodes == 2) {
  813                 pda++;
  814                 INIT_READ_NODE(rpNodes + 1, "Rp");
  815                 RF_ASSERT(pda);
  816                 DISK_NODE_PARAMS(rpNodes[1], pda);
  817                 pda++;
  818                 INIT_READ_NODE(rqNodes + 1, redundantReadNodeName);
  819                 RF_ASSERT(pda);
  820                 DISK_NODE_PARAMS(rqNodes[1], pda);
  821         }
  822         /* the recovery node has all reads as precedessors and all writes as
  823          * successors. It generates a result for every write P or write Q
  824          * node. As parameters, it takes a pda per read and a pda per stripe
  825          * of user data written. It also takes as the last params the raidPtr
  826          * and asm. For results, it takes PDA for P & Q. */
  827 
  828 
  829         rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL,
  830             nWriteNodes,        /* succesors */
  831             nReadNodes,         /* preds */
  832             nReadNodes + nWudNodes + 3, /* params */
  833             2 * nPQNodes,       /* results */
  834             dag_h, recoveryNodeName, allocList);
  835 
  836 
  837 
  838         for (i = 0; i < nReadNodes; i++) {
  839                 recoveryNode->antecedents[i] = rrdNodes + i;
  840                 recoveryNode->antType[i] = rf_control;
  841                 recoveryNode->params[i].p = DISK_NODE_PDA(rrdNodes + i);
  842         }
  843         for (i = 0; i < nWudNodes; i++) {
  844                 recoveryNode->succedents[i] = wudNodes + i;
  845         }
  846         recoveryNode->params[nReadNodes + nWudNodes].p = asmap->failedPDAs[0];
  847         recoveryNode->params[nReadNodes + nWudNodes + 1].p = raidPtr;
  848         recoveryNode->params[nReadNodes + nWudNodes + 2].p = asmap;
  849 
  850         for (; i < nWriteNodes; i++)
  851                 recoveryNode->succedents[i] = wudNodes + i;
  852 
  853         pda = pqPDAs;
  854         recoveryNode->results[0] = pda;
  855         pda++;
  856         recoveryNode->results[1] = pda;
  857         if (nPQNodes == 2) {
  858                 pda++;
  859                 recoveryNode->results[2] = pda;
  860                 pda++;
  861                 recoveryNode->results[3] = pda;
  862         }
  863         /* fill writes */
  864 #define INIT_WRITE_NODE(node,name) \
  865   rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
  866     (node)->succedents[0] = unblockNode; \
  867     (node)->antecedents[0] = recoveryNode; \
  868     (node)->antType[0] = rf_control;
  869 
  870         pda = asmap->physInfo;
  871         for (i = 0; i < nWudNodes; i++) {
  872                 INIT_WRITE_NODE(wudNodes + i, "Wd");
  873                 DISK_NODE_PARAMS(wudNodes[i], pda);
  874                 recoveryNode->params[nReadNodes + i].p = DISK_NODE_PDA(wudNodes + i);
  875                 pda = pda->next;
  876         }
  877         /* write redundancy pdas */
  878         pda = pqPDAs;
  879         INIT_WRITE_NODE(wpNodes, "Wp");
  880         RF_ASSERT(pda);
  881         DISK_NODE_PARAMS(wpNodes[0], pda);
  882         pda++;
  883         INIT_WRITE_NODE(wqNodes, "Wq");
  884         RF_ASSERT(pda);
  885         DISK_NODE_PARAMS(wqNodes[0], pda);
  886         if (nPQNodes == 2) {
  887                 pda++;
  888                 INIT_WRITE_NODE(wpNodes + 1, "Wp");
  889                 RF_ASSERT(pda);
  890                 DISK_NODE_PARAMS(wpNodes[1], pda);
  891                 pda++;
  892                 INIT_WRITE_NODE(wqNodes + 1, "Wq");
  893                 RF_ASSERT(pda);
  894                 DISK_NODE_PARAMS(wqNodes[1], pda);
  895         }
  896 }
  897 #endif   /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0) */

Cache object: 98800b72cbee28dfea56e6f1b5db7447


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.