1 /* $NetBSD: rf_dagdegwr.c,v 1.23.2.1 2004/04/11 11:19:58 tron Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * rf_dagdegwr.c
31 *
32 * code for creating degraded write DAGs
33 *
34 */
35
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: rf_dagdegwr.c,v 1.23.2.1 2004/04/11 11:19:58 tron Exp $");
38
39 #include <dev/raidframe/raidframevar.h>
40
41 #include "rf_raid.h"
42 #include "rf_dag.h"
43 #include "rf_dagutils.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_debugMem.h"
46 #include "rf_general.h"
47 #include "rf_dagdegwr.h"
48 #include "rf_map.h"
49
50
51 /******************************************************************************
52 *
53 * General comments on DAG creation:
54 *
55 * All DAGs in this file use roll-away error recovery. Each DAG has a single
56 * commit node, usually called "Cmt." If an error occurs before the Cmt node
57 * is reached, the execution engine will halt forward execution and work
58 * backward through the graph, executing the undo functions. Assuming that
59 * each node in the graph prior to the Cmt node are undoable and atomic - or -
60 * does not make changes to permanent state, the graph will fail atomically.
61 * If an error occurs after the Cmt node executes, the engine will roll-forward
62 * through the graph, blindly executing nodes until it reaches the end.
63 * If a graph reaches the end, it is assumed to have completed successfully.
64 *
65 * A graph has only 1 Cmt node.
66 *
67 */
68
69
70 /******************************************************************************
71 *
72 * The following wrappers map the standard DAG creation interface to the
73 * DAG creation routines. Additionally, these wrappers enable experimentation
74 * with new DAG structures by providing an extra level of indirection, allowing
75 * the DAG creation routines to be replaced at this single point.
76 */
77
78 static
79 RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG)
80 {
81 rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp,
82 flags, allocList, 1, rf_RecoveryXorFunc, RF_TRUE);
83 }
84
85 void
86 rf_CreateDegradedWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
87 RF_DagHeader_t *dag_h, void *bp,
88 RF_RaidAccessFlags_t flags,
89 RF_AllocListElem_t *allocList)
90 {
91
92 RF_ASSERT(asmap->numDataFailed == 1);
93 dag_h->creator = "DegradedWriteDAG";
94
95 /*
96 * if the access writes only a portion of the failed unit, and also
97 * writes some portion of at least one surviving unit, we create two
98 * DAGs, one for the failed component and one for the non-failed
99 * component, and do them sequentially. Note that the fact that we're
100 * accessing only a portion of the failed unit indicates that the
101 * access either starts or ends in the failed unit, and hence we need
102 * create only two dags. This is inefficient in that the same data or
103 * parity can get read and written twice using this structure. I need
104 * to fix this to do the access all at once.
105 */
106 RF_ASSERT(!(asmap->numStripeUnitsAccessed != 1 &&
107 asmap->failedPDAs[0]->numSector !=
108 raidPtr->Layout.sectorsPerStripeUnit));
109 rf_CreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
110 allocList);
111 }
112
113
114
115 /******************************************************************************
116 *
117 * DAG creation code begins here
118 */
119
120
121
122 /******************************************************************************
123 *
124 * CommonCreateSimpleDegradedWriteDAG -- creates a DAG to do a degraded-mode
125 * write, which is as follows
126 *
127 * / {Wnq} --\
128 * hdr -> blockNode -> Rod -> Xor -> Cmt -> Wnp ----> unblock -> term
129 * \ {Rod} / \ Wnd ---/
130 * \ {Wnd} -/
131 *
132 * commit nodes: Xor, Wnd
133 *
134 * IMPORTANT:
135 * This DAG generator does not work for double-degraded archs since it does not
136 * generate Q
137 *
138 * This dag is essentially identical to the large-write dag, except that the
139 * write to the failed data unit is suppressed.
140 *
141 * IMPORTANT: this dag does not work in the case where the access writes only
142 * a portion of the failed unit, and also writes some portion of at least one
143 * surviving SU. this case is handled in CreateDegradedWriteDAG above.
144 *
145 * The block & unblock nodes are leftovers from a previous version. They
146 * do nothing, but I haven't deleted them because it would be a tremendous
147 * effort to put them back in.
148 *
149 * This dag is used whenever a one of the data units in a write has failed.
150 * If it is the parity unit that failed, the nonredundant write dag (below)
151 * is used.
152 *****************************************************************************/
153
154 void
155 rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t *raidPtr,
156 RF_AccessStripeMap_t *asmap,
157 RF_DagHeader_t *dag_h, void *bp,
158 RF_RaidAccessFlags_t flags,
159 RF_AllocListElem_t *allocList,
160 int nfaults,
161 int (*redFunc) (RF_DagNode_t *),
162 int allowBufferRecycle)
163 {
164 int nNodes, nRrdNodes, nWndNodes, nXorBufs, i, j, paramNum,
165 rdnodesFaked;
166 RF_DagNode_t *blockNode, *unblockNode, *wnpNode, *wnqNode, *termNode;
167 RF_DagNode_t *wndNodes, *rrdNodes, *xorNode, *commitNode;
168 RF_DagNode_t *tmpNode, *tmpwndNode, *tmprrdNode;
169 RF_SectorCount_t sectorsPerSU;
170 RF_ReconUnitNum_t which_ru;
171 char *xorTargetBuf = NULL; /* the target buffer for the XOR
172 * operation */
173 char overlappingPDAs[RF_MAXCOL];/* a temporary array of flags */
174 RF_AccessStripeMapHeader_t *new_asm_h[2];
175 RF_PhysDiskAddr_t *pda, *parityPDA;
176 RF_StripeNum_t parityStripeID;
177 RF_PhysDiskAddr_t *failedPDA;
178 RF_RaidLayout_t *layoutPtr;
179
180 layoutPtr = &(raidPtr->Layout);
181 parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress,
182 &which_ru);
183 sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
184 /* failedPDA points to the pda within the asm that targets the failed
185 * disk */
186 failedPDA = asmap->failedPDAs[0];
187
188 #if RF_DEBUG_DAG
189 if (rf_dagDebug)
190 printf("[Creating degraded-write DAG]\n");
191 #endif
192
193 RF_ASSERT(asmap->numDataFailed == 1);
194 dag_h->creator = "SimpleDegradedWriteDAG";
195
196 /*
197 * Generate two ASMs identifying the surviving data
198 * we need in order to recover the lost data.
199 */
200 /* overlappingPDAs array must be zero'd */
201 memset(overlappingPDAs, 0, RF_MAXCOL);
202 rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h,
203 &nXorBufs, NULL, overlappingPDAs, allocList);
204
205 /* create all the nodes at once */
206 nWndNodes = asmap->numStripeUnitsAccessed - 1; /* no access is
207 * generated for the
208 * failed pda */
209
210 nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
211 ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
212 /*
213 * XXX
214 *
215 * There's a bug with a complete stripe overwrite- that means 0 reads
216 * of old data, and the rest of the DAG generation code doesn't like
217 * that. A release is coming, and I don't wanna risk breaking a critical
218 * DAG generator, so here's what I'm gonna do- if there's no read nodes,
219 * I'm gonna fake there being a read node, and I'm gonna swap in a
220 * no-op node in its place (to make all the link-up code happy).
221 * This should be fixed at some point. --jimz
222 */
223 if (nRrdNodes == 0) {
224 nRrdNodes = 1;
225 rdnodesFaked = 1;
226 } else {
227 rdnodesFaked = 0;
228 }
229 /* lock, unlock, xor, Wnd, Rrd, W(nfaults) */
230 nNodes = 5 + nfaults + nWndNodes + nRrdNodes;
231
232 blockNode = rf_AllocDAGNode();
233 blockNode->list_next = dag_h->nodes;
234 dag_h->nodes = blockNode;
235
236 commitNode = rf_AllocDAGNode();
237 commitNode->list_next = dag_h->nodes;
238 dag_h->nodes = commitNode;
239
240 unblockNode = rf_AllocDAGNode();
241 unblockNode->list_next = dag_h->nodes;
242 dag_h->nodes = unblockNode;
243
244 termNode = rf_AllocDAGNode();
245 termNode->list_next = dag_h->nodes;
246 dag_h->nodes = termNode;
247
248 xorNode = rf_AllocDAGNode();
249 xorNode->list_next = dag_h->nodes;
250 dag_h->nodes = xorNode;
251
252 wnpNode = rf_AllocDAGNode();
253 wnpNode->list_next = dag_h->nodes;
254 dag_h->nodes = wnpNode;
255
256 for (i = 0; i < nWndNodes; i++) {
257 tmpNode = rf_AllocDAGNode();
258 tmpNode->list_next = dag_h->nodes;
259 dag_h->nodes = tmpNode;
260 }
261 wndNodes = dag_h->nodes;
262
263 for (i = 0; i < nRrdNodes; i++) {
264 tmpNode = rf_AllocDAGNode();
265 tmpNode->list_next = dag_h->nodes;
266 dag_h->nodes = tmpNode;
267 }
268 rrdNodes = dag_h->nodes;
269
270 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
271 if (nfaults == 2) {
272 wnqNode = rf_AllocDAGNode();
273 wnqNode->list_next = dag_h->nodes;
274 dag_h->nodes = wnqNode;
275 } else {
276 #endif
277 wnqNode = NULL;
278 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
279 }
280 #endif
281 RF_ASSERT(i == nNodes);
282
283 /* this dag can not commit until all rrd and xor Nodes have completed */
284 dag_h->numCommitNodes = 1;
285 dag_h->numCommits = 0;
286 dag_h->numSuccedents = 1;
287
288 RF_ASSERT(nRrdNodes > 0);
289 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
290 NULL, nRrdNodes, 0, 0, 0, dag_h, "Nil", allocList);
291 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
292 NULL, nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList);
293 rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
294 NULL, 1, nWndNodes + nfaults, 0, 0, dag_h, "Nil", allocList);
295 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
296 NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
297 rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
298 nRrdNodes, 2 * nXorBufs + 2, nfaults, dag_h, "Xrc", allocList);
299
300 /*
301 * Fill in the Rrd nodes. If any of the rrd buffers are the same size as
302 * the failed buffer, save a pointer to it so we can use it as the target
303 * of the XOR. The pdas in the rrd nodes have been range-restricted, so if
304 * a buffer is the same size as the failed buffer, it must also be at the
305 * same alignment within the SU.
306 */
307 i = 0;
308 tmprrdNode = rrdNodes;
309 if (new_asm_h[0]) {
310 for (i = 0, pda = new_asm_h[0]->stripeMap->physInfo;
311 i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
312 i++, pda = pda->next) {
313 rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
314 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
315 RF_ASSERT(pda);
316 tmprrdNode->params[0].p = pda;
317 tmprrdNode->params[1].p = pda->bufPtr;
318 tmprrdNode->params[2].v = parityStripeID;
319 tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
320 tmprrdNode = tmprrdNode->list_next;
321 }
322 }
323 /* i now equals the number of stripe units accessed in new_asm_h[0] */
324 /* Note that for tmprrdNode, this means a continuation from above, so no need to
325 assign it anything.. */
326 if (new_asm_h[1]) {
327 for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo;
328 j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
329 j++, pda = pda->next) {
330 rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
331 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
332 RF_ASSERT(pda);
333 tmprrdNode->params[0].p = pda;
334 tmprrdNode->params[1].p = pda->bufPtr;
335 tmprrdNode->params[2].v = parityStripeID;
336 tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
337 if (allowBufferRecycle && (pda->numSector == failedPDA->numSector))
338 xorTargetBuf = pda->bufPtr;
339 tmprrdNode = tmprrdNode->list_next;
340 }
341 }
342 if (rdnodesFaked) {
343 /*
344 * This is where we'll init that fake noop read node
345 * (XXX should the wakeup func be different?)
346 */
347 /* node that rrdNodes will just be a single node... */
348 rf_InitNode(rrdNodes, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
349 NULL, 1, 1, 0, 0, dag_h, "RrN", allocList);
350 }
351 /*
352 * Make a PDA for the parity unit. The parity PDA should start at
353 * the same offset into the SU as the failed PDA.
354 */
355 /* Danner comment: I don't think this copy is really necessary. We are
356 * in one of two cases here. (1) The entire failed unit is written.
357 * Then asmap->parityInfo will describe the entire parity. (2) We are
358 * only writing a subset of the failed unit and nothing else. Then the
359 * asmap->parityInfo describes the failed unit and the copy can also
360 * be avoided. */
361
362 parityPDA = rf_AllocPhysDiskAddr();
363 parityPDA->next = dag_h->pda_cleanup_list;
364 dag_h->pda_cleanup_list = parityPDA;
365 parityPDA->col = asmap->parityInfo->col;
366 parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU)
367 * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
368 parityPDA->numSector = failedPDA->numSector;
369
370 if (!xorTargetBuf) {
371 xorTargetBuf = rf_AllocBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
372 }
373 /* init the Wnp node */
374 rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
375 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
376 wnpNode->params[0].p = parityPDA;
377 wnpNode->params[1].p = xorTargetBuf;
378 wnpNode->params[2].v = parityStripeID;
379 wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
380
381 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
382 /* fill in the Wnq Node */
383 if (nfaults == 2) {
384 {
385 RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t),
386 (RF_PhysDiskAddr_t *), allocList);
387 parityPDA->col = asmap->qInfo->col;
388 parityPDA->startSector = ((asmap->qInfo->startSector / sectorsPerSU)
389 * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
390 parityPDA->numSector = failedPDA->numSector;
391
392 rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
393 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
394 wnqNode->params[0].p = parityPDA;
395 RF_MallocAndAdd(xorNode->results[1],
396 rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
397 wnqNode->params[1].p = xorNode->results[1];
398 wnqNode->params[2].v = parityStripeID;
399 wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
400 }
401 }
402 #endif
403 /* fill in the Wnd nodes */
404 tmpwndNode = wndNodes;
405 for (pda = asmap->physInfo, i = 0; i < nWndNodes; i++, pda = pda->next) {
406 if (pda == failedPDA) {
407 i--;
408 continue;
409 }
410 rf_InitNode(tmpwndNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
411 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
412 RF_ASSERT(pda);
413 tmpwndNode->params[0].p = pda;
414 tmpwndNode->params[1].p = pda->bufPtr;
415 tmpwndNode->params[2].v = parityStripeID;
416 tmpwndNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
417 tmpwndNode = tmpwndNode->list_next;
418 }
419
420 /* fill in the results of the xor node */
421 xorNode->results[0] = xorTargetBuf;
422
423 /* fill in the params of the xor node */
424
425 paramNum = 0;
426 if (rdnodesFaked == 0) {
427 tmprrdNode = rrdNodes;
428 for (i = 0; i < nRrdNodes; i++) {
429 /* all the Rrd nodes need to be xored together */
430 xorNode->params[paramNum++] = tmprrdNode->params[0];
431 xorNode->params[paramNum++] = tmprrdNode->params[1];
432 tmprrdNode = tmprrdNode->list_next;
433 }
434 }
435 tmpwndNode = wndNodes;
436 for (i = 0; i < nWndNodes; i++) {
437 /* any Wnd nodes that overlap the failed access need to be
438 * xored in */
439 if (overlappingPDAs[i]) {
440 pda = rf_AllocPhysDiskAddr();
441 memcpy((char *) pda, (char *) tmpwndNode->params[0].p, sizeof(RF_PhysDiskAddr_t));
442 /* add it into the pda_cleanup_list *after* the copy, TYVM */
443 pda->next = dag_h->pda_cleanup_list;
444 dag_h->pda_cleanup_list = pda;
445 rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0);
446 xorNode->params[paramNum++].p = pda;
447 xorNode->params[paramNum++].p = pda->bufPtr;
448 }
449 tmpwndNode = tmpwndNode->list_next;
450 }
451
452 /*
453 * Install the failed PDA into the xor param list so that the
454 * new data gets xor'd in.
455 */
456 xorNode->params[paramNum++].p = failedPDA;
457 xorNode->params[paramNum++].p = failedPDA->bufPtr;
458
459 /*
460 * The last 2 params to the recovery xor node are always the failed
461 * PDA and the raidPtr. install the failedPDA even though we have just
462 * done so above. This allows us to use the same XOR function for both
463 * degraded reads and degraded writes.
464 */
465 xorNode->params[paramNum++].p = failedPDA;
466 xorNode->params[paramNum++].p = raidPtr;
467 RF_ASSERT(paramNum == 2 * nXorBufs + 2);
468
469 /*
470 * Code to link nodes begins here
471 */
472
473 /* link header to block node */
474 RF_ASSERT(blockNode->numAntecedents == 0);
475 dag_h->succedents[0] = blockNode;
476
477 /* link block node to rd nodes */
478 RF_ASSERT(blockNode->numSuccedents == nRrdNodes);
479 tmprrdNode = rrdNodes;
480 for (i = 0; i < nRrdNodes; i++) {
481 RF_ASSERT(tmprrdNode->numAntecedents == 1);
482 blockNode->succedents[i] = tmprrdNode;
483 tmprrdNode->antecedents[0] = blockNode;
484 tmprrdNode->antType[0] = rf_control;
485 tmprrdNode = tmprrdNode->list_next;
486 }
487
488 /* link read nodes to xor node */
489 RF_ASSERT(xorNode->numAntecedents == nRrdNodes);
490 tmprrdNode = rrdNodes;
491 for (i = 0; i < nRrdNodes; i++) {
492 RF_ASSERT(tmprrdNode->numSuccedents == 1);
493 tmprrdNode->succedents[0] = xorNode;
494 xorNode->antecedents[i] = tmprrdNode;
495 xorNode->antType[i] = rf_trueData;
496 tmprrdNode = tmprrdNode->list_next;
497 }
498
499 /* link xor node to commit node */
500 RF_ASSERT(xorNode->numSuccedents == 1);
501 RF_ASSERT(commitNode->numAntecedents == 1);
502 xorNode->succedents[0] = commitNode;
503 commitNode->antecedents[0] = xorNode;
504 commitNode->antType[0] = rf_control;
505
506 /* link commit node to wnd nodes */
507 RF_ASSERT(commitNode->numSuccedents == nfaults + nWndNodes);
508 tmpwndNode = wndNodes;
509 for (i = 0; i < nWndNodes; i++) {
510 RF_ASSERT(tmpwndNode->numAntecedents == 1);
511 commitNode->succedents[i] = tmpwndNode;
512 tmpwndNode->antecedents[0] = commitNode;
513 tmpwndNode->antType[0] = rf_control;
514 tmpwndNode = tmpwndNode->list_next;
515 }
516
517 /* link the commit node to wnp, wnq nodes */
518 RF_ASSERT(wnpNode->numAntecedents == 1);
519 commitNode->succedents[nWndNodes] = wnpNode;
520 wnpNode->antecedents[0] = commitNode;
521 wnpNode->antType[0] = rf_control;
522 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
523 if (nfaults == 2) {
524 RF_ASSERT(wnqNode->numAntecedents == 1);
525 commitNode->succedents[nWndNodes + 1] = wnqNode;
526 wnqNode->antecedents[0] = commitNode;
527 wnqNode->antType[0] = rf_control;
528 }
529 #endif
530 /* link write new data nodes to unblock node */
531 RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nfaults));
532 tmpwndNode = wndNodes;
533 for (i = 0; i < nWndNodes; i++) {
534 RF_ASSERT(tmpwndNode->numSuccedents == 1);
535 tmpwndNode->succedents[0] = unblockNode;
536 unblockNode->antecedents[i] = tmpwndNode;
537 unblockNode->antType[i] = rf_control;
538 tmpwndNode = tmpwndNode->list_next;
539 }
540
541 /* link write new parity node to unblock node */
542 RF_ASSERT(wnpNode->numSuccedents == 1);
543 wnpNode->succedents[0] = unblockNode;
544 unblockNode->antecedents[nWndNodes] = wnpNode;
545 unblockNode->antType[nWndNodes] = rf_control;
546
547 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
548 /* link write new q node to unblock node */
549 if (nfaults == 2) {
550 RF_ASSERT(wnqNode->numSuccedents == 1);
551 wnqNode->succedents[0] = unblockNode;
552 unblockNode->antecedents[nWndNodes + 1] = wnqNode;
553 unblockNode->antType[nWndNodes + 1] = rf_control;
554 }
555 #endif
556 /* link unblock node to term node */
557 RF_ASSERT(unblockNode->numSuccedents == 1);
558 RF_ASSERT(termNode->numAntecedents == 1);
559 RF_ASSERT(termNode->numSuccedents == 0);
560 unblockNode->succedents[0] = termNode;
561 termNode->antecedents[0] = unblockNode;
562 termNode->antType[0] = rf_control;
563 }
564 #define CONS_PDA(if,start,num) \
565 pda_p->col = asmap->if->col; \
566 pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \
567 pda_p->numSector = num; \
568 pda_p->next = NULL; \
569 RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList)
570 #if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0)
571 void
572 rf_WriteGenerateFailedAccessASMs(
573 RF_Raid_t * raidPtr,
574 RF_AccessStripeMap_t * asmap,
575 RF_PhysDiskAddr_t ** pdap,
576 int *nNodep,
577 RF_PhysDiskAddr_t ** pqpdap,
578 int *nPQNodep,
579 RF_AllocListElem_t * allocList)
580 {
581 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
582 int PDAPerDisk, i;
583 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
584 int numDataCol = layoutPtr->numDataCol;
585 int state;
586 unsigned napdas;
587 RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end;
588 RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1];
589 RF_PhysDiskAddr_t *pda_p;
590 RF_RaidAddr_t sosAddr;
591
592 /* determine how many pda's we will have to generate per unaccess
593 * stripe. If there is only one failed data unit, it is one; if two,
594 * possibly two, depending wether they overlap. */
595
596 fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector);
597 fone_end = fone_start + fone->numSector;
598
599 if (asmap->numDataFailed == 1) {
600 PDAPerDisk = 1;
601 state = 1;
602 RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
603 pda_p = *pqpdap;
604 /* build p */
605 CONS_PDA(parityInfo, fone_start, fone->numSector);
606 pda_p->type = RF_PDA_TYPE_PARITY;
607 pda_p++;
608 /* build q */
609 CONS_PDA(qInfo, fone_start, fone->numSector);
610 pda_p->type = RF_PDA_TYPE_Q;
611 } else {
612 ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector);
613 ftwo_end = ftwo_start + ftwo->numSector;
614 if (fone->numSector + ftwo->numSector > secPerSU) {
615 PDAPerDisk = 1;
616 state = 2;
617 RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
618 pda_p = *pqpdap;
619 CONS_PDA(parityInfo, 0, secPerSU);
620 pda_p->type = RF_PDA_TYPE_PARITY;
621 pda_p++;
622 CONS_PDA(qInfo, 0, secPerSU);
623 pda_p->type = RF_PDA_TYPE_Q;
624 } else {
625 PDAPerDisk = 2;
626 state = 3;
627 /* four of them, fone, then ftwo */
628 RF_MallocAndAdd(*pqpdap, 4 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
629 pda_p = *pqpdap;
630 CONS_PDA(parityInfo, fone_start, fone->numSector);
631 pda_p->type = RF_PDA_TYPE_PARITY;
632 pda_p++;
633 CONS_PDA(qInfo, fone_start, fone->numSector);
634 pda_p->type = RF_PDA_TYPE_Q;
635 pda_p++;
636 CONS_PDA(parityInfo, ftwo_start, ftwo->numSector);
637 pda_p->type = RF_PDA_TYPE_PARITY;
638 pda_p++;
639 CONS_PDA(qInfo, ftwo_start, ftwo->numSector);
640 pda_p->type = RF_PDA_TYPE_Q;
641 }
642 }
643 /* figure out number of nonaccessed pda */
644 napdas = PDAPerDisk * (numDataCol - 2);
645 *nPQNodep = PDAPerDisk;
646
647 *nNodep = napdas;
648 if (napdas == 0)
649 return; /* short circuit */
650
651 /* allocate up our list of pda's */
652
653 RF_MallocAndAdd(pda_p, napdas * sizeof(RF_PhysDiskAddr_t),
654 (RF_PhysDiskAddr_t *), allocList);
655 *pdap = pda_p;
656
657 /* linkem together */
658 for (i = 0; i < (napdas - 1); i++)
659 pda_p[i].next = pda_p + (i + 1);
660
661 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
662 for (i = 0; i < numDataCol; i++) {
663 if ((pda_p - (*pdap)) == napdas)
664 continue;
665 pda_p->type = RF_PDA_TYPE_DATA;
666 pda_p->raidAddress = sosAddr + (i * secPerSU);
667 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0);
668 /* skip over dead disks */
669 if (RF_DEAD_DISK(raidPtr->Disks[pda_p->col].status))
670 continue;
671 switch (state) {
672 case 1: /* fone */
673 pda_p->numSector = fone->numSector;
674 pda_p->raidAddress += fone_start;
675 pda_p->startSector += fone_start;
676 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
677 break;
678 case 2: /* full stripe */
679 pda_p->numSector = secPerSU;
680 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, secPerSU), (char *), allocList);
681 break;
682 case 3: /* two slabs */
683 pda_p->numSector = fone->numSector;
684 pda_p->raidAddress += fone_start;
685 pda_p->startSector += fone_start;
686 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
687 pda_p++;
688 pda_p->type = RF_PDA_TYPE_DATA;
689 pda_p->raidAddress = sosAddr + (i * secPerSU);
690 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0);
691 pda_p->numSector = ftwo->numSector;
692 pda_p->raidAddress += ftwo_start;
693 pda_p->startSector += ftwo_start;
694 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
695 break;
696 default:
697 RF_PANIC();
698 }
699 pda_p++;
700 }
701
702 RF_ASSERT(pda_p - *pdap == napdas);
703 return;
704 }
705 #define DISK_NODE_PDA(node) ((node)->params[0].p)
706
707 #define DISK_NODE_PARAMS(_node_,_p_) \
708 (_node_).params[0].p = _p_ ; \
709 (_node_).params[1].p = (_p_)->bufPtr; \
710 (_node_).params[2].v = parityStripeID; \
711 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru)
712
713 void
714 rf_DoubleDegSmallWrite(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
715 RF_DagHeader_t *dag_h, void *bp,
716 RF_RaidAccessFlags_t flags,
717 RF_AllocListElem_t *allocList,
718 char *redundantReadNodeName,
719 char *redundantWriteNodeName,
720 char *recoveryNodeName,
721 int (*recovFunc) (RF_DagNode_t *))
722 {
723 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
724 RF_DagNode_t *nodes, *wudNodes, *rrdNodes, *recoveryNode, *blockNode,
725 *unblockNode, *rpNodes, *rqNodes, *wpNodes, *wqNodes, *termNode;
726 RF_PhysDiskAddr_t *pda, *pqPDAs;
727 RF_PhysDiskAddr_t *npdas;
728 int nWriteNodes, nNodes, nReadNodes, nRrdNodes, nWudNodes, i;
729 RF_ReconUnitNum_t which_ru;
730 int nPQNodes;
731 RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru);
732
733 /* simple small write case - First part looks like a reconstruct-read
734 * of the failed data units. Then a write of all data units not
735 * failed. */
736
737
738 /* Hdr | ------Block- / / \ Rrd Rrd ... Rrd Rp Rq \ \
739 * / -------PQ----- / \ \ Wud Wp WQ \ | /
740 * --Unblock- | T
741 *
742 * Rrd = read recovery data (potentially none) Wud = write user data
743 * (not incl. failed disks) Wp = Write P (could be two) Wq = Write Q
744 * (could be two)
745 *
746 */
747
748 rf_WriteGenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes, allocList);
749
750 RF_ASSERT(asmap->numDataFailed == 1);
751
752 nWudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
753 nReadNodes = nRrdNodes + 2 * nPQNodes;
754 nWriteNodes = nWudNodes + 2 * nPQNodes;
755 nNodes = 4 + nReadNodes + nWriteNodes;
756
757 RF_MallocAndAdd(nodes, nNodes * sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
758 blockNode = nodes;
759 unblockNode = blockNode + 1;
760 termNode = unblockNode + 1;
761 recoveryNode = termNode + 1;
762 rrdNodes = recoveryNode + 1;
763 rpNodes = rrdNodes + nRrdNodes;
764 rqNodes = rpNodes + nPQNodes;
765 wudNodes = rqNodes + nPQNodes;
766 wpNodes = wudNodes + nWudNodes;
767 wqNodes = wpNodes + nPQNodes;
768
769 dag_h->creator = "PQ_DDSimpleSmallWrite";
770 dag_h->numSuccedents = 1;
771 dag_h->succedents[0] = blockNode;
772 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
773 termNode->antecedents[0] = unblockNode;
774 termNode->antType[0] = rf_control;
775
776 /* init the block and unblock nodes */
777 /* The block node has all the read nodes as successors */
778 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList);
779 for (i = 0; i < nReadNodes; i++)
780 blockNode->succedents[i] = rrdNodes + i;
781
782 /* The unblock node has all the writes as successors */
783 rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWriteNodes, 0, 0, dag_h, "Nil", allocList);
784 for (i = 0; i < nWriteNodes; i++) {
785 unblockNode->antecedents[i] = wudNodes + i;
786 unblockNode->antType[i] = rf_control;
787 }
788 unblockNode->succedents[0] = termNode;
789
790 #define INIT_READ_NODE(node,name) \
791 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
792 (node)->succedents[0] = recoveryNode; \
793 (node)->antecedents[0] = blockNode; \
794 (node)->antType[0] = rf_control;
795
796 /* build the read nodes */
797 pda = npdas;
798 for (i = 0; i < nRrdNodes; i++, pda = pda->next) {
799 INIT_READ_NODE(rrdNodes + i, "rrd");
800 DISK_NODE_PARAMS(rrdNodes[i], pda);
801 }
802
803 /* read redundancy pdas */
804 pda = pqPDAs;
805 INIT_READ_NODE(rpNodes, "Rp");
806 RF_ASSERT(pda);
807 DISK_NODE_PARAMS(rpNodes[0], pda);
808 pda++;
809 INIT_READ_NODE(rqNodes, redundantReadNodeName);
810 RF_ASSERT(pda);
811 DISK_NODE_PARAMS(rqNodes[0], pda);
812 if (nPQNodes == 2) {
813 pda++;
814 INIT_READ_NODE(rpNodes + 1, "Rp");
815 RF_ASSERT(pda);
816 DISK_NODE_PARAMS(rpNodes[1], pda);
817 pda++;
818 INIT_READ_NODE(rqNodes + 1, redundantReadNodeName);
819 RF_ASSERT(pda);
820 DISK_NODE_PARAMS(rqNodes[1], pda);
821 }
822 /* the recovery node has all reads as precedessors and all writes as
823 * successors. It generates a result for every write P or write Q
824 * node. As parameters, it takes a pda per read and a pda per stripe
825 * of user data written. It also takes as the last params the raidPtr
826 * and asm. For results, it takes PDA for P & Q. */
827
828
829 rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL,
830 nWriteNodes, /* succesors */
831 nReadNodes, /* preds */
832 nReadNodes + nWudNodes + 3, /* params */
833 2 * nPQNodes, /* results */
834 dag_h, recoveryNodeName, allocList);
835
836
837
838 for (i = 0; i < nReadNodes; i++) {
839 recoveryNode->antecedents[i] = rrdNodes + i;
840 recoveryNode->antType[i] = rf_control;
841 recoveryNode->params[i].p = DISK_NODE_PDA(rrdNodes + i);
842 }
843 for (i = 0; i < nWudNodes; i++) {
844 recoveryNode->succedents[i] = wudNodes + i;
845 }
846 recoveryNode->params[nReadNodes + nWudNodes].p = asmap->failedPDAs[0];
847 recoveryNode->params[nReadNodes + nWudNodes + 1].p = raidPtr;
848 recoveryNode->params[nReadNodes + nWudNodes + 2].p = asmap;
849
850 for (; i < nWriteNodes; i++)
851 recoveryNode->succedents[i] = wudNodes + i;
852
853 pda = pqPDAs;
854 recoveryNode->results[0] = pda;
855 pda++;
856 recoveryNode->results[1] = pda;
857 if (nPQNodes == 2) {
858 pda++;
859 recoveryNode->results[2] = pda;
860 pda++;
861 recoveryNode->results[3] = pda;
862 }
863 /* fill writes */
864 #define INIT_WRITE_NODE(node,name) \
865 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
866 (node)->succedents[0] = unblockNode; \
867 (node)->antecedents[0] = recoveryNode; \
868 (node)->antType[0] = rf_control;
869
870 pda = asmap->physInfo;
871 for (i = 0; i < nWudNodes; i++) {
872 INIT_WRITE_NODE(wudNodes + i, "Wd");
873 DISK_NODE_PARAMS(wudNodes[i], pda);
874 recoveryNode->params[nReadNodes + i].p = DISK_NODE_PDA(wudNodes + i);
875 pda = pda->next;
876 }
877 /* write redundancy pdas */
878 pda = pqPDAs;
879 INIT_WRITE_NODE(wpNodes, "Wp");
880 RF_ASSERT(pda);
881 DISK_NODE_PARAMS(wpNodes[0], pda);
882 pda++;
883 INIT_WRITE_NODE(wqNodes, "Wq");
884 RF_ASSERT(pda);
885 DISK_NODE_PARAMS(wqNodes[0], pda);
886 if (nPQNodes == 2) {
887 pda++;
888 INIT_WRITE_NODE(wpNodes + 1, "Wp");
889 RF_ASSERT(pda);
890 DISK_NODE_PARAMS(wpNodes[1], pda);
891 pda++;
892 INIT_WRITE_NODE(wqNodes + 1, "Wq");
893 RF_ASSERT(pda);
894 DISK_NODE_PARAMS(wqNodes[1], pda);
895 }
896 }
897 #endif /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0) */
Cache object: 98800b72cbee28dfea56e6f1b5db7447
|