1 /* $NetBSD: rf_pqdegdags.c,v 1.9 2004/01/10 00:56:28 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Daniel Stodolsky
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * rf_pqdegdags.c
31 * Degraded mode dags for double fault cases.
32 */
33
34
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: rf_pqdegdags.c,v 1.9 2004/01/10 00:56:28 oster Exp $");
37
38 #include "rf_archs.h"
39
40 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
41
42 #include <dev/raidframe/raidframevar.h>
43
44 #include "rf_raid.h"
45 #include "rf_dag.h"
46 #include "rf_dagdegrd.h"
47 #include "rf_dagdegwr.h"
48 #include "rf_dagfuncs.h"
49 #include "rf_dagutils.h"
50 #include "rf_etimer.h"
51 #include "rf_acctrace.h"
52 #include "rf_general.h"
53 #include "rf_pqdegdags.h"
54 #include "rf_pq.h"
55
56 static void
57 applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
58 RF_PhysDiskAddr_t * qpda, void *bp);
59
60 /*
61 Two data drives have failed, and we are doing a read that covers one of them.
62 We may also be reading some of the surviving drives.
63
64
65 *****************************************************************************************
66 *
67 * creates a DAG to perform a degraded-mode read of data within one stripe.
68 * This DAG is as follows:
69 *
70 * Hdr
71 * |
72 * Block
73 * / / \ \ \ \
74 * Rud ... Rud Rrd ... Rrd Rp Rq
75 * | \ | \ | \ | \ | \ | \
76 *
77 * | |
78 * Unblock X
79 * \ /
80 * ------ T ------
81 *
82 * Each R node is a successor of the L node
83 * One successor arc from each R node goes to U, and the other to X
84 * There is one Rud for each chunk of surviving user data requested by the user,
85 * and one Rrd for each chunk of surviving user data _not_ being read by the user
86 * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
87 * X = pq recovery node, T = terminate
88 *
89 * The block & unblock nodes are leftovers from a previous version. They
90 * do nothing, but I haven't deleted them because it would be a tremendous
91 * effort to put them back in.
92 *
93 * Note: The target buffer for the XOR node is set to the actual user buffer where the
94 * failed data is supposed to end up. This buffer is zero'd by the code here. Thus,
95 * if you create a degraded read dag, use it, and then re-use, you have to be sure to
96 * zero the target buffer prior to the re-use.
97 *
98 * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
99 * needs and what's not.
100 ****************************************************************************************/
101 /* init a disk node with 2 successors and one predecessor */
102 #define INIT_DISK_NODE(node,name) \
103 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
104 (node)->succedents[0] = unblockNode; \
105 (node)->succedents[1] = recoveryNode; \
106 (node)->antecedents[0] = blockNode; \
107 (node)->antType[0] = rf_control
108
109 #define DISK_NODE_PARAMS(_node_,_p_) \
110 (_node_).params[0].p = _p_ ; \
111 (_node_).params[1].p = (_p_)->bufPtr; \
112 (_node_).params[2].v = parityStripeID; \
113 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru)
114
115 #define DISK_NODE_PDA(node) ((node)->params[0].p)
116
117 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
118 {
119 rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
120 "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
121 }
122
123 static void
124 applyPDA(raidPtr, pda, ppda, qpda, bp)
125 RF_Raid_t *raidPtr;
126 RF_PhysDiskAddr_t *pda;
127 RF_PhysDiskAddr_t *ppda;
128 RF_PhysDiskAddr_t *qpda;
129 void *bp;
130 {
131 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
132 RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
133 RF_SectorCount_t s0len = ppda->numSector, len;
134 RF_SectorNum_t suoffset;
135 unsigned coeff;
136 char *pbuf = ppda->bufPtr;
137 char *qbuf = qpda->bufPtr;
138 char *buf;
139 int delta;
140
141 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
142 len = pda->numSector;
143 /* see if pda intersects a recovery pda */
144 if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
145 buf = pda->bufPtr;
146 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
147 coeff = (coeff % raidPtr->Layout.numDataCol);
148
149 if (suoffset < s0off) {
150 delta = s0off - suoffset;
151 buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
152 suoffset = s0off;
153 len -= delta;
154 }
155 if (suoffset > s0off) {
156 delta = suoffset - s0off;
157 pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
158 qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
159 }
160 if ((suoffset + len) > (s0len + s0off))
161 len = s0len + s0off - suoffset;
162
163 /* src, dest, len */
164 rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
165
166 /* dest, src, len, coeff */
167 rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
168 }
169 }
170 /*
171 Recover data in the case of a double failure. There can be two
172 result buffers, one for each chunk of data trying to be recovered.
173 The params are pda's that have not been range restricted or otherwise
174 politely massaged - this should be done here. The last params are the
175 pdas of P and Q, followed by the raidPtr. The list can look like
176
177 pda, pda, ... , p pda, q pda, raidptr, asm
178
179 or
180
181 pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
182
183 depending on wether two chunks of recovery data were required.
184
185 The second condition only arises if there are two failed buffers
186 whose lengths do not add up a stripe unit.
187 */
188
189
190 int
191 rf_PQDoubleRecoveryFunc(node)
192 RF_DagNode_t *node;
193 {
194 int np = node->numParams;
195 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
196 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
197 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
198 int d, i;
199 unsigned coeff;
200 RF_RaidAddr_t sosAddr, suoffset;
201 RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
202 int two = 0;
203 RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
204 char *buf;
205 int numDataCol = layoutPtr->numDataCol;
206 RF_Etimer_t timer;
207 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
208
209 RF_ETIMER_START(timer);
210
211 if (asmap->failedPDAs[1] &&
212 (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
213 RF_ASSERT(0);
214 ppda = node->params[np - 6].p;
215 ppda2 = node->params[np - 5].p;
216 qpda = node->params[np - 4].p;
217 qpda2 = node->params[np - 3].p;
218 d = (np - 6);
219 two = 1;
220 } else {
221 ppda = node->params[np - 4].p;
222 qpda = node->params[np - 3].p;
223 d = (np - 4);
224 }
225
226 for (i = 0; i < d; i++) {
227 pda = node->params[i].p;
228 buf = pda->bufPtr;
229 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
230 len = pda->numSector;
231 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
232 /* compute the data unit offset within the column */
233 coeff = (coeff % raidPtr->Layout.numDataCol);
234 /* see if pda intersects a recovery pda */
235 applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
236 if (two)
237 applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
238 }
239
240 /* ok, we got the parity back to the point where we can recover. We
241 * now need to determine the coeff of the columns that need to be
242 * recovered. We can also only need to recover a single stripe unit. */
243
244 if (asmap->failedPDAs[1] == NULL) { /* only a single stripe unit
245 * to recover. */
246 pda = asmap->failedPDAs[0];
247 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
248 /* need to determine the column of the other failed disk */
249 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
250 /* compute the data unit offset within the column */
251 coeff = (coeff % raidPtr->Layout.numDataCol);
252 for (i = 0; i < numDataCol; i++) {
253 npda.raidAddress = sosAddr + (i * secPerSU);
254 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
255 /* skip over dead disks */
256 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
257 if (i != coeff)
258 break;
259 }
260 RF_ASSERT(i < numDataCol);
261 RF_ASSERT(two == 0);
262 /* recover the data. Since we need only want to recover one
263 * column, we overwrite the parity with the other one. */
264 if (coeff < i) /* recovering 'a' */
265 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
266 else /* recovering 'b' */
267 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
268 } else
269 RF_PANIC();
270
271 RF_ETIMER_STOP(timer);
272 RF_ETIMER_EVAL(timer);
273 if (tracerec)
274 tracerec->q_us += RF_ETIMER_VAL_US(timer);
275 rf_GenericWakeupFunc(node, 0);
276 return (0);
277 }
278
279 int
280 rf_PQWriteDoubleRecoveryFunc(node)
281 RF_DagNode_t *node;
282 {
283 /* The situation:
284 *
285 * We are doing a write that hits only one failed data unit. The other
286 * failed data unit is not being overwritten, so we need to generate
287 * it.
288 *
289 * For the moment, we assume all the nonfailed data being written is in
290 * the shadow of the failed data unit. (i.e,, either a single data
291 * unit write or the entire failed stripe unit is being overwritten. )
292 *
293 * Recovery strategy: apply the recovery data to the parity and q. Use P
294 * & Q to recover the second failed data unit in P. Zero fill Q, then
295 * apply the recovered data to p. Then apply the data being written to
296 * the failed drive. Then walk through the surviving drives, applying
297 * new data when it exists, othewise the recovery data. Quite a mess.
298 *
299 *
300 * The params
301 *
302 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
303 * write pda (numStripeUnitAccess - numDataFailed), failed pda,
304 * raidPtr, asmap */
305
306 int np = node->numParams;
307 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
308 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
309 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
310 int i;
311 RF_RaidAddr_t sosAddr;
312 unsigned coeff;
313 RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
314 RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
315 int numDataCol = layoutPtr->numDataCol;
316 RF_Etimer_t timer;
317 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
318
319 RF_ASSERT(node->numResults == 2);
320 RF_ASSERT(asmap->failedPDAs[1] == NULL);
321 RF_ETIMER_START(timer);
322 ppda = node->results[0];
323 qpda = node->results[1];
324 /* apply the recovery data */
325 for (i = 0; i < numDataCol - 2; i++)
326 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
327
328 /* determine the other failed data unit */
329 pda = asmap->failedPDAs[0];
330 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
331 /* need to determine the column of the other failed disk */
332 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
333 /* compute the data unit offset within the column */
334 coeff = (coeff % raidPtr->Layout.numDataCol);
335 for (i = 0; i < numDataCol; i++) {
336 npda.raidAddress = sosAddr + (i * secPerSU);
337 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
338 /* skip over dead disks */
339 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
340 if (i != coeff)
341 break;
342 }
343 RF_ASSERT(i < numDataCol);
344 /* recover the data. The column we want to recover we write over the
345 * parity. The column we don't care about we dump in q. */
346 if (coeff < i) /* recovering 'a' */
347 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
348 else /* recovering 'b' */
349 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
350
351 /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
352 memset(qpda->bufPtr, 0, rf_RaidAddressToByte(raidPtr, qpda->numSector));
353 rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
354
355 /* now apply all the write data to the buffer */
356 /* single stripe unit write case: the failed data is only thing we are
357 * writing. */
358 RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
359 /* dest, src, len, coeff */
360 rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
361 rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
362
363 /* now apply all the recovery data */
364 for (i = 0; i < numDataCol - 2; i++)
365 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
366
367 RF_ETIMER_STOP(timer);
368 RF_ETIMER_EVAL(timer);
369 if (tracerec)
370 tracerec->q_us += RF_ETIMER_VAL_US(timer);
371
372 rf_GenericWakeupFunc(node, 0);
373 return (0);
374 }
375 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
376 {
377 RF_PANIC();
378 }
379 /*
380 Two lost data unit write case.
381
382 There are really two cases here:
383
384 (1) The write completely covers the two lost data units.
385 In that case, a reconstruct write that doesn't write the
386 failed data units will do the correct thing. So in this case,
387 the dag looks like
388
389 full stripe read of surviving data units (not being overwriten)
390 write new data (ignoring failed units) compute P&Q
391 write P&Q
392
393
394 (2) The write does not completely cover both failed data units
395 (but touches at least one of them). Then we need to do the
396 equivalent of a reconstruct read to recover the missing data
397 unit from the other stripe.
398
399 For any data we are writing that is not in the "shadow"
400 of the failed units, we need to do a four cycle update.
401 PANIC on this case. for now
402
403 */
404
405 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
406 {
407 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
408 RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
409 int sum;
410 int nf = asmap->numDataFailed;
411
412 sum = asmap->failedPDAs[0]->numSector;
413 if (nf == 2)
414 sum += asmap->failedPDAs[1]->numSector;
415
416 if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
417 /* large write case */
418 rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
419 return;
420 }
421 if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
422 /* small write case, no user data not in shadow */
423 rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
424 return;
425 }
426 RF_PANIC();
427 }
428 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
429 {
430 rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
431 }
432 #endif /* (RF_INCLUDE_DECL_PQ > 0) ||
433 * (RF_INCLUDE_RAID6 > 0) */
Cache object: 813dd3d986f6ba97a4238bc322ee0690
|