1 /*-
2 * Copyright (c) 1997, 1998
3 * Cybernet Corporation and Nan Yang Computer Services Limited.
4 * All rights reserved.
5 *
6 * This software was developed as part of the NetMAX project.
7 *
8 * Written by Greg Lehey
9 *
10 * This software is distributed under the so-called ``Berkeley
11 * License'':
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 * must display the following acknowledgement:
23 * This product includes software developed by Cybernet Corporation
24 * and Nan Yang Computer Services Limited
25 * 4. Neither the name of the Companies nor the names of its contributors
26 * may be used to endorse or promote products derived from this software
27 * without specific prior written permission.
28 *
29 * This software is provided ``as is'', and any express or implied
30 * warranties, including, but not limited to, the implied warranties of
31 * merchantability and fitness for a particular purpose are disclaimed.
32 * In no event shall the company or contributors be liable for any
33 * direct, indirect, incidental, special, exemplary, or consequential
34 * damages (including, but not limited to, procurement of substitute
35 * goods or services; loss of use, data, or profits; or business
36 * interruption) however caused and on any theory of liability, whether
37 * in contract, strict liability, or tort (including negligence or
38 * otherwise) arising in any way out of the use of this software, even if
39 * advised of the possibility of such damage.
40 *
41 * $Id: vinumraid5.c,v 1.21 2001/01/09 04:21:27 grog Exp grog $
42 * $FreeBSD: releng/5.0/sys/dev/vinum/vinumraid5.c 70866 2001-01-10 05:07:14Z grog $
43 */
44 #include <dev/vinum/vinumhdr.h>
45 #include <dev/vinum/request.h>
46 #include <sys/resourcevar.h>
47
48 /*
49 * Parameters which describe the current transfer.
50 * These are only used for calculation, but they
51 * need to be passed to other functions, so it's
52 * tidier to put them in a struct
53 */
54 struct metrics {
55 daddr_t stripebase; /* base address of stripe (1st subdisk) */
56 int stripeoffset; /* offset in stripe */
57 int stripesectors; /* total sectors to transfer in this stripe */
58 daddr_t sdbase; /* offset in subdisk of stripe base */
59 int sdcount; /* number of disks involved in this transfer */
60 daddr_t diskstart; /* remember where this transfer starts */
61 int psdno; /* number of parity subdisk */
62 int badsdno; /* number of down subdisk, if there is one */
63 int firstsdno; /* first data subdisk number */
64 /* These correspond to the fields in rqelement, sort of */
65 int useroffset;
66 /*
67 * Initial offset and length values for the first
68 * data block
69 */
70 int initoffset; /* start address of block to transfer */
71 short initlen; /* length in sectors of data transfer */
72 /* Define a normal operation */
73 int dataoffset; /* start address of block to transfer */
74 int datalen; /* length in sectors of data transfer */
75 /* Define a group operation */
76 int groupoffset; /* subdisk offset of group operation */
77 int grouplen; /* length in sectors of group operation */
78 /* Define a normal write operation */
79 int writeoffset; /* subdisk offset of normal write */
80 int writelen; /* length in sectors of write operation */
81 enum xferinfo flags; /* to check what we're doing */
82 int rqcount; /* number of elements in request */
83 };
84
85 enum requeststatus bre5(struct request *rq,
86 int plexno,
87 daddr_t * diskstart,
88 daddr_t diskend);
89 void complete_raid5_write(struct rqelement *);
90 enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
91 void setrqebounds(struct rqelement *rqe, struct metrics *mp);
92
93 /*
94 * define the low-level requests needed to perform
95 * a high-level I/O operation for a specific plex
96 * 'plexno'.
97 *
98 * Return 0 if all subdisks involved in the
99 * request are up, 1 if some subdisks are not up,
100 * and -1 if the request is at least partially
101 * outside the bounds of the subdisks.
102 *
103 * Modify the pointer *diskstart to point to the
104 * end address. On read, return on the first bad
105 * subdisk, so that the caller
106 * (build_read_request) can try alternatives.
107 *
108 * On entry to this routine, the prq structures
109 * are not assigned. The assignment is performed
110 * by expandrq(). Strictly speaking, the elements
111 * rqe->sdno of all entries should be set to -1,
112 * since 0 (from bzero) is a valid subdisk number.
113 * We avoid this problem by initializing the ones
114 * we use, and not looking at the others (index >=
115 * prq->requests).
116 */
117 enum requeststatus
118 bre5(struct request *rq,
119 int plexno,
120 daddr_t * diskaddr,
121 daddr_t diskend)
122 {
123 struct metrics m; /* most of the information */
124 struct sd *sd;
125 struct plex *plex;
126 struct buf *bp; /* user's bp */
127 struct rqgroup *rqg; /* the request group that we will create */
128 struct rqelement *rqe; /* point to this request information */
129 int rsectors; /* sectors remaining in this stripe */
130 int mysdno; /* another sd index in loops */
131 int rqno; /* request number */
132
133 rqg = NULL; /* shut up, damn compiler */
134 m.diskstart = *diskaddr; /* start of transfer */
135 bp = rq->bp; /* buffer pointer */
136 plex = &PLEX[plexno]; /* point to the plex */
137
138
139 while (*diskaddr < diskend) { /* until we get it all sorted out */
140 if (*diskaddr >= plex->length) /* beyond the end of the plex */
141 return REQUEST_EOF; /* can't continue */
142
143 m.badsdno = -1; /* no bad subdisk yet */
144
145 /* Part A: Define the request */
146 /*
147 * First, calculate some sizes:
148 * The offset of the start address from
149 * the start of the stripe.
150 */
151 m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1));
152
153 /*
154 * The plex-relative address of the
155 * start of the stripe.
156 */
157 m.stripebase = *diskaddr - m.stripeoffset;
158
159 /* subdisk containing the parity stripe */
160 if (plex->organization == plex_raid5)
161 m.psdno = plex->subdisks - 1
162 - (*diskaddr / (plex->stripesize * (plex->subdisks - 1)))
163 % plex->subdisks;
164 else /* RAID-4 */
165 m.psdno = plex->subdisks - 1;
166
167 /*
168 * The number of the subdisk in which
169 * the start is located.
170 */
171 m.firstsdno = m.stripeoffset / plex->stripesize;
172 if (m.firstsdno >= m.psdno) /* at or past parity sd */
173 m.firstsdno++; /* increment it */
174
175 /*
176 * The offset from the beginning of
177 * the stripe on this subdisk.
178 */
179 m.initoffset = m.stripeoffset % plex->stripesize;
180
181 /* The offset of the stripe start relative to this subdisk */
182 m.sdbase = m.stripebase / (plex->subdisks - 1);
183
184 m.useroffset = *diskaddr - m.diskstart; /* The offset of the start in the user buffer */
185
186 /*
187 * The number of sectors to transfer in the
188 * current (first) subdisk.
189 */
190 m.initlen = min(diskend - *diskaddr, /* the amount remaining to transfer */
191 plex->stripesize - m.initoffset); /* and the amount left in this block */
192
193 /*
194 * The number of sectors to transfer in this stripe
195 * is the minumum of the amount remaining to transfer
196 * and the amount left in this stripe.
197 */
198 m.stripesectors = min(diskend - *diskaddr,
199 plex->stripesize * (plex->subdisks - 1) - m.stripeoffset);
200
201 /* The number of data subdisks involved in this request */
202 m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize;
203
204 /* Part B: decide what kind of transfer this will be.
205
206 * start and end addresses of the transfer in
207 * the current block.
208 *
209 * There are a number of different kinds of
210 * transfer, each of which relates to a
211 * specific subdisk:
212 *
213 * 1. Normal read. All participating subdisks
214 * are up, and the transfer can be made
215 * directly to the user buffer. The bounds
216 * of the transfer are described by
217 * m.dataoffset and m.datalen. We have
218 * already calculated m.initoffset and
219 * m.initlen, which define the parameters
220 * for the first data block.
221 *
222 * 2. Recovery read. One participating
223 * subdisk is down. To recover data, all
224 * the other subdisks, including the parity
225 * subdisk, must be read. The data is
226 * recovered by exclusive-oring all the
227 * other blocks. The bounds of the
228 * transfer are described by m.groupoffset
229 * and m.grouplen.
230 *
231 * 3. A read request may request reading both
232 * available data (normal read) and
233 * non-available data (recovery read).
234 * This can be a problem if the address
235 * ranges of the two reads do not coincide:
236 * in this case, the normal read needs to
237 * be extended to cover the address range
238 * of the recovery read, and must thus be
239 * performed out of malloced memory.
240 *
241 * 4. Normal write. All the participating
242 * subdisks are up. The bounds of the
243 * transfer are described by m.dataoffset
244 * and m.datalen. Since these values
245 * differ for each block, we calculate the
246 * bounds for the parity block
247 * independently as the maximum of the
248 * individual blocks and store these values
249 * in m.writeoffset and m.writelen. This
250 * write proceeds in four phases:
251 *
252 * i. Read the old contents of each block
253 * and the parity block.
254 * ii. ``Remove'' the old contents from
255 * the parity block with exclusive or.
256 * iii. ``Insert'' the new contents of the
257 * block in the parity block, again
258 * with exclusive or.
259 *
260 * iv. Write the new contents of the data
261 * blocks and the parity block. The data
262 * block transfers can be made directly from
263 * the user buffer.
264 *
265 * 5. Degraded write where the data block is
266 * not available. The bounds of the
267 * transfer are described by m.groupoffset
268 * and m.grouplen. This requires the
269 * following steps:
270 *
271 * i. Read in all the other data blocks,
272 * excluding the parity block.
273 *
274 * ii. Recreate the parity block from the
275 * other data blocks and the data to be
276 * written.
277 *
278 * iii. Write the parity block.
279 *
280 * 6. Parityless write, a write where the
281 * parity block is not available. This is
282 * in fact the simplest: just write the
283 * data blocks. This can proceed directly
284 * from the user buffer. The bounds of the
285 * transfer are described by m.dataoffset
286 * and m.datalen.
287 *
288 * 7. Combination of degraded data block write
289 * and normal write. In this case the
290 * address ranges of the reads may also
291 * need to be extended to cover all
292 * participating blocks.
293 *
294 * All requests in a group transfer transfer
295 * the same address range relative to their
296 * subdisk. The individual transfers may
297 * vary, but since our group of requests is
298 * all in a single slice, we can define a
299 * range in which they all fall.
300 *
301 * In the following code section, we determine
302 * which kind of transfer we will perform. If
303 * there is a group transfer, we also decide
304 * its bounds relative to the subdisks. At
305 * the end, we have the following values:
306 *
307 * m.flags indicates the kinds of transfers
308 * we will perform.
309 * m.initoffset indicates the offset of the
310 * beginning of any data operation relative
311 * to the beginning of the stripe base.
312 * m.initlen specifies the length of any data
313 * operation.
314 * m.dataoffset contains the same value as
315 * m.initoffset.
316 * m.datalen contains the same value as
317 * m.initlen. Initially dataoffset and
318 * datalen describe the parameters for the
319 * first data block; while building the data
320 * block requests, they are updated for each
321 * block.
322 * m.groupoffset indicates the offset of any
323 * group operation relative to the beginning
324 * of the stripe base.
325 * m.grouplen specifies the length of any
326 * group operation.
327 * m.writeoffset indicates the offset of a
328 * normal write relative to the beginning of
329 * the stripe base. This value differs from
330 * m.dataoffset in that it applies to the
331 * entire operation, and not just the first
332 * block.
333 * m.writelen specifies the total span of a
334 * normal write operation. writeoffset and
335 * writelen are used to define the parity
336 * block.
337 */
338 m.groupoffset = 0; /* assume no group... */
339 m.grouplen = 0; /* until we know we have one */
340 m.writeoffset = m.initoffset; /* start offset of transfer */
341 m.writelen = 0; /* nothing to write yet */
342 m.flags = 0; /* no flags yet */
343 rsectors = m.stripesectors; /* remaining sectors to examine */
344 m.dataoffset = m.initoffset; /* start at the beginning of the transfer */
345 m.datalen = m.initlen;
346
347 if (m.sdcount > 1) {
348 plex->multiblock++; /* more than one block for the request */
349 /*
350 * If we have two transfers that don't overlap,
351 * (one at the end of the first block, the other
352 * at the beginning of the second block),
353 * it's cheaper to split them.
354 */
355 if (rsectors < plex->stripesize) {
356 m.sdcount = 1; /* just one subdisk */
357 m.stripesectors = m.initlen; /* and just this many sectors */
358 rsectors = m.initlen; /* and in the loop counter */
359 }
360 }
361 if (SD[plex->sdnos[m.psdno]].state < sd_reborn) /* is our parity subdisk down? */
362 m.badsdno = m.psdno; /* note that it's down */
363 if (bp->b_iocmd == BIO_READ) { /* read operation */
364 for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
365 if (mysdno == m.psdno) /* ignore parity on read */
366 mysdno++;
367 if (mysdno == plex->subdisks) /* wraparound */
368 mysdno = 0;
369 if (mysdno == m.psdno) /* parity, */
370 mysdno++; /* we've given already */
371
372 if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */
373 if (m.badsdno >= 0) /* we had one already, */
374 return REQUEST_DOWN; /* we can't take a second */
375 m.badsdno = mysdno; /* got the first */
376 m.groupoffset = m.dataoffset; /* define the bounds */
377 m.grouplen = m.datalen;
378 m.flags |= XFR_RECOVERY_READ; /* we need recovery */
379 plex->recovered_reads++; /* count another one */
380 } else
381 m.flags |= XFR_NORMAL_READ; /* normal read */
382
383 /* Update the pointers for the next block */
384 m.dataoffset = 0; /* back to the start of the stripe */
385 rsectors -= m.datalen; /* remaining sectors to examine */
386 m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */
387 }
388 } else { /* write operation */
389 for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
390 if (mysdno == m.psdno) /* parity stripe, we've dealt with that */
391 mysdno++;
392 if (mysdno == plex->subdisks) /* wraparound */
393 mysdno = 0;
394 if (mysdno == m.psdno) /* parity, */
395 mysdno++; /* we've given already */
396
397 sd = &SD[plex->sdnos[mysdno]];
398 if (sd->state != sd_up) {
399 enum requeststatus s;
400
401 s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
402 if (s && (m.badsdno >= 0)) { /* second bad disk, */
403 int sdno;
404 /*
405 * If the parity disk is down, there's
406 * no recovery. We make all involved
407 * subdisks stale. Otherwise, we
408 * should be able to recover, but it's
409 * like pulling teeth. Fix it later.
410 */
411 for (sdno = 0; sdno < m.sdcount; sdno++) {
412 struct sd *sd = &SD[plex->sdnos[sdno]];
413 if (sd->state >= sd_reborn) /* sort of up, */
414 set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */
415 }
416 return s; /* and crap out */
417 }
418 m.badsdno = mysdno; /* note which one is bad */
419 m.flags |= XFR_DEGRADED_WRITE; /* we need recovery */
420 plex->degraded_writes++; /* count another one */
421 m.groupoffset = m.dataoffset; /* define the bounds */
422 m.grouplen = m.datalen;
423 } else {
424 m.flags |= XFR_NORMAL_WRITE; /* normal write operation */
425 if (m.writeoffset > m.dataoffset) { /* move write operation lower */
426 m.writelen = max(m.writeoffset + m.writelen,
427 m.dataoffset + m.datalen)
428 - m.dataoffset;
429 m.writeoffset = m.dataoffset;
430 } else
431 m.writelen = max(m.writeoffset + m.writelen,
432 m.dataoffset + m.datalen)
433 - m.writeoffset;
434 }
435
436 /* Update the pointers for the next block */
437 m.dataoffset = 0; /* back to the start of the stripe */
438 rsectors -= m.datalen; /* remaining sectors to examine */
439 m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */
440 }
441 if (m.badsdno == m.psdno) { /* got a bad parity block, */
442 struct sd *psd = &SD[plex->sdnos[m.psdno]];
443
444 if (psd->state == sd_down)
445 set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */
446 else if (psd->state == sd_crashed)
447 set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */
448 m.flags &= ~XFR_NORMAL_WRITE; /* this write isn't normal, */
449 m.flags |= XFR_PARITYLESS_WRITE; /* it's parityless */
450 plex->parityless_writes++; /* count another one */
451 }
452 }
453
454 /* reset the initial transfer values */
455 m.dataoffset = m.initoffset; /* start at the beginning of the transfer */
456 m.datalen = m.initlen;
457
458 /* decide how many requests we need */
459 if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))
460 /* doing a recovery read or degraded write, */
461 m.rqcount = plex->subdisks; /* all subdisks */
462 else if (m.flags & XFR_NORMAL_WRITE) /* normal write, */
463 m.rqcount = m.sdcount + 1; /* all data blocks and the parity block */
464 else /* parityless write or normal read */
465 m.rqcount = m.sdcount; /* just the data blocks */
466
467 /* Part C: build the requests */
468 rqg = allocrqg(rq, m.rqcount); /* get a request group */
469 if (rqg == NULL) { /* malloc failed */
470 bp->b_error = ENOMEM;
471 bp->b_ioflags |= BIO_ERROR;
472 return REQUEST_ENOMEM;
473 }
474 rqg->plexno = plexno;
475 rqg->flags = m.flags;
476 rqno = 0; /* index in the request group */
477
478 /* 1: PARITY BLOCK */
479 /*
480 * Are we performing an operation which requires parity? In that case,
481 * work out the parameters and define the parity block.
482 * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE
483 */
484 if (m.flags & XFR_PARITYOP) { /* need parity */
485 rqe = &rqg->rqe[rqno]; /* point to element */
486 sd = &SD[plex->sdnos[m.psdno]]; /* the subdisk in question */
487 rqe->rqg = rqg; /* point back to group */
488 rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */
489 &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE); /* transfer flags without data op stuf */
490 setrqebounds(rqe, &m); /* set up the bounds of the transfer */
491 rqe->sdno = sd->sdno; /* subdisk number */
492 rqe->driveno = sd->driveno;
493 if (build_rq_buffer(rqe, plex)) /* build the buffer */
494 return REQUEST_ENOMEM; /* can't do it */
495 rqe->b.b_iocmd = BIO_READ; /* we must read first */
496 m.sdcount++; /* adjust the subdisk count */
497 rqno++; /* and point to the next request */
498 }
499 /*
500 * 2: DATA BLOCKS
501 * Now build up requests for the blocks required
502 * for individual transfers
503 */
504 for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) {
505 if (mysdno == m.psdno) /* parity, */
506 mysdno++; /* we've given already */
507 if (mysdno == plex->subdisks) /* got to the end, */
508 mysdno = 0; /* wrap around */
509 if (mysdno == m.psdno) /* parity, */
510 mysdno++; /* we've given already */
511
512 rqe = &rqg->rqe[rqno]; /* point to element */
513 sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */
514 rqe->rqg = rqg; /* point to group */
515 if (m.flags & XFR_NEEDS_MALLOC) /* we need a malloced buffer first */
516 rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */
517 else
518 rqe->flags = m.flags | XFR_DATA_BLOCK; /* transfer flags */
519 if (mysdno == m.badsdno) { /* this is the bad subdisk */
520 rqg->badsdno = rqno; /* note which one */
521 rqe->flags |= XFR_BAD_SUBDISK; /* note that it's dead */
522 /*
523 * we can't read or write from/to it,
524 * but we don't need to malloc
525 */
526 rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE);
527 }
528 setrqebounds(rqe, &m); /* set up the bounds of the transfer */
529 rqe->useroffset = m.useroffset; /* offset in user buffer */
530 rqe->sdno = sd->sdno; /* subdisk number */
531 rqe->driveno = sd->driveno;
532 if (build_rq_buffer(rqe, plex)) /* build the buffer */
533 return REQUEST_ENOMEM; /* can't do it */
534 if ((m.flags & XFR_PARITYOP) /* parity operation, */
535 &&((m.flags & XFR_BAD_SUBDISK) == 0)) /* and not the bad subdisk, */
536 rqe->b.b_iocmd = BIO_READ; /* we must read first */
537
538 /* Now update pointers for the next block */
539 *diskaddr += m.datalen; /* skip past what we've done */
540 m.stripesectors -= m.datalen; /* deduct from what's left */
541 m.useroffset += m.datalen; /* and move on in the user buffer */
542 m.datalen = min(m.stripesectors, plex->stripesize); /* and recalculate */
543 m.dataoffset = 0; /* start at the beginning of next block */
544 }
545
546 /*
547 * 3: REMAINING BLOCKS FOR RECOVERY
548 * Finally, if we have a recovery operation, build
549 * up transfers for the other subdisks. Follow the
550 * subdisks around until we get to where we started.
551 * These requests use only the group parameters.
552 */
553 if ((rqno < m.rqcount) /* haven't done them all already */
554 &&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) {
555 for (; rqno < m.rqcount; rqno++, mysdno++) {
556 if (mysdno == m.psdno) /* parity, */
557 mysdno++; /* we've given already */
558 if (mysdno == plex->subdisks) /* got to the end, */
559 mysdno = 0; /* wrap around */
560 if (mysdno == m.psdno) /* parity, */
561 mysdno++; /* we've given already */
562
563 rqe = &rqg->rqe[rqno]; /* point to element */
564 sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */
565 rqe->rqg = rqg; /* point to group */
566
567 rqe->sdoffset = m.sdbase + m.groupoffset; /* start of transfer */
568 rqe->dataoffset = 0; /* for tidiness' sake */
569 rqe->groupoffset = 0; /* group starts at the beginining */
570 rqe->datalen = 0;
571 rqe->grouplen = m.grouplen;
572 rqe->buflen = m.grouplen;
573 rqe->flags = (m.flags | XFR_MALLOCED) /* transfer flags without data op stuf */
574 &~XFR_DATAOP;
575 rqe->sdno = sd->sdno; /* subdisk number */
576 rqe->driveno = sd->driveno;
577 if (build_rq_buffer(rqe, plex)) /* build the buffer */
578 return REQUEST_ENOMEM; /* can't do it */
579 rqe->b.b_iocmd = BIO_READ; /* we must read first */
580 }
581 }
582 /*
583 * We need to lock the address range before
584 * doing anything. We don't have to be
585 * performing a recovery operation: somebody
586 * else could be doing so, and the results could
587 * influence us. Note the fact here, we'll perform
588 * the lock in launch_requests.
589 */
590 rqg->lockbase = m.stripebase;
591 if (*diskaddr < diskend) /* didn't finish the request on this stripe */
592 plex->multistripe++; /* count another one */
593 }
594 return REQUEST_OK;
595 }
596
597 /*
598 * Helper function for rqe5: adjust the bounds of
599 * the transfers to minimize the buffer
600 * allocation.
601 *
602 * Each request can handle two of three different
603 * data ranges:
604 *
605 * 1. The range described by the parameters
606 * dataoffset and datalen, for normal read or
607 * parityless write.
608 * 2. The range described by the parameters
609 * groupoffset and grouplen, for recovery read
610 * and degraded write.
611 * 3. For normal write, the range depends on the
612 * kind of block. For data blocks, the range
613 * is defined by dataoffset and datalen. For
614 * parity blocks, it is defined by writeoffset
615 * and writelen.
616 *
617 * In order not to allocate more memory than
618 * necessary, this function adjusts the bounds
619 * parameter for each request to cover just the
620 * minimum necessary for the function it performs.
621 * This will normally vary from one request to the
622 * next.
623 *
624 * Things are slightly different for the parity
625 * block. In this case, the bounds defined by
626 * mp->writeoffset and mp->writelen also play a
627 * rôle. Select this case by setting the
628 * parameter forparity != 0
629 */
630 void
631 setrqebounds(struct rqelement *rqe, struct metrics *mp)
632 {
633 /* parity block of a normal write */
634 if ((rqe->flags & (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK))
635 == (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) { /* case 3 */
636 if (rqe->flags & XFR_DEGRADED_WRITE) { /* also degraded write */
637 /*
638 * With a combined normal and degraded write, we
639 * will zero out the area of the degraded write
640 * in the second phase, so we don't need to read
641 * it in. Unfortunately, we need a way to tell
642 * build_request_buffer the size of the buffer,
643 * and currently that's the length of the read.
644 * As a result, we read everything, even the stuff
645 * that we're going to nuke.
646 * FIXME XXX
647 */
648 if (mp->groupoffset < mp->writeoffset) { /* group operation starts lower */
649 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
650 rqe->dataoffset = mp->writeoffset - mp->groupoffset; /* data starts here */
651 rqe->groupoffset = 0; /* and the group at the beginning */
652 } else { /* individual data starts first */
653 rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */
654 rqe->dataoffset = 0; /* individual data starts at the beginning */
655 rqe->groupoffset = mp->groupoffset - mp->writeoffset; /* group starts here */
656 }
657 rqe->datalen = mp->writelen;
658 rqe->grouplen = mp->grouplen;
659 } else { /* just normal write (case 3) */
660 rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */
661 rqe->dataoffset = 0; /* degradation starts at the beginning */
662 rqe->groupoffset = 0; /* for tidiness' sake */
663 rqe->datalen = mp->writelen;
664 rqe->grouplen = 0;
665 }
666 } else if (rqe->flags & XFR_DATAOP) { /* data operation (case 1 or 3) */
667 if (rqe->flags & XFR_GROUPOP) { /* also a group operation (case 2) */
668 if (mp->groupoffset < mp->dataoffset) { /* group operation starts lower */
669 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
670 rqe->dataoffset = mp->dataoffset - mp->groupoffset; /* data starts here */
671 rqe->groupoffset = 0; /* and the group at the beginning */
672 } else { /* individual data starts first */
673 rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */
674 rqe->dataoffset = 0; /* individual data starts at the beginning */
675 rqe->groupoffset = mp->groupoffset - mp->dataoffset; /* group starts here */
676 }
677 rqe->datalen = mp->datalen;
678 rqe->grouplen = mp->grouplen;
679 } else { /* just data operation (case 1) */
680 rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */
681 rqe->dataoffset = 0; /* degradation starts at the beginning */
682 rqe->groupoffset = 0; /* for tidiness' sake */
683 rqe->datalen = mp->datalen;
684 rqe->grouplen = 0;
685 }
686 } else { /* just group operations (case 2) */
687 rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
688 rqe->dataoffset = 0; /* for tidiness' sake */
689 rqe->groupoffset = 0; /* group starts at the beginining */
690 rqe->datalen = 0;
691 rqe->grouplen = mp->grouplen;
692 }
693 rqe->buflen = max(rqe->dataoffset + rqe->datalen, /* total buffer length */
694 rqe->groupoffset + rqe->grouplen);
695 }
696 /* Local Variables: */
697 /* fill-column: 50 */
698 /* End: */
Cache object: e893c4d0ef932e8df03216539461f8a7
|