1 /*-
2 * Copyright (c) 2004, 2007 Lukas Ertl
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD: releng/10.0/sys/geom/vinum/geom_vinum_raid5.c 191856 2009-05-06 19:34:32Z lulf $");
29
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/lock.h>
33 #include <sys/malloc.h>
34 #include <sys/systm.h>
35
36 #include <geom/geom.h>
37 #include <geom/vinum/geom_vinum_var.h>
38 #include <geom/vinum/geom_vinum_raid5.h>
39 #include <geom/vinum/geom_vinum.h>
40
41 static int gv_raid5_offset(struct gv_plex *, off_t, off_t,
42 off_t *, off_t *, int *, int *, int);
43 static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *,
44 struct gv_raid5_packet *, caddr_t, int);
45 static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *,
46 struct bio *, caddr_t, off_t, off_t, int *);
47 static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *,
48 struct bio *, caddr_t, off_t, off_t);
49 static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *,
50 struct bio *, caddr_t, off_t, off_t);
51
52 struct gv_raid5_packet *
53 gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff,
54 off_t bcount)
55 {
56 struct bio *cbp;
57 struct gv_raid5_packet *wp, *wp2;
58 struct gv_bioq *bq, *bq2;
59 int err, delay;
60
61 delay = 0;
62 wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
63 wp->bio = bp;
64 wp->waiting = NULL;
65 wp->parity = NULL;
66 TAILQ_INIT(&wp->bits);
67
68 if (bp->bio_pflags & GV_BIO_REBUILD)
69 err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount);
70 else if (bp->bio_pflags & GV_BIO_CHECK)
71 err = gv_raid5_check(p, wp, bp, addr, boff, bcount);
72 else
73 err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay);
74
75 /* Means we have a delayed request. */
76 if (delay) {
77 g_free(wp);
78 return (NULL);
79 }
80
81 /*
82 * Building the sub-request failed, we probably need to clean up a lot.
83 */
84 if (err) {
85 G_VINUM_LOGREQ(0, bp, "raid5 plex request failed.");
86 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
87 TAILQ_REMOVE(&wp->bits, bq, queue);
88 g_free(bq);
89 }
90 if (wp->waiting != NULL) {
91 if (wp->waiting->bio_cflags & GV_BIO_MALLOC)
92 g_free(wp->waiting->bio_data);
93 g_destroy_bio(wp->waiting);
94 }
95 if (wp->parity != NULL) {
96 if (wp->parity->bio_cflags & GV_BIO_MALLOC)
97 g_free(wp->parity->bio_data);
98 g_destroy_bio(wp->parity);
99 }
100 g_free(wp);
101
102 TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
103 if (wp->bio != bp)
104 continue;
105
106 TAILQ_REMOVE(&p->packets, wp, list);
107 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
108 TAILQ_REMOVE(&wp->bits, bq, queue);
109 g_free(bq);
110 }
111 g_free(wp);
112 }
113
114 cbp = bioq_takefirst(p->bqueue);
115 while (cbp != NULL) {
116 if (cbp->bio_cflags & GV_BIO_MALLOC)
117 g_free(cbp->bio_data);
118 g_destroy_bio(cbp);
119 cbp = bioq_takefirst(p->bqueue);
120 }
121
122 /* If internal, stop and reset state. */
123 if (bp->bio_pflags & GV_BIO_INTERNAL) {
124 if (bp->bio_pflags & GV_BIO_MALLOC)
125 g_free(bp->bio_data);
126 g_destroy_bio(bp);
127 /* Reset flags. */
128 p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
129 GV_PLEX_GROWING);
130 return (NULL);
131 }
132 g_io_deliver(bp, err);
133 return (NULL);
134 }
135
136 return (wp);
137 }
138
139 /*
140 * Check if the stripe that the work packet wants is already being used by
141 * some other work packet.
142 */
143 int
144 gv_stripe_active(struct gv_plex *p, struct bio *bp)
145 {
146 struct gv_raid5_packet *wp, *owp;
147 int overlap;
148
149 wp = bp->bio_caller2;
150 if (wp->lockbase == -1)
151 return (0);
152
153 overlap = 0;
154 TAILQ_FOREACH(owp, &p->packets, list) {
155 if (owp == wp)
156 break;
157 if ((wp->lockbase >= owp->lockbase) &&
158 (wp->lockbase <= owp->lockbase + owp->length)) {
159 overlap++;
160 break;
161 }
162 if ((wp->lockbase <= owp->lockbase) &&
163 (wp->lockbase + wp->length >= owp->lockbase)) {
164 overlap++;
165 break;
166 }
167 }
168
169 return (overlap);
170 }
171
172 static int
173 gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
174 caddr_t addr, off_t boff, off_t bcount)
175 {
176 struct gv_sd *parity, *s;
177 struct gv_bioq *bq;
178 struct bio *cbp;
179 int i, psdno;
180 off_t real_len, real_off;
181
182 if (p == NULL || LIST_EMPTY(&p->subdisks))
183 return (ENXIO);
184
185 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1);
186
187 /* Find the right subdisk. */
188 parity = NULL;
189 i = 0;
190 LIST_FOREACH(s, &p->subdisks, in_plex) {
191 if (i == psdno) {
192 parity = s;
193 break;
194 }
195 i++;
196 }
197
198 /* Parity stripe not found. */
199 if (parity == NULL)
200 return (ENXIO);
201
202 if (parity->state != GV_SD_UP)
203 return (ENXIO);
204
205 wp->length = real_len;
206 wp->data = addr;
207 wp->lockbase = real_off;
208
209 /* Read all subdisks. */
210 LIST_FOREACH(s, &p->subdisks, in_plex) {
211 /* Skip the parity subdisk. */
212 if (s == parity)
213 continue;
214 /* Skip growing subdisks. */
215 if (s->flags & GV_SD_GROW)
216 continue;
217
218 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
219 if (cbp == NULL)
220 return (ENOMEM);
221 cbp->bio_cmd = BIO_READ;
222
223 bioq_insert_tail(p->bqueue, cbp);
224
225 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
226 bq->bp = cbp;
227 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
228 }
229
230 /* Read the parity data. */
231 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
232 if (cbp == NULL)
233 return (ENOMEM);
234 cbp->bio_cmd = BIO_READ;
235 wp->waiting = cbp;
236
237 /*
238 * In case we want to rebuild the parity, create an extra BIO to write
239 * it out. It also acts as buffer for the XOR operations.
240 */
241 cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1);
242 if (cbp == NULL)
243 return (ENOMEM);
244 wp->parity = cbp;
245
246 return (0);
247 }
248
249 /* Rebuild a degraded RAID5 plex. */
250 static int
251 gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
252 caddr_t addr, off_t boff, off_t bcount)
253 {
254 struct gv_sd *broken, *s;
255 struct gv_bioq *bq;
256 struct bio *cbp;
257 off_t real_len, real_off;
258
259 if (p == NULL || LIST_EMPTY(&p->subdisks))
260 return (ENXIO);
261
262 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1);
263
264 /* Find the right subdisk. */
265 broken = NULL;
266 LIST_FOREACH(s, &p->subdisks, in_plex) {
267 if (s->state != GV_SD_UP)
268 broken = s;
269 }
270
271 /* Broken stripe not found. */
272 if (broken == NULL)
273 return (ENXIO);
274
275 switch (broken->state) {
276 case GV_SD_UP:
277 return (EINVAL);
278
279 case GV_SD_STALE:
280 if (!(bp->bio_pflags & GV_BIO_REBUILD))
281 return (ENXIO);
282
283 G_VINUM_DEBUG(1, "sd %s is reviving", broken->name);
284 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
285 /* Set this bit now, but should be set at end. */
286 broken->flags |= GV_SD_CANGOUP;
287 break;
288
289 case GV_SD_REVIVING:
290 break;
291
292 default:
293 /* All other subdisk states mean it's not accessible. */
294 return (ENXIO);
295 }
296
297 wp->length = real_len;
298 wp->data = addr;
299 wp->lockbase = real_off;
300
301 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
302
303 /* Read all subdisks. */
304 LIST_FOREACH(s, &p->subdisks, in_plex) {
305 /* Skip the broken subdisk. */
306 if (s == broken)
307 continue;
308
309 /* Skip growing subdisks. */
310 if (s->flags & GV_SD_GROW)
311 continue;
312
313 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
314 if (cbp == NULL)
315 return (ENOMEM);
316 cbp->bio_cmd = BIO_READ;
317
318 bioq_insert_tail(p->bqueue, cbp);
319
320 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
321 bq->bp = cbp;
322 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
323 }
324
325 /* Write the parity data. */
326 cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1);
327 if (cbp == NULL)
328 return (ENOMEM);
329 wp->parity = cbp;
330
331 p->synced = boff;
332
333 /* Post notification that we're finished. */
334 return (0);
335 }
336
337 /* Build a request group to perform (part of) a RAID5 request. */
338 static int
339 gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp,
340 struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay)
341 {
342 struct g_geom *gp;
343 struct gv_sd *broken, *original, *parity, *s;
344 struct gv_bioq *bq;
345 struct bio *cbp;
346 int i, psdno, sdno, type, grow;
347 off_t real_len, real_off;
348
349 gp = bp->bio_to->geom;
350
351 if (p == NULL || LIST_EMPTY(&p->subdisks))
352 return (ENXIO);
353
354 /* We are optimistic and assume that this request will be OK. */
355 #define REQ_TYPE_NORMAL 0
356 #define REQ_TYPE_DEGRADED 1
357 #define REQ_TYPE_NOPARITY 2
358
359 type = REQ_TYPE_NORMAL;
360 original = parity = broken = NULL;
361
362 /* XXX: The resize won't crash with rebuild or sync, but we should still
363 * be aware of it. Also this should perhaps be done on rebuild/check as
364 * well?
365 */
366 /* If we're over, we must use the old. */
367 if (boff >= p->synced) {
368 grow = 1;
369 /* Or if over the resized offset, we use all drives. */
370 } else if (boff + bcount <= p->synced) {
371 grow = 0;
372 /* Else, we're in the middle, and must wait a bit. */
373 } else {
374 bioq_disksort(p->rqueue, bp);
375 *delay = 1;
376 return (0);
377 }
378 gv_raid5_offset(p, boff, bcount, &real_off, &real_len,
379 &sdno, &psdno, grow);
380
381 /* Find the right subdisks. */
382 i = 0;
383 LIST_FOREACH(s, &p->subdisks, in_plex) {
384 if (i == sdno)
385 original = s;
386 if (i == psdno)
387 parity = s;
388 if (s->state != GV_SD_UP)
389 broken = s;
390 i++;
391 }
392
393 if ((original == NULL) || (parity == NULL))
394 return (ENXIO);
395
396 /* Our data stripe is missing. */
397 if (original->state != GV_SD_UP)
398 type = REQ_TYPE_DEGRADED;
399
400 /* If synchronizing request, just write it if disks are stale. */
401 if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE &&
402 bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) {
403 type = REQ_TYPE_NORMAL;
404 /* Our parity stripe is missing. */
405 } else if (parity->state != GV_SD_UP) {
406 /* We cannot take another failure if we're already degraded. */
407 if (type != REQ_TYPE_NORMAL)
408 return (ENXIO);
409 else
410 type = REQ_TYPE_NOPARITY;
411 }
412
413 wp->length = real_len;
414 wp->data = addr;
415 wp->lockbase = real_off;
416
417 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
418
419 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced))
420 type = REQ_TYPE_NORMAL;
421
422 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) {
423 bioq_disksort(p->rqueue, bp);
424 *delay = 1;
425 return (0);
426 }
427
428 switch (bp->bio_cmd) {
429 case BIO_READ:
430 /*
431 * For a degraded read we need to read in all stripes except
432 * the broken one plus the parity stripe and then recalculate
433 * the desired data.
434 */
435 if (type == REQ_TYPE_DEGRADED) {
436 bzero(wp->data, wp->length);
437 LIST_FOREACH(s, &p->subdisks, in_plex) {
438 /* Skip the broken subdisk. */
439 if (s == broken)
440 continue;
441 /* Skip growing if within offset. */
442 if (grow && s->flags & GV_SD_GROW)
443 continue;
444 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
445 if (cbp == NULL)
446 return (ENOMEM);
447
448 bioq_insert_tail(p->bqueue, cbp);
449
450 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
451 bq->bp = cbp;
452 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
453 }
454
455 /* A normal read can be fulfilled with the original subdisk. */
456 } else {
457 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0);
458 if (cbp == NULL)
459 return (ENOMEM);
460
461 bioq_insert_tail(p->bqueue, cbp);
462 }
463 wp->lockbase = -1;
464
465 break;
466
467 case BIO_WRITE:
468 /*
469 * A degraded write means we cannot write to the original data
470 * subdisk. Thus we need to read in all valid stripes,
471 * recalculate the parity from the original data, and then
472 * write the parity stripe back out.
473 */
474 if (type == REQ_TYPE_DEGRADED) {
475 /* Read all subdisks. */
476 LIST_FOREACH(s, &p->subdisks, in_plex) {
477 /* Skip the broken and the parity subdisk. */
478 if ((s == broken) || (s == parity))
479 continue;
480 /* Skip growing if within offset. */
481 if (grow && s->flags & GV_SD_GROW)
482 continue;
483
484 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
485 if (cbp == NULL)
486 return (ENOMEM);
487 cbp->bio_cmd = BIO_READ;
488
489 bioq_insert_tail(p->bqueue, cbp);
490
491 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
492 bq->bp = cbp;
493 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
494 }
495
496 /* Write the parity data. */
497 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
498 if (cbp == NULL)
499 return (ENOMEM);
500 bcopy(addr, cbp->bio_data, wp->length);
501 wp->parity = cbp;
502
503 /*
504 * When the parity stripe is missing we just write out the data.
505 */
506 } else if (type == REQ_TYPE_NOPARITY) {
507 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
508 if (cbp == NULL)
509 return (ENOMEM);
510
511 bioq_insert_tail(p->bqueue, cbp);
512
513 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
514 bq->bp = cbp;
515 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
516
517 /*
518 * A normal write request goes to the original subdisk, then we
519 * read in all other stripes, recalculate the parity and write
520 * out the parity again.
521 */
522 } else {
523 /* Read old parity. */
524 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
525 if (cbp == NULL)
526 return (ENOMEM);
527 cbp->bio_cmd = BIO_READ;
528
529 bioq_insert_tail(p->bqueue, cbp);
530
531 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
532 bq->bp = cbp;
533 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
534
535 /* Read old data. */
536 cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1);
537 if (cbp == NULL)
538 return (ENOMEM);
539 cbp->bio_cmd = BIO_READ;
540
541 bioq_insert_tail(p->bqueue, cbp);
542
543 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
544 bq->bp = cbp;
545 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
546
547 /* Write new data. */
548 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
549 if (cbp == NULL)
550 return (ENOMEM);
551
552 /*
553 * We must not write the new data until the old data
554 * was read, so hold this BIO back until we're ready
555 * for it.
556 */
557 wp->waiting = cbp;
558
559 /* The final bio for the parity. */
560 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
561 if (cbp == NULL)
562 return (ENOMEM);
563
564 /* Remember that this is the BIO for the parity data. */
565 wp->parity = cbp;
566 }
567 break;
568
569 default:
570 return (EINVAL);
571 }
572
573 return (0);
574 }
575
576 /*
577 * Calculate the offsets in the various subdisks for a RAID5 request. Also take
578 * care of new subdisks in an expanded RAID5 array.
579 * XXX: This assumes that the new subdisks are inserted after the others (which
580 * is okay as long as plex_offset is larger). If subdisks are inserted into the
581 * plexlist before, we get problems.
582 */
583 static int
584 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
585 off_t *real_len, int *sdno, int *psdno, int growing)
586 {
587 struct gv_sd *s;
588 int sd, psd, sdcount;
589 off_t len_left, stripeend, stripeoff, stripestart;
590
591 sdcount = p->sdcount;
592 if (growing) {
593 LIST_FOREACH(s, &p->subdisks, in_plex) {
594 if (s->flags & GV_SD_GROW)
595 sdcount--;
596 }
597 }
598
599 /* The number of the subdisk containing the parity stripe. */
600 psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) %
601 sdcount;
602 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
603
604 /* Offset of the start address from the start of the stripe. */
605 stripeoff = boff % (p->stripesize * (sdcount - 1));
606 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
607
608 /* The number of the subdisk where the stripe resides. */
609 sd = stripeoff / p->stripesize;
610 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
611
612 /* At or past parity subdisk. */
613 if (sd >= psd)
614 sd++;
615
616 /* The offset of the stripe on this subdisk. */
617 stripestart = (boff - stripeoff) / (sdcount - 1);
618 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
619
620 stripeoff %= p->stripesize;
621
622 /* The offset of the request on this subdisk. */
623 *real_off = stripestart + stripeoff;
624
625 stripeend = stripestart + p->stripesize;
626 len_left = stripeend - *real_off;
627 KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
628
629 *real_len = (bcount <= len_left) ? bcount : len_left;
630
631 if (sdno != NULL)
632 *sdno = sd;
633 if (psdno != NULL)
634 *psdno = psd;
635
636 return (0);
637 }
638
639 static struct bio *
640 gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp,
641 caddr_t addr, int use_wp)
642 {
643 struct bio *cbp;
644
645 cbp = g_clone_bio(bp);
646 if (cbp == NULL)
647 return (NULL);
648 if (addr == NULL) {
649 cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO);
650 cbp->bio_cflags |= GV_BIO_MALLOC;
651 } else
652 cbp->bio_data = addr;
653 cbp->bio_offset = wp->lockbase + s->drive_offset;
654 cbp->bio_length = wp->length;
655 cbp->bio_done = gv_done;
656 cbp->bio_caller1 = s;
657 if (use_wp)
658 cbp->bio_caller2 = wp;
659
660 return (cbp);
661 }
Cache object: 962aa32aaff7aec9d4fe7d9fe8611b02
|