1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2004, 2007 Lukas Ertl
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/bio.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/systm.h>
37
38 #include <geom/geom.h>
39 #include <geom/geom_dbg.h>
40 #include <geom/vinum/geom_vinum_var.h>
41 #include <geom/vinum/geom_vinum_raid5.h>
42 #include <geom/vinum/geom_vinum.h>
43
44 static int gv_raid5_offset(struct gv_plex *, off_t, off_t,
45 off_t *, off_t *, int *, int *, int);
46 static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *,
47 struct gv_raid5_packet *, caddr_t, int);
48 static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *,
49 struct bio *, caddr_t, off_t, off_t, int *);
50 static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *,
51 struct bio *, caddr_t, off_t, off_t);
52 static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *,
53 struct bio *, caddr_t, off_t, off_t);
54
55 struct gv_raid5_packet *
56 gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff,
57 off_t bcount)
58 {
59 struct bio *cbp;
60 struct gv_raid5_packet *wp, *wp2;
61 struct gv_bioq *bq, *bq2;
62 int err, delay;
63
64 delay = 0;
65 wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
66 wp->bio = bp;
67 wp->waiting = NULL;
68 wp->parity = NULL;
69 TAILQ_INIT(&wp->bits);
70
71 if (bp->bio_pflags & GV_BIO_REBUILD)
72 err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount);
73 else if (bp->bio_pflags & GV_BIO_CHECK)
74 err = gv_raid5_check(p, wp, bp, addr, boff, bcount);
75 else
76 err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay);
77
78 /* Means we have a delayed request. */
79 if (delay) {
80 g_free(wp);
81 return (NULL);
82 }
83
84 /*
85 * Building the sub-request failed, we probably need to clean up a lot.
86 */
87 if (err) {
88 G_VINUM_LOGREQ(0, bp, "raid5 plex request failed.");
89 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
90 TAILQ_REMOVE(&wp->bits, bq, queue);
91 g_free(bq);
92 }
93 if (wp->waiting != NULL) {
94 if (wp->waiting->bio_cflags & GV_BIO_MALLOC)
95 g_free(wp->waiting->bio_data);
96 gv_drive_done(wp->waiting->bio_caller1);
97 g_destroy_bio(wp->waiting);
98 }
99 if (wp->parity != NULL) {
100 if (wp->parity->bio_cflags & GV_BIO_MALLOC)
101 g_free(wp->parity->bio_data);
102 gv_drive_done(wp->parity->bio_caller1);
103 g_destroy_bio(wp->parity);
104 }
105 g_free(wp);
106
107 TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
108 if (wp->bio != bp)
109 continue;
110
111 TAILQ_REMOVE(&p->packets, wp, list);
112 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
113 TAILQ_REMOVE(&wp->bits, bq, queue);
114 g_free(bq);
115 }
116 g_free(wp);
117 }
118
119 cbp = bioq_takefirst(p->bqueue);
120 while (cbp != NULL) {
121 if (cbp->bio_cflags & GV_BIO_MALLOC)
122 g_free(cbp->bio_data);
123 gv_drive_done(cbp->bio_caller1);
124 g_destroy_bio(cbp);
125 cbp = bioq_takefirst(p->bqueue);
126 }
127
128 /* If internal, stop and reset state. */
129 if (bp->bio_pflags & GV_BIO_INTERNAL) {
130 if (bp->bio_pflags & GV_BIO_MALLOC)
131 g_free(bp->bio_data);
132 g_destroy_bio(bp);
133 /* Reset flags. */
134 p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
135 GV_PLEX_GROWING);
136 return (NULL);
137 }
138 g_io_deliver(bp, err);
139 return (NULL);
140 }
141
142 return (wp);
143 }
144
145 /*
146 * Check if the stripe that the work packet wants is already being used by
147 * some other work packet.
148 */
149 int
150 gv_stripe_active(struct gv_plex *p, struct bio *bp)
151 {
152 struct gv_raid5_packet *wp, *owp;
153 int overlap;
154
155 wp = bp->bio_caller2;
156 if (wp->lockbase == -1)
157 return (0);
158
159 overlap = 0;
160 TAILQ_FOREACH(owp, &p->packets, list) {
161 if (owp == wp)
162 break;
163 if ((wp->lockbase >= owp->lockbase) &&
164 (wp->lockbase <= owp->lockbase + owp->length)) {
165 overlap++;
166 break;
167 }
168 if ((wp->lockbase <= owp->lockbase) &&
169 (wp->lockbase + wp->length >= owp->lockbase)) {
170 overlap++;
171 break;
172 }
173 }
174
175 return (overlap);
176 }
177
178 static int
179 gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
180 caddr_t addr, off_t boff, off_t bcount)
181 {
182 struct gv_sd *parity, *s;
183 struct gv_bioq *bq;
184 struct bio *cbp;
185 int i, psdno;
186 off_t real_len, real_off;
187
188 if (p == NULL || LIST_EMPTY(&p->subdisks))
189 return (ENXIO);
190
191 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1);
192
193 /* Find the right subdisk. */
194 parity = NULL;
195 i = 0;
196 LIST_FOREACH(s, &p->subdisks, in_plex) {
197 if (i == psdno) {
198 parity = s;
199 break;
200 }
201 i++;
202 }
203
204 /* Parity stripe not found. */
205 if (parity == NULL)
206 return (ENXIO);
207
208 if (parity->state != GV_SD_UP)
209 return (ENXIO);
210
211 wp->length = real_len;
212 wp->data = addr;
213 wp->lockbase = real_off;
214
215 /* Read all subdisks. */
216 LIST_FOREACH(s, &p->subdisks, in_plex) {
217 /* Skip the parity subdisk. */
218 if (s == parity)
219 continue;
220 /* Skip growing subdisks. */
221 if (s->flags & GV_SD_GROW)
222 continue;
223
224 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
225 if (cbp == NULL)
226 return (ENOMEM);
227 cbp->bio_cmd = BIO_READ;
228
229 bioq_insert_tail(p->bqueue, cbp);
230
231 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
232 bq->bp = cbp;
233 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
234 }
235
236 /* Read the parity data. */
237 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
238 if (cbp == NULL)
239 return (ENOMEM);
240 cbp->bio_cmd = BIO_READ;
241 wp->waiting = cbp;
242
243 /*
244 * In case we want to rebuild the parity, create an extra BIO to write
245 * it out. It also acts as buffer for the XOR operations.
246 */
247 cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1);
248 if (cbp == NULL)
249 return (ENOMEM);
250 wp->parity = cbp;
251
252 return (0);
253 }
254
255 /* Rebuild a degraded RAID5 plex. */
256 static int
257 gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
258 caddr_t addr, off_t boff, off_t bcount)
259 {
260 struct gv_sd *broken, *s;
261 struct gv_bioq *bq;
262 struct bio *cbp;
263 off_t real_len, real_off;
264
265 if (p == NULL || LIST_EMPTY(&p->subdisks))
266 return (ENXIO);
267
268 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1);
269
270 /* Find the right subdisk. */
271 broken = NULL;
272 LIST_FOREACH(s, &p->subdisks, in_plex) {
273 if (s->state != GV_SD_UP)
274 broken = s;
275 }
276
277 /* Broken stripe not found. */
278 if (broken == NULL)
279 return (ENXIO);
280
281 switch (broken->state) {
282 case GV_SD_UP:
283 return (EINVAL);
284
285 case GV_SD_STALE:
286 if (!(bp->bio_pflags & GV_BIO_REBUILD))
287 return (ENXIO);
288
289 G_VINUM_DEBUG(1, "sd %s is reviving", broken->name);
290 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
291 /* Set this bit now, but should be set at end. */
292 broken->flags |= GV_SD_CANGOUP;
293 break;
294
295 case GV_SD_REVIVING:
296 break;
297
298 default:
299 /* All other subdisk states mean it's not accessible. */
300 return (ENXIO);
301 }
302
303 wp->length = real_len;
304 wp->data = addr;
305 wp->lockbase = real_off;
306
307 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
308
309 /* Read all subdisks. */
310 LIST_FOREACH(s, &p->subdisks, in_plex) {
311 /* Skip the broken subdisk. */
312 if (s == broken)
313 continue;
314
315 /* Skip growing subdisks. */
316 if (s->flags & GV_SD_GROW)
317 continue;
318
319 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
320 if (cbp == NULL)
321 return (ENOMEM);
322 cbp->bio_cmd = BIO_READ;
323
324 bioq_insert_tail(p->bqueue, cbp);
325
326 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
327 bq->bp = cbp;
328 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
329 }
330
331 /* Write the parity data. */
332 cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1);
333 if (cbp == NULL)
334 return (ENOMEM);
335 wp->parity = cbp;
336
337 p->synced = boff;
338
339 /* Post notification that we're finished. */
340 return (0);
341 }
342
343 /* Build a request group to perform (part of) a RAID5 request. */
344 static int
345 gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp,
346 struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay)
347 {
348 struct gv_sd *broken, *original, *parity, *s;
349 struct gv_bioq *bq;
350 struct bio *cbp;
351 int i, psdno, sdno, type, grow;
352 off_t real_len, real_off;
353
354 if (p == NULL || LIST_EMPTY(&p->subdisks))
355 return (ENXIO);
356
357 /* We are optimistic and assume that this request will be OK. */
358 #define REQ_TYPE_NORMAL 0
359 #define REQ_TYPE_DEGRADED 1
360 #define REQ_TYPE_NOPARITY 2
361
362 type = REQ_TYPE_NORMAL;
363 original = parity = broken = NULL;
364
365 /* XXX: The resize won't crash with rebuild or sync, but we should still
366 * be aware of it. Also this should perhaps be done on rebuild/check as
367 * well?
368 */
369 /* If we're over, we must use the old. */
370 if (boff >= p->synced) {
371 grow = 1;
372 /* Or if over the resized offset, we use all drives. */
373 } else if (boff + bcount <= p->synced) {
374 grow = 0;
375 /* Else, we're in the middle, and must wait a bit. */
376 } else {
377 bioq_disksort(p->rqueue, bp);
378 *delay = 1;
379 return (0);
380 }
381 gv_raid5_offset(p, boff, bcount, &real_off, &real_len,
382 &sdno, &psdno, grow);
383
384 /* Find the right subdisks. */
385 i = 0;
386 LIST_FOREACH(s, &p->subdisks, in_plex) {
387 if (i == sdno)
388 original = s;
389 if (i == psdno)
390 parity = s;
391 if (s->state != GV_SD_UP)
392 broken = s;
393 i++;
394 }
395
396 if ((original == NULL) || (parity == NULL))
397 return (ENXIO);
398
399 /* Our data stripe is missing. */
400 if (original->state != GV_SD_UP)
401 type = REQ_TYPE_DEGRADED;
402
403 /* If synchronizing request, just write it if disks are stale. */
404 if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE &&
405 bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) {
406 type = REQ_TYPE_NORMAL;
407 /* Our parity stripe is missing. */
408 } else if (parity->state != GV_SD_UP) {
409 /* We cannot take another failure if we're already degraded. */
410 if (type != REQ_TYPE_NORMAL)
411 return (ENXIO);
412 else
413 type = REQ_TYPE_NOPARITY;
414 }
415
416 wp->length = real_len;
417 wp->data = addr;
418 wp->lockbase = real_off;
419
420 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
421
422 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced))
423 type = REQ_TYPE_NORMAL;
424
425 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) {
426 bioq_disksort(p->rqueue, bp);
427 *delay = 1;
428 return (0);
429 }
430
431 switch (bp->bio_cmd) {
432 case BIO_READ:
433 /*
434 * For a degraded read we need to read in all stripes except
435 * the broken one plus the parity stripe and then recalculate
436 * the desired data.
437 */
438 if (type == REQ_TYPE_DEGRADED) {
439 bzero(wp->data, wp->length);
440 LIST_FOREACH(s, &p->subdisks, in_plex) {
441 /* Skip the broken subdisk. */
442 if (s == broken)
443 continue;
444 /* Skip growing if within offset. */
445 if (grow && s->flags & GV_SD_GROW)
446 continue;
447 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
448 if (cbp == NULL)
449 return (ENOMEM);
450
451 bioq_insert_tail(p->bqueue, cbp);
452
453 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
454 bq->bp = cbp;
455 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
456 }
457
458 /* A normal read can be fulfilled with the original subdisk. */
459 } else {
460 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0);
461 if (cbp == NULL)
462 return (ENOMEM);
463
464 bioq_insert_tail(p->bqueue, cbp);
465 }
466 wp->lockbase = -1;
467
468 break;
469
470 case BIO_WRITE:
471 /*
472 * A degraded write means we cannot write to the original data
473 * subdisk. Thus we need to read in all valid stripes,
474 * recalculate the parity from the original data, and then
475 * write the parity stripe back out.
476 */
477 if (type == REQ_TYPE_DEGRADED) {
478 /* Read all subdisks. */
479 LIST_FOREACH(s, &p->subdisks, in_plex) {
480 /* Skip the broken and the parity subdisk. */
481 if ((s == broken) || (s == parity))
482 continue;
483 /* Skip growing if within offset. */
484 if (grow && s->flags & GV_SD_GROW)
485 continue;
486
487 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1);
488 if (cbp == NULL)
489 return (ENOMEM);
490 cbp->bio_cmd = BIO_READ;
491
492 bioq_insert_tail(p->bqueue, cbp);
493
494 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
495 bq->bp = cbp;
496 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
497 }
498
499 /* Write the parity data. */
500 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
501 if (cbp == NULL)
502 return (ENOMEM);
503 bcopy(addr, cbp->bio_data, wp->length);
504 wp->parity = cbp;
505
506 /*
507 * When the parity stripe is missing we just write out the data.
508 */
509 } else if (type == REQ_TYPE_NOPARITY) {
510 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
511 if (cbp == NULL)
512 return (ENOMEM);
513
514 bioq_insert_tail(p->bqueue, cbp);
515
516 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
517 bq->bp = cbp;
518 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
519
520 /*
521 * A normal write request goes to the original subdisk, then we
522 * read in all other stripes, recalculate the parity and write
523 * out the parity again.
524 */
525 } else {
526 /* Read old parity. */
527 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
528 if (cbp == NULL)
529 return (ENOMEM);
530 cbp->bio_cmd = BIO_READ;
531
532 bioq_insert_tail(p->bqueue, cbp);
533
534 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
535 bq->bp = cbp;
536 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
537
538 /* Read old data. */
539 cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1);
540 if (cbp == NULL)
541 return (ENOMEM);
542 cbp->bio_cmd = BIO_READ;
543
544 bioq_insert_tail(p->bqueue, cbp);
545
546 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
547 bq->bp = cbp;
548 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
549
550 /* Write new data. */
551 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1);
552 if (cbp == NULL)
553 return (ENOMEM);
554
555 /*
556 * We must not write the new data until the old data
557 * was read, so hold this BIO back until we're ready
558 * for it.
559 */
560 wp->waiting = cbp;
561
562 /* The final bio for the parity. */
563 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1);
564 if (cbp == NULL)
565 return (ENOMEM);
566
567 /* Remember that this is the BIO for the parity data. */
568 wp->parity = cbp;
569 }
570 break;
571
572 default:
573 return (EINVAL);
574 }
575
576 return (0);
577 }
578
579 /*
580 * Calculate the offsets in the various subdisks for a RAID5 request. Also take
581 * care of new subdisks in an expanded RAID5 array.
582 * XXX: This assumes that the new subdisks are inserted after the others (which
583 * is okay as long as plex_offset is larger). If subdisks are inserted into the
584 * plexlist before, we get problems.
585 */
586 static int
587 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
588 off_t *real_len, int *sdno, int *psdno, int growing)
589 {
590 struct gv_sd *s;
591 int sd, psd, sdcount;
592 off_t len_left, stripeend, stripeoff, stripestart;
593
594 sdcount = p->sdcount;
595 if (growing) {
596 LIST_FOREACH(s, &p->subdisks, in_plex) {
597 if (s->flags & GV_SD_GROW)
598 sdcount--;
599 }
600 }
601
602 /* The number of the subdisk containing the parity stripe. */
603 psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) %
604 sdcount;
605 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
606
607 /* Offset of the start address from the start of the stripe. */
608 stripeoff = boff % (p->stripesize * (sdcount - 1));
609 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
610
611 /* The number of the subdisk where the stripe resides. */
612 sd = stripeoff / p->stripesize;
613 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
614
615 /* At or past parity subdisk. */
616 if (sd >= psd)
617 sd++;
618
619 /* The offset of the stripe on this subdisk. */
620 stripestart = (boff - stripeoff) / (sdcount - 1);
621 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
622
623 stripeoff %= p->stripesize;
624
625 /* The offset of the request on this subdisk. */
626 *real_off = stripestart + stripeoff;
627
628 stripeend = stripestart + p->stripesize;
629 len_left = stripeend - *real_off;
630 KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
631
632 *real_len = (bcount <= len_left) ? bcount : len_left;
633
634 if (sdno != NULL)
635 *sdno = sd;
636 if (psdno != NULL)
637 *psdno = psd;
638
639 return (0);
640 }
641
642 static struct bio *
643 gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp,
644 caddr_t addr, int use_wp)
645 {
646 struct bio *cbp;
647
648 cbp = g_clone_bio(bp);
649 if (cbp == NULL)
650 return (NULL);
651 if (addr == NULL) {
652 cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO);
653 cbp->bio_cflags |= GV_BIO_MALLOC;
654 } else
655 cbp->bio_data = addr;
656 cbp->bio_offset = wp->lockbase + s->drive_offset;
657 cbp->bio_length = wp->length;
658 cbp->bio_done = gv_done;
659 cbp->bio_caller1 = s;
660 s->drive_sc->active++;
661 if (use_wp)
662 cbp->bio_caller2 = wp;
663
664 return (cbp);
665 }
Cache object: 65567ec86dd8eb3fd248842fdeb7bcd6
|