1 /*-
2 * Copyright (c) 2004 Lukas Ertl
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/conf.h>
33 #include <sys/errno.h>
34 #include <sys/kernel.h>
35 #include <sys/kthread.h>
36 #include <sys/libkern.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/systm.h>
41
42 #include <geom/geom.h>
43 #include <geom/vinum/geom_vinum_var.h>
44 #include <geom/vinum/geom_vinum_raid5.h>
45 #include <geom/vinum/geom_vinum.h>
46
47 int gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *,
48 int *, int *);
49
50 /*
51 * Check if the stripe that the work packet wants is already being used by
52 * some other work packet.
53 */
54 int
55 gv_stripe_active(struct gv_plex *p, struct bio *bp)
56 {
57 struct gv_raid5_packet *wp, *owp;
58 int overlap;
59
60 wp = bp->bio_driver1;
61 if (wp->lockbase == -1)
62 return (0);
63
64 overlap = 0;
65 TAILQ_FOREACH(owp, &p->packets, list) {
66 if (owp == wp)
67 break;
68 if ((wp->lockbase >= owp->lockbase) &&
69 (wp->lockbase <= owp->lockbase + owp->length)) {
70 overlap++;
71 break;
72 }
73 if ((wp->lockbase <= owp->lockbase) &&
74 (wp->lockbase + wp->length >= owp->lockbase)) {
75 overlap++;
76 break;
77 }
78 }
79
80 return (overlap);
81 }
82
83 int
84 gv_check_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
85 caddr_t addr, off_t boff, off_t bcount)
86 {
87 struct gv_sd *parity, *s;
88 struct gv_bioq *bq;
89 struct bio *cbp, *pbp;
90 int i, psdno;
91 off_t real_len, real_off;
92
93 if (p == NULL || LIST_EMPTY(&p->subdisks))
94 return (ENXIO);
95
96 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno);
97
98 /* Find the right subdisk. */
99 parity = NULL;
100 i = 0;
101 LIST_FOREACH(s, &p->subdisks, in_plex) {
102 if (i == psdno) {
103 parity = s;
104 break;
105 }
106 i++;
107 }
108
109 /* Parity stripe not found. */
110 if (parity == NULL)
111 return (ENXIO);
112
113 if (parity->state != GV_SD_UP)
114 return (ENXIO);
115
116 wp->length = real_len;
117 wp->data = addr;
118 wp->lockbase = real_off;
119
120 /* Read all subdisks. */
121 LIST_FOREACH(s, &p->subdisks, in_plex) {
122 /* Skip the parity subdisk. */
123 if (s == parity)
124 continue;
125
126 cbp = g_clone_bio(bp);
127 if (cbp == NULL)
128 return (ENOMEM);
129 cbp->bio_cmd = BIO_READ;
130 cbp->bio_data = g_malloc(real_len, M_WAITOK);
131 cbp->bio_cflags |= GV_BIO_MALLOC;
132 cbp->bio_offset = real_off;
133 cbp->bio_length = real_len;
134 cbp->bio_done = gv_plex_done;
135 cbp->bio_caller2 = s->consumer;
136 cbp->bio_driver1 = wp;
137
138 GV_ENQUEUE(bp, cbp, pbp);
139
140 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
141 bq->bp = cbp;
142 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
143 }
144
145 /* Read the parity data. */
146 cbp = g_clone_bio(bp);
147 if (cbp == NULL)
148 return (ENOMEM);
149 cbp->bio_cmd = BIO_READ;
150 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
151 cbp->bio_cflags |= GV_BIO_MALLOC;
152 cbp->bio_offset = real_off;
153 cbp->bio_length = real_len;
154 cbp->bio_done = gv_plex_done;
155 cbp->bio_caller2 = parity->consumer;
156 cbp->bio_driver1 = wp;
157 wp->waiting = cbp;
158
159 /*
160 * In case we want to rebuild the parity, create an extra BIO to write
161 * it out. It also acts as buffer for the XOR operations.
162 */
163 cbp = g_clone_bio(bp);
164 if (cbp == NULL)
165 return (ENOMEM);
166 cbp->bio_data = addr;
167 cbp->bio_offset = real_off;
168 cbp->bio_length = real_len;
169 cbp->bio_done = gv_plex_done;
170 cbp->bio_caller2 = parity->consumer;
171 cbp->bio_driver1 = wp;
172 wp->parity = cbp;
173
174 return (0);
175 }
176
177 /* Rebuild a degraded RAID5 plex. */
178 int
179 gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
180 caddr_t addr, off_t boff, off_t bcount)
181 {
182 struct gv_sd *broken, *s;
183 struct gv_bioq *bq;
184 struct bio *cbp, *pbp;
185 off_t real_len, real_off;
186
187 if (p == NULL || LIST_EMPTY(&p->subdisks))
188 return (ENXIO);
189
190 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL);
191
192 /* Find the right subdisk. */
193 broken = NULL;
194 LIST_FOREACH(s, &p->subdisks, in_plex) {
195 if (s->state != GV_SD_UP)
196 broken = s;
197 }
198
199 /* Broken stripe not found. */
200 if (broken == NULL)
201 return (ENXIO);
202
203 switch (broken->state) {
204 case GV_SD_UP:
205 return (EINVAL);
206
207 case GV_SD_STALE:
208 if (!(bp->bio_cflags & GV_BIO_REBUILD))
209 return (ENXIO);
210
211 printf("GEOM_VINUM: sd %s is reviving\n", broken->name);
212 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
213 break;
214
215 case GV_SD_REVIVING:
216 break;
217
218 default:
219 /* All other subdisk states mean it's not accessible. */
220 return (ENXIO);
221 }
222
223 wp->length = real_len;
224 wp->data = addr;
225 wp->lockbase = real_off;
226
227 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
228
229 /* Read all subdisks. */
230 LIST_FOREACH(s, &p->subdisks, in_plex) {
231 /* Skip the broken subdisk. */
232 if (s == broken)
233 continue;
234
235 cbp = g_clone_bio(bp);
236 if (cbp == NULL)
237 return (ENOMEM);
238 cbp->bio_cmd = BIO_READ;
239 cbp->bio_data = g_malloc(real_len, M_WAITOK);
240 cbp->bio_cflags |= GV_BIO_MALLOC;
241 cbp->bio_offset = real_off;
242 cbp->bio_length = real_len;
243 cbp->bio_done = gv_plex_done;
244 cbp->bio_caller2 = s->consumer;
245 cbp->bio_driver1 = wp;
246
247 GV_ENQUEUE(bp, cbp, pbp);
248
249 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
250 bq->bp = cbp;
251 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
252 }
253
254 /* Write the parity data. */
255 cbp = g_clone_bio(bp);
256 if (cbp == NULL)
257 return (ENOMEM);
258 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
259 cbp->bio_cflags |= GV_BIO_MALLOC;
260 cbp->bio_offset = real_off;
261 cbp->bio_length = real_len;
262 cbp->bio_done = gv_plex_done;
263 cbp->bio_caller2 = broken->consumer;
264 cbp->bio_driver1 = wp;
265 cbp->bio_cflags |= GV_BIO_REBUILD;
266 wp->parity = cbp;
267
268 p->synced = boff;
269
270 return (0);
271 }
272
273 /* Build a request group to perform (part of) a RAID5 request. */
274 int
275 gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
276 struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
277 {
278 struct g_geom *gp;
279 struct gv_sd *broken, *original, *parity, *s;
280 struct gv_bioq *bq;
281 struct bio *cbp, *pbp;
282 int i, psdno, sdno, type;
283 off_t real_len, real_off;
284
285 gp = bp->bio_to->geom;
286
287 if (p == NULL || LIST_EMPTY(&p->subdisks))
288 return (ENXIO);
289
290 /* We are optimistic and assume that this request will be OK. */
291 #define REQ_TYPE_NORMAL 0
292 #define REQ_TYPE_DEGRADED 1
293 #define REQ_TYPE_NOPARITY 2
294
295 type = REQ_TYPE_NORMAL;
296 original = parity = broken = NULL;
297
298 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno);
299
300 /* Find the right subdisks. */
301 i = 0;
302 LIST_FOREACH(s, &p->subdisks, in_plex) {
303 if (i == sdno)
304 original = s;
305 if (i == psdno)
306 parity = s;
307 if (s->state != GV_SD_UP)
308 broken = s;
309 i++;
310 }
311
312 if ((original == NULL) || (parity == NULL))
313 return (ENXIO);
314
315 /* Our data stripe is missing. */
316 if (original->state != GV_SD_UP)
317 type = REQ_TYPE_DEGRADED;
318 /* Our parity stripe is missing. */
319 if (parity->state != GV_SD_UP) {
320 /* We cannot take another failure if we're already degraded. */
321 if (type != REQ_TYPE_NORMAL)
322 return (ENXIO);
323 else
324 type = REQ_TYPE_NOPARITY;
325 }
326
327 wp->length = real_len;
328 wp->data = addr;
329 wp->lockbase = real_off;
330
331 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
332
333 if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced))
334 type = REQ_TYPE_NORMAL;
335
336 switch (bp->bio_cmd) {
337 case BIO_READ:
338 /*
339 * For a degraded read we need to read in all stripes except
340 * the broken one plus the parity stripe and then recalculate
341 * the desired data.
342 */
343 if (type == REQ_TYPE_DEGRADED) {
344 bzero(wp->data, wp->length);
345 LIST_FOREACH(s, &p->subdisks, in_plex) {
346 /* Skip the broken subdisk. */
347 if (s == broken)
348 continue;
349 cbp = g_clone_bio(bp);
350 if (cbp == NULL)
351 return (ENOMEM);
352 cbp->bio_data = g_malloc(real_len, M_WAITOK);
353 cbp->bio_cflags |= GV_BIO_MALLOC;
354 cbp->bio_offset = real_off;
355 cbp->bio_length = real_len;
356 cbp->bio_done = gv_plex_done;
357 cbp->bio_caller2 = s->consumer;
358 cbp->bio_driver1 = wp;
359
360 GV_ENQUEUE(bp, cbp, pbp);
361
362 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
363 bq->bp = cbp;
364 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
365 }
366
367 /* A normal read can be fulfilled with the original subdisk. */
368 } else {
369 cbp = g_clone_bio(bp);
370 if (cbp == NULL)
371 return (ENOMEM);
372 cbp->bio_offset = real_off;
373 cbp->bio_length = real_len;
374 cbp->bio_data = addr;
375 cbp->bio_done = g_std_done;
376 cbp->bio_caller2 = original->consumer;
377
378 GV_ENQUEUE(bp, cbp, pbp);
379 }
380 wp->lockbase = -1;
381
382 break;
383
384 case BIO_WRITE:
385 /*
386 * A degraded write means we cannot write to the original data
387 * subdisk. Thus we need to read in all valid stripes,
388 * recalculate the parity from the original data, and then
389 * write the parity stripe back out.
390 */
391 if (type == REQ_TYPE_DEGRADED) {
392 /* Read all subdisks. */
393 LIST_FOREACH(s, &p->subdisks, in_plex) {
394 /* Skip the broken and the parity subdisk. */
395 if ((s == broken) || (s == parity))
396 continue;
397
398 cbp = g_clone_bio(bp);
399 if (cbp == NULL)
400 return (ENOMEM);
401 cbp->bio_cmd = BIO_READ;
402 cbp->bio_data = g_malloc(real_len, M_WAITOK);
403 cbp->bio_cflags |= GV_BIO_MALLOC;
404 cbp->bio_offset = real_off;
405 cbp->bio_length = real_len;
406 cbp->bio_done = gv_plex_done;
407 cbp->bio_caller2 = s->consumer;
408 cbp->bio_driver1 = wp;
409
410 GV_ENQUEUE(bp, cbp, pbp);
411
412 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
413 bq->bp = cbp;
414 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
415 }
416
417 /* Write the parity data. */
418 cbp = g_clone_bio(bp);
419 if (cbp == NULL)
420 return (ENOMEM);
421 cbp->bio_data = g_malloc(real_len, M_WAITOK);
422 cbp->bio_cflags |= GV_BIO_MALLOC;
423 bcopy(addr, cbp->bio_data, real_len);
424 cbp->bio_offset = real_off;
425 cbp->bio_length = real_len;
426 cbp->bio_done = gv_plex_done;
427 cbp->bio_caller2 = parity->consumer;
428 cbp->bio_driver1 = wp;
429 wp->parity = cbp;
430
431 /*
432 * When the parity stripe is missing we just write out the data.
433 */
434 } else if (type == REQ_TYPE_NOPARITY) {
435 cbp = g_clone_bio(bp);
436 if (cbp == NULL)
437 return (ENOMEM);
438 cbp->bio_offset = real_off;
439 cbp->bio_length = real_len;
440 cbp->bio_data = addr;
441 cbp->bio_done = gv_plex_done;
442 cbp->bio_caller2 = original->consumer;
443 cbp->bio_driver1 = wp;
444
445 GV_ENQUEUE(bp, cbp, pbp);
446
447 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
448 bq->bp = cbp;
449 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
450
451 /*
452 * A normal write request goes to the original subdisk, then we
453 * read in all other stripes, recalculate the parity and write
454 * out the parity again.
455 */
456 } else {
457 /* Read old parity. */
458 cbp = g_clone_bio(bp);
459 if (cbp == NULL)
460 return (ENOMEM);
461 cbp->bio_cmd = BIO_READ;
462 cbp->bio_data = g_malloc(real_len, M_WAITOK);
463 cbp->bio_cflags |= GV_BIO_MALLOC;
464 cbp->bio_offset = real_off;
465 cbp->bio_length = real_len;
466 cbp->bio_done = gv_plex_done;
467 cbp->bio_caller2 = parity->consumer;
468 cbp->bio_driver1 = wp;
469
470 GV_ENQUEUE(bp, cbp, pbp);
471
472 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
473 bq->bp = cbp;
474 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
475
476 /* Read old data. */
477 cbp = g_clone_bio(bp);
478 if (cbp == NULL)
479 return (ENOMEM);
480 cbp->bio_cmd = BIO_READ;
481 cbp->bio_data = g_malloc(real_len, M_WAITOK);
482 cbp->bio_cflags |= GV_BIO_MALLOC;
483 cbp->bio_offset = real_off;
484 cbp->bio_length = real_len;
485 cbp->bio_done = gv_plex_done;
486 cbp->bio_caller2 = original->consumer;
487 cbp->bio_driver1 = wp;
488
489 GV_ENQUEUE(bp, cbp, pbp);
490
491 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
492 bq->bp = cbp;
493 TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
494
495 /* Write new data. */
496 cbp = g_clone_bio(bp);
497 if (cbp == NULL)
498 return (ENOMEM);
499 cbp->bio_data = addr;
500 cbp->bio_offset = real_off;
501 cbp->bio_length = real_len;
502 cbp->bio_done = gv_plex_done;
503 cbp->bio_caller2 = original->consumer;
504
505 cbp->bio_driver1 = wp;
506
507 /*
508 * We must not write the new data until the old data
509 * was read, so hold this BIO back until we're ready
510 * for it.
511 */
512 wp->waiting = cbp;
513
514 /* The final bio for the parity. */
515 cbp = g_clone_bio(bp);
516 if (cbp == NULL)
517 return (ENOMEM);
518 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
519 cbp->bio_cflags |= GV_BIO_MALLOC;
520 cbp->bio_offset = real_off;
521 cbp->bio_length = real_len;
522 cbp->bio_done = gv_plex_done;
523 cbp->bio_caller2 = parity->consumer;
524 cbp->bio_driver1 = wp;
525
526 /* Remember that this is the BIO for the parity data. */
527 wp->parity = cbp;
528 }
529 break;
530
531 default:
532 return (EINVAL);
533 }
534
535 return (0);
536 }
537
538 /* Calculate the offsets in the various subdisks for a RAID5 request. */
539 int
540 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
541 off_t *real_len, int *sdno, int *psdno)
542 {
543 int sd, psd;
544 off_t len_left, stripeend, stripeoff, stripestart;
545
546 /* The number of the subdisk containing the parity stripe. */
547 psd = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) %
548 p->sdcount;
549 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
550
551 /* Offset of the start address from the start of the stripe. */
552 stripeoff = boff % (p->stripesize * (p->sdcount - 1));
553 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
554
555 /* The number of the subdisk where the stripe resides. */
556 sd = stripeoff / p->stripesize;
557 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
558
559 /* At or past parity subdisk. */
560 if (sd >= psd)
561 sd++;
562
563 /* The offset of the stripe on this subdisk. */
564 stripestart = (boff - stripeoff) / (p->sdcount - 1);
565 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
566
567 stripeoff %= p->stripesize;
568
569 /* The offset of the request on this subdisk. */
570 *real_off = stripestart + stripeoff;
571
572 stripeend = stripestart + p->stripesize;
573 len_left = stripeend - *real_off;
574 KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
575
576 *real_len = (bcount <= len_left) ? bcount : len_left;
577
578 if (sdno != NULL)
579 *sdno = sd;
580 if (psdno != NULL)
581 *psdno = psd;
582
583 return (0);
584 }
Cache object: fb210f612af02872a0a18d6d24414b72
|