1 /*-
2 * Copyright (c) 1997, 1998, 1999
3 * Nan Yang Computer Services Limited. All rights reserved.
4 *
5 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
6 *
7 * Written by Greg Lehey
8 *
9 * This software is distributed under the so-called ``Berkeley
10 * License'':
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by Nan Yang Computer
23 * Services Limited.
24 * 4. Neither the name of the Company nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided ``as is'', and any express or implied
29 * warranties, including, but not limited to, the implied warranties of
30 * merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall the company or contributors be liable for any
32 * direct, indirect, incidental, special, exemplary, or consequential
33 * damages (including, but not limited to, procurement of substitute
34 * goods or services; loss of use, data, or profits; or business
35 * interruption) however caused and on any theory of liability, whether
36 * in contract, strict liability, or tort (including negligence or
37 * otherwise) arising in any way out of the use of this software, even if
38 * advised of the possibility of such damage.
39 *
40 * $Id: vinumrevive.c,v 1.14 2000/12/21 01:55:11 grog Exp grog $
41 * $FreeBSD$
42 */
43
44 #include <dev/vinum/vinumhdr.h>
45 #include <dev/vinum/request.h>
46
47 /*
48 * Revive a block of a subdisk. Return an error
49 * indication. EAGAIN means successful copy, but
50 * that more blocks remain to be copied. EINVAL
51 * means that the subdisk isn't associated with a
52 * plex (which means a programming error if we get
53 * here at all; FIXME).
54 */
55
56 int
57 revive_block(int sdno)
58 {
59 int s; /* priority level */
60 struct sd *sd;
61 struct plex *plex;
62 struct volume *vol;
63 struct buf *bp;
64 int error = EAGAIN;
65 int size; /* size of revive block, bytes */
66 daddr_t plexblkno; /* lblkno in plex */
67 int psd; /* parity subdisk number */
68 u_int64_t stripe; /* stripe number */
69 int paritysd = 0; /* set if this is the parity stripe */
70 struct rangelock *lock; /* for locking */
71 daddr_t stripeoffset; /* offset in stripe */
72
73 plexblkno = 0; /* to keep the compiler happy */
74 sd = &SD[sdno];
75 lock = NULL;
76 if (sd->plexno < 0) /* no plex? */
77 return EINVAL;
78 plex = &PLEX[sd->plexno]; /* point to plex */
79 if (plex->volno >= 0)
80 vol = &VOL[plex->volno];
81 else
82 vol = NULL;
83
84 if ((sd->revive_blocksize == 0) /* no block size */
85 ||(sd->revive_blocksize & ((1 << DEV_BSHIFT) - 1))) /* or invalid block size */
86 sd->revive_blocksize = DEFAULT_REVIVE_BLOCKSIZE;
87 else if (sd->revive_blocksize > MAX_REVIVE_BLOCKSIZE)
88 sd->revive_blocksize = MAX_REVIVE_BLOCKSIZE;
89 size = min(sd->revive_blocksize >> DEV_BSHIFT, sd->sectors - sd->revived) << DEV_BSHIFT;
90 sd->reviver = curproc->p_pid; /* note who last had a bash at it */
91
92 /* Now decide where to read from */
93 switch (plex->organization) {
94 case plex_concat:
95 plexblkno = sd->revived + sd->plexoffset; /* corresponding address in plex */
96 break;
97
98 case plex_striped:
99 stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */
100 if (stripeoffset + (size >> DEV_BSHIFT) > plex->stripesize)
101 size = (plex->stripesize - stripeoffset) << DEV_BSHIFT;
102 plexblkno = sd->plexoffset /* base */
103 + (sd->revived - stripeoffset) * plex->subdisks /* offset to beginning of stripe */
104 + stripeoffset; /* offset from beginning of stripe */
105 break;
106
107 case plex_raid4:
108 case plex_raid5:
109 stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */
110 plexblkno = sd->plexoffset /* base */
111 + (sd->revived - stripeoffset) * (plex->subdisks - 1) /* offset to beginning of stripe */
112 +stripeoffset; /* offset from beginning of stripe */
113 stripe = (sd->revived / plex->stripesize); /* stripe number */
114
115 /* Make sure we don't go beyond the end of the band. */
116 size = min(size, (plex->stripesize - stripeoffset) << DEV_BSHIFT);
117 if (plex->organization == plex_raid4)
118 psd = plex->subdisks - 1; /* parity subdisk for this stripe */
119 else
120 psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
121 paritysd = plex->sdnos[psd] == sdno; /* note if it's the parity subdisk */
122
123 /*
124 * Now adjust for the strangenesses
125 * in RAID-4 and RAID-5 striping.
126 */
127 if (sd->plexsdno > psd) /* beyond the parity stripe, */
128 plexblkno -= plex->stripesize; /* one stripe less */
129 else if (paritysd)
130 plexblkno -= plex->stripesize * sd->plexsdno; /* go back to the beginning of the band */
131 break;
132
133 case plex_disorg: /* to keep the compiler happy */
134 }
135
136 if (paritysd) { /* we're reviving a parity block, */
137 bp = parityrebuild(plex, sd->revived, size, rebuildparity, &lock, NULL); /* do the grunt work */
138 if (bp == NULL) /* no buffer space */
139 return ENOMEM; /* chicken out */
140 } else { /* data block */
141 s = splbio();
142 bp = geteblk(size); /* Get a buffer */
143 splx(s);
144 if (bp == NULL)
145 return ENOMEM;
146
147 /*
148 * Amount to transfer: block size, unless it
149 * would overlap the end.
150 */
151 bp->b_bcount = size;
152 bp->b_resid = bp->b_bcount;
153 bp->b_blkno = plexblkno; /* start here */
154 if (isstriped(plex)) /* we need to lock striped plexes */
155 lock = lockrange(plexblkno << DEV_BSHIFT, bp, plex); /* lock it */
156 if (vol != NULL) /* it's part of a volume, */
157 /*
158 * First, read the data from the volume. We
159 * don't care which plex, that's bre's job.
160 */
161 bp->b_dev = VINUMDEV(plex->volno, 0, 0, VINUM_VOLUME_TYPE); /* create the device number */
162 else /* it's an unattached plex */
163 bp->b_dev = VINUM_PLEX(sd->plexno); /* create the device number */
164
165 bp->b_flags = B_READ; /* either way, read it */
166 vinumstart(bp, 1);
167 biowait(bp);
168 }
169
170 if (bp->b_flags & B_ERROR)
171 error = bp->b_error;
172 else
173 /* Now write to the subdisk */
174 {
175 bp->b_dev = VINUM_SD(sdno); /* create the device number */
176 bp->b_flags = B_ORDERED | B_WRITE; /* and make this an ordered write */
177 bp->b_resid = bp->b_bcount;
178 bp->b_blkno = sd->revived; /* write it to here */
179 sdio(bp); /* perform the I/O */
180 biowait(bp);
181 if (bp->b_flags & B_ERROR)
182 error = bp->b_error;
183 else {
184 sd->revived += bp->b_bcount >> DEV_BSHIFT; /* moved this much further down */
185 if (sd->revived >= sd->sectors) { /* finished */
186 sd->revived = 0;
187 set_sd_state(sdno, sd_up, setstate_force); /* bring the sd up */
188 log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
189 save_config(); /* and save the updated configuration */
190 error = 0; /* we're done */
191 }
192 }
193 if (lock) /* we took a lock, */
194 unlockrange(sd->plexno, lock); /* give it back */
195 while (sd->waitlist) { /* we have waiting requests */
196 #if VINUMDEBUG
197 struct request *rq = sd->waitlist;
198
199 if (debug & DEBUG_REVIVECONFLICT)
200 log(LOG_DEBUG,
201 "Relaunch revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%x, length %ld\n",
202 rq->sdno,
203 rq,
204 rq->bp->b_flags & B_READ ? "Read" : "Write",
205 major(rq->bp->b_dev),
206 minor(rq->bp->b_dev),
207 rq->bp->b_blkno,
208 rq->bp->b_bcount);
209 #endif
210 launch_requests(sd->waitlist, 1); /* do them now */
211 sd->waitlist = sd->waitlist->next; /* and move on to the next */
212 }
213 }
214 if (bp->b_qindex == 0) { /* not on a queue, */
215 bp->b_flags |= B_INVAL;
216 bp->b_flags &= ~B_ERROR;
217 brelse(bp); /* is this kosher? */
218 }
219 return error;
220 }
221
222 /*
223 * Check or rebuild the parity blocks of a RAID-4
224 * or RAID-5 plex.
225 *
226 * The variables plex->checkblock and
227 * plex->rebuildblock represent the
228 * subdisk-relative address of the stripe we're
229 * looking at, not the plex-relative address. We
230 * store it in the plex and not as a local
231 * variable because this function could be
232 * stopped, and we don't want to repeat the part
233 * we've already done. This is also the reason
234 * why we don't initialize it here except at the
235 * end. It gets initialized with the plex on
236 * creation.
237 *
238 * Each call to this function processes at most
239 * one stripe. We can't loop in this function,
240 * because we're unstoppable, so we have to be
241 * called repeatedly from userland.
242 */
243 void
244 parityops(struct vinum_ioctl_msg *data)
245 {
246 int plexno;
247 struct plex *plex;
248 int size; /* I/O transfer size, bytes */
249 int stripe; /* stripe number in plex */
250 int psd; /* parity subdisk number */
251 struct rangelock *lock; /* lock on stripe */
252 struct _ioctl_reply *reply;
253 off_t pstripe; /* pointer to our stripe counter */
254 struct buf *pbp;
255 off_t errorloc; /* offset of parity error */
256 enum parityop op; /* operation to perform */
257
258 plexno = data->index;
259 op = data->op;
260 pbp = NULL;
261 reply = (struct _ioctl_reply *) data;
262 reply->error = EAGAIN; /* expect to repeat this call */
263 plex = &PLEX[plexno];
264 if (!isparity(plex)) { /* not RAID-4 or RAID-5 */
265 reply->error = EINVAL;
266 return;
267 } else if (plex->state < plex_flaky) {
268 reply->error = EIO;
269 strcpy(reply->msg, "Plex is not completely accessible\n");
270 return;
271 }
272 pstripe = data->offset;
273 stripe = pstripe / plex->stripesize; /* stripe number */
274 psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
275 size = min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */
276 plex->stripesize << DEV_BSHIFT);
277
278 pbp = parityrebuild(plex, pstripe, size, op, &lock, &errorloc); /* do the grunt work */
279 if (pbp == NULL) { /* no buffer space */
280 reply->error = ENOMEM;
281 return; /* chicken out */
282 }
283 /*
284 * Now we have a result in the data buffer of
285 * the parity buffer header, which we have kept.
286 * Decide what to do with it.
287 */
288 reply->msg[0] = '\0'; /* until shown otherwise */
289 if ((pbp->b_flags & B_ERROR) == 0) { /* no error */
290 if ((op == rebuildparity)
291 || (op == rebuildandcheckparity)) {
292 pbp->b_flags &= ~B_READ;
293 pbp->b_resid = pbp->b_bcount;
294 sdio(pbp); /* write the parity block */
295 biowait(pbp);
296 }
297 if (((op == checkparity)
298 || (op == rebuildandcheckparity))
299 && (errorloc != -1)) {
300 if (op == checkparity)
301 reply->error = EIO;
302 sprintf(reply->msg,
303 "Parity incorrect at offset 0x%llx\n",
304 errorloc);
305 }
306 if (reply->error == EAGAIN) { /* still OK, */
307 plex->checkblock = pstripe + (pbp->b_bcount >> DEV_BSHIFT); /* moved this much further down */
308 if (plex->checkblock >= SD[plex->sdnos[0]].sectors) { /* finished */
309 plex->checkblock = 0;
310 reply->error = 0;
311 }
312 }
313 }
314 if (pbp->b_flags & B_ERROR)
315 reply->error = pbp->b_error;
316 pbp->b_flags |= B_INVAL;
317 pbp->b_flags &= ~B_ERROR;
318 brelse(pbp);
319 unlockrange(plexno, lock);
320 }
321
322 /*
323 * Rebuild a parity stripe. Return pointer to
324 * parity bp. On return,
325 *
326 * 1. The band is locked. The caller must unlock
327 * the band and release the buffer header.
328 *
329 * 2. All buffer headers except php have been
330 * released. The caller must release pbp.
331 *
332 * 3. For checkparity and rebuildandcheckparity,
333 * the parity is compared with the current
334 * parity block. If it's different, the
335 * offset of the error is returned to
336 * errorloc. The caller can set the value of
337 * the pointer to NULL if this is called for
338 * rebuilding parity.
339 *
340 * pstripe is the subdisk-relative base address of
341 * the data to be reconstructed, size is the size
342 * of the transfer in bytes.
343 */
344 struct buf *
345 parityrebuild(struct plex *plex,
346 u_int64_t pstripe,
347 int size,
348 enum parityop op,
349 struct rangelock **lockp,
350 off_t * errorloc)
351 {
352 int error;
353 int s;
354 int sdno;
355 u_int64_t stripe; /* stripe number */
356 int *parity_buf; /* buffer address for current parity block */
357 int *newparity_buf; /* and for new parity block */
358 int mysize; /* I/O transfer size for this transfer */
359 int isize; /* mysize in ints */
360 int i;
361 int psd; /* parity subdisk number */
362 int newpsd; /* and "subdisk number" of new parity */
363 struct buf **bpp; /* pointers to our bps */
364 struct buf *pbp; /* buffer header for parity stripe */
365 int *sbuf;
366 int bufcount; /* number of buffers we need */
367
368 stripe = pstripe / plex->stripesize; /* stripe number */
369 psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
370 parity_buf = NULL; /* to keep the compiler happy */
371 error = 0;
372
373 /*
374 * It's possible that the default transfer size
375 * we chose is not a factor of the stripe size.
376 * We *must* limit this operation to a single
377 * stripe, at least for RAID-5 rebuild, since
378 * the parity subdisk changes between stripes,
379 * so in this case we need to perform a short
380 * transfer. Set variable mysize to reflect
381 * this.
382 */
383 mysize = min(size, (plex->stripesize * (stripe + 1) - pstripe) << DEV_BSHIFT);
384 isize = mysize / (sizeof(int)); /* number of ints in the buffer */
385 bufcount = plex->subdisks + 1; /* sd buffers plus result buffer */
386 newpsd = plex->subdisks;
387 bpp = (struct buf **) Malloc(bufcount * sizeof(struct buf *)); /* array of pointers to bps */
388
389 /* First, build requests for all subdisks */
390 for (sdno = 0; sdno < bufcount; sdno++) { /* for each subdisk */
391 if ((sdno != psd) || (op != rebuildparity)) {
392 /* Get a buffer header and initialize it. */
393 s = splbio();
394 bpp[sdno] = geteblk(mysize); /* Get a buffer */
395 if (bpp[sdno] == NULL) {
396 while (sdno-- > 0) { /* release the ones we got */
397 bpp[sdno]->b_flags |= B_INVAL;
398 brelse(bpp[sdno]); /* give back our resources */
399 }
400 splx(s);
401 printf("vinum: can't allocate buffer space for parity op.\n");
402 return NULL; /* no bpps */
403 }
404 splx(s);
405 if (sdno == psd)
406 parity_buf = (int *) bpp[sdno]->b_data;
407 if (sdno == newpsd) /* the new one? */
408 bpp[sdno]->b_dev = VINUM_SD(plex->sdnos[psd]); /* write back to the parity SD */
409 else
410 bpp[sdno]->b_dev = VINUM_SD(plex->sdnos[sdno]); /* device number */
411 bpp[sdno]->b_flags = B_READ; /* either way, read it */
412 bpp[sdno]->b_bcount = mysize;
413 bpp[sdno]->b_resid = bpp[sdno]->b_bcount;
414 bpp[sdno]->b_blkno = pstripe; /* transfer from here */
415 }
416 }
417
418 /* Initialize result buffer */
419 pbp = bpp[newpsd];
420 newparity_buf = (int *) bpp[newpsd]->b_data;
421 bzero(newparity_buf, mysize);
422
423 /*
424 * Now lock the stripe with the first non-parity
425 * bp as locking bp.
426 */
427 *lockp = lockrange(pstripe * plex->stripesize * (plex->subdisks - 1),
428 bpp[psd ? 0 : 1],
429 plex);
430
431 /*
432 * Then issue requests for all subdisks in
433 * parallel. Don't transfer the parity stripe
434 * if we're rebuilding parity, unless we also
435 * want to check it.
436 */
437 for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each real subdisk */
438 if ((sdno != psd) || (op != rebuildparity)) {
439 sdio(bpp[sdno]);
440 }
441 }
442
443 /*
444 * Next, wait for the requests to complete.
445 * We wait in the order in which they were
446 * issued, which isn't necessarily the order in
447 * which they complete, but we don't have a
448 * convenient way of doing the latter, and the
449 * delay is minimal.
450 */
451 for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each subdisk */
452 if ((sdno != psd) || (op != rebuildparity)) {
453 biowait(bpp[sdno]);
454 if (bpp[sdno]->b_flags & B_ERROR) /* can't read, */
455 error = bpp[sdno]->b_error;
456 else if (sdno != psd) { /* update parity */
457 sbuf = (int *) bpp[sdno]->b_data;
458 for (i = 0; i < isize; i++)
459 ((int *) newparity_buf)[i] ^= sbuf[i]; /* xor in the buffer */
460 }
461 }
462 if (sdno != psd) { /* release all bps except parity */
463 bpp[sdno]->b_flags |= B_INVAL;
464 brelse(bpp[sdno]); /* give back our resources */
465 }
466 }
467
468 /*
469 * If we're checking, compare the calculated
470 * and the read parity block. If they're
471 * different, return the plex-relative offset;
472 * otherwise return -1.
473 */
474 if ((op == checkparity)
475 || (op == rebuildandcheckparity)) {
476 *errorloc = -1; /* no error yet */
477 for (i = 0; i < isize; i++) {
478 if (parity_buf[i] != newparity_buf[i]) {
479 *errorloc = (off_t) (pstripe << DEV_BSHIFT) * (plex->subdisks - 1)
480 + i * sizeof(int);
481 break;
482 }
483 }
484 bpp[psd]->b_flags |= B_INVAL;
485 brelse(bpp[psd]); /* give back our resources */
486 }
487 /* release our resources */
488 Free(bpp);
489 if (error) {
490 pbp->b_flags |= B_ERROR;
491 pbp->b_error = error;
492 }
493 return pbp;
494 }
495
496 /*
497 * Initialize a subdisk by writing zeroes to the
498 * complete address space. If verify is set,
499 * check each transfer for correctness.
500 *
501 * Each call to this function writes (and maybe
502 * checks) a single block.
503 */
504 int
505 initsd(int sdno, int verify)
506 {
507 int s; /* priority level */
508 struct sd *sd;
509 struct plex *plex;
510 struct volume *vol;
511 struct buf *bp;
512 int error;
513 int size; /* size of init block, bytes */
514 daddr_t plexblkno; /* lblkno in plex */
515 int verified; /* set when we're happy with what we wrote */
516
517 error = 0;
518 plexblkno = 0; /* to keep the compiler happy */
519 sd = &SD[sdno];
520 if (sd->plexno < 0) /* no plex? */
521 return EINVAL;
522 plex = &PLEX[sd->plexno]; /* point to plex */
523 if (plex->volno >= 0)
524 vol = &VOL[plex->volno];
525 else
526 vol = NULL;
527
528 if (sd->init_blocksize == 0) {
529 if (plex->stripesize != 0) /* we're striped, don't init more than */
530 sd->init_blocksize = min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */
531 plex->stripesize << DEV_BSHIFT);
532 else
533 sd->init_blocksize = DEFAULT_REVIVE_BLOCKSIZE;
534 } else if (sd->init_blocksize > MAX_REVIVE_BLOCKSIZE)
535 sd->init_blocksize = MAX_REVIVE_BLOCKSIZE;
536
537 size = min(sd->init_blocksize >> DEV_BSHIFT, sd->sectors - sd->initialized) << DEV_BSHIFT;
538
539 verified = 0;
540 while (!verified) { /* until we're happy with it, */
541 s = splbio();
542 bp = geteblk(size); /* Get a buffer */
543 splx(s);
544 if (bp == NULL)
545 return ENOMEM;
546
547 bp->b_bcount = size;
548 bp->b_resid = bp->b_bcount;
549 bp->b_blkno = sd->initialized; /* write it to here */
550 bzero(bp->b_data, bp->b_bcount);
551 bp->b_dev = VINUM_SD(sdno); /* create the device number */
552 bp->b_flags &= ~B_READ;
553 sdio(bp); /* perform the I/O */
554 biowait(bp);
555 if (bp->b_flags & B_ERROR)
556 error = bp->b_error;
557 if (bp->b_qindex == 0) { /* not on a queue, */
558 bp->b_flags |= B_INVAL;
559 bp->b_flags &= ~B_ERROR;
560 brelse(bp); /* is this kosher? */
561 }
562 if ((error == 0) && verify) { /* check that it got there */
563 s = splbio();
564 bp = geteblk(size); /* get a buffer */
565 if (bp == NULL) {
566 splx(s);
567 error = ENOMEM;
568 } else {
569 bp->b_bcount = size;
570 bp->b_resid = bp->b_bcount;
571 bp->b_blkno = sd->initialized; /* read from here */
572 bp->b_dev = VINUM_SD(sdno); /* create the device number */
573 bp->b_flags |= B_READ; /* read it back */
574 splx(s);
575 sdio(bp);
576 biowait(bp);
577 /*
578 * XXX Bug fix code. This is hopefully no
579 * longer needed (21 February 2000).
580 */
581 if (bp->b_flags & B_ERROR)
582 error = bp->b_error;
583 else if ((*bp->b_data != 0) /* first word spammed */
584 ||(bcmp(bp->b_data, &bp->b_data[1], bp->b_bcount - 1))) { /* or one of the others */
585 printf("vinum: init error on %s, offset 0x%llx sectors\n",
586 sd->name,
587 (long long) sd->initialized);
588 verified = 0;
589 } else
590 verified = 1;
591 if (bp->b_qindex == 0) { /* not on a queue, */
592 bp->b_flags |= B_INVAL;
593 bp->b_flags &= ~B_ERROR;
594 brelse(bp); /* is this kosher? */
595 }
596 }
597 } else
598 verified = 1;
599 }
600 if (error == 0) { /* did it, */
601 sd->initialized += size >> DEV_BSHIFT; /* moved this much further down */
602 if (sd->initialized >= sd->sectors) { /* finished */
603 sd->initialized = 0;
604 set_sd_state(sdno, sd_initialized, setstate_force); /* bring the sd up */
605 log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
606 save_config(); /* and save the updated configuration */
607 } else /* more to go, */
608 error = EAGAIN; /* ya'll come back, see? */
609 }
610 return error;
611 }
612
613 /* Local Variables: */
614 /* fill-column: 50 */
615 /* End: */
Cache object: 51e1613b23e14bd5a801a32a41b5e9b9
|