1 /*-
2 * Copyright (c) 1997, 1998, 1999
3 * Nan Yang Computer Services Limited. All rights reserved.
4 *
5 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
6 *
7 * Written by Greg Lehey
8 *
9 * This software is distributed under the so-called ``Berkeley
10 * License'':
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by Nan Yang Computer
23 * Services Limited.
24 * 4. Neither the name of the Company nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided ``as is'', and any express or implied
29 * warranties, including, but not limited to, the implied warranties of
30 * merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall the company or contributors be liable for any
32 * direct, indirect, incidental, special, exemplary, or consequential
33 * damages (including, but not limited to, procurement of substitute
34 * goods or services; loss of use, data, or profits; or business
35 * interruption) however caused and on any theory of liability, whether
36 * in contract, strict liability, or tort (including negligence or
37 * otherwise) arising in any way out of the use of this software, even if
38 * advised of the possibility of such damage.
39 *
40 * $Id: vinumrequest.c,v 1.1.1.1 2003/10/10 03:09:14 grog Exp $
41 * $FreeBSD$
42 */
43
44 #include <dev/vinum/vinumhdr.h>
45 #include <dev/vinum/request.h>
46 #include <sys/resourcevar.h>
47
48 enum requeststatus bre(struct request *rq,
49 int plexno,
50 daddr_t * diskstart,
51 daddr_t diskend);
52 enum requeststatus bre5(struct request *rq,
53 int plexno,
54 daddr_t * diskstart,
55 daddr_t diskend);
56 enum requeststatus build_read_request(struct request *rq, int volplexno);
57 enum requeststatus build_write_request(struct request *rq);
58 enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
59 int find_alternate_sd(struct request *rq);
60 int check_range_covered(struct request *);
61 void complete_rqe(struct buf *bp);
62 void complete_raid5_write(struct rqelement *);
63 int abortrequest(struct request *rq, int error);
64 void sdio_done(struct buf *bp);
65 int vinum_bounds_check(struct buf *bp, struct volume *vol);
66 caddr_t allocdatabuf(struct rqelement *rqe);
67 void freedatabuf(struct rqelement *rqe);
68
69 #ifdef VINUMDEBUG
70 struct rqinfo rqinfo[RQINFO_SIZE];
71 struct rqinfo *rqip = rqinfo;
72
73 void
74 logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp)
75 {
76 int s = splhigh();
77
78 microtime(&rqip->timestamp); /* when did this happen? */
79 rqip->type = type;
80 rqip->bp = ubp; /* user buffer */
81 switch (type) {
82 case loginfo_user_bp:
83 case loginfo_user_bpl:
84 case loginfo_sdio: /* subdisk I/O */
85 case loginfo_sdiol: /* subdisk I/O launch */
86 case loginfo_sdiodone: /* subdisk I/O complete */
87 bcopy(info.bp, &rqip->info.b, sizeof(struct buf));
88 rqip->devmajor = major(info.bp->b_dev);
89 rqip->devminor = minor(info.bp->b_dev);
90 break;
91
92 case loginfo_iodone:
93 case loginfo_rqe:
94 case loginfo_raid5_data:
95 case loginfo_raid5_parity:
96 bcopy(info.rqe, &rqip->info.rqe, sizeof(struct rqelement));
97 rqip->devmajor = major(info.rqe->b.b_dev);
98 rqip->devminor = minor(info.rqe->b.b_dev);
99 break;
100
101 case loginfo_lockwait:
102 case loginfo_lock:
103 case loginfo_unlock:
104 bcopy(info.lockinfo, &rqip->info.lockinfo, sizeof(struct rangelock));
105
106 break;
107
108 case loginfo_unused:
109 break;
110 }
111 rqip++;
112 if (rqip >= &rqinfo[RQINFO_SIZE]) /* wrap around */
113 rqip = rqinfo;
114 splx(s);
115 }
116
117 #endif
118
119 void
120 vinumstrategy(struct buf *bp)
121 {
122 int volno;
123 struct volume *vol = NULL;
124
125 switch (DEVTYPE(bp->b_dev)) {
126 case VINUM_SD_TYPE:
127 case VINUM_SD2_TYPE:
128 sdio(bp);
129 return;
130
131 default:
132 bp->b_error = EIO; /* I/O error */
133 bp->b_flags |= B_ERROR;
134 biodone(bp);
135 return;
136
137 case VINUM_VOLUME_TYPE: /* volume I/O */
138 volno = Volno(bp->b_dev);
139 vol = &VOL[volno];
140 if (vol->state != volume_up) { /* can't access this volume */
141 bp->b_error = EIO; /* I/O error */
142 bp->b_flags |= B_ERROR;
143 biodone(bp);
144 return;
145 }
146 if (vinum_bounds_check(bp, vol) <= 0) { /* don't like them bounds */
147 biodone(bp);
148 return;
149 }
150 /* FALLTHROUGH */
151 /*
152 * Plex I/O is pretty much the same as volume I/O
153 * for a single plex. Indicate this by passing a NULL
154 * pointer (set above) for the volume
155 */
156 case VINUM_PLEX_TYPE:
157 bp->b_resid = bp->b_bcount; /* transfer everything */
158 vinumstart(bp, 0);
159 return;
160 }
161 }
162
163 /*
164 * Start a transfer. Return -1 on error, 0 if OK,
165 * 1 if we need to retry. Parameter reviveok is
166 * set when doing transfers for revives: it allows
167 * transfers to be started immediately when a
168 * revive is in progress. During revive, normal
169 * transfers are queued if they share address
170 * space with a currently active revive operation.
171 */
172 int
173 vinumstart(struct buf *bp, int reviveok)
174 {
175 int plexno;
176 int maxplex; /* maximum number of plexes to handle */
177 struct volume *vol;
178 struct request *rq; /* build up our request here */
179 enum requeststatus status;
180
181 #ifdef VINUMDEBUG
182 if (debug & DEBUG_LASTREQS)
183 logrq(loginfo_user_bp, (union rqinfou) bp, bp);
184 #endif
185
186 if ((bp->b_bcount % DEV_BSIZE) != 0) { /* bad length */
187 bp->b_error = EINVAL; /* invalid size */
188 bp->b_flags |= B_ERROR;
189 biodone(bp);
190 return -1;
191 }
192 rq = (struct request *) Malloc(sizeof(struct request)); /* allocate a request struct */
193 if (rq == NULL) { /* can't do it */
194 bp->b_error = ENOMEM; /* can't get memory */
195 bp->b_flags |= B_ERROR;
196 biodone(bp);
197 return -1;
198 }
199 bzero(rq, sizeof(struct request));
200
201 /*
202 * Note the volume ID. This can be NULL, which
203 * the request building functions use as an
204 * indication for single plex I/O.
205 */
206 rq->bp = bp; /* and the user buffer struct */
207
208 if (DEVTYPE(bp->b_dev) == VINUM_VOLUME_TYPE) { /* it's a volume, */
209 rq->volplex.volno = Volno(bp->b_dev); /* get the volume number */
210 vol = &VOL[rq->volplex.volno]; /* and point to it */
211 vol->active++; /* one more active request */
212 maxplex = vol->plexes; /* consider all its plexes */
213 } else {
214 vol = NULL; /* no volume */
215 rq->volplex.plexno = Plexno(bp->b_dev); /* point to the plex */
216 rq->isplex = 1; /* note that it's a plex */
217 maxplex = 1; /* just the one plex */
218 }
219
220 if (bp->b_flags & B_READ) {
221 /*
222 * This is a read request. Decide
223 * which plex to read from.
224 *
225 * There's a potential race condition here,
226 * since we're not locked, and we could end
227 * up multiply incrementing the round-robin
228 * counter. This doesn't have any serious
229 * effects, however.
230 */
231 if (vol != NULL) {
232 plexno = vol->preferred_plex; /* get the plex to use */
233 if (plexno < 0) { /* round robin */
234 plexno = vol->last_plex_read;
235 vol->last_plex_read++;
236 if (vol->last_plex_read >= vol->plexes) /* got the the end? */
237 vol->last_plex_read = 0; /* wrap around */
238 }
239 status = build_read_request(rq, plexno); /* build a request */
240 } else {
241 daddr_t diskaddr = bp->b_blkno; /* start offset of transfer */
242 status = bre(rq, /* build a request list */
243 rq->volplex.plexno,
244 &diskaddr,
245 diskaddr + (bp->b_bcount / DEV_BSIZE));
246 }
247
248 if (status > REQUEST_RECOVERED) { /* can't satisfy it */
249 if (status == REQUEST_DOWN) { /* not enough subdisks */
250 bp->b_error = EIO; /* I/O error */
251 bp->b_flags |= B_ERROR;
252 }
253 biodone(bp);
254 freerq(rq);
255 return -1;
256 }
257 return launch_requests(rq, reviveok); /* now start the requests if we can */
258 } else
259 /*
260 * This is a write operation. We write to all plexes. If this is
261 * a RAID-4 or RAID-5 plex, we must also update the parity stripe.
262 */
263 {
264 if (vol != NULL) {
265 if ((vol->plexes > 0) /* multiple plex */
266 ||(isparity((&PLEX[vol->plex[0]])))) { /* or RAID-[45], */
267 rq->save_data = bp->b_data; /* save the data buffer address */
268 bp->b_data = Malloc(bp->b_bcount);
269 bcopy(rq->save_data, bp->b_data, bp->b_bcount); /* make a copy */
270 rq->flags |= XFR_COPYBUF; /* and note that we did it */
271 }
272 status = build_write_request(rq);
273 } else { /* plex I/O */
274 daddr_t diskstart;
275
276 diskstart = bp->b_blkno; /* start offset of transfer */
277 status = bre(rq,
278 Plexno(bp->b_dev),
279 &diskstart,
280 bp->b_blkno + (bp->b_bcount / DEV_BSIZE)); /* build requests for the plex */
281 }
282 if (status > REQUEST_RECOVERED) { /* can't satisfy it */
283 if (status == REQUEST_DOWN) { /* not enough subdisks */
284 bp->b_error = EIO; /* I/O error */
285 bp->b_flags |= B_ERROR;
286 }
287 if (rq->flags & XFR_COPYBUF) {
288 Free(bp->b_data);
289 bp->b_data = rq->save_data;
290 }
291 biodone(bp);
292 freerq(rq);
293 return -1;
294 }
295 return launch_requests(rq, reviveok); /* now start the requests if we can */
296 }
297 }
298
299 /*
300 * Call the low-level strategy routines to
301 * perform the requests in a struct request
302 */
303 int
304 launch_requests(struct request *rq, int reviveok)
305 {
306 struct rqgroup *rqg;
307 int rqno; /* loop index */
308 struct rqelement *rqe; /* current element */
309 struct drive *drive;
310 int rcount; /* request count */
311 const struct bdevsw *bdev;
312
313 /*
314 * First find out whether we're reviving, and
315 * the request contains a conflict. If so, we
316 * hang the request off plex->waitlist of the
317 * first plex we find which is reviving.
318 */
319
320 if ((rq->flags & XFR_REVIVECONFLICT) /* possible revive conflict */
321 &&(!reviveok)) { /* and we don't want to do it now, */
322 struct sd *sd;
323 struct request *waitlist; /* point to the waitlist */
324
325 sd = &SD[rq->sdno];
326 if (sd->waitlist != NULL) { /* something there already, */
327 waitlist = sd->waitlist;
328 while (waitlist->next != NULL) /* find the end */
329 waitlist = waitlist->next;
330 waitlist->next = rq; /* hook our request there */
331 } else
332 sd->waitlist = rq; /* hook our request at the front */
333
334 #ifdef VINUMDEBUG
335 if (debug & DEBUG_REVIVECONFLICT)
336 log(LOG_DEBUG,
337 "Revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%llx, length %ld\n",
338 rq->sdno,
339 rq,
340 rq->bp->b_flags & B_READ ? "Read" : "Write",
341 major(rq->bp->b_dev),
342 minor(rq->bp->b_dev),
343 rq->bp->b_blkno,
344 rq->bp->b_bcount);
345 #endif
346 return 0; /* and get out of here */
347 }
348 rq->active = 0; /* nothing yet */
349 #ifdef VINUMDEBUG
350 /* XXX This is probably due to a bug */
351 if (rq->rqg == NULL) { /* no request */
352 log(LOG_ERR, "vinum: null rqg\n");
353 abortrequest(rq, EINVAL);
354 return -1;
355 }
356 #endif
357 #ifdef VINUMDEBUG
358 if (debug & DEBUG_ADDRESSES)
359 log(LOG_DEBUG,
360 "Request: %p\n%s dev %d.%d, offset 0x%llx, length %ld\n",
361 rq,
362 rq->bp->b_flags & B_READ ? "Read" : "Write",
363 major(rq->bp->b_dev),
364 minor(rq->bp->b_dev),
365 rq->bp->b_blkno,
366 rq->bp->b_bcount);
367 vinum_conf.lastrq = rq;
368 vinum_conf.lastbuf = rq->bp;
369 if (debug & DEBUG_LASTREQS)
370 logrq(loginfo_user_bpl, (union rqinfou) rq->bp, rq->bp);
371 #endif
372
373 /*
374 * We used to have an splbio() here anyway, out
375 * of superstition. With the division of labour
376 * below (first count the requests, then issue
377 * them), it looks as if we don't need this
378 * splbio() protection. In fact, as dillon
379 * points out, there's a race condition
380 * incrementing and decrementing rq->active and
381 * rqg->active. This splbio() didn't help
382 * there, because the device strategy routine
383 * can sleep. Solve this by putting shorter
384 * duration locks on the code.
385 */
386 /*
387 * This loop happens without any participation
388 * of the bottom half, so it requires no
389 * protection.
390 */
391 for (rqg = rq->rqg; rqg != NULL; rqg = rqg->next) { /* through the whole request chain */
392 rqg->active = rqg->count; /* they're all active */
393 for (rqno = 0; rqno < rqg->count; rqno++) {
394 rqe = &rqg->rqe[rqno];
395 if (rqe->flags & XFR_BAD_SUBDISK) /* this subdisk is bad, */
396 rqg->active--; /* one less active request */
397 }
398 if (rqg->active) /* we have at least one active request, */
399 rq->active++; /* one more active request group */
400 }
401
402 /*
403 * Now fire off the requests. In this loop the
404 * bottom half could be completing requests
405 * before we finish. We avoid splbio()
406 * protection by ensuring we don't tread in the
407 * same places that the bottom half does.
408 */
409 for (rqg = rq->rqg; rqg != NULL;) { /* through the whole request chain */
410 if (rqg->lockbase >= 0) /* this rqg needs a lock first */
411 rqg->lock = lockrange(rqg->lockbase, rqg->rq->bp, &PLEX[rqg->plexno]);
412 rcount = rqg->count;
413 for (rqno = 0; rqno < rcount;) {
414 rqe = &rqg->rqe[rqno];
415
416 /*
417 * Point to next rqg before the bottom half
418 * changes the structures.
419 */
420 if (++rqno >= rcount)
421 rqg = rqg->next;
422 if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* this subdisk is good, */
423 drive = &DRIVE[rqe->driveno]; /* look at drive */
424 drive->active++;
425 if (drive->active >= drive->maxactive)
426 drive->maxactive = drive->active;
427 vinum_conf.active++;
428 if (vinum_conf.active >= vinum_conf.maxactive)
429 vinum_conf.maxactive = vinum_conf.active;
430
431 #ifdef VINUMDEBUG
432 if (debug & DEBUG_ADDRESSES)
433 log(LOG_DEBUG,
434 " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%llx, length %ld\n",
435 rqe->b.b_flags & B_READ ? "Read" : "Write",
436 major(rqe->b.b_dev),
437 minor(rqe->b.b_dev),
438 rqe->sdno,
439 (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
440 rqe->b.b_blkno,
441 rqe->b.b_bcount);
442 if (debug & DEBUG_LASTREQS) {
443 microtime(&rqe->launchtime); /* time we launched this request */
444 logrq(loginfo_rqe, (union rqinfou) rqe, rq->bp);
445 }
446 #endif
447 /* fire off the request */
448 bdev = bdevsw_lookup(rqe->b.b_dev);
449 if (bdev == NULL)
450 panic("no bdev"); /* FIXME XXX */
451 (*bdev->d_strategy) (&rqe->b);
452 }
453 }
454 }
455 return 0;
456 }
457
458 /*
459 * define the low-level requests needed to perform a
460 * high-level I/O operation for a specific plex 'plexno'.
461 *
462 * Return REQUEST_OK if all subdisks involved in the request are up,
463 * REQUEST_DOWN if some subdisks are not up, and REQUEST_EOF if the
464 * request is at least partially outside the bounds of the subdisks.
465 *
466 * Modify the pointer *diskstart to point to the end address. On
467 * read, return on the first bad subdisk, so that the caller
468 * (build_read_request) can try alternatives.
469 *
470 * On entry to this routine, the rqg structures are not assigned. The
471 * assignment is performed by expandrq(). Strictly speaking, the
472 * elements rqe->sdno of all entries should be set to -1, since 0
473 * (from bzero) is a valid subdisk number. We avoid this problem by
474 * initializing the ones we use, and not looking at the others (index
475 * >= rqg->requests).
476 */
477 enum requeststatus
478 bre(struct request *rq,
479 int plexno,
480 daddr_t * diskaddr,
481 daddr_t diskend)
482 {
483 int sdno;
484 struct sd *sd;
485 struct rqgroup *rqg;
486 struct buf *bp; /* user's bp */
487 struct plex *plex;
488 enum requeststatus status; /* return value */
489 daddr_t plexoffset; /* offset of transfer in plex */
490 daddr_t stripebase; /* base address of stripe (1st subdisk) */
491 daddr_t stripeoffset; /* offset in stripe */
492 daddr_t blockoffset; /* offset in stripe on subdisk */
493 struct rqelement *rqe; /* point to this request information */
494 daddr_t diskstart = *diskaddr; /* remember where this transfer starts */
495 enum requeststatus s; /* temp return value */
496
497 bp = rq->bp; /* buffer pointer */
498 status = REQUEST_OK; /* return value: OK until proven otherwise */
499 plex = &PLEX[plexno]; /* point to the plex */
500
501 switch (plex->organization) {
502 case plex_concat:
503 sd = NULL; /* (keep compiler quiet) */
504 for (sdno = 0; sdno < plex->subdisks; sdno++) {
505 sd = &SD[plex->sdnos[sdno]];
506 if (*diskaddr < sd->plexoffset) /* we must have a hole, */
507 status = REQUEST_DEGRADED; /* note the fact */
508 if (*diskaddr < (sd->plexoffset + sd->sectors)) { /* the request starts in this subdisk */
509 rqg = allocrqg(rq, 1); /* space for the request */
510 if (rqg == NULL) { /* malloc failed */
511 bp->b_error = ENOMEM;
512 bp->b_flags |= B_ERROR;
513 return REQUEST_ENOMEM;
514 }
515 rqg->plexno = plexno;
516
517 rqe = &rqg->rqe[0]; /* point to the element */
518 rqe->rqg = rqg; /* group */
519 rqe->sdno = sd->sdno; /* put in the subdisk number */
520 plexoffset = *diskaddr; /* start offset in plex */
521 rqe->sdoffset = plexoffset - sd->plexoffset; /* start offset in subdisk */
522 rqe->useroffset = plexoffset - diskstart; /* start offset in user buffer */
523 rqe->dataoffset = 0;
524 rqe->datalen = min(diskend - *diskaddr, /* number of sectors to transfer in this sd */
525 sd->sectors - rqe->sdoffset);
526 rqe->groupoffset = 0; /* no groups for concatenated plexes */
527 rqe->grouplen = 0;
528 rqe->buflen = rqe->datalen; /* buffer length is data buffer length */
529 rqe->flags = 0;
530 rqe->driveno = sd->driveno;
531 if (sd->state != sd_up) { /* *now* we find the sd is down */
532 s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
533 if (s == REQUEST_DOWN) { /* down? */
534 rqe->flags = XFR_BAD_SUBDISK; /* yup */
535 if (rq->bp->b_flags & B_READ) /* read request, */
536 return REQUEST_DEGRADED; /* give up here */
537 /*
538 * If we're writing, don't give up
539 * because of a bad subdisk. Go
540 * through to the bitter end, but note
541 * which ones we can't access.
542 */
543 status = REQUEST_DEGRADED; /* can't do it all */
544 }
545 }
546 *diskaddr += rqe->datalen; /* bump the address */
547 if (build_rq_buffer(rqe, plex)) { /* build the buffer */
548 deallocrqg(rqg);
549 bp->b_error = ENOMEM;
550 bp->b_flags |= B_ERROR;
551 return REQUEST_ENOMEM; /* can't do it */
552 }
553 }
554 if (*diskaddr == diskend) /* we're finished, */
555 break; /* get out of here */
556 }
557 /*
558 * We've got to the end of the plex. Have we got to the end of
559 * the transfer? It would seem that having an offset beyond the
560 * end of the subdisk is an error, but in fact it can happen if
561 * the volume has another plex of different size. There's a valid
562 * question as to why you would want to do this, but currently
563 * it's allowed.
564 *
565 * In a previous version, I returned REQUEST_DOWN here. I think
566 * REQUEST_EOF is more appropriate now.
567 */
568 if (diskend > sd->sectors + sd->plexoffset) /* pointing beyond EOF? */
569 status = REQUEST_EOF;
570 break;
571
572 case plex_striped:
573 {
574 while (*diskaddr < diskend) { /* until we get it all sorted out */
575 if (*diskaddr >= plex->length) /* beyond the end of the plex */
576 return REQUEST_EOF; /* can't continue */
577
578 /* The offset of the start address from the start of the stripe. */
579 stripeoffset = *diskaddr % (plex->stripesize * plex->subdisks);
580
581 /* The plex-relative address of the start of the stripe. */
582 stripebase = *diskaddr - stripeoffset;
583
584 /* The number of the subdisk in which the start is located. */
585 sdno = stripeoffset / plex->stripesize;
586
587 /* The offset from the beginning of the stripe on this subdisk. */
588 blockoffset = stripeoffset % plex->stripesize;
589
590 sd = &SD[plex->sdnos[sdno]]; /* the subdisk in question */
591 rqg = allocrqg(rq, 1); /* space for the request */
592 if (rqg == NULL) { /* malloc failed */
593 bp->b_error = ENOMEM;
594 bp->b_flags |= B_ERROR;
595 return REQUEST_ENOMEM;
596 }
597 rqg->plexno = plexno;
598
599 rqe = &rqg->rqe[0]; /* point to the element */
600 rqe->rqg = rqg;
601 rqe->sdoffset = stripebase / plex->subdisks + blockoffset; /* start offset in this subdisk */
602 rqe->useroffset = *diskaddr - diskstart; /* The offset of the start in the user buffer */
603 rqe->dataoffset = 0;
604 rqe->datalen = min(diskend - *diskaddr, /* the amount remaining to transfer */
605 plex->stripesize - blockoffset); /* and the amount left in this stripe */
606 rqe->groupoffset = 0; /* no groups for striped plexes */
607 rqe->grouplen = 0;
608 rqe->buflen = rqe->datalen; /* buffer length is data buffer length */
609 rqe->flags = 0;
610 rqe->sdno = sd->sdno; /* put in the subdisk number */
611 rqe->driveno = sd->driveno;
612
613 if (sd->state != sd_up) { /* *now* we find the sd is down */
614 s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
615 if (s == REQUEST_DOWN) { /* down? */
616 rqe->flags = XFR_BAD_SUBDISK; /* yup */
617 if (rq->bp->b_flags & B_READ) /* read request, */
618 return REQUEST_DEGRADED; /* give up here */
619 /*
620 * If we're writing, don't give up
621 * because of a bad subdisk. Go through
622 * to the bitter end, but note which
623 * ones we can't access.
624 */
625 status = REQUEST_DEGRADED; /* can't do it all */
626 }
627 }
628 /*
629 * It would seem that having an offset
630 * beyond the end of the subdisk is an
631 * error, but in fact it can happen if the
632 * volume has another plex of different
633 * size. There's a valid question as to why
634 * you would want to do this, but currently
635 * it's allowed.
636 */
637 if (rqe->sdoffset + rqe->datalen > sd->sectors) { /* ends beyond the end of the subdisk? */
638 rqe->datalen = sd->sectors - rqe->sdoffset; /* truncate */
639 #ifdef VINUMDEBUG
640 if (debug & DEBUG_EOFINFO) { /* tell on the request */
641 log(LOG_DEBUG,
642 "vinum: EOF on plex %s, sd %s offset %x (user offset 0x%llx)\n",
643 plex->name,
644 sd->name,
645 (u_int) sd->sectors,
646 bp->b_blkno);
647 log(LOG_DEBUG,
648 "vinum: stripebase 0x%llx, stripeoffset 0x%llx, blockoffset 0x%llx\n",
649 stripebase,
650 stripeoffset,
651 blockoffset);
652 }
653 #endif
654 }
655 if (build_rq_buffer(rqe, plex)) { /* build the buffer */
656 deallocrqg(rqg);
657 bp->b_error = ENOMEM;
658 bp->b_flags |= B_ERROR;
659 return REQUEST_ENOMEM; /* can't do it */
660 }
661 *diskaddr += rqe->datalen; /* look at the remainder */
662 if ((*diskaddr < diskend) /* didn't finish the request on this stripe */
663 &&(*diskaddr < plex->length)) { /* and there's more to come */
664 plex->multiblock++; /* count another one */
665 if (sdno == plex->subdisks - 1) /* last subdisk, */
666 plex->multistripe++; /* another stripe as well */
667 }
668 }
669 }
670 break;
671
672 /*
673 * RAID-4 and RAID-5 are complicated enough to have their own
674 * function.
675 */
676 case plex_raid4:
677 case plex_raid5:
678 status = bre5(rq, plexno, diskaddr, diskend);
679 break;
680
681 default:
682 log(LOG_ERR, "vinum: invalid plex type %d in bre\n", plex->organization);
683 status = REQUEST_DOWN; /* can't access it */
684 }
685
686 return status;
687 }
688
689 /*
690 * Build up a request structure for reading volumes.
691 * This function is not needed for plex reads, since there's
692 * no recovery if a plex read can't be satisified.
693 */
694 enum requeststatus
695 build_read_request(struct request *rq, /* request */
696 int plexindex)
697 { /* index in the volume's plex table */
698 struct buf *bp;
699 daddr_t startaddr; /* offset of previous part of transfer */
700 daddr_t diskaddr; /* offset of current part of transfer */
701 daddr_t diskend; /* and end offset of transfer */
702 int plexno; /* plex index in vinum_conf */
703 struct rqgroup *rqg; /* point to the request we're working on */
704 struct volume *vol; /* volume in question */
705 int recovered = 0; /* set if we recover a read */
706 enum requeststatus status = REQUEST_OK;
707 int plexmask; /* bit mask of plexes, for recovery */
708
709 bp = rq->bp; /* buffer pointer */
710 diskaddr = bp->b_blkno; /* start offset of transfer */
711 diskend = diskaddr + (bp->b_bcount / DEV_BSIZE); /* and end offset of transfer */
712 rqg = &rq->rqg[plexindex]; /* plex request */
713 vol = &VOL[rq->volplex.volno]; /* point to volume */
714
715 while (diskaddr < diskend) { /* build up request components */
716 startaddr = diskaddr;
717 status = bre(rq, vol->plex[plexindex], &diskaddr, diskend); /* build up a request */
718 switch (status) {
719 case REQUEST_OK:
720 continue;
721
722 case REQUEST_RECOVERED:
723 /*
724 * XXX FIXME if we have more than one plex, and we can
725 * satisfy the request from another, don't use the
726 * recovered request, since it's more expensive.
727 */
728 recovered = 1;
729 break;
730
731 case REQUEST_ENOMEM:
732 return status;
733 /*
734 * If we get here, our request is not complete. Try
735 * to fill in the missing parts from another plex.
736 * This can happen multiple times in this function,
737 * and we reinitialize the plex mask each time, since
738 * we could have a hole in our plexes.
739 */
740 case REQUEST_EOF:
741 case REQUEST_DOWN: /* can't access the plex */
742 case REQUEST_DEGRADED: /* can't access the plex */
743 plexmask = ((1 << vol->plexes) - 1) /* all plexes in the volume */
744 &~(1 << plexindex); /* except for the one we were looking at */
745 for (plexno = 0; plexno < vol->plexes; plexno++) {
746 if (plexmask == 0) /* no plexes left to try */
747 return REQUEST_DOWN; /* failed */
748 diskaddr = startaddr; /* start at the beginning again */
749 if (plexmask & (1 << plexno)) { /* we haven't tried this plex yet */
750 bre(rq, vol->plex[plexno], &diskaddr, diskend); /* try a request */
751 if (diskaddr > startaddr) { /* we satisfied another part */
752 recovered = 1; /* we recovered from the problem */
753 status = REQUEST_OK; /* don't complain about it */
754 break;
755 }
756 }
757 }
758 if (diskaddr == startaddr) /* didn't get any further, */
759 return status;
760 }
761 if (recovered)
762 vol->recovered_reads += recovered; /* adjust our recovery count */
763 }
764 return status;
765 }
766
767 /*
768 * Build up a request structure for writes.
769 * Return 0 if all subdisks involved in the request are up, 1 if some
770 * subdisks are not up, and -1 if the request is at least partially
771 * outside the bounds of the subdisks.
772 */
773 enum requeststatus
774 build_write_request(struct request *rq)
775 { /* request */
776 struct buf *bp;
777 daddr_t diskstart; /* offset of current part of transfer */
778 daddr_t diskend; /* and end offset of transfer */
779 int plexno; /* plex index in vinum_conf */
780 struct volume *vol; /* volume in question */
781 enum requeststatus status;
782
783 bp = rq->bp; /* buffer pointer */
784 vol = &VOL[rq->volplex.volno]; /* point to volume */
785 diskend = bp->b_blkno + (bp->b_bcount / DEV_BSIZE); /* end offset of transfer */
786 status = REQUEST_DOWN; /* assume the worst */
787 for (plexno = 0; plexno < vol->plexes; plexno++) {
788 diskstart = bp->b_blkno; /* start offset of transfer */
789 /*
790 * Build requests for the plex.
791 * We take the best possible result here (min,
792 * not max): we're happy if we can write at all
793 */
794 status = min(status, bre(rq,
795 vol->plex[plexno],
796 &diskstart,
797 diskend));
798 }
799 return status;
800 }
801
802 /* Fill in the struct buf part of a request element. */
803 enum requeststatus
804 build_rq_buffer(struct rqelement *rqe, struct plex *plex)
805 {
806 struct sd *sd; /* point to subdisk */
807 struct volume *vol;
808 struct buf *bp;
809 struct buf *ubp; /* user (high level) buffer header */
810
811 vol = &VOL[rqe->rqg->rq->volplex.volno];
812 sd = &SD[rqe->sdno]; /* point to subdisk */
813 bp = &rqe->b;
814 ubp = rqe->rqg->rq->bp; /* pointer to user buffer header */
815
816 /* Initialize the buf struct */
817 /* copy these flags from user bp */
818 bp->b_flags = ubp->b_flags & (B_NOCACHE | B_READ | B_ASYNC);
819 bp->b_flags |= B_CALL | B_BUSY; /* inform us when it's done */
820 bp->b_iodone = complete_rqe;
821 /*
822 * You'd think that we wouldn't need to even
823 * build the request buffer for a dead subdisk,
824 * but in some cases we need information like
825 * the user buffer address. Err on the side of
826 * generosity and supply what we can. That
827 * obviously doesn't include drive information
828 * when the drive is dead.
829 */
830 if ((rqe->flags & XFR_BAD_SUBDISK) == 0) /* subdisk is accessible, */
831 bp->b_dev = DRIVE[rqe->driveno].dev; /* drive device */
832 bp->b_blkno = rqe->sdoffset + sd->driveoffset; /* start address */
833 bp->b_bcount = rqe->buflen << DEV_BSHIFT; /* number of bytes to transfer */
834 bp->b_resid = bp->b_bcount; /* and it's still all waiting */
835 bp->b_bufsize = bp->b_bcount; /* and buffer size */
836
837 if (rqe->flags & XFR_MALLOCED) { /* this operation requires a malloced buffer */
838 bp->b_data = Malloc(bp->b_bcount); /* get a buffer to put it in */
839 if (bp->b_data == NULL) { /* failed */
840 abortrequest(rqe->rqg->rq, ENOMEM);
841 return REQUEST_ENOMEM; /* no memory */
842 }
843 } else
844 /*
845 * Point directly to user buffer data. This means
846 * that we don't need to do anything when we have
847 * finished the transfer
848 */
849 bp->b_data = ubp->b_data + rqe->useroffset * DEV_BSIZE;
850 /*
851 * On a recovery read, we perform an XOR of
852 * all blocks to the user buffer. To make
853 * this work, we first clean out the buffer
854 */
855 if ((rqe->flags & (XFR_RECOVERY_READ | XFR_BAD_SUBDISK))
856 == (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) { /* bad subdisk of a recovery read */
857 int length = rqe->grouplen << DEV_BSHIFT; /* and count involved */
858 char *data = (char *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* destination */
859
860 bzero(data, length); /* clean it out */
861 }
862 return 0;
863 }
864
865 /*
866 * Abort a request: free resources and complete the
867 * user request with the specified error
868 */
869 int
870 abortrequest(struct request *rq, int error)
871 {
872 struct buf *bp = rq->bp; /* user buffer */
873
874 bp->b_error = error;
875 freerq(rq); /* free everything we're doing */
876 bp->b_flags |= B_ERROR;
877 return error; /* and give up */
878 }
879
880 /*
881 * Check that our transfer will cover the
882 * complete address space of the user request.
883 *
884 * Return 1 if it can, otherwise 0
885 */
886 int
887 check_range_covered(struct request *rq)
888 {
889 return 1;
890 }
891
892 /* Perform I/O on a subdisk */
893 void
894 sdio(struct buf *bp)
895 {
896 int s; /* spl */
897 struct sd *sd;
898 struct sdbuf *sbp;
899 daddr_t endoffset;
900 struct drive *drive;
901 const struct bdevsw *bdev;
902
903 #ifdef VINUMDEBUG
904 if (debug & DEBUG_LASTREQS)
905 logrq(loginfo_sdio, (union rqinfou) bp, bp);
906 #endif
907 sd = &SD[Sdno(bp->b_dev)]; /* point to the subdisk */
908 drive = &DRIVE[sd->driveno];
909
910 if (drive->state != drive_up) {
911 if (sd->state >= sd_crashed) {
912 if ((bp->b_flags & B_READ) == 0) /* writing, */
913 set_sd_state(sd->sdno, sd_stale, setstate_force);
914 else
915 set_sd_state(sd->sdno, sd_crashed, setstate_force);
916 }
917 bp->b_error = EIO;
918 bp->b_flags |= B_ERROR;
919 biodone(bp);
920 return;
921 }
922 /*
923 * We allow access to any kind of subdisk as long as we can expect
924 * to get the I/O performed.
925 */
926 if (sd->state < sd_empty) { /* nothing to talk to, */
927 bp->b_error = EIO;
928 bp->b_flags |= B_ERROR;
929 biodone(bp);
930 return;
931 }
932 /* Get a buffer */
933 sbp = (struct sdbuf *) Malloc(sizeof(struct sdbuf));
934 if (sbp == NULL) {
935 bp->b_error = ENOMEM;
936 bp->b_flags |= B_ERROR;
937 biodone(bp);
938 return;
939 }
940 bzero(sbp, sizeof(struct sdbuf)); /* start with nothing */
941 sbp->b.b_flags = bp->b_flags | B_CALL | B_BUSY; /* inform us when it's done */
942 sbp->b.b_bufsize = bp->b_bcount; /* buffer size */
943 sbp->b.b_bcount = bp->b_bcount; /* number of bytes to transfer */
944 sbp->b.b_resid = bp->b_resid; /* and amount waiting */
945 sbp->b.b_dev = DRIVE[sd->driveno].dev; /* device */
946 sbp->b.b_data = bp->b_data; /* data buffer */
947 sbp->b.b_blkno = bp->b_blkno + sd->driveoffset;
948 sbp->b.b_iodone = sdio_done; /* come here on completion */
949 sbp->bp = bp; /* note the address of the original header */
950 sbp->sdno = sd->sdno; /* note for statistics */
951 sbp->driveno = sd->driveno;
952 endoffset = bp->b_blkno + sbp->b.b_bcount / DEV_BSIZE; /* final sector offset */
953 if (endoffset > sd->sectors) { /* beyond the end */
954 sbp->b.b_bcount -= (endoffset - sd->sectors) * DEV_BSIZE; /* trim */
955 if (sbp->b.b_bcount <= 0) { /* nothing to transfer */
956 bp->b_resid = bp->b_bcount; /* nothing transferred */
957 biodone(bp);
958 Free(sbp);
959 return;
960 }
961 }
962 #ifdef VINUMDEBUG
963 if (debug & DEBUG_ADDRESSES)
964 log(LOG_DEBUG,
965 " %s dev %d.%d, sd %d, offset 0x%llx, devoffset 0x%llx, length %ld\n",
966 sbp->b.b_flags & B_READ ? "Read" : "Write",
967 major(sbp->b.b_dev),
968 minor(sbp->b.b_dev),
969 sbp->sdno,
970 sbp->b.b_blkno - SD[sbp->sdno].driveoffset,
971 sbp->b.b_blkno,
972 sbp->b.b_bcount);
973 #endif
974 s = splbio();
975 #ifdef VINUMDEBUG
976 if (debug & DEBUG_LASTREQS)
977 logrq(loginfo_sdiol, (union rqinfou) &sbp->b, &sbp->b);
978 #endif
979 bdev = bdevsw_lookup(sbp->b.b_dev);
980 if (bdev == NULL)
981 panic("no buf"); /* XXX FIXME */
982 (*bdev->d_strategy) (&sbp->b);
983 splx(s);
984 }
985
986 /*
987 * Simplified version of bounds_check_with_label
988 * Determine the size of the transfer, and make sure it is
989 * within the boundaries of the partition. Adjust transfer
990 * if needed, and signal errors or early completion.
991 *
992 * Volumes are simpler than disk slices: they only contain
993 * one component (though we call them a, b and c to make
994 * system utilities happy), and they always take up the
995 * complete space of the "partition".
996 *
997 * I'm still not happy with this: why should the label be
998 * protected? If it weren't so damned difficult to write
999 * one in the first pleace (because it's protected), it wouldn't
1000 * be a problem.
1001 */
1002 int
1003 vinum_bounds_check(struct buf *bp, struct volume *vol)
1004 {
1005 int maxsize = vol->size; /* size of the partition (sectors) */
1006 int size = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* size of this request (sectors) */
1007
1008 /* Would this transfer overwrite the disk label? */
1009 if (bp->b_blkno <= LABELSECTOR /* starts before or at the label */
1010 #if LABELSECTOR != 0
1011 && bp->b_blkno + size > LABELSECTOR /* and finishes after */
1012 #endif
1013 && (!(vol->flags & VF_RAW)) /* and it's not raw */
1014 &&major(bp->b_dev) == VINUM_BDEV_MAJOR /* and it's the block device */
1015 && ((bp->b_flags & B_READ) == 0) /* and it's a write */
1016 &&(!vol->flags & (VF_WLABEL | VF_LABELLING))) { /* and we're not allowed to write the label */
1017 bp->b_error = EROFS; /* read-only */
1018 bp->b_flags |= B_ERROR;
1019 return -1;
1020 }
1021 if (size == 0) /* no transfer specified, */
1022 return 0; /* treat as EOF */
1023 /* beyond partition? */
1024 if (bp->b_blkno < 0 /* negative start */
1025 || bp->b_blkno + size > maxsize) { /* or goes beyond the end of the partition */
1026 /* if exactly at end of disk, return an EOF */
1027 if (bp->b_blkno == maxsize) {
1028 bp->b_resid = bp->b_bcount;
1029 return 0;
1030 }
1031 /* or truncate if part of it fits */
1032 size = maxsize - bp->b_blkno;
1033 if (size <= 0) { /* nothing to transfer */
1034 bp->b_error = EINVAL;
1035 bp->b_flags |= B_ERROR;
1036 return -1;
1037 }
1038 bp->b_bcount = size << DEV_BSHIFT;
1039 }
1040 return 1;
1041 }
1042
1043 /*
1044 * Allocate a request group and hook
1045 * it in in the list for rq
1046 */
1047 struct rqgroup *
1048 allocrqg(struct request *rq, int elements)
1049 {
1050 struct rqgroup *rqg; /* the one we're going to allocate */
1051 int size = sizeof(struct rqgroup) + elements * sizeof(struct rqelement);
1052
1053 rqg = (struct rqgroup *) Malloc(size);
1054 if (rqg != NULL) { /* malloc OK, */
1055 if (rq->rqg) /* we already have requests */
1056 rq->lrqg->next = rqg; /* hang it off the end */
1057 else /* first request */
1058 rq->rqg = rqg; /* at the start */
1059 rq->lrqg = rqg; /* this one is the last in the list */
1060
1061 bzero(rqg, size); /* no old junk */
1062 rqg->rq = rq; /* point back to the parent request */
1063 rqg->count = elements; /* number of requests in the group */
1064 rqg->lockbase = -1; /* no lock required yet */
1065 }
1066 return rqg;
1067 }
1068
1069 /*
1070 * Deallocate a request group out of a chain. We do
1071 * this by linear search: the chain is short, this
1072 * almost never happens, and currently it can only
1073 * happen to the first member of the chain.
1074 */
1075 void
1076 deallocrqg(struct rqgroup *rqg)
1077 {
1078 struct rqgroup *rqgc = rqg->rq->rqg; /* point to the request chain */
1079
1080 if (rqg->lock) /* got a lock? */
1081 unlockrange(rqg->plexno, rqg->lock); /* yes, free it */
1082 if (rqgc == rqg) /* we're first in line */
1083 rqg->rq->rqg = rqg->next; /* unhook ourselves */
1084 else {
1085 while ((rqgc->next != NULL) /* find the group */
1086 &&(rqgc->next != rqg))
1087 rqgc = rqgc->next;
1088 if (rqgc->next == NULL)
1089 log(LOG_ERR,
1090 "vinum deallocrqg: rqg %p not found in request %p\n",
1091 rqg->rq,
1092 rqg);
1093 else
1094 rqgc->next = rqg->next; /* make the chain jump over us */
1095 }
1096 Free(rqg);
1097 }
1098
1099 /* Character device interface */
1100 int
1101 vinumread(dev_t dev, struct uio *uio, int ioflag)
1102 {
1103 return (physio(vinumstrategy, NULL, dev, B_READ, minphys, uio));
1104 }
1105
1106 int
1107 vinumwrite(dev_t dev, struct uio *uio, int ioflag)
1108 {
1109 return (physio(vinumstrategy, NULL, dev, B_WRITE, minphys, uio));
1110 }
1111
1112 /* Local Variables: */
1113 /* fill-column: 50 */
1114 /* End: */
Cache object: 32d35ff4f6e761dff55c12c1e95f2b30
|