1 /* vinuminterrupt.c: bottom half of the driver */
2
3 /*-
4 * Copyright (c) 1997, 1998, 1999
5 * Nan Yang Computer Services Limited. All rights reserved.
6 *
7 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
8 *
9 * Written by Greg Lehey
10 *
11 * This software is distributed under the so-called ``Berkeley
12 * License'':
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by Nan Yang Computer
25 * Services Limited.
26 * 4. Neither the name of the Company nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
29 *
30 * This software is provided ``as is'', and any express or implied
31 * warranties, including, but not limited to, the implied warranties of
32 * merchantability and fitness for a particular purpose are disclaimed.
33 * In no event shall the company or contributors be liable for any
34 * direct, indirect, incidental, special, exemplary, or consequential
35 * damages (including, but not limited to, procurement of substitute
36 * goods or services; loss of use, data, or profits; or business
37 * interruption) however caused and on any theory of liability, whether
38 * in contract, strict liability, or tort (including negligence or
39 * otherwise) arising in any way out of the use of this software, even if
40 * advised of the possibility of such damage.
41 *
42 * $Id: vinuminterrupt.c,v 1.14 2001/05/23 23:03:37 grog Exp grog $
43 * $FreeBSD: releng/5.0/sys/dev/vinum/vinuminterrupt.c 106583 2002-11-07 21:52:51Z jhb $
44 */
45
46 #include <dev/vinum/vinumhdr.h>
47 #include <dev/vinum/request.h>
48 #include <sys/resourcevar.h>
49
50 void complete_raid5_write(struct rqelement *);
51 void complete_rqe(struct buf *bp);
52 void sdio_done(struct buf *bp);
53
54 /*
55 * Take a completed buffer, transfer the data back if
56 * it's a read, and complete the high-level request
57 * if this is the last subrequest.
58 *
59 * The bp parameter is in fact a struct rqelement, which
60 * includes a couple of extras at the end.
61 */
62 void
63 complete_rqe(struct buf *bp)
64 {
65 struct rqelement *rqe;
66 struct request *rq;
67 struct rqgroup *rqg;
68 struct buf *ubp; /* user buffer */
69 struct drive *drive;
70 struct sd *sd;
71 char *gravity; /* for error messages */
72
73 rqe = (struct rqelement *) bp; /* point to the element element that completed */
74 rqg = rqe->rqg; /* and the request group */
75 rq = rqg->rq; /* and the complete request */
76 ubp = rq->bp; /* user buffer */
77
78 #ifdef VINUMDEBUG
79 if (debug & DEBUG_LASTREQS)
80 logrq(loginfo_iodone, (union rqinfou) rqe, ubp);
81 #endif
82 drive = &DRIVE[rqe->driveno];
83 drive->active--; /* one less outstanding I/O on this drive */
84 vinum_conf.active--; /* one less outstanding I/O globally */
85 if ((drive->active == (DRIVE_MAXACTIVE - 1)) /* we were at the drive limit */
86 ||(vinum_conf.active == VINUM_MAXACTIVE)) /* or the global limit */
87 wakeup(&launch_requests); /* let another one at it */
88 if ((bp->b_io.bio_flags & BIO_ERROR) != 0) { /* transfer in error */
89 gravity = "";
90 sd = &SD[rqe->sdno];
91
92 if (bp->b_error != 0) /* did it return a number? */
93 rq->error = bp->b_error; /* yes, put it in. */
94 else if (rq->error == 0) /* no: do we have one already? */
95 rq->error = EIO; /* no: catchall "I/O error" */
96 sd->lasterror = rq->error;
97 if (bp->b_iocmd == BIO_READ) { /* read operation */
98 if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
99 gravity = " fatal";
100 set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
101 }
102 log(LOG_ERR,
103 "%s:%s read error, block %lld for %ld bytes\n",
104 gravity,
105 sd->name,
106 (long long)bp->b_blkno,
107 bp->b_bcount);
108 } else { /* write operation */
109 if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
110 gravity = "fatal ";
111 set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
112 }
113 log(LOG_ERR,
114 "%s:%s write error, block %lld for %ld bytes\n",
115 gravity,
116 sd->name,
117 (long long)bp->b_blkno,
118 bp->b_bcount);
119 }
120 log(LOG_ERR,
121 "%s: user buffer block %lld for %ld bytes\n",
122 sd->name,
123 (long long)ubp->b_blkno,
124 ubp->b_bcount);
125 if (rq->error == ENXIO) { /* the drive's down too */
126 log(LOG_ERR,
127 "%s: fatal drive I/O error, block %lld for %ld bytes\n",
128 DRIVE[rqe->driveno].label.name,
129 (long long)bp->b_blkno,
130 bp->b_bcount);
131 DRIVE[rqe->driveno].lasterror = rq->error;
132 set_drive_state(rqe->driveno, /* take the drive down */
133 drive_down,
134 setstate_force);
135 }
136 }
137 /* Now update the statistics */
138 if (bp->b_iocmd == BIO_READ) { /* read operation */
139 DRIVE[rqe->driveno].reads++;
140 DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
141 SD[rqe->sdno].reads++;
142 SD[rqe->sdno].bytes_read += bp->b_bcount;
143 PLEX[rqe->rqg->plexno].reads++;
144 PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
145 if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */
146 VOL[PLEX[rqe->rqg->plexno].volno].reads++;
147 VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount;
148 }
149 } else { /* write operation */
150 DRIVE[rqe->driveno].writes++;
151 DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
152 SD[rqe->sdno].writes++;
153 SD[rqe->sdno].bytes_written += bp->b_bcount;
154 PLEX[rqe->rqg->plexno].writes++;
155 PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
156 if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */
157 VOL[PLEX[rqe->rqg->plexno].volno].writes++;
158 VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount;
159 }
160 }
161 if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */
162 int *sdata; /* source */
163 int *data; /* and group data */
164 int length; /* and count involved */
165 int count; /* loop counter */
166 struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */
167
168 /* XOR destination is the user data */
169 sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */
170 data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
171 length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
172
173 for (count = 0; count < length; count++)
174 data[count] ^= sdata[count];
175
176 /*
177 * In a normal read, we will normally read directly
178 * into the user buffer. This doesn't work if
179 * we're also doing a recovery, so we have to
180 * copy it
181 */
182 if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */
183 char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
184 char *dst;
185
186 dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
187 length = rqe->datalen << DEV_BSHIFT; /* and count involved */
188 bcopy(src, dst, length); /* move it */
189 }
190 } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation */
191 &&(rqg->active == 1)) /* and this is the last active request */
192 complete_raid5_write(rqe);
193 /*
194 * This is the earliest place where we can be
195 * sure that the request has really finished,
196 * since complete_raid5_write can issue new
197 * requests.
198 */
199 rqg->active--; /* this request now finished */
200 if (rqg->active == 0) { /* request group finished, */
201 rq->active--; /* one less */
202 if (rqg->lock) { /* got a lock? */
203 unlockrange(rqg->plexno, rqg->lock); /* yes, free it */
204 rqg->lock = 0;
205 }
206 }
207 if (rq->active == 0) { /* request finished, */
208 #ifdef VINUMDEBUG
209 if (debug & DEBUG_RESID) {
210 if (ubp->b_resid != 0) /* still something to transfer? */
211 Debugger("resid");
212 }
213 #endif
214
215 if (rq->error) { /* did we have an error? */
216 if (rq->isplex) { /* plex operation, */
217 ubp->b_io.bio_flags |= BIO_ERROR; /* yes, propagate to user */
218 ubp->b_error = rq->error;
219 } else /* try to recover */
220 queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */
221 } else {
222 ubp->b_resid = 0; /* completed our transfer */
223 if (rq->isplex == 0) /* volume request, */
224 VOL[rq->volplex.volno].active--; /* another request finished */
225 if (rq->flags & XFR_COPYBUF) {
226 Free(ubp->b_data);
227 ubp->b_data = rq->save_data;
228 }
229 bufdone(ubp); /* top level buffer completed */
230 freerq(rq); /* return the request storage */
231 }
232 }
233 }
234
235 /* Free a request block and anything hanging off it */
236 void
237 freerq(struct request *rq)
238 {
239 struct rqgroup *rqg;
240 struct rqgroup *nrqg; /* next in chain */
241 int rqno;
242
243 for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) { /* through the whole request chain */
244 if (rqg->lock) /* got a lock? */
245 unlockrange(rqg->plexno, rqg->lock); /* yes, free it */
246 for (rqno = 0; rqno < rqg->count; rqno++) {
247 if ((rqg->rqe[rqno].flags & XFR_MALLOCED) /* data buffer was malloced, */
248 &&rqg->rqe[rqno].b.b_data) /* and the allocation succeeded */
249 Free(rqg->rqe[rqno].b.b_data); /* free it */
250 if (rqg->rqe[rqno].flags & XFR_BUFLOCKED) { /* locked this buffer, */
251 BUF_UNLOCK(&rqg->rqe[rqno].b); /* unlock it again */
252 BUF_LOCKFREE(&rqg->rqe[rqno].b);
253 }
254 }
255 nrqg = rqg->next; /* note the next one */
256 Free(rqg); /* and free this one */
257 }
258 Free(rq); /* free the request itself */
259 }
260
261 /* I/O on subdisk completed */
262 void
263 sdio_done(struct buf *bp)
264 {
265 struct sdbuf *sbp;
266
267 sbp = (struct sdbuf *) bp;
268 if (sbp->b.b_io.bio_flags & BIO_ERROR) { /* had an error */
269 sbp->bp->b_io.bio_flags |= BIO_ERROR; /* propagate upwards */
270 sbp->bp->b_error = sbp->b.b_error;
271 }
272 #ifdef VINUMDEBUG
273 if (debug & DEBUG_LASTREQS)
274 logrq(loginfo_sdiodone, (union rqinfou) bp, bp);
275 #endif
276 sbp->bp->b_resid = sbp->b.b_resid; /* copy the resid field */
277 /* Now update the statistics */
278 if (bp->b_iocmd == BIO_READ) { /* read operation */
279 DRIVE[sbp->driveno].reads++;
280 DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount;
281 SD[sbp->sdno].reads++;
282 SD[sbp->sdno].bytes_read += sbp->b.b_bcount;
283 } else { /* write operation */
284 DRIVE[sbp->driveno].writes++;
285 DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount;
286 SD[sbp->sdno].writes++;
287 SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
288 }
289 bufdone(sbp->bp); /* complete the caller's I/O */
290 BUF_UNLOCK(&sbp->b);
291 BUF_LOCKFREE(&sbp->b);
292 Free(sbp);
293 }
294
295 /* Start the second phase of a RAID-4 or RAID-5 group write operation. */
296 void
297 complete_raid5_write(struct rqelement *rqe)
298 {
299 int *sdata; /* source */
300 int *pdata; /* and parity block data */
301 int length; /* and count involved */
302 int count; /* loop counter */
303 int rqno; /* request index */
304 int rqoffset; /* offset of request data from parity data */
305 struct buf *ubp; /* user buffer header */
306 struct request *rq; /* pointer to our request */
307 struct rqgroup *rqg; /* and to the request group */
308 struct rqelement *prqe; /* point to the parity block */
309 struct drive *drive; /* drive to access */
310
311 rqg = rqe->rqg; /* and to our request group */
312 rq = rqg->rq; /* point to our request */
313 ubp = rq->bp; /* user's buffer header */
314 prqe = &rqg->rqe[0]; /* point to the parity block */
315
316 /*
317 * If we get to this function, we have normal or
318 * degraded writes, or a combination of both. We do
319 * the same thing in each case: we perform an
320 * exclusive or to the parity block. The only
321 * difference is the origin of the data and the
322 * address range.
323 */
324 if (rqe->flags & XFR_DEGRADED_WRITE) { /* do the degraded write stuff */
325 pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
326 bzero(pdata, prqe->grouplen << DEV_BSHIFT); /* start with nothing in the parity block */
327
328 /* Now get what data we need from each block */
329 for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */
330 rqe = &rqg->rqe[rqno]; /* this request */
331 sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
332 length = rqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */
333
334 /*
335 * Add the data block to the parity block. Before
336 * we started the request, we zeroed the parity
337 * block, so the result of adding all the other
338 * blocks and the block we want to write will be
339 * the correct parity block.
340 */
341 for (count = 0; count < length; count++)
342 pdata[count] ^= sdata[count];
343 if ((rqe->flags & XFR_MALLOCED) /* the buffer was malloced, */
344 &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) { /* and we have no normal write, */
345 Free(rqe->b.b_data); /* free it now */
346 rqe->flags &= ~XFR_MALLOCED;
347 }
348 }
349 }
350 if (rqg->flags & XFR_NORMAL_WRITE) { /* do normal write stuff */
351 /* Get what data we need from each block */
352 for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */
353 rqe = &rqg->rqe[rqno]; /* this request */
354 if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
355 == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) { /* good data block to write */
356 sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
357 rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
358 pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
359 length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
360
361 /*
362 * "remove" the old data block
363 * from the parity block
364 */
365 if ((pdata < ((int *) prqe->b.b_data))
366 || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
367 || (sdata < ((int *) rqe->b.b_data))
368 || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
369 panic("complete_raid5_write: bounds overflow");
370 for (count = 0; count < length; count++)
371 pdata[count] ^= sdata[count];
372
373 /* "add" the new data block */
374 sdata = (int *) (&ubp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
375 if ((sdata < ((int *) ubp->b_data))
376 || (&sdata[length] > ((int *) (ubp->b_data + ubp->b_bcount))))
377 panic("complete_raid5_write: bounds overflow");
378 for (count = 0; count < length; count++)
379 pdata[count] ^= sdata[count];
380
381 /* Free the malloced buffer */
382 if (rqe->flags & XFR_MALLOCED) { /* the buffer was malloced, */
383 Free(rqe->b.b_data); /* free it */
384 rqe->flags &= ~XFR_MALLOCED;
385 } else
386 panic("complete_raid5_write: malloc conflict");
387
388 if ((rqe->b.b_iocmd == BIO_READ) /* this was a read */
389 &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) { /* and we can write this block */
390 rqe->b.b_flags &= ~B_DONE; /* start a new request */
391 rqe->b.b_iocmd = BIO_WRITE; /* we're writing now */
392 rqe->b.b_iodone = complete_rqe; /* call us here when done */
393 rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */
394 rqe->b.b_data = &ubp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
395 rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
396 rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim more */
397 rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */
398 rqe->b.b_blkno += rqe->dataoffset; /* point to the correct block */
399 rqg->active++; /* another active request */
400 drive = &DRIVE[rqe->driveno]; /* drive to access */
401
402 /* We can't sleep here, so we just increment the counters. */
403 drive->active++;
404 if (drive->active >= drive->maxactive)
405 drive->maxactive = drive->active;
406 vinum_conf.active++;
407 if (vinum_conf.active >= vinum_conf.maxactive)
408 vinum_conf.maxactive = vinum_conf.active;
409 #ifdef VINUMDEBUG
410 if (debug & DEBUG_ADDRESSES)
411 log(LOG_DEBUG,
412 " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%llx, length %ld\n",
413 rqe->b.b_iocmd == BIO_READ ? "Read" : "Write",
414 major(rqe->b.b_dev),
415 minor(rqe->b.b_dev),
416 rqe->sdno,
417 (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
418 (long long)rqe->b.b_blkno,
419 rqe->b.b_bcount);
420 if (debug & DEBUG_LASTREQS)
421 logrq(loginfo_raid5_data, (union rqinfou) rqe, ubp);
422 #endif
423 DEV_STRATEGY(&rqe->b, 0);
424 }
425 }
426 }
427 }
428 /* Finally, write the parity block */
429 rqe = &rqg->rqe[0];
430 rqe->b.b_flags &= ~B_DONE; /* we're not done */
431 rqe->b.b_iocmd = BIO_WRITE; /* we're writing now */
432 rqe->b.b_iodone = complete_rqe; /* call us here when done */
433 rqg->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */
434 rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT; /* length to write */
435 rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim we have more */
436 rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */
437 rqg->active++; /* another active request */
438 drive = &DRIVE[rqe->driveno]; /* drive to access */
439
440 /* We can't sleep here, so we just increment the counters. */
441 drive->active++;
442 if (drive->active >= drive->maxactive)
443 drive->maxactive = drive->active;
444 vinum_conf.active++;
445 if (vinum_conf.active >= vinum_conf.maxactive)
446 vinum_conf.maxactive = vinum_conf.active;
447
448 #ifdef VINUMDEBUG
449 if (debug & DEBUG_ADDRESSES)
450 log(LOG_DEBUG,
451 " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%llx, length %ld\n",
452 rqe->b.b_iocmd == BIO_READ ? "Read" : "Write",
453 major(rqe->b.b_dev),
454 minor(rqe->b.b_dev),
455 rqe->sdno,
456 (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
457 (long long)rqe->b.b_blkno,
458 rqe->b.b_bcount);
459 if (debug & DEBUG_LASTREQS)
460 logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubp);
461 #endif
462 DEV_STRATEGY(&rqe->b, 0);
463 }
464
465 /* Local Variables: */
466 /* fill-column: 50 */
467 /* End: */
Cache object: f992350566e38e7d04a364315de378be
|