1 /* vinuminterrupt.c: bottom half of the driver */
2
3 #include <sys/cdefs.h>
4 __FBSDID("$FreeBSD$");
5
6 /*-
7 * Copyright (c) 1997, 1998, 1999
8 * Nan Yang Computer Services Limited. All rights reserved.
9 *
10 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
11 *
12 * Written by Greg Lehey
13 *
14 * This software is distributed under the so-called ``Berkeley
15 * License'':
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in the
24 * documentation and/or other materials provided with the distribution.
25 * 3. All advertising materials mentioning features or use of this software
26 * must display the following acknowledgement:
27 * This product includes software developed by Nan Yang Computer
28 * Services Limited.
29 * 4. Neither the name of the Company nor the names of its contributors
30 * may be used to endorse or promote products derived from this software
31 * without specific prior written permission.
32 *
33 * This software is provided ``as is'', and any express or implied
34 * warranties, including, but not limited to, the implied warranties of
35 * merchantability and fitness for a particular purpose are disclaimed.
36 * In no event shall the company or contributors be liable for any
37 * direct, indirect, incidental, special, exemplary, or consequential
38 * damages (including, but not limited to, procurement of substitute
39 * goods or services; loss of use, data, or profits; or business
40 * interruption) however caused and on any theory of liability, whether
41 * in contract, strict liability, or tort (including negligence or
42 * otherwise) arising in any way out of the use of this software, even if
43 * advised of the possibility of such damage.
44 *
45 * $Id: vinuminterrupt.c,v 1.41 2003/08/24 17:55:56 obrien Exp $
46 */
47
48 #include <dev/vinum/vinumhdr.h>
49 #include <dev/vinum/request.h>
50 #include <sys/resourcevar.h>
51
52 void complete_raid5_write(struct rqelement *);
53 void complete_rqe(struct buf *bp);
54 void sdio_done(struct buf *bp);
55
56 /*
57 * Take a completed buffer, transfer the data back if
58 * it's a read, and complete the high-level request
59 * if this is the last subrequest.
60 *
61 * The bp parameter is in fact a struct rqelement, which
62 * includes a couple of extras at the end.
63 */
64 void
65 complete_rqe(struct buf *bp)
66 {
67 struct rqelement *rqe;
68 struct request *rq;
69 struct rqgroup *rqg;
70 struct buf *ubp; /* user buffer */
71 struct drive *drive;
72 struct sd *sd;
73 char *gravity; /* for error messages */
74
75 rqe = (struct rqelement *) bp; /* point to the element element that completed */
76 rqg = rqe->rqg; /* and the request group */
77 rq = rqg->rq; /* and the complete request */
78 ubp = rq->bp; /* user buffer */
79
80 #ifdef VINUMDEBUG
81 if (debug & DEBUG_LASTREQS)
82 logrq(loginfo_iodone, (union rqinfou) rqe, ubp);
83 #endif
84 drive = &DRIVE[rqe->driveno];
85 drive->active--; /* one less outstanding I/O on this drive */
86 vinum_conf.active--; /* one less outstanding I/O globally */
87 if ((drive->active == (DRIVE_MAXACTIVE - 1)) /* we were at the drive limit */
88 ||(vinum_conf.active == VINUM_MAXACTIVE)) /* or the global limit */
89 wakeup(&launch_requests); /* let another one at it */
90 if ((bp->b_io.bio_flags & BIO_ERROR) != 0) { /* transfer in error */
91 gravity = "";
92 sd = &SD[rqe->sdno];
93
94 if (bp->b_error != 0) /* did it return a number? */
95 rq->error = bp->b_error; /* yes, put it in. */
96 else if (rq->error == 0) /* no: do we have one already? */
97 rq->error = EIO; /* no: catchall "I/O error" */
98 sd->lasterror = rq->error;
99 if (bp->b_iocmd == BIO_READ) { /* read operation */
100 if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
101 gravity = " fatal";
102 set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
103 }
104 log(LOG_ERR,
105 "%s:%s read error, block %lld for %ld bytes\n",
106 gravity,
107 sd->name,
108 (long long)bp->b_blkno,
109 bp->b_bcount);
110 } else { /* write operation */
111 if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
112 gravity = "fatal ";
113 set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
114 }
115 log(LOG_ERR,
116 "%s:%s write error, block %lld for %ld bytes\n",
117 gravity,
118 sd->name,
119 (long long)bp->b_blkno,
120 bp->b_bcount);
121 }
122 log(LOG_ERR,
123 "%s: user buffer block %lld for %ld bytes\n",
124 sd->name,
125 (long long)ubp->b_blkno,
126 ubp->b_bcount);
127 if (rq->error == ENXIO) { /* the drive's down too */
128 log(LOG_ERR,
129 "%s: fatal drive I/O error, block %lld for %ld bytes\n",
130 DRIVE[rqe->driveno].label.name,
131 (long long)bp->b_blkno,
132 bp->b_bcount);
133 DRIVE[rqe->driveno].lasterror = rq->error;
134 set_drive_state(rqe->driveno, /* take the drive down */
135 drive_down,
136 setstate_force);
137 }
138 }
139 /* Now update the statistics */
140 if (bp->b_iocmd == BIO_READ) { /* read operation */
141 DRIVE[rqe->driveno].reads++;
142 DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
143 SD[rqe->sdno].reads++;
144 SD[rqe->sdno].bytes_read += bp->b_bcount;
145 PLEX[rqe->rqg->plexno].reads++;
146 PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
147 if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */
148 VOL[PLEX[rqe->rqg->plexno].volno].reads++;
149 VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount;
150 }
151 } else { /* write operation */
152 DRIVE[rqe->driveno].writes++;
153 DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
154 SD[rqe->sdno].writes++;
155 SD[rqe->sdno].bytes_written += bp->b_bcount;
156 PLEX[rqe->rqg->plexno].writes++;
157 PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
158 if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */
159 VOL[PLEX[rqe->rqg->plexno].volno].writes++;
160 VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount;
161 }
162 }
163 if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */
164 int *sdata; /* source */
165 int *data; /* and group data */
166 int length; /* and count involved */
167 int count; /* loop counter */
168 struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */
169
170 /* XOR destination is the user data */
171 sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */
172 data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
173 length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
174
175 for (count = 0; count < length; count++)
176 data[count] ^= sdata[count];
177
178 /*
179 * In a normal read, we will normally read directly
180 * into the user buffer. This doesn't work if
181 * we're also doing a recovery, so we have to
182 * copy it
183 */
184 if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */
185 char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
186 char *dst;
187
188 dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
189 length = rqe->datalen << DEV_BSHIFT; /* and count involved */
190 bcopy(src, dst, length); /* move it */
191 }
192 } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation */
193 &&(rqg->active == 1)) /* and this is the last active request */
194 complete_raid5_write(rqe);
195 /*
196 * This is the earliest place where we can be
197 * sure that the request has really finished,
198 * since complete_raid5_write can issue new
199 * requests.
200 */
201 rqg->active--; /* this request now finished */
202 if (rqg->active == 0) { /* request group finished, */
203 rq->active--; /* one less */
204 if (rqg->lock) { /* got a lock? */
205 unlockrange(rqg->plexno, rqg->lock); /* yes, free it */
206 rqg->lock = 0;
207 }
208 }
209 if (rq->active == 0) { /* request finished, */
210 #ifdef VINUMDEBUG
211 if (debug & DEBUG_RESID) {
212 if (ubp->b_resid != 0) /* still something to transfer? */
213 kdb_enter("resid");
214 }
215 #endif
216
217 if (rq->error) { /* did we have an error? */
218 if (rq->isplex) { /* plex operation, */
219 ubp->b_io.bio_flags |= BIO_ERROR; /* yes, propagate to user */
220 ubp->b_error = rq->error;
221 } else /* try to recover */
222 queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */
223 } else {
224 ubp->b_resid = 0; /* completed our transfer */
225 if (rq->isplex == 0) /* volume request, */
226 VOL[rq->volplex.volno].active--; /* another request finished */
227 if (rq->flags & XFR_COPYBUF) {
228 Free(ubp->b_data);
229 ubp->b_data = rq->save_data;
230 }
231 bufdone(ubp); /* top level buffer completed */
232 freerq(rq); /* return the request storage */
233 }
234 }
235 }
236
237 /* Free a request block and anything hanging off it */
238 void
239 freerq(struct request *rq)
240 {
241 struct rqgroup *rqg;
242 struct rqgroup *nrqg; /* next in chain */
243 int rqno;
244
245 for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) { /* through the whole request chain */
246 if (rqg->lock) /* got a lock? */
247 unlockrange(rqg->plexno, rqg->lock); /* yes, free it */
248 for (rqno = 0; rqno < rqg->count; rqno++) {
249 if ((rqg->rqe[rqno].flags & XFR_MALLOCED) /* data buffer was malloced, */
250 &&rqg->rqe[rqno].b.b_data) /* and the allocation succeeded */
251 Free(rqg->rqe[rqno].b.b_data); /* free it */
252 if (rqg->rqe[rqno].flags & XFR_BUFLOCKED) { /* locked this buffer, */
253 BUF_UNLOCK(&rqg->rqe[rqno].b); /* unlock it again */
254 BUF_LOCKFREE(&rqg->rqe[rqno].b);
255 }
256 }
257 nrqg = rqg->next; /* note the next one */
258 Free(rqg); /* and free this one */
259 }
260 Free(rq); /* free the request itself */
261 }
262
263 /* I/O on subdisk completed */
264 void
265 sdio_done(struct buf *bp)
266 {
267 struct sdbuf *sbp;
268
269 sbp = (struct sdbuf *) bp;
270 if (sbp->b.b_io.bio_flags & BIO_ERROR) { /* had an error */
271 sbp->bp->b_io.bio_flags |= BIO_ERROR; /* propagate upwards */
272 sbp->bp->b_error = sbp->b.b_error;
273 }
274 #ifdef VINUMDEBUG
275 if (debug & DEBUG_LASTREQS)
276 logrq(loginfo_sdiodone, (union rqinfou) bp, bp);
277 #endif
278 sbp->bp->b_resid = sbp->b.b_resid; /* copy the resid field */
279 /* Now update the statistics */
280 if (bp->b_iocmd == BIO_READ) { /* read operation */
281 DRIVE[sbp->driveno].reads++;
282 DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount;
283 SD[sbp->sdno].reads++;
284 SD[sbp->sdno].bytes_read += sbp->b.b_bcount;
285 } else { /* write operation */
286 DRIVE[sbp->driveno].writes++;
287 DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount;
288 SD[sbp->sdno].writes++;
289 SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
290 }
291 bufdone(sbp->bp); /* complete the caller's I/O */
292 BUF_UNLOCK(&sbp->b);
293 BUF_LOCKFREE(&sbp->b);
294 Free(sbp);
295 }
296
297 /* Start the second phase of a RAID-4 or RAID-5 group write operation. */
298 void
299 complete_raid5_write(struct rqelement *rqe)
300 {
301 int *sdata; /* source */
302 int *pdata; /* and parity block data */
303 int length; /* and count involved */
304 int count; /* loop counter */
305 int rqno; /* request index */
306 int rqoffset; /* offset of request data from parity data */
307 struct buf *ubp; /* user buffer header */
308 struct request *rq; /* pointer to our request */
309 struct rqgroup *rqg; /* and to the request group */
310 struct rqelement *prqe; /* point to the parity block */
311 struct drive *drive; /* drive to access */
312
313 rqg = rqe->rqg; /* and to our request group */
314 rq = rqg->rq; /* point to our request */
315 ubp = rq->bp; /* user's buffer header */
316 prqe = &rqg->rqe[0]; /* point to the parity block */
317
318 /*
319 * If we get to this function, we have normal or
320 * degraded writes, or a combination of both. We do
321 * the same thing in each case: we perform an
322 * exclusive or to the parity block. The only
323 * difference is the origin of the data and the
324 * address range.
325 */
326 if (rqe->flags & XFR_DEGRADED_WRITE) { /* do the degraded write stuff */
327 pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
328 bzero(pdata, prqe->grouplen << DEV_BSHIFT); /* start with nothing in the parity block */
329
330 /* Now get what data we need from each block */
331 for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */
332 rqe = &rqg->rqe[rqno]; /* this request */
333 sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
334 length = rqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */
335
336 /*
337 * Add the data block to the parity block. Before
338 * we started the request, we zeroed the parity
339 * block, so the result of adding all the other
340 * blocks and the block we want to write will be
341 * the correct parity block.
342 */
343 for (count = 0; count < length; count++)
344 pdata[count] ^= sdata[count];
345 if ((rqe->flags & XFR_MALLOCED) /* the buffer was malloced, */
346 &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) { /* and we have no normal write, */
347 Free(rqe->b.b_data); /* free it now */
348 rqe->flags &= ~XFR_MALLOCED;
349 }
350 }
351 }
352 if (rqg->flags & XFR_NORMAL_WRITE) { /* do normal write stuff */
353 /* Get what data we need from each block */
354 for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */
355 rqe = &rqg->rqe[rqno]; /* this request */
356 if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
357 == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) { /* good data block to write */
358 sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
359 rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
360 pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
361 length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
362
363 /*
364 * "remove" the old data block
365 * from the parity block
366 */
367 if ((pdata < ((int *) prqe->b.b_data))
368 || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
369 || (sdata < ((int *) rqe->b.b_data))
370 || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
371 panic("complete_raid5_write: bounds overflow");
372 for (count = 0; count < length; count++)
373 pdata[count] ^= sdata[count];
374
375 /* "add" the new data block */
376 sdata = (int *) (&ubp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
377 if ((sdata < ((int *) ubp->b_data))
378 || (&sdata[length] > ((int *) (ubp->b_data + ubp->b_bcount))))
379 panic("complete_raid5_write: bounds overflow");
380 for (count = 0; count < length; count++)
381 pdata[count] ^= sdata[count];
382
383 /* Free the malloced buffer */
384 if (rqe->flags & XFR_MALLOCED) { /* the buffer was malloced, */
385 Free(rqe->b.b_data); /* free it */
386 rqe->flags &= ~XFR_MALLOCED;
387 } else
388 panic("complete_raid5_write: malloc conflict");
389
390 if ((rqe->b.b_iocmd == BIO_READ) /* this was a read */
391 &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) { /* and we can write this block */
392 rqe->b.b_flags &= ~B_DONE; /* start a new request */
393 rqe->b.b_iocmd = BIO_WRITE; /* we're writing now */
394 rqe->b.b_iodone = complete_rqe; /* call us here when done */
395 rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */
396 rqe->b.b_data = &ubp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
397 rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
398 rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim more */
399 rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */
400 rqe->b.b_blkno += rqe->dataoffset; /* point to the correct block */
401 rqe->b.b_offset = rqe->b.b_blkno << DEV_BSHIFT;
402 rqe->b.b_iooffset = rqe->b.b_offset;
403 rqg->active++; /* another active request */
404 drive = &DRIVE[rqe->driveno]; /* drive to access */
405
406 /* We can't sleep here, so we just increment the counters. */
407 drive->active++;
408 if (drive->active >= drive->maxactive)
409 drive->maxactive = drive->active;
410 vinum_conf.active++;
411 if (vinum_conf.active >= vinum_conf.maxactive)
412 vinum_conf.maxactive = vinum_conf.active;
413 #ifdef VINUMDEBUG
414 if (debug & DEBUG_ADDRESSES)
415 log(LOG_DEBUG,
416 " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%llx, length %ld\n",
417 rqe->b.b_iocmd == BIO_READ ? "Read" : "Write",
418 major(rqe->b.b_dev),
419 minor(rqe->b.b_dev),
420 rqe->sdno,
421 (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
422 (long long)rqe->b.b_blkno,
423 rqe->b.b_bcount);
424 if (debug & DEBUG_LASTREQS)
425 logrq(loginfo_raid5_data, (union rqinfou) rqe, ubp);
426 #endif
427 DEV_STRATEGY(&rqe->b);
428 }
429 }
430 }
431 }
432 /* Finally, write the parity block */
433 rqe = &rqg->rqe[0];
434 rqe->b.b_flags &= ~B_DONE; /* we're not done */
435 rqe->b.b_iocmd = BIO_WRITE; /* we're writing now */
436 rqe->b.b_iodone = complete_rqe; /* call us here when done */
437 rqg->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */
438 rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT; /* length to write */
439 rqe->b.b_offset = rqe->b.b_blkno << DEV_BSHIFT;
440 rqe->b.b_iooffset = rqe->b.b_offset;
441 rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim we have more */
442 rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */
443 rqg->active++; /* another active request */
444 drive = &DRIVE[rqe->driveno]; /* drive to access */
445
446 /* We can't sleep here, so we just increment the counters. */
447 drive->active++;
448 if (drive->active >= drive->maxactive)
449 drive->maxactive = drive->active;
450 vinum_conf.active++;
451 if (vinum_conf.active >= vinum_conf.maxactive)
452 vinum_conf.maxactive = vinum_conf.active;
453
454 #ifdef VINUMDEBUG
455 if (debug & DEBUG_ADDRESSES)
456 log(LOG_DEBUG,
457 " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%llx, length %ld\n",
458 rqe->b.b_iocmd == BIO_READ ? "Read" : "Write",
459 major(rqe->b.b_dev),
460 minor(rqe->b.b_dev),
461 rqe->sdno,
462 (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
463 (long long)rqe->b.b_blkno,
464 rqe->b.b_bcount);
465 if (debug & DEBUG_LASTREQS)
466 logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubp);
467 #endif
468 DEV_STRATEGY(&rqe->b);
469 }
470
471 /* Local Variables: */
472 /* fill-column: 50 */
473 /* End: */
Cache object: f6ffaf394f8d8e23bcf74efe19abf974
|