vinuminterrupt.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /* vinuminterrupt.c: bottom half of the driver */
    2 
    3 /*-
    4  * Copyright (c) 1997, 1998, 1999
    5  *      Nan Yang Computer Services Limited.  All rights reserved.
    6  *
    7  *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
    8  *
    9  *  Written by Greg Lehey
   10  *
   11  *  This software is distributed under the so-called ``Berkeley
   12  *  License'':
   13  *
   14  * Redistribution and use in source and binary forms, with or without
   15  * modification, are permitted provided that the following conditions
   16  * are met:
   17  * 1. Redistributions of source code must retain the above copyright
   18  *    notice, this list of conditions and the following disclaimer.
   19  * 2. Redistributions in binary form must reproduce the above copyright
   20  *    notice, this list of conditions and the following disclaimer in the
   21  *    documentation and/or other materials provided with the distribution.
   22  * 3. All advertising materials mentioning features or use of this software
   23  *    must display the following acknowledgement:
   24  *      This product includes software developed by Nan Yang Computer
   25  *      Services Limited.
   26  * 4. Neither the name of the Company nor the names of its contributors
   27  *    may be used to endorse or promote products derived from this software
   28  *    without specific prior written permission.
   29  *
   30  * This software is provided ``as is'', and any express or implied
   31  * warranties, including, but not limited to, the implied warranties of
   32  * merchantability and fitness for a particular purpose are disclaimed.
   33  * In no event shall the company or contributors be liable for any
   34  * direct, indirect, incidental, special, exemplary, or consequential
   35  * damages (including, but not limited to, procurement of substitute
   36  * goods or services; loss of use, data, or profits; or business
   37  * interruption) however caused and on any theory of liability, whether
   38  * in contract, strict liability, or tort (including negligence or
   39  * otherwise) arising in any way out of the use of this software, even if
   40  * advised of the possibility of such damage.
   41  *
   42  * $Id: vinuminterrupt.c,v 1.9 2000/02/16 01:59:02 grog Exp grog $
   43  * $FreeBSD$
   44  */
   45 
   46 #include <dev/vinum/vinumhdr.h>
   47 #include <dev/vinum/request.h>
   48 #include <sys/resourcevar.h>
   49 
   50 void complete_raid5_write(struct rqelement *);
   51 void complete_rqe(struct buf *bp);
   52 void sdio_done(struct buf *bp);
   53 
   54 /*
   55  * Take a completed buffer, transfer the data back if
   56  * it's a read, and complete the high-level request
   57  * if this is the last subrequest.
   58  *
   59  * The bp parameter is in fact a struct rqelement, which
   60  * includes a couple of extras at the end.
   61  */
   62 void
   63 complete_rqe(struct buf *bp)
   64 {
   65     struct rqelement *rqe;
   66     struct request *rq;
   67     struct rqgroup *rqg;
   68     struct buf *ubp;                                        /* user buffer */
   69     struct drive *drive;
   70 
   71     rqe = (struct rqelement *) bp;                          /* point to the element element that completed */
   72     rqg = rqe->rqg;                                         /* and the request group */
   73     rq = rqg->rq;                                           /* and the complete request */
   74     ubp = rq->bp;                                           /* user buffer */
   75 
   76 #ifdef VINUMDEBUG
   77     if (debug & DEBUG_LASTREQS)
   78         logrq(loginfo_iodone, (union rqinfou) rqe, ubp);
   79 #endif
   80     drive = &DRIVE[rqe->driveno];
   81     drive->active--;                                        /* one less outstanding I/O on this drive */
   82     vinum_conf.active--;                                    /* one less outstanding I/O globally */
   83     if ((drive->active == (DRIVE_MAXACTIVE - 1))            /* we were at the drive limit */
   84     ||(vinum_conf.active == VINUM_MAXACTIVE))               /* or the global limit */
   85         wakeup(&launch_requests);                           /* let another one at it */
   86     if ((bp->b_flags & B_ERROR) != 0) {                     /* transfer in error */
   87         if (bp->b_error != 0)                               /* did it return a number? */
   88             rq->error = bp->b_error;                        /* yes, put it in. */
   89         else if (rq->error == 0)                            /* no: do we have one already? */
   90             rq->error = EIO;                                /* no: catchall "I/O error" */
   91         SD[rqe->sdno].lasterror = rq->error;
   92         if (bp->b_flags & B_READ) {
   93             log(LOG_ERR, "%s: fatal read I/O error\n", SD[rqe->sdno].name);
   94             set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
   95         } else {                                            /* write operation */
   96             log(LOG_ERR, "%s: fatal write I/O error\n", SD[rqe->sdno].name);
   97             set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
   98         }
   99         if (rq->error == ENXIO) {                           /* the drive's down too */
  100             log(LOG_ERR, "%s: fatal drive I/O error\n", DRIVE[rqe->driveno].label.name);
  101             DRIVE[rqe->driveno].lasterror = rq->error;
  102             set_drive_state(rqe->driveno,                   /* take the drive down */
  103                 drive_down,
  104                 setstate_force);
  105         }
  106     }
  107     /* Now update the statistics */
  108     if (bp->b_flags & B_READ) {                             /* read operation */
  109         DRIVE[rqe->driveno].reads++;
  110         DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
  111         SD[rqe->sdno].reads++;
  112         SD[rqe->sdno].bytes_read += bp->b_bcount;
  113         PLEX[rqe->rqg->plexno].reads++;
  114         PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
  115         if (PLEX[rqe->rqg->plexno].volno >= 0) {            /* volume I/O, not plex */
  116             VOL[PLEX[rqe->rqg->plexno].volno].reads++;
  117             VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount;
  118         }
  119     } else {                                                /* write operation */
  120         DRIVE[rqe->driveno].writes++;
  121         DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
  122         SD[rqe->sdno].writes++;
  123         SD[rqe->sdno].bytes_written += bp->b_bcount;
  124         PLEX[rqe->rqg->plexno].writes++;
  125         PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
  126         if (PLEX[rqe->rqg->plexno].volno >= 0) {            /* volume I/O, not plex */
  127             VOL[PLEX[rqe->rqg->plexno].volno].writes++;
  128             VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount;
  129         }
  130     }
  131     if (rqg->flags & XFR_RECOVERY_READ) {                   /* recovery read, */
  132         int *sdata;                                         /* source */
  133         int *data;                                          /* and group data */
  134         int length;                                         /* and count involved */
  135         int count;                                          /* loop counter */
  136         struct rqelement *urqe = &rqg->rqe[rqg->badsdno];   /* rqe of the bad subdisk */
  137 
  138         /* XOR destination is the user data */
  139         sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */
  140         data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
  141         length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
  142 
  143         for (count = 0; count < length; count++)
  144             data[count] ^= sdata[count];
  145 
  146         /*
  147          * In a normal read, we will normally read directly
  148          * into the user buffer.  This doesn't work if
  149          * we're also doing a recovery, so we have to
  150          * copy it
  151          */
  152         if (rqe->flags & XFR_NORMAL_READ) {                 /* normal read as well, */
  153             char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
  154             char *dst;
  155 
  156             dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
  157             length = rqe->datalen << DEV_BSHIFT;            /* and count involved */
  158             bcopy(src, dst, length);                        /* move it */
  159         }
  160     } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation  */
  161     &&(rqg->active == 1))                                   /* and this is the last active request */
  162         complete_raid5_write(rqe);
  163     /*
  164      * This is the earliest place where we can be
  165      * sure that the request has really finished,
  166      * since complete_raid5_write can issue new
  167      * requests.
  168      */
  169     rqg->active--;                                          /* this request now finished */
  170     if (rqg->active == 0) {                                 /* request group finished, */
  171         rq->active--;                                       /* one less */
  172         if (rqg->lock) {                                    /* got a lock? */
  173             unlockrange(rqg->plexno, rqg->lock);            /* yes, free it */
  174             rqg->lock = 0;
  175         }
  176     }
  177     if (rq->active == 0) {                                  /* request finished, */
  178 #if VINUMDEBUG
  179         if (debug & DEBUG_RESID) {
  180             if (ubp->b_resid != 0)                          /* still something to transfer? */
  181                 Debugger("resid");
  182         }
  183 #endif
  184 
  185         if (rq->error) {                                    /* did we have an error? */
  186             if (rq->isplex) {                               /* plex operation, */
  187                 ubp->b_flags |= B_ERROR;                    /* yes, propagate to user */
  188                 ubp->b_error = rq->error;
  189             } else                                          /* try to recover */
  190                 queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */
  191         } else {
  192             ubp->b_resid = 0;                               /* completed our transfer */
  193             if (rq->isplex == 0)                            /* volume request, */
  194                 VOL[rq->volplex.volno].active--;            /* another request finished */
  195             biodone(ubp);                                   /* top level buffer completed */
  196             freerq(rq);                                     /* return the request storage */
  197         }
  198     }
  199 }
  200 
  201 /* Free a request block and anything hanging off it */
  202 void
  203 freerq(struct request *rq)
  204 {
  205     struct rqgroup *rqg;
  206     struct rqgroup *nrqg;                                   /* next in chain */
  207     int rqno;
  208 
  209     for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) {          /* through the whole request chain */
  210         if (rqg->lock)                                      /* got a lock? */
  211             unlockrange(rqg->plexno, rqg->lock);            /* yes, free it */
  212         for (rqno = 0; rqno < rqg->count; rqno++)
  213             if ((rqg->rqe[rqno].flags & XFR_MALLOCED)       /* data buffer was malloced, */
  214             &&rqg->rqe[rqno].b.b_data)                      /* and the allocation succeeded */
  215                 Free(rqg->rqe[rqno].b.b_data);              /* free it */
  216         nrqg = rqg->next;                                   /* note the next one */
  217         Free(rqg);                                          /* and free this one */
  218     }
  219     Free(rq);                                               /* free the request itself */
  220 }
  221 
  222 /* I/O on subdisk completed */
  223 void
  224 sdio_done(struct buf *bp)
  225 {
  226     struct sdbuf *sbp;
  227 
  228     sbp = (struct sdbuf *) bp;
  229     if (sbp->b.b_flags & B_ERROR) {                         /* had an error */
  230         sbp->bp->b_flags |= B_ERROR;                        /* propagate upwards */
  231         sbp->bp->b_error = sbp->b.b_error;
  232     }
  233 #ifdef VINUMDEBUG
  234     if (debug & DEBUG_LASTREQS)
  235         logrq(loginfo_sdiodone, (union rqinfou) bp, bp);
  236 #endif
  237     sbp->bp->b_resid = sbp->b.b_resid;                      /* copy the resid field */
  238     /* Now update the statistics */
  239     if (bp->b_flags & B_READ) {                             /* read operation */
  240         DRIVE[sbp->driveno].reads++;
  241         DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount;
  242         SD[sbp->sdno].reads++;
  243         SD[sbp->sdno].bytes_read += sbp->b.b_bcount;
  244     } else {                                                /* write operation */
  245         DRIVE[sbp->driveno].writes++;
  246         DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount;
  247         SD[sbp->sdno].writes++;
  248         SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
  249     }
  250     biodone(sbp->bp);                                       /* complete the caller's I/O */
  251     Free(sbp);
  252 }
  253 
  254 /* Start the second phase of a RAID-4 or RAID-5 group write operation. */
  255 void
  256 complete_raid5_write(struct rqelement *rqe)
  257 {
  258     int *sdata;                                             /* source */
  259     int *pdata;                                             /* and parity block data */
  260     int length;                                             /* and count involved */
  261     int count;                                              /* loop counter */
  262     int rqno;                                               /* request index */
  263     int rqoffset;                                           /* offset of request data from parity data */
  264     struct buf *ubp;                                        /* user buffer header */
  265     struct request *rq;                                     /* pointer to our request */
  266     struct rqgroup *rqg;                                    /* and to the request group */
  267     struct rqelement *prqe;                                 /* point to the parity block */
  268     struct drive *drive;                                    /* drive to access */
  269 
  270     rqg = rqe->rqg;                                         /* and to our request group */
  271     rq = rqg->rq;                                           /* point to our request */
  272     ubp = rq->bp;                                           /* user's buffer header */
  273     prqe = &rqg->rqe[0];                                    /* point to the parity block */
  274 
  275     /*
  276      * If we get to this function, we have normal or
  277      * degraded writes, or a combination of both.  We do
  278      * the same thing in each case: we perform an
  279      * exclusive or to the parity block.  The only
  280      * difference is the origin of the data and the
  281      * address range.
  282      */
  283     if (rqe->flags & XFR_DEGRADED_WRITE) {                  /* do the degraded write stuff */
  284         pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
  285         bzero(pdata, prqe->grouplen << DEV_BSHIFT);         /* start with nothing in the parity block */
  286 
  287         /* Now get what data we need from each block */
  288         for (rqno = 1; rqno < rqg->count; rqno++) {         /* for all the data blocks */
  289             rqe = &rqg->rqe[rqno];                          /* this request */
  290             sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
  291             length = rqe->grouplen << (DEV_BSHIFT - 2);     /* and count involved */
  292 
  293             /*
  294              * Add the data block to the parity block.  Before
  295              * we started the request, we zeroed the parity
  296              * block, so the result of adding all the other
  297              * blocks and the block we want to write will be
  298              * the correct parity block.
  299              */
  300             for (count = 0; count < length; count++)
  301                 pdata[count] ^= sdata[count];
  302             if ((rqe->flags & XFR_MALLOCED)                 /* the buffer was malloced, */
  303             &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) {     /* and we have no normal write, */
  304                 Free(rqe->b.b_data);                        /* free it now */
  305                 rqe->flags &= ~XFR_MALLOCED;
  306             }
  307         }
  308     }
  309     if (rqg->flags & XFR_NORMAL_WRITE) {                    /* do normal write stuff */
  310         /* Get what data we need from each block */
  311         for (rqno = 1; rqno < rqg->count; rqno++) {         /* for all the data blocks */
  312             rqe = &rqg->rqe[rqno];                          /* this request */
  313             if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
  314                 == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) {   /* good data block to write */
  315                 sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
  316                 rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
  317                 pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
  318                 length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
  319 
  320                 /*
  321                  * "remove" the old data block
  322                  * from the parity block
  323                  */
  324                 if ((pdata < ((int *) prqe->b.b_data))
  325                     || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
  326                     || (sdata < ((int *) rqe->b.b_data))
  327                     || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
  328                     panic("complete_raid5_write: bounds overflow");
  329                 for (count = 0; count < length; count++)
  330                     pdata[count] ^= sdata[count];
  331 
  332                 /* "add" the new data block */
  333                 sdata = (int *) (&ubp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
  334                 if ((sdata < ((int *) ubp->b_data))
  335                     || (&sdata[length] > ((int *) (ubp->b_data + ubp->b_bcount))))
  336                     panic("complete_raid5_write: bounds overflow");
  337                 for (count = 0; count < length; count++)
  338                     pdata[count] ^= sdata[count];
  339 
  340                 /* Free the malloced buffer */
  341                 if (rqe->flags & XFR_MALLOCED) {            /* the buffer was malloced, */
  342                     Free(rqe->b.b_data);                    /* free it */
  343                     rqe->flags &= ~XFR_MALLOCED;
  344                 } else
  345                     panic("complete_raid5_write: malloc conflict");
  346 
  347                 if ((rqe->b.b_flags & B_READ)               /* this was a read */
  348                 &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) {  /* and we can write this block */
  349                     rqe->b.b_flags &= ~(B_READ | B_DONE);   /* we're writing now */
  350                     rqe->b.b_flags |= B_CALL;               /* call us when you're done */
  351                     rqe->b.b_iodone = complete_rqe;         /* by calling us here */
  352                     rqe->flags &= ~XFR_PARITYOP;            /* reset flags that brought us here */
  353                     rqe->b.b_data = &ubp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
  354                     rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
  355                     rqe->b.b_bufsize = rqe->b.b_bcount;     /* don't claim more */
  356                     rqe->b.b_resid = rqe->b.b_bcount;       /* nothing transferred */
  357                     rqe->b.b_blkno += rqe->dataoffset;      /* point to the correct block */
  358                     rqg->active++;                          /* another active request */
  359                     drive = &DRIVE[rqe->driveno];           /* drive to access */
  360 
  361                                                             /* We can't sleep here, so we just increment the counters. */
  362                     drive->active++;
  363                     if (drive->active >= drive->maxactive)
  364                         drive->maxactive = drive->active;
  365                     vinum_conf.active++;
  366                     if (vinum_conf.active >= vinum_conf.maxactive)
  367                         vinum_conf.maxactive = vinum_conf.active;
  368 #if VINUMDEBUG
  369                     if (debug & DEBUG_ADDRESSES)
  370                         log(LOG_DEBUG,
  371                             "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
  372                             rqe->b.b_flags & B_READ ? "Read" : "Write",
  373                             major(rqe->b.b_dev),
  374                             minor(rqe->b.b_dev),
  375                             rqe->sdno,
  376                             (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
  377                             rqe->b.b_blkno,
  378                             rqe->b.b_bcount);
  379                     if (debug & DEBUG_LASTREQS)
  380                         logrq(loginfo_raid5_data, (union rqinfou) rqe, ubp);
  381 #endif
  382                     (*bdevsw[major(rqe->b.b_dev)]->d_strategy) (&rqe->b);
  383                 }
  384             }
  385         }
  386     }
  387     /* Finally, write the parity block */
  388     rqe = &rqg->rqe[0];
  389     rqe->b.b_flags &= ~(B_READ | B_DONE);                   /* we're writing now */
  390     rqe->b.b_flags |= B_CALL;                               /* tell us when you're done */
  391     rqe->b.b_iodone = complete_rqe;                         /* by calling us here */
  392     rqg->flags &= ~XFR_PARITYOP;                            /* reset flags that brought us here */
  393     rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT;            /* length to write */
  394     rqe->b.b_bufsize = rqe->b.b_bcount;                     /* don't claim we have more */
  395     rqe->b.b_resid = rqe->b.b_bcount;                       /* nothing transferred */
  396     rqg->active++;                                          /* another active request */
  397     drive = &DRIVE[rqe->driveno];                           /* drive to access */
  398 
  399     /* We can't sleep here, so we just increment the counters. */
  400     drive->active++;
  401     if (drive->active >= drive->maxactive)
  402         drive->maxactive = drive->active;
  403     vinum_conf.active++;
  404     if (vinum_conf.active >= vinum_conf.maxactive)
  405         vinum_conf.maxactive = vinum_conf.active;
  406 
  407 #if VINUMDEBUG
  408     if (debug & DEBUG_ADDRESSES)
  409         log(LOG_DEBUG,
  410             "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
  411             rqe->b.b_flags & B_READ ? "Read" : "Write",
  412             major(rqe->b.b_dev),
  413             minor(rqe->b.b_dev),
  414             rqe->sdno,
  415             (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
  416             rqe->b.b_blkno,
  417             rqe->b.b_bcount);
  418     if (debug & DEBUG_LASTREQS)
  419         logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubp);
  420 #endif
  421     (*bdevsw[major(rqe->b.b_dev)]->d_strategy) (&rqe->b);
  422 }
  423 
  424 /* Local Variables: */
  425 /* fill-column: 50 */
  426 /* End: */
Cache object: f99cbfa1354678f4db3a7faab7740499
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/dev/vinum/vinuminterrupt.c

FreeBSD/Linux Kernel Cross Reference
sys/dev/vinum/vinuminterrupt.c