subr_disk.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $NetBSD: subr_disk.c,v 1.60 2004/03/09 12:23:07 yamt Exp $      */
    2 
    3 /*-
    4  * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
    9  * NASA Ames Research Center.
   10  *
   11  * Redistribution and use in source and binary forms, with or without
   12  * modification, are permitted provided that the following conditions
   13  * are met:
   14  * 1. Redistributions of source code must retain the above copyright
   15  *    notice, this list of conditions and the following disclaimer.
   16  * 2. Redistributions in binary form must reproduce the above copyright
   17  *    notice, this list of conditions and the following disclaimer in the
   18  *    documentation and/or other materials provided with the distribution.
   19  * 3. All advertising materials mentioning features or use of this software
   20  *    must display the following acknowledgement:
   21  *      This product includes software developed by the NetBSD
   22  *      Foundation, Inc. and its contributors.
   23  * 4. Neither the name of The NetBSD Foundation nor the names of its
   24  *    contributors may be used to endorse or promote products derived
   25  *    from this software without specific prior written permission.
   26  *
   27  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   29  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   30  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   31  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   37  * POSSIBILITY OF SUCH DAMAGE.
   38  */
   39 
   40 /*
   41  * Copyright (c) 1982, 1986, 1988, 1993
   42  *      The Regents of the University of California.  All rights reserved.
   43  * (c) UNIX System Laboratories, Inc.
   44  * All or some portions of this file are derived from material licensed
   45  * to the University of California by American Telephone and Telegraph
   46  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   47  * the permission of UNIX System Laboratories, Inc.
   48  *
   49  * Redistribution and use in source and binary forms, with or without
   50  * modification, are permitted provided that the following conditions
   51  * are met:
   52  * 1. Redistributions of source code must retain the above copyright
   53  *    notice, this list of conditions and the following disclaimer.
   54  * 2. Redistributions in binary form must reproduce the above copyright
   55  *    notice, this list of conditions and the following disclaimer in the
   56  *    documentation and/or other materials provided with the distribution.
   57  * 3. Neither the name of the University nor the names of its contributors
   58  *    may be used to endorse or promote products derived from this software
   59  *    without specific prior written permission.
   60  *
   61  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   62  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   63  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   64  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   65  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   66  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   67  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   68  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   69  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   70  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   71  * SUCH DAMAGE.
   72  *
   73  *      @(#)ufs_disksubr.c      8.5 (Berkeley) 1/21/94
   74  */
   75 
   76 #include <sys/cdefs.h>
   77 __KERNEL_RCSID(0, "$NetBSD: subr_disk.c,v 1.60 2004/03/09 12:23:07 yamt Exp $");
   78 
   79 #include "opt_compat_netbsd.h"
   80 #include "opt_bufq.h"
   81 
   82 #include <sys/param.h>
   83 #include <sys/kernel.h>
   84 #include <sys/malloc.h>
   85 #include <sys/buf.h>
   86 #include <sys/syslog.h>
   87 #include <sys/disklabel.h>
   88 #include <sys/disk.h>
   89 #include <sys/sysctl.h>
   90 #include <lib/libkern/libkern.h>
   91 
   92 /*
   93  * A global list of all disks attached to the system.  May grow or
   94  * shrink over time.
   95  */
   96 struct  disklist_head disklist; /* TAILQ_HEAD */
   97 int     disk_count;             /* number of drives in global disklist */
   98 struct simplelock disklist_slock = SIMPLELOCK_INITIALIZER;
   99 
  100 #ifdef NEW_BUFQ_STRATEGY
  101 int bufq_disk_default_strat = BUFQ_READ_PRIO;
  102 #else /* NEW_BUFQ_STRATEGY */
  103 int bufq_disk_default_strat = BUFQ_DISKSORT;
  104 #endif /* NEW_BUFQ_STRATEGY */
  105 
  106 /*
  107  * Compute checksum for disk label.
  108  */
  109 u_int
  110 dkcksum(struct disklabel *lp)
  111 {
  112         u_short *start, *end;
  113         u_short sum = 0;
  114 
  115         start = (u_short *)lp;
  116         end = (u_short *)&lp->d_partitions[lp->d_npartitions];
  117         while (start < end)
  118                 sum ^= *start++;
  119         return (sum);
  120 }
  121 
  122 /*
  123  * Disk error is the preface to plaintive error messages
  124  * about failing disk transfers.  It prints messages of the form
  125 
  126 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
  127 
  128  * if the offset of the error in the transfer and a disk label
  129  * are both available.  blkdone should be -1 if the position of the error
  130  * is unknown; the disklabel pointer may be null from drivers that have not
  131  * been converted to use them.  The message is printed with printf
  132  * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
  133  * The message should be completed (with at least a newline) with printf
  134  * or addlog, respectively.  There is no trailing space.
  135  */
  136 #ifndef PRIdaddr
  137 #define PRIdaddr PRId64
  138 #endif
  139 void
  140 diskerr(const struct buf *bp, const char *dname, const char *what, int pri,
  141     int blkdone, const struct disklabel *lp)
  142 {
  143         int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
  144         void (*pr)(const char *, ...);
  145         char partname = 'a' + part;
  146         daddr_t sn;
  147 
  148         if (/*CONSTCOND*/0)
  149                 /* Compiler will error this is the format is wrong... */
  150                 printf("%" PRIdaddr, bp->b_blkno);
  151 
  152         if (pri != LOG_PRINTF) {
  153                 static const char fmt[] = "";
  154                 log(pri, fmt);
  155                 pr = addlog;
  156         } else
  157                 pr = printf;
  158         (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
  159             bp->b_flags & B_READ ? "read" : "writ");
  160         sn = bp->b_blkno;
  161         if (bp->b_bcount <= DEV_BSIZE)
  162                 (*pr)("%" PRIdaddr, sn);
  163         else {
  164                 if (blkdone >= 0) {
  165                         sn += blkdone;
  166                         (*pr)("%" PRIdaddr " of ", sn);
  167                 }
  168                 (*pr)("%" PRIdaddr "-%" PRIdaddr "", bp->b_blkno,
  169                     bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE);
  170         }
  171         if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
  172                 sn += lp->d_partitions[part].p_offset;
  173                 (*pr)(" (%s%d bn %" PRIdaddr "; cn %" PRIdaddr "",
  174                     dname, unit, sn, sn / lp->d_secpercyl);
  175                 sn %= lp->d_secpercyl;
  176                 (*pr)(" tn %" PRIdaddr " sn %" PRIdaddr ")",
  177                     sn / lp->d_nsectors, sn % lp->d_nsectors);
  178         }
  179 }
  180 
  181 /*
  182  * Initialize the disklist.  Called by main() before autoconfiguration.
  183  */
  184 void
  185 disk_init(void)
  186 {
  187 
  188         TAILQ_INIT(&disklist);
  189         disk_count = 0;
  190 }
  191 
  192 /*
  193  * Searches the disklist for the disk corresponding to the
  194  * name provided.
  195  */
  196 struct disk *
  197 disk_find(char *name)
  198 {
  199         struct disk *diskp;
  200 
  201         if ((name == NULL) || (disk_count <= 0))
  202                 return (NULL);
  203 
  204         simple_lock(&disklist_slock);
  205         for (diskp = TAILQ_FIRST(&disklist); diskp != NULL;
  206             diskp = TAILQ_NEXT(diskp, dk_link))
  207                 if (strcmp(diskp->dk_name, name) == 0) {
  208                         simple_unlock(&disklist_slock);
  209                         return (diskp);
  210                 }
  211         simple_unlock(&disklist_slock);
  212 
  213         return (NULL);
  214 }
  215 
  216 /*
  217  * Attach a disk.
  218  */
  219 void
  220 disk_attach(struct disk *diskp)
  221 {
  222         int s;
  223 
  224         /*
  225          * Allocate and initialize the disklabel structures.  Note that
  226          * it's not safe to sleep here, since we're probably going to be
  227          * called during autoconfiguration.
  228          */
  229         diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT);
  230         diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF,
  231             M_NOWAIT);
  232         if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL))
  233                 panic("disk_attach: can't allocate storage for disklabel");
  234 
  235         memset(diskp->dk_label, 0, sizeof(struct disklabel));
  236         memset(diskp->dk_cpulabel, 0, sizeof(struct cpu_disklabel));
  237 
  238         /*
  239          * Set the attached timestamp.
  240          */
  241         s = splclock();
  242         diskp->dk_attachtime = mono_time;
  243         splx(s);
  244 
  245         /*
  246          * Link into the disklist.
  247          */
  248         simple_lock(&disklist_slock);
  249         TAILQ_INSERT_TAIL(&disklist, diskp, dk_link);
  250         simple_unlock(&disklist_slock);
  251         ++disk_count;
  252 }
  253 
  254 /*
  255  * Detach a disk.
  256  */
  257 void
  258 disk_detach(struct disk *diskp)
  259 {
  260 
  261         /*
  262          * Remove from the disklist.
  263          */
  264         if (--disk_count < 0)
  265                 panic("disk_detach: disk_count < 0");
  266         simple_lock(&disklist_slock);
  267         TAILQ_REMOVE(&disklist, diskp, dk_link);
  268         simple_unlock(&disklist_slock);
  269 
  270         /*
  271          * Free the space used by the disklabel structures.
  272          */
  273         free(diskp->dk_label, M_DEVBUF);
  274         free(diskp->dk_cpulabel, M_DEVBUF);
  275 }
  276 
  277 /*
  278  * Increment a disk's busy counter.  If the counter is going from
  279  * 0 to 1, set the timestamp.
  280  */
  281 void
  282 disk_busy(struct disk *diskp)
  283 {
  284         int s;
  285 
  286         /*
  287          * XXX We'd like to use something as accurate as microtime(),
  288          * but that doesn't depend on the system TOD clock.
  289          */
  290         if (diskp->dk_busy++ == 0) {
  291                 s = splclock();
  292                 diskp->dk_timestamp = mono_time;
  293                 splx(s);
  294         }
  295 }
  296 
  297 /*
  298  * Decrement a disk's busy counter, increment the byte count, total busy
  299  * time, and reset the timestamp.
  300  */
  301 void
  302 disk_unbusy(struct disk *diskp, long bcount, int read)
  303 {
  304         int s;
  305         struct timeval dv_time, diff_time;
  306 
  307         if (diskp->dk_busy-- == 0) {
  308                 printf("%s: dk_busy < 0\n", diskp->dk_name);
  309                 panic("disk_unbusy");
  310         }
  311 
  312         s = splclock();
  313         dv_time = mono_time;
  314         splx(s);
  315 
  316         timersub(&dv_time, &diskp->dk_timestamp, &diff_time);
  317         timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time);
  318 
  319         diskp->dk_timestamp = dv_time;
  320         if (bcount > 0) {
  321                 if (read) {
  322                         diskp->dk_rbytes += bcount;
  323                         diskp->dk_rxfer++;
  324                 } else {
  325                         diskp->dk_wbytes += bcount;
  326                         diskp->dk_wxfer++;
  327                 }
  328         }
  329 }
  330 
  331 /*
  332  * Reset the metrics counters on the given disk.  Note that we cannot
  333  * reset the busy counter, as it may case a panic in disk_unbusy().
  334  * We also must avoid playing with the timestamp information, as it
  335  * may skew any pending transfer results.
  336  */
  337 void
  338 disk_resetstat(struct disk *diskp)
  339 {
  340         int s = splbio(), t;
  341 
  342         diskp->dk_rxfer = 0;
  343         diskp->dk_rbytes = 0;
  344         diskp->dk_wxfer = 0;
  345         diskp->dk_wbytes = 0;
  346 
  347         t = splclock();
  348         diskp->dk_attachtime = mono_time;
  349         splx(t);
  350 
  351         timerclear(&diskp->dk_time);
  352 
  353         splx(s);
  354 }
  355 
  356 int
  357 sysctl_hw_disknames(SYSCTLFN_ARGS)
  358 {
  359         char buf[DK_DISKNAMELEN + 1];
  360         char *where = oldp;
  361         struct disk *diskp;
  362         size_t needed, left, slen;
  363         int error, first;
  364 
  365         if (newp != NULL)
  366                 return (EPERM);
  367         if (namelen != 0)
  368                 return (EINVAL);
  369 
  370         first = 1;
  371         error = 0;
  372         needed = 0;
  373         left = *oldlenp;
  374 
  375         simple_lock(&disklist_slock);
  376         for (diskp = TAILQ_FIRST(&disklist); diskp != NULL;
  377             diskp = TAILQ_NEXT(diskp, dk_link)) {
  378                 if (where == NULL)
  379                         needed += strlen(diskp->dk_name) + 1;
  380                 else {
  381                         memset(buf, 0, sizeof(buf));
  382                         if (first) {
  383                                 strncpy(buf, diskp->dk_name, sizeof(buf));
  384                                 first = 0;
  385                         } else {
  386                                 buf[0] = ' ';
  387                                 strncpy(buf + 1, diskp->dk_name,
  388                                     sizeof(buf) - 1);
  389                         }
  390                         buf[DK_DISKNAMELEN] = '\0';
  391                         slen = strlen(buf);
  392                         if (left < slen + 1)
  393                                 break;
  394                         /* +1 to copy out the trailing NUL byte */
  395                         error = copyout(buf, where, slen + 1);
  396                         if (error)
  397                                 break;
  398                         where += slen;
  399                         needed += slen;
  400                         left -= slen;
  401                 }
  402         }
  403         simple_unlock(&disklist_slock);
  404         *oldlenp = needed;
  405         return (error);
  406 }
  407 
  408 int
  409 sysctl_hw_diskstats(SYSCTLFN_ARGS)
  410 {
  411         struct disk_sysctl sdisk;
  412         struct disk *diskp;
  413         char *where = oldp;
  414         size_t tocopy, left;
  415         int error;
  416 
  417         if (newp != NULL)
  418                 return (EPERM);
  419 
  420         /*
  421          * The original hw.diskstats call was broken and did not require
  422          * the userland to pass in it's size of struct disk_sysctl.  This
  423          * was fixed after NetBSD 1.6 was released, and any applications
  424          * that do not pass in the size are given an error only, unless
  425          * we care about 1.6 compatibility.
  426          */
  427         if (namelen == 0)
  428 #ifdef COMPAT_16
  429                 tocopy = offsetof(struct disk_sysctl, dk_rxfer);
  430 #else
  431                 return (EINVAL);
  432 #endif
  433         else
  434                 tocopy = name[0];
  435 
  436         if (where == NULL) {
  437                 *oldlenp = disk_count * tocopy;
  438                 return (0);
  439         }
  440 
  441         error = 0;
  442         left = *oldlenp;
  443         memset(&sdisk, 0, sizeof(sdisk));
  444         *oldlenp = 0;
  445 
  446         simple_lock(&disklist_slock);
  447         TAILQ_FOREACH(diskp, &disklist, dk_link) {
  448                 if (left < tocopy)
  449                         break;
  450                 strncpy(sdisk.dk_name, diskp->dk_name, sizeof(sdisk.dk_name));
  451                 sdisk.dk_xfer = diskp->dk_rxfer + diskp->dk_wxfer;
  452                 sdisk.dk_rxfer = diskp->dk_rxfer;
  453                 sdisk.dk_wxfer = diskp->dk_wxfer;
  454                 sdisk.dk_seek = diskp->dk_seek;
  455                 sdisk.dk_bytes = diskp->dk_rbytes + diskp->dk_wbytes;
  456                 sdisk.dk_rbytes = diskp->dk_rbytes;
  457                 sdisk.dk_wbytes = diskp->dk_wbytes;
  458                 sdisk.dk_attachtime_sec = diskp->dk_attachtime.tv_sec;
  459                 sdisk.dk_attachtime_usec = diskp->dk_attachtime.tv_usec;
  460                 sdisk.dk_timestamp_sec = diskp->dk_timestamp.tv_sec;
  461                 sdisk.dk_timestamp_usec = diskp->dk_timestamp.tv_usec;
  462                 sdisk.dk_time_sec = diskp->dk_time.tv_sec;
  463                 sdisk.dk_time_usec = diskp->dk_time.tv_usec;
  464                 sdisk.dk_busy = diskp->dk_busy;
  465 
  466                 error = copyout(&sdisk, where, min(tocopy, sizeof(sdisk)));
  467                 if (error)
  468                         break;
  469                 where += tocopy;
  470                 *oldlenp += tocopy;
  471                 left -= tocopy;
  472         }
  473         simple_unlock(&disklist_slock);
  474         return (error);
  475 }
  476 
  477 struct bufq_fcfs {
  478         TAILQ_HEAD(, buf) bq_head;      /* actual list of buffers */
  479 };
  480 
  481 struct bufq_disksort {
  482         TAILQ_HEAD(, buf) bq_head;      /* actual list of buffers */
  483 };
  484 
  485 #define PRIO_READ_BURST         48
  486 #define PRIO_WRITE_REQ          16
  487 
  488 struct bufq_prio {
  489         TAILQ_HEAD(, buf) bq_read, bq_write; /* actual list of buffers */
  490         struct buf *bq_write_next;      /* next request in bq_write */
  491         struct buf *bq_next;            /* current request */
  492         int bq_read_burst;              /* # of consecutive reads */
  493 };
  494 
  495 
  496 static __inline int buf_inorder(const struct buf *, const struct buf *, int);
  497 
  498 /*
  499  * Check if two buf's are in ascending order.
  500  */
  501 static __inline int
  502 buf_inorder(const struct buf *bp, const struct buf *bq, int sortby)
  503 {
  504 
  505         if (bp == NULL || bq == NULL)
  506                 return (bq == NULL);
  507 
  508         if (sortby == BUFQ_SORT_CYLINDER) {
  509                 if (bp->b_cylinder != bq->b_cylinder)
  510                         return bp->b_cylinder < bq->b_cylinder;
  511                 else
  512                         return bp->b_rawblkno < bq->b_rawblkno;
  513         } else
  514                 return bp->b_rawblkno < bq->b_rawblkno;
  515 }
  516 
  517 
  518 /*
  519  * First-come first-served sort for disks.
  520  *
  521  * Requests are appended to the queue without any reordering.
  522  */
  523 static void
  524 bufq_fcfs_put(struct bufq_state *bufq, struct buf *bp)
  525 {
  526         struct bufq_fcfs *fcfs = bufq->bq_private;
  527 
  528         TAILQ_INSERT_TAIL(&fcfs->bq_head, bp, b_actq);
  529 }
  530 
  531 static struct buf *
  532 bufq_fcfs_get(struct bufq_state *bufq, int remove)
  533 {
  534         struct bufq_fcfs *fcfs = bufq->bq_private;
  535         struct buf *bp;
  536 
  537         bp = TAILQ_FIRST(&fcfs->bq_head);
  538 
  539         if (bp != NULL && remove)
  540                 TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq);
  541 
  542         return (bp);
  543 }
  544 
  545 
  546 /*
  547  * Seek sort for disks.
  548  *
  549  * There are actually two queues, sorted in ascendening order.  The first
  550  * queue holds those requests which are positioned after the current block;
  551  * the second holds requests which came in after their position was passed.
  552  * Thus we implement a one-way scan, retracting after reaching the end of
  553  * the drive to the first request on the second queue, at which time it
  554  * becomes the first queue.
  555  *
  556  * A one-way scan is natural because of the way UNIX read-ahead blocks are
  557  * allocated.
  558  */
  559 static void
  560 bufq_disksort_put(struct bufq_state *bufq, struct buf *bp)
  561 {
  562         struct bufq_disksort *disksort = bufq->bq_private;
  563         struct buf *bq, *nbq;
  564         int sortby;
  565 
  566         sortby = bufq->bq_flags & BUFQ_SORT_MASK;
  567 
  568         bq = TAILQ_FIRST(&disksort->bq_head);
  569 
  570         /*
  571          * If the queue is empty it's easy; we just go on the end.
  572          */
  573         if (bq == NULL) {
  574                 TAILQ_INSERT_TAIL(&disksort->bq_head, bp, b_actq);
  575                 return;
  576         }
  577 
  578         /*
  579          * If we lie before the currently active request, then we
  580          * must locate the second request list and add ourselves to it.
  581          */
  582         if (buf_inorder(bp, bq, sortby)) {
  583                 while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) {
  584                         /*
  585                          * Check for an ``inversion'' in the normally ascending
  586                          * block numbers, indicating the start of the second
  587                          * request list.
  588                          */
  589                         if (buf_inorder(nbq, bq, sortby)) {
  590                                 /*
  591                                  * Search the second request list for the first
  592                                  * request at a larger block number.  We go
  593                                  * after that; if there is no such request, we
  594                                  * go at the end.
  595                                  */
  596                                 do {
  597                                         if (buf_inorder(bp, nbq, sortby))
  598                                                 goto insert;
  599                                         bq = nbq;
  600                                 } while ((nbq =
  601                                     TAILQ_NEXT(bq, b_actq)) != NULL);
  602                                 goto insert;            /* after last */
  603                         }
  604                         bq = nbq;
  605                 }
  606                 /*
  607                  * No inversions... we will go after the last, and
  608                  * be the first request in the second request list.
  609                  */
  610                 goto insert;
  611         }
  612         /*
  613          * Request is at/after the current request...
  614          * sort in the first request list.
  615          */
  616         while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) {
  617                 /*
  618                  * We want to go after the current request if there is an
  619                  * inversion after it (i.e. it is the end of the first
  620                  * request list), or if the next request is a larger cylinder
  621                  * than our request.
  622                  */
  623                 if (buf_inorder(nbq, bq, sortby) ||
  624                     buf_inorder(bp, nbq, sortby))
  625                         goto insert;
  626                 bq = nbq;
  627         }
  628         /*
  629          * Neither a second list nor a larger request... we go at the end of
  630          * the first list, which is the same as the end of the whole schebang.
  631          */
  632 insert: TAILQ_INSERT_AFTER(&disksort->bq_head, bq, bp, b_actq);
  633 }
  634 
  635 static struct buf *
  636 bufq_disksort_get(struct bufq_state *bufq, int remove)
  637 {
  638         struct bufq_disksort *disksort = bufq->bq_private;
  639         struct buf *bp;
  640 
  641         bp = TAILQ_FIRST(&disksort->bq_head);
  642 
  643         if (bp != NULL && remove)
  644                 TAILQ_REMOVE(&disksort->bq_head, bp, b_actq);
  645 
  646         return (bp);
  647 }
  648 
  649 
  650 /*
  651  * Seek sort for disks.
  652  *
  653  * There are two queues.  The first queue holds read requests; the second
  654  * holds write requests.  The read queue is first-come first-served; the
  655  * write queue is sorted in ascendening block order.
  656  * The read queue is processed first.  After PRIO_READ_BURST consecutive
  657  * read requests with non-empty write queue PRIO_WRITE_REQ requests from
  658  * the write queue will be processed.
  659  */
  660 static void
  661 bufq_prio_put(struct bufq_state *bufq, struct buf *bp)
  662 {
  663         struct bufq_prio *prio = bufq->bq_private;
  664         struct buf *bq;
  665         int sortby;
  666 
  667         sortby = bufq->bq_flags & BUFQ_SORT_MASK;
  668 
  669         /*
  670          * If it's a read request append it to the list.
  671          */
  672         if ((bp->b_flags & B_READ) == B_READ) {
  673                 TAILQ_INSERT_TAIL(&prio->bq_read, bp, b_actq);
  674                 return;
  675         }
  676 
  677         bq = TAILQ_FIRST(&prio->bq_write);
  678 
  679         /*
  680          * If the write list is empty, simply append it to the list.
  681          */
  682         if (bq == NULL) {
  683                 TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq);
  684                 prio->bq_write_next = bp;
  685                 return;
  686         }
  687 
  688         /*
  689          * If we lie after the next request, insert after this request.
  690          */
  691         if (buf_inorder(prio->bq_write_next, bp, sortby))
  692                 bq = prio->bq_write_next;
  693 
  694         /*
  695          * Search for the first request at a larger block number.
  696          * We go before this request if it exists.
  697          */
  698         while (bq != NULL && buf_inorder(bq, bp, sortby))
  699                 bq = TAILQ_NEXT(bq, b_actq);
  700 
  701         if (bq != NULL)
  702                 TAILQ_INSERT_BEFORE(bq, bp, b_actq);
  703         else
  704                 TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq);
  705 }
  706 
  707 static struct buf *
  708 bufq_prio_get(struct bufq_state *bufq, int remove)
  709 {
  710         struct bufq_prio *prio = bufq->bq_private;
  711         struct buf *bp;
  712 
  713         /*
  714          * If no current request, get next from the lists.
  715          */
  716         if (prio->bq_next == NULL) {
  717                 /*
  718                  * If at least one list is empty, select the other.
  719                  */
  720                 if (TAILQ_FIRST(&prio->bq_read) == NULL) {
  721                         prio->bq_next = prio->bq_write_next;
  722                         prio->bq_read_burst = 0;
  723                 } else if (prio->bq_write_next == NULL) {
  724                         prio->bq_next = TAILQ_FIRST(&prio->bq_read);
  725                         prio->bq_read_burst = 0;
  726                 } else {
  727                         /*
  728                          * Both list have requests.  Select the read list up
  729                          * to PRIO_READ_BURST times, then select the write
  730                          * list PRIO_WRITE_REQ times.
  731                          */
  732                         if (prio->bq_read_burst++ < PRIO_READ_BURST)
  733                                 prio->bq_next = TAILQ_FIRST(&prio->bq_read);
  734                         else if (prio->bq_read_burst <
  735                             PRIO_READ_BURST + PRIO_WRITE_REQ)
  736                                 prio->bq_next = prio->bq_write_next;
  737                         else {
  738                                 prio->bq_next = TAILQ_FIRST(&prio->bq_read);
  739                                 prio->bq_read_burst = 0;
  740                         }
  741                 }
  742         }
  743 
  744         bp = prio->bq_next;
  745 
  746         if (bp != NULL && remove) {
  747                 if ((bp->b_flags & B_READ) == B_READ)
  748                         TAILQ_REMOVE(&prio->bq_read, bp, b_actq);
  749                 else {
  750                         /*
  751                          * Advance the write pointer before removing
  752                          * bp since it is actually prio->bq_write_next.
  753                          */
  754                         prio->bq_write_next =
  755                             TAILQ_NEXT(prio->bq_write_next, b_actq);
  756                         TAILQ_REMOVE(&prio->bq_write, bp, b_actq);
  757                         if (prio->bq_write_next == NULL)
  758                                 prio->bq_write_next =
  759                                     TAILQ_FIRST(&prio->bq_write);
  760                 }
  761 
  762                 prio->bq_next = NULL;
  763         }
  764 
  765         return (bp);
  766 }
  767 
  768 
  769 /*
  770  * Cyclical scan (CSCAN)
  771  */
  772 TAILQ_HEAD(bqhead, buf);
  773 struct cscan_queue {
  774         struct bqhead cq_head[2];       /* actual lists of buffers */
  775         int cq_idx;                     /* current list index */
  776         int cq_lastcylinder;            /* b_cylinder of the last request */
  777         daddr_t cq_lastrawblkno;        /* b_rawblkno of the last request */
  778 };
  779 
  780 static int __inline cscan_empty(const struct cscan_queue *);
  781 static void cscan_put(struct cscan_queue *, struct buf *, int);
  782 static struct buf *cscan_get(struct cscan_queue *, int);
  783 static void cscan_init(struct cscan_queue *);
  784 
  785 static __inline int
  786 cscan_empty(const struct cscan_queue *q)
  787 {
  788 
  789         return TAILQ_EMPTY(&q->cq_head[0]) && TAILQ_EMPTY(&q->cq_head[1]);
  790 }
  791 
  792 static void
  793 cscan_put(struct cscan_queue *q, struct buf *bp, int sortby)
  794 {
  795         struct buf tmp;
  796         struct buf *it;
  797         struct bqhead *bqh;
  798         int idx;
  799 
  800         tmp.b_cylinder = q->cq_lastcylinder;
  801         tmp.b_rawblkno = q->cq_lastrawblkno;
  802 
  803         if (buf_inorder(bp, &tmp, sortby))
  804                 idx = 1 - q->cq_idx;
  805         else
  806                 idx = q->cq_idx;
  807 
  808         bqh = &q->cq_head[idx];
  809 
  810         TAILQ_FOREACH(it, bqh, b_actq)
  811                 if (buf_inorder(bp, it, sortby))
  812                         break;
  813 
  814         if (it != NULL)
  815                 TAILQ_INSERT_BEFORE(it, bp, b_actq);
  816         else
  817                 TAILQ_INSERT_TAIL(bqh, bp, b_actq);
  818 }
  819 
  820 static struct buf *
  821 cscan_get(struct cscan_queue *q, int remove)
  822 {
  823         int idx = q->cq_idx;
  824         struct bqhead *bqh;
  825         struct buf *bp;
  826 
  827         bqh = &q->cq_head[idx];
  828         bp = TAILQ_FIRST(bqh);
  829 
  830         if (bp == NULL) {
  831                 /* switch queue */
  832                 idx = 1 - idx;
  833                 bqh = &q->cq_head[idx];
  834                 bp = TAILQ_FIRST(bqh);
  835         }
  836 
  837         KDASSERT((bp != NULL && !cscan_empty(q)) ||
  838                  (bp == NULL && cscan_empty(q)));
  839 
  840         if (bp != NULL && remove) {
  841                 q->cq_idx = idx;
  842                 TAILQ_REMOVE(bqh, bp, b_actq);
  843 
  844                 q->cq_lastcylinder = bp->b_cylinder;
  845                 q->cq_lastrawblkno =
  846                     bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT);
  847         }
  848 
  849         return (bp);
  850 }
  851 
  852 static void
  853 cscan_init(struct cscan_queue *q)
  854 {
  855 
  856         TAILQ_INIT(&q->cq_head[0]);
  857         TAILQ_INIT(&q->cq_head[1]);
  858 }
  859 
  860 
  861 /*
  862  * Per-prioritiy CSCAN.
  863  *
  864  * XXX probably we should have a way to raise
  865  * priority of the on-queue requests.
  866  */
  867 #define PRIOCSCAN_NQUEUE        3
  868 
  869 struct priocscan_queue {
  870         struct cscan_queue q_queue;
  871         int q_burst;
  872 };
  873 
  874 struct bufq_priocscan {
  875         struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE];
  876 
  877 #if 0
  878         /*
  879          * XXX using "global" head position can reduce positioning time
  880          * when switching between queues.
  881          * although it might affect against fairness.
  882          */
  883         daddr_t bq_lastrawblkno;
  884         int bq_lastcylinder;
  885 #endif
  886 };
  887 
  888 /*
  889  * how many requests to serve when having pending requests on other queues.
  890  *
  891  * XXX tune
  892  */
  893 const int priocscan_burst[] = {
  894         64, 16, 4
  895 };
  896 
  897 static void bufq_priocscan_put(struct bufq_state *, struct buf *);
  898 static struct buf *bufq_priocscan_get(struct bufq_state *, int);
  899 static void bufq_priocscan_init(struct bufq_state *);
  900 static __inline struct cscan_queue *bufq_priocscan_selectqueue(
  901     struct bufq_priocscan *, const struct buf *);
  902 
  903 static __inline struct cscan_queue *
  904 bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp)
  905 {
  906         static const int priocscan_priomap[] = {
  907                 [BPRIO_TIMENONCRITICAL] = 2,
  908                 [BPRIO_TIMELIMITED] = 1,
  909                 [BPRIO_TIMECRITICAL] = 0
  910         };
  911 
  912         return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue;
  913 }
  914 
  915 static void
  916 bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp)
  917 {
  918         struct bufq_priocscan *q = bufq->bq_private;
  919         struct cscan_queue *cq;
  920         const int sortby = bufq->bq_flags & BUFQ_SORT_MASK;
  921 
  922         cq = bufq_priocscan_selectqueue(q, bp);
  923         cscan_put(cq, bp, sortby);
  924 }
  925 
  926 static struct buf *
  927 bufq_priocscan_get(struct bufq_state *bufq, int remove)
  928 {
  929         struct bufq_priocscan *q = bufq->bq_private;
  930         struct priocscan_queue *pq, *npq;
  931         struct priocscan_queue *first; /* first non-empty queue */
  932         const struct priocscan_queue *epq;
  933         const struct cscan_queue *cq;
  934         struct buf *bp;
  935         boolean_t single; /* true if there's only one non-empty queue */
  936 
  937         pq = &q->bq_queue[0];
  938         epq = pq + PRIOCSCAN_NQUEUE;
  939         for (; pq < epq; pq++) {
  940                 cq = &pq->q_queue;
  941                 if (!cscan_empty(cq))
  942                         break;
  943         }
  944         if (pq == epq) {
  945                 /* there's no requests */
  946                 return NULL;
  947         }
  948 
  949         first = pq;
  950         single = TRUE;
  951         for (npq = first + 1; npq < epq; npq++) {
  952                 cq = &npq->q_queue;
  953                 if (!cscan_empty(cq)) {
  954                         single = FALSE;
  955                         if (pq->q_burst > 0)
  956                                 break;
  957                         pq = npq;
  958                 }
  959         }
  960         if (single) {
  961                 /*
  962                  * there's only a non-empty queue.  just serve it.
  963                  */
  964                 pq = first;
  965         } else if (pq->q_burst > 0) {
  966                 /*
  967                  * XXX account only by number of requests.  is it good enough?
  968                  */
  969                 pq->q_burst--;
  970         } else {
  971                 /*
  972                  * no queue was selected due to burst counts
  973                  */
  974                 int i;
  975 #ifdef DEBUG
  976                 for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
  977                         pq = &q->bq_queue[i];
  978                         cq = &pq->q_queue;
  979                         if (!cscan_empty(cq) && pq->q_burst)
  980                                 panic("%s: inconsist", __func__);
  981                 }
  982 #endif /* DEBUG */
  983 
  984                 /*
  985                  * reset burst counts
  986                  */
  987                 for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
  988                         pq = &q->bq_queue[i];
  989                         pq->q_burst = priocscan_burst[i];
  990                 }
  991 
  992                 /*
  993                  * serve first non-empty queue.
  994                  */
  995                 pq = first;
  996         }
  997 
  998         KDASSERT(!cscan_empty(&pq->q_queue));
  999         bp = cscan_get(&pq->q_queue, remove);
 1000         KDASSERT(bp != NULL);
 1001         KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp));
 1002 
 1003         return bp;
 1004 }
 1005 
 1006 static void
 1007 bufq_priocscan_init(struct bufq_state *bufq)
 1008 {
 1009         struct bufq_priocscan *q;
 1010         int i;
 1011 
 1012         bufq->bq_get = bufq_priocscan_get;
 1013         bufq->bq_put = bufq_priocscan_put;
 1014         bufq->bq_private = malloc(sizeof(struct bufq_priocscan),
 1015             M_DEVBUF, M_ZERO);
 1016 
 1017         q = bufq->bq_private;
 1018         for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
 1019                 struct cscan_queue *cq = &q->bq_queue[i].q_queue;
 1020 
 1021                 cscan_init(cq);
 1022         }
 1023 }
 1024 
 1025 
 1026 /*
 1027  * Create a device buffer queue.
 1028  */
 1029 void
 1030 bufq_alloc(struct bufq_state *bufq, int flags)
 1031 {
 1032         struct bufq_fcfs *fcfs;
 1033         struct bufq_disksort *disksort;
 1034         struct bufq_prio *prio;
 1035 
 1036         bufq->bq_flags = flags;
 1037 
 1038         switch (flags & BUFQ_SORT_MASK) {
 1039         case BUFQ_SORT_RAWBLOCK:
 1040         case BUFQ_SORT_CYLINDER:
 1041                 break;
 1042         case 0:
 1043                 if ((flags & BUFQ_METHOD_MASK) == BUFQ_FCFS)
 1044                         break;
 1045                 /* FALLTHROUGH */
 1046         default:
 1047                 panic("bufq_alloc: sort out of range");
 1048         }
 1049 
 1050         switch (flags & BUFQ_METHOD_MASK) {
 1051         case BUFQ_FCFS:
 1052                 bufq->bq_get = bufq_fcfs_get;
 1053                 bufq->bq_put = bufq_fcfs_put;
 1054                 MALLOC(bufq->bq_private, struct bufq_fcfs *,
 1055                     sizeof(struct bufq_fcfs), M_DEVBUF, M_ZERO);
 1056                 fcfs = (struct bufq_fcfs *)bufq->bq_private;
 1057                 TAILQ_INIT(&fcfs->bq_head);
 1058                 break;
 1059         case BUFQ_DISKSORT:
 1060                 bufq->bq_get = bufq_disksort_get;
 1061                 bufq->bq_put = bufq_disksort_put;
 1062                 MALLOC(bufq->bq_private, struct bufq_disksort *,
 1063                     sizeof(struct bufq_disksort), M_DEVBUF, M_ZERO);
 1064                 disksort = (struct bufq_disksort *)bufq->bq_private;
 1065                 TAILQ_INIT(&disksort->bq_head);
 1066                 break;
 1067         case BUFQ_READ_PRIO:
 1068                 bufq->bq_get = bufq_prio_get;
 1069                 bufq->bq_put = bufq_prio_put;
 1070                 MALLOC(bufq->bq_private, struct bufq_prio *,
 1071                     sizeof(struct bufq_prio), M_DEVBUF, M_ZERO);
 1072                 prio = (struct bufq_prio *)bufq->bq_private;
 1073                 TAILQ_INIT(&prio->bq_read);
 1074                 TAILQ_INIT(&prio->bq_write);
 1075                 break;
 1076         case BUFQ_PRIOCSCAN:
 1077                 bufq_priocscan_init(bufq);
 1078                 break;
 1079         default:
 1080                 panic("bufq_alloc: method out of range");
 1081         }
 1082 }
 1083 
 1084 /*
 1085  * Destroy a device buffer queue.
 1086  */
 1087 void
 1088 bufq_free(struct bufq_state *bufq)
 1089 {
 1090 
 1091         KASSERT(bufq->bq_private != NULL);
 1092         KASSERT(BUFQ_PEEK(bufq) == NULL);
 1093 
 1094         FREE(bufq->bq_private, M_DEVBUF);
 1095         bufq->bq_get = NULL;
 1096         bufq->bq_put = NULL;
 1097 }
 1098 
 1099 /*
 1100  * Bounds checking against the media size, used for the raw partition.
 1101  * The sector size passed in should currently always be DEV_BSIZE,
 1102  * and the media size the size of the device in DEV_BSIZE sectors.
 1103  */
 1104 int
 1105 bounds_check_with_mediasize(struct buf *bp, int secsize, u_int64_t mediasize)
 1106 {
 1107         int sz;
 1108 
 1109         sz = howmany(bp->b_bcount, secsize);
 1110 
 1111         if (bp->b_blkno + sz > mediasize) {
 1112                 sz = mediasize - bp->b_blkno;
 1113                 if (sz == 0) {
 1114                         /* If exactly at end of disk, return EOF. */
 1115                         bp->b_resid = bp->b_bcount;
 1116                         goto done;
 1117                 }
 1118                 if (sz < 0) {
 1119                         /* If past end of disk, return EINVAL. */
 1120                         bp->b_error = EINVAL;
 1121                         goto bad;
 1122                 }
 1123                 /* Otherwise, truncate request. */
 1124                 bp->b_bcount = sz << DEV_BSHIFT;
 1125         }
 1126 
 1127         return 1;
 1128 
 1129 bad:
 1130         bp->b_flags |= B_ERROR;
 1131 done:
 1132         return 0;
 1133 }
Cache object: 5a239ec5acb57a36f004c2642fa0bf66
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/subr_disk.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/subr_disk.c