The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/subr_disk.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2003,2004,2009 The DragonFly Project.  All rights reserved.
    3  *
    4  * This code is derived from software contributed to The DragonFly Project
    5  * by Matthew Dillon <dillon@backplane.com>
    6  * and Alex Hornung <ahornung@gmail.com>
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  *
   12  * 1. Redistributions of source code must retain the above copyright
   13  *    notice, this list of conditions and the following disclaimer.
   14  * 2. Redistributions in binary form must reproduce the above copyright
   15  *    notice, this list of conditions and the following disclaimer in
   16  *    the documentation and/or other materials provided with the
   17  *    distribution.
   18  * 3. Neither the name of The DragonFly Project nor the names of its
   19  *    contributors may be used to endorse or promote products derived
   20  *    from this software without specific, prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
   26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   33  * SUCH DAMAGE.
   34  *
   35  * ----------------------------------------------------------------------------
   36  * "THE BEER-WARE LICENSE" (Revision 42):
   37  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
   38  * can do whatever you want with this stuff. If we meet some day, and you think
   39  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
   40  * ----------------------------------------------------------------------------
   41  *
   42  * Copyright (c) 1982, 1986, 1988, 1993
   43  *      The Regents of the University of California.  All rights reserved.
   44  * (c) UNIX System Laboratories, Inc.
   45  * All or some portions of this file are derived from material licensed
   46  * to the University of California by American Telephone and Telegraph
   47  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   48  * the permission of UNIX System Laboratories, Inc.
   49  *
   50  * Redistribution and use in source and binary forms, with or without
   51  * modification, are permitted provided that the following conditions
   52  * are met:
   53  * 1. Redistributions of source code must retain the above copyright
   54  *    notice, this list of conditions and the following disclaimer.
   55  * 2. Redistributions in binary form must reproduce the above copyright
   56  *    notice, this list of conditions and the following disclaimer in the
   57  *    documentation and/or other materials provided with the distribution.
   58  * 3. Neither the name of the University nor the names of its contributors
   59  *    may be used to endorse or promote products derived from this software
   60  *    without specific prior written permission.
   61  *
   62  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   63  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   65  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   66  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   67  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   68  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   69  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   70  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   71  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   72  * SUCH DAMAGE.
   73  *
   74  *      @(#)ufs_disksubr.c      8.5 (Berkeley) 1/21/94
   75  * $FreeBSD: src/sys/kern/subr_disk.c,v 1.20.2.6 2001/10/05 07:14:57 peter Exp $
   76  * $FreeBSD: src/sys/ufs/ufs/ufs_disksubr.c,v 1.44.2.3 2001/03/05 05:42:19 obrien Exp $
   77  */
   78 
   79 #include <sys/param.h>
   80 #include <sys/systm.h>
   81 #include <sys/kernel.h>
   82 #include <sys/proc.h>
   83 #include <sys/sysctl.h>
   84 #include <sys/buf.h>
   85 #include <sys/conf.h>
   86 #include <sys/disklabel.h>
   87 #include <sys/disklabel32.h>
   88 #include <sys/disklabel64.h>
   89 #include <sys/diskslice.h>
   90 #include <sys/diskmbr.h>
   91 #include <sys/disk.h>
   92 #include <sys/kerneldump.h>
   93 #include <sys/malloc.h>
   94 #include <machine/md_var.h>
   95 #include <sys/ctype.h>
   96 #include <sys/syslog.h>
   97 #include <sys/device.h>
   98 #include <sys/msgport.h>
   99 #include <sys/devfs.h>
  100 #include <sys/thread.h>
  101 #include <sys/dsched.h>
  102 #include <sys/queue.h>
  103 #include <sys/lock.h>
  104 #include <sys/udev.h>
  105 #include <sys/uuid.h>
  106 
  107 #include <sys/buf2.h>
  108 #include <sys/mplock2.h>
  109 #include <sys/msgport2.h>
  110 #include <sys/thread2.h>
  111 
  112 static MALLOC_DEFINE(M_DISK, "disk", "disk data");
  113 static int disk_debug_enable = 0;
  114 
  115 static void disk_msg_autofree_reply(lwkt_port_t, lwkt_msg_t);
  116 static void disk_msg_core(void *);
  117 static int disk_probe_slice(struct disk *dp, cdev_t dev, int slice, int reprobe);
  118 static void disk_probe(struct disk *dp, int reprobe);
  119 static void _setdiskinfo(struct disk *disk, struct disk_info *info);
  120 static void bioqwritereorder(struct bio_queue_head *bioq);
  121 static void disk_cleanserial(char *serno);
  122 static int disk_debug(int, char *, ...) __printflike(2, 3);
  123 static cdev_t _disk_create_named(const char *name, int unit, struct disk *dp,
  124     struct dev_ops *raw_ops, int clone);
  125 
  126 static d_open_t diskopen;
  127 static d_close_t diskclose;
  128 static d_ioctl_t diskioctl;
  129 static d_strategy_t diskstrategy;
  130 static d_psize_t diskpsize;
  131 static d_dump_t diskdump;
  132 
  133 static LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist);
  134 static struct lwkt_token disklist_token;
  135 
  136 static struct dev_ops disk_ops = {
  137         { "disk", 0, D_DISK | D_MPSAFE | D_TRACKCLOSE },
  138         .d_open = diskopen,
  139         .d_close = diskclose,
  140         .d_read = physread,
  141         .d_write = physwrite,
  142         .d_ioctl = diskioctl,
  143         .d_strategy = diskstrategy,
  144         .d_dump = diskdump,
  145         .d_psize = diskpsize,
  146 };
  147 
  148 static struct objcache  *disk_msg_cache;
  149 
  150 struct objcache_malloc_args disk_msg_malloc_args = {
  151         sizeof(struct disk_msg), M_DISK };
  152 
  153 static struct lwkt_port disk_dispose_port;
  154 static struct lwkt_port disk_msg_port;
  155 
  156 static int
  157 disk_debug(int level, char *fmt, ...)
  158 {
  159         __va_list ap;
  160 
  161         __va_start(ap, fmt);
  162         if (level <= disk_debug_enable)
  163                 kvprintf(fmt, ap);
  164         __va_end(ap);
  165 
  166         return 0;
  167 }
  168 
  169 static int
  170 disk_probe_slice(struct disk *dp, cdev_t dev, int slice, int reprobe)
  171 {
  172         struct disk_info *info = &dp->d_info;
  173         struct diskslice *sp = &dp->d_slice->dss_slices[slice];
  174         disklabel_ops_t ops;
  175         struct partinfo part;
  176         const char *msg;
  177         char uuid_buf[128];
  178         cdev_t ndev;
  179         int sno;
  180         u_int i;
  181 
  182         disk_debug(2, "disk_probe_slice (begin): %s (%s)\n",
  183                    dev->si_name, dp->d_cdev->si_name);
  184 
  185         sno = slice ? slice - 1 : 0;
  186 
  187         ops = &disklabel32_ops;
  188         msg = ops->op_readdisklabel(dev, sp, &sp->ds_label, info);
  189         if (msg && !strcmp(msg, "no disk label")) {
  190                 ops = &disklabel64_ops;
  191                 msg = ops->op_readdisklabel(dev, sp, &sp->ds_label, info);
  192         }
  193 
  194         if (msg == NULL) {
  195                 if (slice != WHOLE_DISK_SLICE)
  196                         ops->op_adjust_label_reserved(dp->d_slice, slice, sp);
  197                 else
  198                         sp->ds_reserved = 0;
  199 
  200                 sp->ds_ops = ops;
  201                 for (i = 0; i < ops->op_getnumparts(sp->ds_label); i++) {
  202                         ops->op_loadpartinfo(sp->ds_label, i, &part);
  203                         if (part.fstype) {
  204                                 if (reprobe &&
  205                                     (ndev = devfs_find_device_by_name("%s%c",
  206                                                 dev->si_name, 'a' + i))
  207                                 ) {
  208                                         /*
  209                                          * Device already exists and
  210                                          * is still valid.
  211                                          */
  212                                         ndev->si_flags |= SI_REPROBE_TEST;
  213 
  214                                         /*
  215                                          * Destroy old UUID alias
  216                                          */
  217                                         destroy_dev_alias(ndev, "part-by-uuid/*");
  218 
  219                                         /* Create UUID alias */
  220                                         if (!kuuid_is_nil(&part.storage_uuid)) {
  221                                                 snprintf_uuid(uuid_buf,
  222                                                     sizeof(uuid_buf),
  223                                                     &part.storage_uuid);
  224                                                 make_dev_alias(ndev,
  225                                                     "part-by-uuid/%s",
  226                                                     uuid_buf);
  227                                                 udev_dict_set_cstr(ndev, "uuid", uuid_buf);
  228                                         }
  229                                 } else {
  230                                         ndev = make_dev_covering(&disk_ops, dp->d_rawdev->si_ops,
  231                                                 dkmakeminor(dkunit(dp->d_cdev),
  232                                                             slice, i),
  233                                                 UID_ROOT, GID_OPERATOR, 0640,
  234                                                 "%s%c", dev->si_name, 'a'+ i);
  235                                         ndev->si_parent = dev;
  236                                         ndev->si_iosize_max = dev->si_iosize_max;
  237                                         ndev->si_disk = dp;
  238                                         udev_dict_set_cstr(ndev, "subsystem", "disk");
  239                                         /* Inherit parent's disk type */
  240                                         if (dp->d_disktype) {
  241                                                 udev_dict_set_cstr(ndev, "disk-type",
  242                                                     __DECONST(char *, dp->d_disktype));
  243                                         }
  244 
  245                                         /* Create serno alias */
  246                                         if (dp->d_info.d_serialno) {
  247                                                 make_dev_alias(ndev,
  248                                                     "serno/%s.s%d%c",
  249                                                     dp->d_info.d_serialno,
  250                                                     sno, 'a' + i);
  251                                         }
  252 
  253                                         /* Create UUID alias */
  254                                         if (!kuuid_is_nil(&part.storage_uuid)) {
  255                                                 snprintf_uuid(uuid_buf,
  256                                                     sizeof(uuid_buf),
  257                                                     &part.storage_uuid);
  258                                                 make_dev_alias(ndev,
  259                                                     "part-by-uuid/%s",
  260                                                     uuid_buf);
  261                                                 udev_dict_set_cstr(ndev, "uuid", uuid_buf);
  262                                         }
  263                                         ndev->si_flags |= SI_REPROBE_TEST;
  264                                 }
  265                         }
  266                 }
  267         } else if (info->d_dsflags & DSO_COMPATLABEL) {
  268                 msg = NULL;
  269                 if (sp->ds_size >= 0x100000000ULL)
  270                         ops = &disklabel64_ops;
  271                 else
  272                         ops = &disklabel32_ops;
  273                 sp->ds_label = ops->op_clone_label(info, sp);
  274         } else {
  275                 if (sp->ds_type == DOSPTYP_386BSD || /* XXX */
  276                     sp->ds_type == DOSPTYP_NETBSD ||
  277                     sp->ds_type == DOSPTYP_OPENBSD) {
  278                         log(LOG_WARNING, "%s: cannot find label (%s)\n",
  279                             dev->si_name, msg);
  280                 }
  281 
  282                 if (sp->ds_label.opaque != NULL && sp->ds_ops != NULL) {
  283                         /* Clear out old label - it's not around anymore */
  284                         disk_debug(2,
  285                             "disk_probe_slice: clear out old diskabel on %s\n",
  286                             dev->si_name);
  287 
  288                         sp->ds_ops->op_freedisklabel(&sp->ds_label);
  289                         sp->ds_ops = NULL;
  290                 }
  291         }
  292 
  293         if (msg == NULL) {
  294                 sp->ds_wlabel = FALSE;
  295         }
  296 
  297         return (msg ? EINVAL : 0);
  298 }
  299 
  300 /*
  301  * This routine is only called for newly minted drives or to reprobe
  302  * a drive with no open slices.  disk_probe_slice() is called directly
  303  * when reprobing partition changes within slices.
  304  */
  305 static void
  306 disk_probe(struct disk *dp, int reprobe)
  307 {
  308         struct disk_info *info = &dp->d_info;
  309         cdev_t dev = dp->d_cdev;
  310         cdev_t ndev;
  311         int error, i, sno;
  312         struct diskslices *osp;
  313         struct diskslice *sp;
  314         char uuid_buf[128];
  315 
  316         KKASSERT (info->d_media_blksize != 0);
  317 
  318         osp = dp->d_slice;
  319         dp->d_slice = dsmakeslicestruct(BASE_SLICE, info);
  320         disk_debug(1, "disk_probe (begin): %s\n", dp->d_cdev->si_name);
  321 
  322         error = mbrinit(dev, info, &(dp->d_slice));
  323         if (error) {
  324                 dsgone(&osp);
  325                 return;
  326         }
  327 
  328         for (i = 0; i < dp->d_slice->dss_nslices; i++) {
  329                 /*
  330                  * Ignore the whole-disk slice, it has already been created.
  331                  */
  332                 if (i == WHOLE_DISK_SLICE)
  333                         continue;
  334 
  335 #if 1
  336                 /*
  337                  * Ignore the compatibility slice s0 if it's a device mapper
  338                  * volume.
  339                  */
  340                 if ((i == COMPATIBILITY_SLICE) &&
  341                     (info->d_dsflags & DSO_DEVICEMAPPER))
  342                         continue;
  343 #endif
  344 
  345                 sp = &dp->d_slice->dss_slices[i];
  346 
  347                 /*
  348                  * Handle s0.  s0 is a compatibility slice if there are no
  349                  * other slices and it has not otherwise been set up, else
  350                  * we ignore it.
  351                  */
  352                 if (i == COMPATIBILITY_SLICE) {
  353                         sno = 0;
  354                         if (sp->ds_type == 0 &&
  355                             dp->d_slice->dss_nslices == BASE_SLICE) {
  356                                 sp->ds_size = info->d_media_blocks;
  357                                 sp->ds_reserved = 0;
  358                         }
  359                 } else {
  360                         sno = i - 1;
  361                         sp->ds_reserved = 0;
  362                 }
  363 
  364                 /*
  365                  * Ignore 0-length slices
  366                  */
  367                 if (sp->ds_size == 0)
  368                         continue;
  369 
  370                 if (reprobe &&
  371                     (ndev = devfs_find_device_by_name("%ss%d",
  372                                                       dev->si_name, sno))) {
  373                         /*
  374                          * Device already exists and is still valid
  375                          */
  376                         ndev->si_flags |= SI_REPROBE_TEST;
  377 
  378                         /*
  379                          * Destroy old UUID alias
  380                          */
  381                         destroy_dev_alias(ndev, "slice-by-uuid/*");
  382 
  383                         /* Create UUID alias */
  384                         if (!kuuid_is_nil(&sp->ds_stor_uuid)) {
  385                                 snprintf_uuid(uuid_buf, sizeof(uuid_buf),
  386                                     &sp->ds_stor_uuid);
  387                                 make_dev_alias(ndev, "slice-by-uuid/%s",
  388                                     uuid_buf);
  389                         }
  390                 } else {
  391                         /*
  392                          * Else create new device
  393                          */
  394                         ndev = make_dev_covering(&disk_ops, dp->d_rawdev->si_ops,
  395                                         dkmakewholeslice(dkunit(dev), i),
  396                                         UID_ROOT, GID_OPERATOR, 0640,
  397                                         (info->d_dsflags & DSO_DEVICEMAPPER)?
  398                                         "%s.s%d" : "%ss%d", dev->si_name, sno);
  399                         ndev->si_parent = dev;
  400                         ndev->si_iosize_max = dev->si_iosize_max;
  401                         udev_dict_set_cstr(ndev, "subsystem", "disk");
  402                         /* Inherit parent's disk type */
  403                         if (dp->d_disktype) {
  404                                 udev_dict_set_cstr(ndev, "disk-type",
  405                                     __DECONST(char *, dp->d_disktype));
  406                         }
  407 
  408                         /* Create serno alias */
  409                         if (dp->d_info.d_serialno) {
  410                                 make_dev_alias(ndev, "serno/%s.s%d",
  411                                                dp->d_info.d_serialno, sno);
  412                         }
  413 
  414                         /* Create UUID alias */
  415                         if (!kuuid_is_nil(&sp->ds_stor_uuid)) {
  416                                 snprintf_uuid(uuid_buf, sizeof(uuid_buf),
  417                                     &sp->ds_stor_uuid);
  418                                 make_dev_alias(ndev, "slice-by-uuid/%s",
  419                                     uuid_buf);
  420                         }
  421 
  422                         ndev->si_disk = dp;
  423                         ndev->si_flags |= SI_REPROBE_TEST;
  424                 }
  425                 sp->ds_dev = ndev;
  426 
  427                 /*
  428                  * Probe appropriate slices for a disklabel
  429                  *
  430                  * XXX slice type 1 used by our gpt probe code.
  431                  * XXX slice type 0 used by mbr compat slice.
  432                  */
  433                 if (sp->ds_type == DOSPTYP_386BSD ||
  434                     sp->ds_type == DOSPTYP_NETBSD ||
  435                     sp->ds_type == DOSPTYP_OPENBSD ||
  436                     sp->ds_type == 0 ||
  437                     sp->ds_type == 1) {
  438                         if (dp->d_slice->dss_first_bsd_slice == 0)
  439                                 dp->d_slice->dss_first_bsd_slice = i;
  440                         disk_probe_slice(dp, ndev, i, reprobe);
  441                 }
  442         }
  443         dsgone(&osp);
  444         disk_debug(1, "disk_probe (end): %s\n", dp->d_cdev->si_name);
  445 }
  446 
  447 
  448 static void
  449 disk_msg_core(void *arg)
  450 {
  451         struct disk     *dp;
  452         struct diskslice *sp;
  453         disk_msg_t msg;
  454         int run;
  455 
  456         lwkt_gettoken(&disklist_token);
  457         lwkt_initport_thread(&disk_msg_port, curthread);
  458         wakeup(curthread);      /* synchronous startup */
  459         lwkt_reltoken(&disklist_token);
  460 
  461         get_mplock();   /* not mpsafe yet? */
  462         run = 1;
  463 
  464         while (run) {
  465                 msg = (disk_msg_t)lwkt_waitport(&disk_msg_port, 0);
  466 
  467                 switch (msg->hdr.u.ms_result) {
  468                 case DISK_DISK_PROBE:
  469                         dp = (struct disk *)msg->load;
  470                         disk_debug(1,
  471                                     "DISK_DISK_PROBE: %s\n",
  472                                         dp->d_cdev->si_name);
  473                         disk_iocom_update(dp);
  474                         disk_probe(dp, 0);
  475                         break;
  476                 case DISK_DISK_DESTROY:
  477                         dp = (struct disk *)msg->load;
  478                         disk_debug(1,
  479                                     "DISK_DISK_DESTROY: %s\n",
  480                                         dp->d_cdev->si_name);
  481                         disk_iocom_uninit(dp);
  482 
  483                         /*
  484                          * Interlock against struct disk enumerations.
  485                          * Wait for enumerations to complete then remove
  486                          * the dp from the list before tearing it down.
  487                          *
  488                          * This avoids races against e.g.
  489                          * dsched_thread_io_alloc().
  490                          */
  491                         lwkt_gettoken(&disklist_token);
  492                         while (dp->d_refs)
  493                                 tsleep(&dp->d_refs, 0, "diskdel", hz / 10);
  494                         LIST_REMOVE(dp, d_list);
  495 
  496                         dsched_disk_destroy_callback(dp);
  497                         devfs_destroy_related(dp->d_cdev);
  498                         destroy_dev(dp->d_cdev);
  499                         destroy_only_dev(dp->d_rawdev);
  500 
  501                         lwkt_reltoken(&disklist_token);
  502 
  503                         if (dp->d_info.d_serialno) {
  504                                 kfree(dp->d_info.d_serialno, M_TEMP);
  505                                 dp->d_info.d_serialno = NULL;
  506                         }
  507                         break;
  508                 case DISK_UNPROBE:
  509                         dp = (struct disk *)msg->load;
  510                         disk_debug(1,
  511                                     "DISK_DISK_UNPROBE: %s\n",
  512                                         dp->d_cdev->si_name);
  513                         devfs_destroy_related(dp->d_cdev);
  514                         break;
  515                 case DISK_SLICE_REPROBE:
  516                         dp = (struct disk *)msg->load;
  517                         sp = (struct diskslice *)msg->load2;
  518                         devfs_clr_related_flag(sp->ds_dev,
  519                                                 SI_REPROBE_TEST);
  520                         disk_debug(1,
  521                                     "DISK_SLICE_REPROBE: %s\n",
  522                                     sp->ds_dev->si_name);
  523                         disk_probe_slice(dp, sp->ds_dev,
  524                                          dkslice(sp->ds_dev), 1);
  525                         devfs_destroy_related_without_flag(
  526                                         sp->ds_dev, SI_REPROBE_TEST);
  527                         break;
  528                 case DISK_DISK_REPROBE:
  529                         dp = (struct disk *)msg->load;
  530                         devfs_clr_related_flag(dp->d_cdev, SI_REPROBE_TEST);
  531                         disk_debug(1,
  532                                     "DISK_DISK_REPROBE: %s\n",
  533                                     dp->d_cdev->si_name);
  534                         disk_probe(dp, 1);
  535                         devfs_destroy_related_without_flag(
  536                                         dp->d_cdev, SI_REPROBE_TEST);
  537                         break;
  538                 case DISK_SYNC:
  539                         disk_debug(1, "DISK_SYNC\n");
  540                         break;
  541                 default:
  542                         devfs_debug(DEVFS_DEBUG_WARNING,
  543                                     "disk_msg_core: unknown message "
  544                                     "received at core\n");
  545                         break;
  546                 }
  547                 lwkt_replymsg(&msg->hdr, 0);
  548         }
  549         lwkt_exit();
  550 }
  551 
  552 
  553 /*
  554  * Acts as a message drain. Any message that is replied to here gets
  555  * destroyed and the memory freed.
  556  */
  557 static void
  558 disk_msg_autofree_reply(lwkt_port_t port, lwkt_msg_t msg)
  559 {
  560         objcache_put(disk_msg_cache, msg);
  561 }
  562 
  563 
  564 void
  565 disk_msg_send(uint32_t cmd, void *load, void *load2)
  566 {
  567         disk_msg_t disk_msg;
  568         lwkt_port_t port = &disk_msg_port;
  569 
  570         disk_msg = objcache_get(disk_msg_cache, M_WAITOK);
  571 
  572         lwkt_initmsg(&disk_msg->hdr, &disk_dispose_port, 0);
  573 
  574         disk_msg->hdr.u.ms_result = cmd;
  575         disk_msg->load = load;
  576         disk_msg->load2 = load2;
  577         KKASSERT(port);
  578         lwkt_sendmsg(port, &disk_msg->hdr);
  579 }
  580 
  581 void
  582 disk_msg_send_sync(uint32_t cmd, void *load, void *load2)
  583 {
  584         struct lwkt_port rep_port;
  585         disk_msg_t disk_msg;
  586         lwkt_port_t port;
  587 
  588         disk_msg = objcache_get(disk_msg_cache, M_WAITOK);
  589         port = &disk_msg_port;
  590 
  591         /* XXX could probably use curthread's built-in msgport */
  592         lwkt_initport_thread(&rep_port, curthread);
  593         lwkt_initmsg(&disk_msg->hdr, &rep_port, 0);
  594 
  595         disk_msg->hdr.u.ms_result = cmd;
  596         disk_msg->load = load;
  597         disk_msg->load2 = load2;
  598 
  599         lwkt_sendmsg(port, &disk_msg->hdr);
  600         lwkt_waitmsg(&disk_msg->hdr, 0);
  601         objcache_put(disk_msg_cache, disk_msg);
  602 }
  603 
  604 /*
  605  * Create a raw device for the dev_ops template (which is returned).  Also
  606  * create a slice and unit managed disk and overload the user visible
  607  * device space with it.
  608  *
  609  * NOTE: The returned raw device is NOT a slice and unit managed device.
  610  * It is an actual raw device representing the raw disk as specified by
  611  * the passed dev_ops.  The disk layer not only returns such a raw device,
  612  * it also uses it internally when passing (modified) commands through.
  613  */
  614 cdev_t
  615 disk_create(int unit, struct disk *dp, struct dev_ops *raw_ops)
  616 {
  617         return _disk_create_named(NULL, unit, dp, raw_ops, 0);
  618 }
  619 
  620 cdev_t
  621 disk_create_clone(int unit, struct disk *dp,
  622                   struct dev_ops *raw_ops)
  623 {
  624         return _disk_create_named(NULL, unit, dp, raw_ops, 1);
  625 }
  626 
  627 cdev_t
  628 disk_create_named(const char *name, int unit, struct disk *dp,
  629                   struct dev_ops *raw_ops)
  630 {
  631         return _disk_create_named(name, unit, dp, raw_ops, 0);
  632 }
  633 
  634 cdev_t
  635 disk_create_named_clone(const char *name, int unit, struct disk *dp,
  636                         struct dev_ops *raw_ops)
  637 {
  638         return _disk_create_named(name, unit, dp, raw_ops, 1);
  639 }
  640 
  641 static cdev_t
  642 _disk_create_named(const char *name, int unit, struct disk *dp,
  643                    struct dev_ops *raw_ops, int clone)
  644 {
  645         cdev_t rawdev;
  646 
  647         disk_debug(1, "disk_create (begin): %s%d\n", name, unit);
  648 
  649         if (name) {
  650                 rawdev = make_only_dev(raw_ops, dkmakewholedisk(unit),
  651                     UID_ROOT, GID_OPERATOR, 0640, "%s", name);
  652         } else {
  653                 rawdev = make_only_dev(raw_ops, dkmakewholedisk(unit),
  654                     UID_ROOT, GID_OPERATOR, 0640,
  655                     "%s%d", raw_ops->head.name, unit);
  656         }
  657 
  658         bzero(dp, sizeof(*dp));
  659 
  660         dp->d_rawdev = rawdev;
  661         dp->d_raw_ops = raw_ops;
  662         dp->d_dev_ops = &disk_ops;
  663 
  664         if (name) {
  665                 if (clone) {
  666                         dp->d_cdev = make_only_dev_covering(
  667                                         &disk_ops, dp->d_rawdev->si_ops,
  668                                         dkmakewholedisk(unit),
  669                                         UID_ROOT, GID_OPERATOR, 0640,
  670                                         "%s", name);
  671                 } else {
  672                         dp->d_cdev = make_dev_covering(
  673                                         &disk_ops, dp->d_rawdev->si_ops,
  674                                         dkmakewholedisk(unit),
  675                                         UID_ROOT, GID_OPERATOR, 0640,
  676                                         "%s", name);
  677                 }
  678         } else {
  679                 if (clone) {
  680                         dp->d_cdev = make_only_dev_covering(
  681                                         &disk_ops, dp->d_rawdev->si_ops,
  682                                         dkmakewholedisk(unit),
  683                                         UID_ROOT, GID_OPERATOR, 0640,
  684                                         "%s%d", raw_ops->head.name, unit);
  685                 } else {
  686                         dp->d_cdev = make_dev_covering(
  687                                         &disk_ops, dp->d_rawdev->si_ops,
  688                                         dkmakewholedisk(unit),
  689                                         UID_ROOT, GID_OPERATOR, 0640,
  690                                         "%s%d", raw_ops->head.name, unit);
  691                 }
  692         }
  693 
  694         udev_dict_set_cstr(dp->d_cdev, "subsystem", "disk");
  695         dp->d_cdev->si_disk = dp;
  696 
  697         if (name)
  698                 dsched_disk_create_callback(dp, name, unit);
  699         else
  700                 dsched_disk_create_callback(dp, raw_ops->head.name, unit);
  701 
  702         lwkt_gettoken(&disklist_token);
  703         LIST_INSERT_HEAD(&disklist, dp, d_list);
  704         lwkt_reltoken(&disklist_token);
  705 
  706         disk_iocom_init(dp);
  707 
  708         disk_debug(1, "disk_create (end): %s%d\n",
  709                    (name != NULL)?(name):(raw_ops->head.name), unit);
  710 
  711         return (dp->d_rawdev);
  712 }
  713 
  714 int
  715 disk_setdisktype(struct disk *disk, const char *type)
  716 {
  717         int error;
  718 
  719         KKASSERT(disk != NULL);
  720 
  721         disk->d_disktype = type;
  722         error = udev_dict_set_cstr(disk->d_cdev, "disk-type",
  723                                    __DECONST(char *, type));
  724         return error;
  725 }
  726 
  727 int
  728 disk_getopencount(struct disk *disk)
  729 {
  730         return disk->d_opencount;
  731 }
  732 
  733 static void
  734 _setdiskinfo(struct disk *disk, struct disk_info *info)
  735 {
  736         char *oldserialno;
  737 
  738         oldserialno = disk->d_info.d_serialno;
  739         bcopy(info, &disk->d_info, sizeof(disk->d_info));
  740         info = &disk->d_info;
  741 
  742         disk_debug(1, "_setdiskinfo: %s\n", disk->d_cdev->si_name);
  743 
  744         /*
  745          * The serial number is duplicated so the caller can throw
  746          * their copy away.
  747          */
  748         if (info->d_serialno && info->d_serialno[0] &&
  749             (info->d_serialno[0] != ' ' || strlen(info->d_serialno) > 1)) {
  750                 info->d_serialno = kstrdup(info->d_serialno, M_TEMP);
  751                 disk_cleanserial(info->d_serialno);
  752                 if (disk->d_cdev) {
  753                         make_dev_alias(disk->d_cdev, "serno/%s",
  754                                        info->d_serialno);
  755                 }
  756         } else {
  757                 info->d_serialno = NULL;
  758         }
  759         if (oldserialno)
  760                 kfree(oldserialno, M_TEMP);
  761 
  762         dsched_disk_update_callback(disk, info);
  763 
  764         /*
  765          * The caller may set d_media_size or d_media_blocks and we
  766          * calculate the other.
  767          */
  768         KKASSERT(info->d_media_size == 0 || info->d_media_blocks == 0);
  769         if (info->d_media_size == 0 && info->d_media_blocks) {
  770                 info->d_media_size = (u_int64_t)info->d_media_blocks *
  771                                      info->d_media_blksize;
  772         } else if (info->d_media_size && info->d_media_blocks == 0 &&
  773                    info->d_media_blksize) {
  774                 info->d_media_blocks = info->d_media_size /
  775                                        info->d_media_blksize;
  776         }
  777 
  778         /*
  779          * The si_* fields for rawdev are not set until after the
  780          * disk_create() call, so someone using the cooked version
  781          * of the raw device (i.e. da0s0) will not get the right
  782          * si_iosize_max unless we fix it up here.
  783          */
  784         if (disk->d_cdev && disk->d_rawdev &&
  785             disk->d_cdev->si_iosize_max == 0) {
  786                 disk->d_cdev->si_iosize_max = disk->d_rawdev->si_iosize_max;
  787                 disk->d_cdev->si_bsize_phys = disk->d_rawdev->si_bsize_phys;
  788                 disk->d_cdev->si_bsize_best = disk->d_rawdev->si_bsize_best;
  789         }
  790 
  791         /* Add the serial number to the udev_dictionary */
  792         if (info->d_serialno)
  793                 udev_dict_set_cstr(disk->d_cdev, "serno", info->d_serialno);
  794 }
  795 
  796 /*
  797  * Disk drivers must call this routine when media parameters are available
  798  * or have changed.
  799  */
  800 void
  801 disk_setdiskinfo(struct disk *disk, struct disk_info *info)
  802 {
  803         _setdiskinfo(disk, info);
  804         disk_msg_send(DISK_DISK_PROBE, disk, NULL);
  805         disk_debug(1, "disk_setdiskinfo: sent probe for %s\n",
  806                    disk->d_cdev->si_name);
  807 }
  808 
  809 void
  810 disk_setdiskinfo_sync(struct disk *disk, struct disk_info *info)
  811 {
  812         _setdiskinfo(disk, info);
  813         disk_msg_send_sync(DISK_DISK_PROBE, disk, NULL);
  814         disk_debug(1, "disk_setdiskinfo_sync: sent probe for %s\n",
  815                    disk->d_cdev->si_name);
  816 }
  817 
  818 /*
  819  * This routine is called when an adapter detaches.  The higher level
  820  * managed disk device is destroyed while the lower level raw device is
  821  * released.
  822  */
  823 void
  824 disk_destroy(struct disk *disk)
  825 {
  826         disk_msg_send_sync(DISK_DISK_DESTROY, disk, NULL);
  827         return;
  828 }
  829 
  830 int
  831 disk_dumpcheck(cdev_t dev, u_int64_t *size,
  832                u_int64_t *blkno, u_int32_t *secsize)
  833 {
  834         struct partinfo pinfo;
  835         int error;
  836 
  837         bzero(&pinfo, sizeof(pinfo));
  838         error = dev_dioctl(dev, DIOCGPART, (void *)&pinfo, 0,
  839                            proc0.p_ucred, NULL);
  840         if (error)
  841                 return (error);
  842 
  843         if (pinfo.media_blksize == 0)
  844                 return (ENXIO);
  845 
  846         if (blkno) /* XXX: make sure this reserved stuff is right */
  847                 *blkno = pinfo.reserved_blocks +
  848                         pinfo.media_offset / pinfo.media_blksize;
  849         if (secsize)
  850                 *secsize = pinfo.media_blksize;
  851         if (size)
  852                 *size = (pinfo.media_blocks - pinfo.reserved_blocks);
  853 
  854         return (0);
  855 }
  856 
  857 int
  858 disk_dumpconf(cdev_t dev, u_int onoff)
  859 {
  860         struct dumperinfo di;
  861         u_int64_t       size, blkno;
  862         u_int32_t       secsize;
  863         int error;
  864 
  865         if (!onoff)
  866                 return set_dumper(NULL);
  867 
  868         error = disk_dumpcheck(dev, &size, &blkno, &secsize);
  869 
  870         if (error)
  871                 return ENXIO;
  872 
  873         bzero(&di, sizeof(struct dumperinfo));
  874         di.dumper = diskdump;
  875         di.priv = dev;
  876         di.blocksize = secsize;
  877         di.maxiosize = dev->si_iosize_max;
  878         di.mediaoffset = blkno * DEV_BSIZE;
  879         di.mediasize = size * DEV_BSIZE;
  880 
  881         return set_dumper(&di);
  882 }
  883 
  884 void
  885 disk_unprobe(struct disk *disk)
  886 {
  887         if (disk == NULL)
  888                 return;
  889 
  890         disk_msg_send_sync(DISK_UNPROBE, disk, NULL);
  891 }
  892 
  893 void
  894 disk_invalidate (struct disk *disk)
  895 {
  896         dsgone(&disk->d_slice);
  897 }
  898 
  899 /*
  900  * Enumerate disks, pass a marker and an initial NULL dp to initialize,
  901  * then loop with the previously returned dp.
  902  *
  903  * The returned dp will be referenced, preventing its destruction.  When
  904  * you pass the returned dp back into the loop the ref is dropped.
  905  *
  906  * WARNING: If terminating your loop early you must call
  907  *          disk_enumerate_stop().
  908  */
  909 struct disk *
  910 disk_enumerate(struct disk *marker, struct disk *dp)
  911 {
  912         lwkt_gettoken(&disklist_token);
  913         if (dp) {
  914                 --dp->d_refs;
  915                 dp = LIST_NEXT(marker, d_list);
  916                 LIST_REMOVE(marker, d_list);
  917         } else {
  918                 bzero(marker, sizeof(*marker));
  919                 marker->d_flags = DISKFLAG_MARKER;
  920                 dp = LIST_FIRST(&disklist);
  921         }
  922         while (dp) {
  923                 if ((dp->d_flags & DISKFLAG_MARKER) == 0)
  924                         break;
  925                 dp = LIST_NEXT(dp, d_list);
  926         }
  927         if (dp) {
  928                 ++dp->d_refs;
  929                 LIST_INSERT_AFTER(dp, marker, d_list);
  930         }
  931         lwkt_reltoken(&disklist_token);
  932         return (dp);
  933 }
  934 
  935 /*
  936  * Terminate an enumeration early.  Do not call this function if the
  937  * enumeration ended normally.  dp can be NULL, indicating that you
  938  * wish to retain the ref count on dp.
  939  *
  940  * This function removes the marker.
  941  */
  942 void
  943 disk_enumerate_stop(struct disk *marker, struct disk *dp)
  944 {
  945         lwkt_gettoken(&disklist_token);
  946         LIST_REMOVE(marker, d_list);
  947         if (dp)
  948                 --dp->d_refs;
  949         lwkt_reltoken(&disklist_token);
  950 }
  951 
  952 static
  953 int
  954 sysctl_disks(SYSCTL_HANDLER_ARGS)
  955 {
  956         struct disk marker;
  957         struct disk *dp;
  958         int error, first;
  959 
  960         first = 1;
  961         error = 0;
  962         dp = NULL;
  963 
  964         while ((dp = disk_enumerate(&marker, dp))) {
  965                 if (!first) {
  966                         error = SYSCTL_OUT(req, " ", 1);
  967                         if (error) {
  968                                 disk_enumerate_stop(&marker, dp);
  969                                 break;
  970                         }
  971                 } else {
  972                         first = 0;
  973                 }
  974                 error = SYSCTL_OUT(req, dp->d_rawdev->si_name,
  975                                    strlen(dp->d_rawdev->si_name));
  976                 if (error) {
  977                         disk_enumerate_stop(&marker, dp);
  978                         break;
  979                 }
  980         }
  981         if (error == 0)
  982                 error = SYSCTL_OUT(req, "", 1);
  983         return error;
  984 }
  985 
  986 SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
  987     sysctl_disks, "A", "names of available disks");
  988 
  989 /*
  990  * Open a disk device or partition.
  991  */
  992 static
  993 int
  994 diskopen(struct dev_open_args *ap)
  995 {
  996         cdev_t dev = ap->a_head.a_dev;
  997         struct disk *dp;
  998         int error;
  999 
 1000         /*
 1001          * dp can't be NULL here XXX.
 1002          *
 1003          * d_slice will be NULL if setdiskinfo() has not been called yet.
 1004          * setdiskinfo() is typically called whether the disk is present
 1005          * or not (e.g. CD), but the base disk device is created first
 1006          * and there may be a race.
 1007          */
 1008         dp = dev->si_disk;
 1009         if (dp == NULL || dp->d_slice == NULL)
 1010                 return (ENXIO);
 1011         error = 0;
 1012 
 1013         /*
 1014          * Deal with open races
 1015          */
 1016         get_mplock();
 1017         while (dp->d_flags & DISKFLAG_LOCK) {
 1018                 dp->d_flags |= DISKFLAG_WANTED;
 1019                 error = tsleep(dp, PCATCH, "diskopen", hz);
 1020                 if (error) {
 1021                         rel_mplock();
 1022                         return (error);
 1023                 }
 1024         }
 1025         dp->d_flags |= DISKFLAG_LOCK;
 1026 
 1027         /*
 1028          * Open the underlying raw device.
 1029          */
 1030         if (!dsisopen(dp->d_slice)) {
 1031 #if 0
 1032                 if (!pdev->si_iosize_max)
 1033                         pdev->si_iosize_max = dev->si_iosize_max;
 1034 #endif
 1035                 error = dev_dopen(dp->d_rawdev, ap->a_oflags,
 1036                                   ap->a_devtype, ap->a_cred);
 1037         }
 1038 
 1039         if (error)
 1040                 goto out;
 1041         error = dsopen(dev, ap->a_devtype, dp->d_info.d_dsflags,
 1042                        &dp->d_slice, &dp->d_info);
 1043         if (!dsisopen(dp->d_slice)) {
 1044                 dev_dclose(dp->d_rawdev, ap->a_oflags, ap->a_devtype);
 1045         }
 1046 out:
 1047         dp->d_flags &= ~DISKFLAG_LOCK;
 1048         if (dp->d_flags & DISKFLAG_WANTED) {
 1049                 dp->d_flags &= ~DISKFLAG_WANTED;
 1050                 wakeup(dp);
 1051         }
 1052         rel_mplock();
 1053 
 1054         KKASSERT(dp->d_opencount >= 0);
 1055         /* If the open was successful, bump open count */
 1056         if (error == 0)
 1057                 atomic_add_int(&dp->d_opencount, 1);
 1058 
 1059         return(error);
 1060 }
 1061 
 1062 /*
 1063  * Close a disk device or partition
 1064  */
 1065 static
 1066 int
 1067 diskclose(struct dev_close_args *ap)
 1068 {
 1069         cdev_t dev = ap->a_head.a_dev;
 1070         struct disk *dp;
 1071         int error;
 1072         int lcount;
 1073 
 1074         error = 0;
 1075         dp = dev->si_disk;
 1076 
 1077         /*
 1078          * The cdev_t represents the disk/slice/part.  The shared
 1079          * dp structure governs all cdevs associated with the disk.
 1080          *
 1081          * As a safety only close the underlying raw device on the last
 1082          * close the disk device if our tracking of the slices/partitions
 1083          * also indicates nothing is open.
 1084          */
 1085         KKASSERT(dp->d_opencount >= 1);
 1086         lcount = atomic_fetchadd_int(&dp->d_opencount, -1);
 1087 
 1088         get_mplock();
 1089         dsclose(dev, ap->a_devtype, dp->d_slice);
 1090         if (lcount <= 1 && !dsisopen(dp->d_slice)) {
 1091                 error = dev_dclose(dp->d_rawdev, ap->a_fflag, ap->a_devtype);
 1092         }
 1093         rel_mplock();
 1094         return (error);
 1095 }
 1096 
 1097 /*
 1098  * First execute the ioctl on the disk device, and if it isn't supported
 1099  * try running it on the backing device.
 1100  */
 1101 static
 1102 int
 1103 diskioctl(struct dev_ioctl_args *ap)
 1104 {
 1105         cdev_t dev = ap->a_head.a_dev;
 1106         struct disk *dp;
 1107         int error;
 1108         u_int u;
 1109 
 1110         dp = dev->si_disk;
 1111         if (dp == NULL)
 1112                 return (ENXIO);
 1113 
 1114         devfs_debug(DEVFS_DEBUG_DEBUG,
 1115                     "diskioctl: cmd is: %lx (name: %s)\n",
 1116                     ap->a_cmd, dev->si_name);
 1117         devfs_debug(DEVFS_DEBUG_DEBUG,
 1118                     "diskioctl: &dp->d_slice is: %p, %p\n",
 1119                     &dp->d_slice, dp->d_slice);
 1120 
 1121         if (ap->a_cmd == DIOCGKERNELDUMP) {
 1122                 u = *(u_int *)ap->a_data;
 1123                 return disk_dumpconf(dev, u);
 1124         }
 1125 
 1126         if (ap->a_cmd == DIOCRECLUSTER && dev == dp->d_cdev) {
 1127                 error = disk_iocom_ioctl(dp, ap->a_cmd, ap->a_data);
 1128                 return error;
 1129         }
 1130 
 1131         if (&dp->d_slice == NULL || dp->d_slice == NULL ||
 1132             ((dp->d_info.d_dsflags & DSO_DEVICEMAPPER) &&
 1133              dkslice(dev) == WHOLE_DISK_SLICE)) {
 1134                 error = ENOIOCTL;
 1135         } else {
 1136                 get_mplock();
 1137                 error = dsioctl(dev, ap->a_cmd, ap->a_data, ap->a_fflag,
 1138                                 &dp->d_slice, &dp->d_info);
 1139                 rel_mplock();
 1140         }
 1141 
 1142         if (error == ENOIOCTL) {
 1143                 error = dev_dioctl(dp->d_rawdev, ap->a_cmd, ap->a_data,
 1144                                    ap->a_fflag, ap->a_cred, NULL);
 1145         }
 1146         return (error);
 1147 }
 1148 
 1149 /*
 1150  * Execute strategy routine
 1151  */
 1152 static
 1153 int
 1154 diskstrategy(struct dev_strategy_args *ap)
 1155 {
 1156         cdev_t dev = ap->a_head.a_dev;
 1157         struct bio *bio = ap->a_bio;
 1158         struct bio *nbio;
 1159         struct disk *dp;
 1160 
 1161         dp = dev->si_disk;
 1162 
 1163         if (dp == NULL) {
 1164                 bio->bio_buf->b_error = ENXIO;
 1165                 bio->bio_buf->b_flags |= B_ERROR;
 1166                 biodone(bio);
 1167                 return(0);
 1168         }
 1169         KKASSERT(dev->si_disk == dp);
 1170 
 1171         /*
 1172          * The dscheck() function will also transform the slice relative
 1173          * block number i.e. bio->bio_offset into a block number that can be
 1174          * passed directly to the underlying raw device.  If dscheck()
 1175          * returns NULL it will have handled the bio for us (e.g. EOF
 1176          * or error due to being beyond the device size).
 1177          */
 1178         if ((nbio = dscheck(dev, bio, dp->d_slice)) != NULL) {
 1179                 dsched_queue(dp, nbio);
 1180         } else {
 1181                 biodone(bio);
 1182         }
 1183         return(0);
 1184 }
 1185 
 1186 /*
 1187  * Return the partition size in ?blocks?
 1188  */
 1189 static
 1190 int
 1191 diskpsize(struct dev_psize_args *ap)
 1192 {
 1193         cdev_t dev = ap->a_head.a_dev;
 1194         struct disk *dp;
 1195 
 1196         dp = dev->si_disk;
 1197         if (dp == NULL)
 1198                 return(ENODEV);
 1199 
 1200         ap->a_result = dssize(dev, &dp->d_slice);
 1201 
 1202         if ((ap->a_result == -1) &&
 1203            (dp->d_info.d_dsflags & DSO_RAWPSIZE)) {
 1204                 ap->a_head.a_dev = dp->d_rawdev;
 1205                 return dev_doperate(&ap->a_head);
 1206         }
 1207         return(0);
 1208 }
 1209 
 1210 int
 1211 diskdump(struct dev_dump_args *ap)
 1212 {
 1213         cdev_t dev = ap->a_head.a_dev;
 1214         struct disk *dp = dev->si_disk;
 1215         u_int64_t size, offset;
 1216         int error;
 1217 
 1218         error = disk_dumpcheck(dev, &size, &ap->a_blkno, &ap->a_secsize);
 1219         /* XXX: this should probably go in disk_dumpcheck somehow */
 1220         if (ap->a_length != 0) {
 1221                 size *= DEV_BSIZE;
 1222                 offset = ap->a_blkno * DEV_BSIZE;
 1223                 if ((ap->a_offset < offset) ||
 1224                     (ap->a_offset + ap->a_length - offset > size)) {
 1225                         kprintf("Attempt to write outside dump "
 1226                                 "device boundaries.\n");
 1227                         error = ENOSPC;
 1228                 }
 1229         }
 1230 
 1231         if (error == 0) {
 1232                 ap->a_head.a_dev = dp->d_rawdev;
 1233                 error = dev_doperate(&ap->a_head);
 1234         }
 1235 
 1236         return(error);
 1237 }
 1238 
 1239 
 1240 SYSCTL_INT(_debug_sizeof, OID_AUTO, diskslices, CTLFLAG_RD,
 1241            0, sizeof(struct diskslices), "sizeof(struct diskslices)");
 1242 
 1243 SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD,
 1244            0, sizeof(struct disk), "sizeof(struct disk)");
 1245 
 1246 /*
 1247  * Reorder interval for burst write allowance and minor write
 1248  * allowance.
 1249  *
 1250  * We always want to trickle some writes in to make use of the
 1251  * disk's zone cache.  Bursting occurs on a longer interval and only
 1252  * runningbufspace is well over the hirunningspace limit.
 1253  */
 1254 int bioq_reorder_burst_interval = 60;   /* should be multiple of minor */
 1255 SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_burst_interval,
 1256            CTLFLAG_RW, &bioq_reorder_burst_interval, 0, "");
 1257 int bioq_reorder_minor_interval = 5;
 1258 SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_minor_interval,
 1259            CTLFLAG_RW, &bioq_reorder_minor_interval, 0, "");
 1260 
 1261 int bioq_reorder_burst_bytes = 3000000;
 1262 SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_burst_bytes,
 1263            CTLFLAG_RW, &bioq_reorder_burst_bytes, 0, "");
 1264 int bioq_reorder_minor_bytes = 262144;
 1265 SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_minor_bytes,
 1266            CTLFLAG_RW, &bioq_reorder_minor_bytes, 0, "");
 1267 
 1268 
 1269 /*
 1270  * Order I/Os.  Generally speaking this code is designed to make better
 1271  * use of drive zone caches.  A drive zone cache can typically track linear
 1272  * reads or writes for around 16 zones simultaniously.
 1273  *
 1274  * Read prioritization issues:  It is possible for hundreds of megabytes worth
 1275  * of writes to be queued asynchronously.  This creates a huge bottleneck
 1276  * for reads which reduce read bandwidth to a trickle.
 1277  *
 1278  * To solve this problem we generally reorder reads before writes.
 1279  *
 1280  * However, a large number of random reads can also starve writes and
 1281  * make poor use of the drive zone cache so we allow writes to trickle
 1282  * in every N reads.
 1283  */
 1284 void
 1285 bioqdisksort(struct bio_queue_head *bioq, struct bio *bio)
 1286 {
 1287         /*
 1288          * The BIO wants to be ordered.  Adding to the tail also
 1289          * causes transition to be set to NULL, forcing the ordering
 1290          * of all prior I/O's.
 1291          */
 1292         if (bio->bio_buf->b_flags & B_ORDERED) {
 1293                 bioq_insert_tail(bioq, bio);
 1294                 return;
 1295         }
 1296 
 1297         switch(bio->bio_buf->b_cmd) {
 1298         case BUF_CMD_READ:
 1299                 if (bioq->transition) {
 1300                         /*
 1301                          * Insert before the first write.  Bleedover writes
 1302                          * based on reorder intervals to prevent starvation.
 1303                          */
 1304                         TAILQ_INSERT_BEFORE(bioq->transition, bio, bio_act);
 1305                         ++bioq->reorder;
 1306                         if (bioq->reorder % bioq_reorder_minor_interval == 0) {
 1307                                 bioqwritereorder(bioq);
 1308                                 if (bioq->reorder >=
 1309                                     bioq_reorder_burst_interval) {
 1310                                         bioq->reorder = 0;
 1311                                 }
 1312                         }
 1313                 } else {
 1314                         /*
 1315                          * No writes queued (or ordering was forced),
 1316                          * insert at tail.
 1317                          */
 1318                         TAILQ_INSERT_TAIL(&bioq->queue, bio, bio_act);
 1319                 }
 1320                 break;
 1321         case BUF_CMD_WRITE:
 1322                 /*
 1323                  * Writes are always appended.  If no writes were previously
 1324                  * queued or an ordered tail insertion occured the transition
 1325                  * field will be NULL.
 1326                  */
 1327                 TAILQ_INSERT_TAIL(&bioq->queue, bio, bio_act);
 1328                 if (bioq->transition == NULL)
 1329                         bioq->transition = bio;
 1330                 break;
 1331         default:
 1332                 /*
 1333                  * All other request types are forced to be ordered.
 1334                  */
 1335                 bioq_insert_tail(bioq, bio);
 1336                 break;
 1337         }
 1338 }
 1339 
 1340 /*
 1341  * Move the read-write transition point to prevent reads from
 1342  * completely starving our writes.  This brings a number of writes into
 1343  * the fold every N reads.
 1344  *
 1345  * We bring a few linear writes into the fold on a minor interval
 1346  * and we bring a non-linear burst of writes into the fold on a major
 1347  * interval.  Bursting only occurs if runningbufspace is really high
 1348  * (typically from syncs, fsyncs, or HAMMER flushes).
 1349  */
 1350 static
 1351 void
 1352 bioqwritereorder(struct bio_queue_head *bioq)
 1353 {
 1354         struct bio *bio;
 1355         off_t next_offset;
 1356         size_t left;
 1357         size_t n;
 1358         int check_off;
 1359 
 1360         if (bioq->reorder < bioq_reorder_burst_interval ||
 1361             !buf_runningbufspace_severe()) {
 1362                 left = (size_t)bioq_reorder_minor_bytes;
 1363                 check_off = 1;
 1364         } else {
 1365                 left = (size_t)bioq_reorder_burst_bytes;
 1366                 check_off = 0;
 1367         }
 1368 
 1369         next_offset = bioq->transition->bio_offset;
 1370         while ((bio = bioq->transition) != NULL &&
 1371                (check_off == 0 || next_offset == bio->bio_offset)
 1372         ) {
 1373                 n = bio->bio_buf->b_bcount;
 1374                 next_offset = bio->bio_offset + n;
 1375                 bioq->transition = TAILQ_NEXT(bio, bio_act);
 1376                 if (left < n)
 1377                         break;
 1378                 left -= n;
 1379         }
 1380 }
 1381 
 1382 /*
 1383  * Bounds checking against the media size, used for the raw partition.
 1384  * secsize, mediasize and b_blkno must all be the same units.
 1385  * Possibly this has to be DEV_BSIZE (512).
 1386  */
 1387 int
 1388 bounds_check_with_mediasize(struct bio *bio, int secsize, uint64_t mediasize)
 1389 {
 1390         struct buf *bp = bio->bio_buf;
 1391         int64_t sz;
 1392 
 1393         sz = howmany(bp->b_bcount, secsize);
 1394 
 1395         if (bio->bio_offset/DEV_BSIZE + sz > mediasize) {
 1396                 sz = mediasize - bio->bio_offset/DEV_BSIZE;
 1397                 if (sz == 0) {
 1398                         /* If exactly at end of disk, return EOF. */
 1399                         bp->b_resid = bp->b_bcount;
 1400                         return 0;
 1401                 }
 1402                 if (sz < 0) {
 1403                         /* If past end of disk, return EINVAL. */
 1404                         bp->b_error = EINVAL;
 1405                         return 0;
 1406                 }
 1407                 /* Otherwise, truncate request. */
 1408                 bp->b_bcount = sz * secsize;
 1409         }
 1410 
 1411         return 1;
 1412 }
 1413 
 1414 /*
 1415  * Disk error is the preface to plaintive error messages
 1416  * about failing disk transfers.  It prints messages of the form
 1417 
 1418 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
 1419 
 1420  * if the offset of the error in the transfer and a disk label
 1421  * are both available.  blkdone should be -1 if the position of the error
 1422  * is unknown; the disklabel pointer may be null from drivers that have not
 1423  * been converted to use them.  The message is printed with kprintf
 1424  * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
 1425  * The message should be completed (with at least a newline) with kprintf
 1426  * or log(-1, ...), respectively.  There is no trailing space.
 1427  */
 1428 void
 1429 diskerr(struct bio *bio, cdev_t dev, const char *what, int pri, int donecnt)
 1430 {
 1431         struct buf *bp = bio->bio_buf;
 1432         const char *term;
 1433 
 1434         switch(bp->b_cmd) {
 1435         case BUF_CMD_READ:
 1436                 term = "read";
 1437                 break;
 1438         case BUF_CMD_WRITE:
 1439                 term = "write";
 1440                 break;
 1441         default:
 1442                 term = "access";
 1443                 break;
 1444         }
 1445         kprintf("%s: %s %sing ", dev->si_name, what, term);
 1446         kprintf("offset %012llx for %d",
 1447                 (long long)bio->bio_offset,
 1448                 bp->b_bcount);
 1449 
 1450         if (donecnt)
 1451                 kprintf(" (%d bytes completed)", donecnt);
 1452 }
 1453 
 1454 /*
 1455  * Locate a disk device
 1456  */
 1457 cdev_t
 1458 disk_locate(const char *devname)
 1459 {
 1460         return devfs_find_device_by_name("%s", devname);
 1461 }
 1462 
 1463 void
 1464 disk_config(void *arg)
 1465 {
 1466         disk_msg_send_sync(DISK_SYNC, NULL, NULL);
 1467 }
 1468 
 1469 static void
 1470 disk_init(void)
 1471 {
 1472         struct thread* td_core;
 1473 
 1474         disk_msg_cache = objcache_create("disk-msg-cache", 0, 0,
 1475                                          NULL, NULL, NULL,
 1476                                          objcache_malloc_alloc,
 1477                                          objcache_malloc_free,
 1478                                          &disk_msg_malloc_args);
 1479 
 1480         lwkt_token_init(&disklist_token, "disks");
 1481 
 1482         /*
 1483          * Initialize the reply-only port which acts as a message drain
 1484          */
 1485         lwkt_initport_replyonly(&disk_dispose_port, disk_msg_autofree_reply);
 1486 
 1487         lwkt_gettoken(&disklist_token);
 1488         lwkt_create(disk_msg_core, /*args*/NULL, &td_core, NULL,
 1489                     0, -1, "disk_msg_core");
 1490         tsleep(td_core, 0, "diskcore", 0);
 1491         lwkt_reltoken(&disklist_token);
 1492 }
 1493 
 1494 static void
 1495 disk_uninit(void)
 1496 {
 1497         objcache_destroy(disk_msg_cache);
 1498 }
 1499 
 1500 /*
 1501  * Clean out illegal characters in serial numbers.
 1502  */
 1503 static void
 1504 disk_cleanserial(char *serno)
 1505 {
 1506         char c;
 1507 
 1508         while ((c = *serno) != 0) {
 1509                 if (c >= 'a' && c <= 'z')
 1510                         ;
 1511                 else if (c >= 'A' && c <= 'Z')
 1512                         ;
 1513                 else if (c >= '' && c <= '9')
 1514                         ;
 1515                 else if (c == '-' || c == '@' || c == '+' || c == '.')
 1516                         ;
 1517                 else
 1518                         c = '_';
 1519                 *serno++= c;
 1520         }
 1521 }
 1522 
 1523 TUNABLE_INT("kern.disk_debug", &disk_debug_enable);
 1524 SYSCTL_INT(_kern, OID_AUTO, disk_debug, CTLFLAG_RW, &disk_debug_enable,
 1525            0, "Enable subr_disk debugging");
 1526 
 1527 SYSINIT(disk_register, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, disk_init, NULL);
 1528 SYSUNINIT(disk_register, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, disk_uninit, NULL);

Cache object: d3280a90086db463ddd43dae5cbd934c


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.