The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/md/md.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: (Beerware AND BSD-3-Clause)
    3  *
    4  * ----------------------------------------------------------------------------
    5  * "THE BEER-WARE LICENSE" (Revision 42):
    6  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
    7  * can do whatever you want with this stuff. If we meet some day, and you think
    8  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
    9  * ----------------------------------------------------------------------------
   10  *
   11  * $FreeBSD$
   12  *
   13  */
   14 
   15 /*-
   16  * The following functions are based on the vn(4) driver: mdstart_swap(),
   17  * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
   18  * and as such under the following copyright:
   19  *
   20  * Copyright (c) 1988 University of Utah.
   21  * Copyright (c) 1990, 1993
   22  *      The Regents of the University of California.  All rights reserved.
   23  * Copyright (c) 2013 The FreeBSD Foundation
   24  * All rights reserved.
   25  *
   26  * This code is derived from software contributed to Berkeley by
   27  * the Systems Programming Group of the University of Utah Computer
   28  * Science Department.
   29  *
   30  * Portions of this software were developed by Konstantin Belousov
   31  * under sponsorship from the FreeBSD Foundation.
   32  *
   33  * Redistribution and use in source and binary forms, with or without
   34  * modification, are permitted provided that the following conditions
   35  * are met:
   36  * 1. Redistributions of source code must retain the above copyright
   37  *    notice, this list of conditions and the following disclaimer.
   38  * 2. Redistributions in binary form must reproduce the above copyright
   39  *    notice, this list of conditions and the following disclaimer in the
   40  *    documentation and/or other materials provided with the distribution.
   41  * 3. Neither the name of the University nor the names of its contributors
   42  *    may be used to endorse or promote products derived from this software
   43  *    without specific prior written permission.
   44  *
   45  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   55  * SUCH DAMAGE.
   56  *
   57  * from: Utah Hdr: vn.c 1.13 94/04/02
   58  *
   59  *      from: @(#)vn.c  8.6 (Berkeley) 4/1/94
   60  * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
   61  */
   62 
   63 #include "opt_rootdevname.h"
   64 #include "opt_geom.h"
   65 #include "opt_md.h"
   66 
   67 #include <sys/param.h>
   68 #include <sys/systm.h>
   69 #include <sys/bio.h>
   70 #include <sys/buf.h>
   71 #include <sys/conf.h>
   72 #include <sys/devicestat.h>
   73 #include <sys/fcntl.h>
   74 #include <sys/kernel.h>
   75 #include <sys/kthread.h>
   76 #include <sys/limits.h>
   77 #include <sys/linker.h>
   78 #include <sys/lock.h>
   79 #include <sys/malloc.h>
   80 #include <sys/mdioctl.h>
   81 #include <sys/mount.h>
   82 #include <sys/mutex.h>
   83 #include <sys/sx.h>
   84 #include <sys/namei.h>
   85 #include <sys/proc.h>
   86 #include <sys/queue.h>
   87 #include <sys/rwlock.h>
   88 #include <sys/sbuf.h>
   89 #include <sys/sched.h>
   90 #include <sys/sf_buf.h>
   91 #include <sys/sysctl.h>
   92 #include <sys/uio.h>
   93 #include <sys/vnode.h>
   94 #include <sys/disk.h>
   95 
   96 #include <geom/geom.h>
   97 #include <geom/geom_int.h>
   98 
   99 #include <vm/vm.h>
  100 #include <vm/vm_param.h>
  101 #include <vm/vm_object.h>
  102 #include <vm/vm_page.h>
  103 #include <vm/vm_pager.h>
  104 #include <vm/swap_pager.h>
  105 #include <vm/uma.h>
  106 
  107 #include <machine/bus.h>
  108 
  109 #define MD_MODVER 1
  110 
  111 #define MD_SHUTDOWN     0x10000         /* Tell worker thread to terminate. */
  112 #define MD_EXITING      0x20000         /* Worker thread is exiting. */
  113 #define MD_PROVIDERGONE 0x40000         /* Safe to free the softc */
  114 
  115 #ifndef MD_NSECT
  116 #define MD_NSECT (10000 * 2)
  117 #endif
  118 
  119 struct md_req {
  120         unsigned        md_unit;        /* unit number */
  121         enum md_types   md_type;        /* type of disk */
  122         off_t           md_mediasize;   /* size of disk in bytes */
  123         unsigned        md_sectorsize;  /* sectorsize */
  124         unsigned        md_options;     /* options */
  125         int             md_fwheads;     /* firmware heads */
  126         int             md_fwsectors;   /* firmware sectors */
  127         char            *md_file;       /* pathname of file to mount */
  128         enum uio_seg    md_file_seg;    /* location of md_file */
  129         char            *md_label;      /* label of the device (userspace) */
  130         int             *md_units;      /* pointer to units array (kernel) */
  131         size_t          md_units_nitems; /* items in md_units array */
  132 };
  133 
  134 #ifdef COMPAT_FREEBSD32
  135 struct md_ioctl32 {
  136         unsigned        md_version;
  137         unsigned        md_unit;
  138         enum md_types   md_type;
  139         uint32_t        md_file;
  140         off_t           md_mediasize;
  141         unsigned        md_sectorsize;
  142         unsigned        md_options;
  143         uint64_t        md_base;
  144         int             md_fwheads;
  145         int             md_fwsectors;
  146         uint32_t        md_label;
  147         int             md_pad[MDNPAD];
  148 } __attribute__((__packed__));
  149 CTASSERT((sizeof(struct md_ioctl32)) == 436);
  150 
  151 #define MDIOCATTACH_32  _IOC_NEWTYPE(MDIOCATTACH, struct md_ioctl32)
  152 #define MDIOCDETACH_32  _IOC_NEWTYPE(MDIOCDETACH, struct md_ioctl32)
  153 #define MDIOCQUERY_32   _IOC_NEWTYPE(MDIOCQUERY, struct md_ioctl32)
  154 #define MDIOCRESIZE_32  _IOC_NEWTYPE(MDIOCRESIZE, struct md_ioctl32)
  155 #endif /* COMPAT_FREEBSD32 */
  156 
  157 static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk");
  158 static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors");
  159 
  160 static int md_debug;
  161 SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0,
  162     "Enable md(4) debug messages");
  163 static int md_malloc_wait;
  164 SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0,
  165     "Allow malloc to wait for memory allocations");
  166 
  167 #if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE)
  168 #define MD_ROOT_FSTYPE  "ufs"
  169 #endif
  170 
  171 #if defined(MD_ROOT)
  172 /*
  173  * Preloaded image gets put here.
  174  */
  175 #if defined(MD_ROOT_SIZE)
  176 /*
  177  * We put the mfs_root symbol into the oldmfs section of the kernel object file.
  178  * Applications that patch the object with the image can determine
  179  * the size looking at the oldmfs section size within the kernel.
  180  */
  181 u_char mfs_root[MD_ROOT_SIZE*1024] __attribute__ ((section ("oldmfs")));
  182 const int mfs_root_size = sizeof(mfs_root);
  183 #elif defined(MD_ROOT_MEM)
  184 /* MD region already mapped in the memory */
  185 u_char *mfs_root;
  186 int mfs_root_size;
  187 #else
  188 extern volatile u_char __weak_symbol mfs_root;
  189 extern volatile u_char __weak_symbol mfs_root_end;
  190 #define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root))
  191 #endif
  192 #endif
  193 
  194 static g_init_t g_md_init;
  195 static g_fini_t g_md_fini;
  196 static g_start_t g_md_start;
  197 static g_access_t g_md_access;
  198 static void g_md_dumpconf(struct sbuf *sb, const char *indent,
  199     struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp);
  200 static g_provgone_t g_md_providergone;
  201 
  202 static struct cdev *status_dev = NULL;
  203 static struct sx md_sx;
  204 static struct unrhdr *md_uh;
  205 
  206 static d_ioctl_t mdctlioctl;
  207 
  208 static struct cdevsw mdctl_cdevsw = {
  209         .d_version =    D_VERSION,
  210         .d_ioctl =      mdctlioctl,
  211         .d_name =       MD_NAME,
  212 };
  213 
  214 struct g_class g_md_class = {
  215         .name = "MD",
  216         .version = G_VERSION,
  217         .init = g_md_init,
  218         .fini = g_md_fini,
  219         .start = g_md_start,
  220         .access = g_md_access,
  221         .dumpconf = g_md_dumpconf,
  222         .providergone = g_md_providergone,
  223 };
  224 
  225 DECLARE_GEOM_CLASS(g_md_class, g_md);
  226 
  227 static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list);
  228 
  229 #define NINDIR  (PAGE_SIZE / sizeof(uintptr_t))
  230 #define NMASK   (NINDIR-1)
  231 static int nshift;
  232 
  233 static uma_zone_t md_pbuf_zone;
  234 
  235 struct indir {
  236         uintptr_t       *array;
  237         u_int           total;
  238         u_int           used;
  239         u_int           shift;
  240 };
  241 
  242 struct md_s {
  243         int unit;
  244         LIST_ENTRY(md_s) list;
  245         struct bio_queue_head bio_queue;
  246         struct mtx queue_mtx;
  247         struct cdev *dev;
  248         enum md_types type;
  249         off_t mediasize;
  250         unsigned sectorsize;
  251         unsigned opencount;
  252         unsigned fwheads;
  253         unsigned fwsectors;
  254         char ident[32];
  255         unsigned flags;
  256         char name[20];
  257         struct proc *procp;
  258         struct g_geom *gp;
  259         struct g_provider *pp;
  260         int (*start)(struct md_s *sc, struct bio *bp);
  261         struct devstat *devstat;
  262 
  263         /* MD_MALLOC related fields */
  264         struct indir *indir;
  265         uma_zone_t uma;
  266 
  267         /* MD_PRELOAD related fields */
  268         u_char *pl_ptr;
  269         size_t pl_len;
  270 
  271         /* MD_VNODE related fields */
  272         struct vnode *vnode;
  273         char file[PATH_MAX];
  274         char label[PATH_MAX];
  275         struct ucred *cred;
  276 
  277         /* MD_SWAP related fields */
  278         vm_object_t object;
  279 };
  280 
  281 static struct indir *
  282 new_indir(u_int shift)
  283 {
  284         struct indir *ip;
  285 
  286         ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT)
  287             | M_ZERO);
  288         if (ip == NULL)
  289                 return (NULL);
  290         ip->array = malloc(sizeof(uintptr_t) * NINDIR,
  291             M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO);
  292         if (ip->array == NULL) {
  293                 free(ip, M_MD);
  294                 return (NULL);
  295         }
  296         ip->total = NINDIR;
  297         ip->shift = shift;
  298         return (ip);
  299 }
  300 
  301 static void
  302 del_indir(struct indir *ip)
  303 {
  304 
  305         free(ip->array, M_MDSECT);
  306         free(ip, M_MD);
  307 }
  308 
  309 static void
  310 destroy_indir(struct md_s *sc, struct indir *ip)
  311 {
  312         int i;
  313 
  314         for (i = 0; i < NINDIR; i++) {
  315                 if (!ip->array[i])
  316                         continue;
  317                 if (ip->shift)
  318                         destroy_indir(sc, (struct indir*)(ip->array[i]));
  319                 else if (ip->array[i] > 255)
  320                         uma_zfree(sc->uma, (void *)(ip->array[i]));
  321         }
  322         del_indir(ip);
  323 }
  324 
  325 /*
  326  * This function does the math and allocates the top level "indir" structure
  327  * for a device of "size" sectors.
  328  */
  329 
  330 static struct indir *
  331 dimension(off_t size)
  332 {
  333         off_t rcnt;
  334         struct indir *ip;
  335         int layer;
  336 
  337         rcnt = size;
  338         layer = 0;
  339         while (rcnt > NINDIR) {
  340                 rcnt /= NINDIR;
  341                 layer++;
  342         }
  343 
  344         /*
  345          * XXX: the top layer is probably not fully populated, so we allocate
  346          * too much space for ip->array in here.
  347          */
  348         ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
  349         ip->array = malloc(sizeof(uintptr_t) * NINDIR,
  350             M_MDSECT, M_WAITOK | M_ZERO);
  351         ip->total = NINDIR;
  352         ip->shift = layer * nshift;
  353         return (ip);
  354 }
  355 
  356 /*
  357  * Read a given sector
  358  */
  359 
  360 static uintptr_t
  361 s_read(struct indir *ip, off_t offset)
  362 {
  363         struct indir *cip;
  364         int idx;
  365         uintptr_t up;
  366 
  367         if (md_debug > 1)
  368                 printf("s_read(%jd)\n", (intmax_t)offset);
  369         up = 0;
  370         for (cip = ip; cip != NULL;) {
  371                 if (cip->shift) {
  372                         idx = (offset >> cip->shift) & NMASK;
  373                         up = cip->array[idx];
  374                         cip = (struct indir *)up;
  375                         continue;
  376                 }
  377                 idx = offset & NMASK;
  378                 return (cip->array[idx]);
  379         }
  380         return (0);
  381 }
  382 
  383 /*
  384  * Write a given sector, prune the tree if the value is 0
  385  */
  386 
  387 static int
  388 s_write(struct indir *ip, off_t offset, uintptr_t ptr)
  389 {
  390         struct indir *cip, *lip[10];
  391         int idx, li;
  392         uintptr_t up;
  393 
  394         if (md_debug > 1)
  395                 printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
  396         up = 0;
  397         li = 0;
  398         cip = ip;
  399         for (;;) {
  400                 lip[li++] = cip;
  401                 if (cip->shift) {
  402                         idx = (offset >> cip->shift) & NMASK;
  403                         up = cip->array[idx];
  404                         if (up != 0) {
  405                                 cip = (struct indir *)up;
  406                                 continue;
  407                         }
  408                         /* Allocate branch */
  409                         cip->array[idx] =
  410                             (uintptr_t)new_indir(cip->shift - nshift);
  411                         if (cip->array[idx] == 0)
  412                                 return (ENOSPC);
  413                         cip->used++;
  414                         up = cip->array[idx];
  415                         cip = (struct indir *)up;
  416                         continue;
  417                 }
  418                 /* leafnode */
  419                 idx = offset & NMASK;
  420                 up = cip->array[idx];
  421                 if (up != 0)
  422                         cip->used--;
  423                 cip->array[idx] = ptr;
  424                 if (ptr != 0)
  425                         cip->used++;
  426                 break;
  427         }
  428         if (cip->used != 0 || li == 1)
  429                 return (0);
  430         li--;
  431         while (cip->used == 0 && cip != ip) {
  432                 li--;
  433                 idx = (offset >> lip[li]->shift) & NMASK;
  434                 up = lip[li]->array[idx];
  435                 KASSERT(up == (uintptr_t)cip, ("md screwed up"));
  436                 del_indir(cip);
  437                 lip[li]->array[idx] = 0;
  438                 lip[li]->used--;
  439                 cip = lip[li];
  440         }
  441         return (0);
  442 }
  443 
  444 static int
  445 g_md_access(struct g_provider *pp, int r, int w, int e)
  446 {
  447         struct md_s *sc;
  448 
  449         sc = pp->geom->softc;
  450         if (sc == NULL) {
  451                 if (r <= 0 && w <= 0 && e <= 0)
  452                         return (0);
  453                 return (ENXIO);
  454         }
  455         r += pp->acr;
  456         w += pp->acw;
  457         e += pp->ace;
  458         if ((sc->flags & MD_READONLY) != 0 && w > 0)
  459                 return (EROFS);
  460         if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
  461                 sc->opencount = 1;
  462         } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
  463                 sc->opencount = 0;
  464         }
  465         return (0);
  466 }
  467 
  468 static void
  469 g_md_start(struct bio *bp)
  470 {
  471         struct md_s *sc;
  472 
  473         sc = bp->bio_to->geom->softc;
  474         if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) {
  475                 devstat_start_transaction_bio(sc->devstat, bp);
  476         }
  477         mtx_lock(&sc->queue_mtx);
  478         bioq_disksort(&sc->bio_queue, bp);
  479         wakeup(sc);
  480         mtx_unlock(&sc->queue_mtx);
  481 }
  482 
  483 #define MD_MALLOC_MOVE_ZERO     1
  484 #define MD_MALLOC_MOVE_FILL     2
  485 #define MD_MALLOC_MOVE_READ     3
  486 #define MD_MALLOC_MOVE_WRITE    4
  487 #define MD_MALLOC_MOVE_CMP      5
  488 
  489 static int
  490 md_malloc_move_ma(vm_page_t **mp, int *ma_offs, unsigned sectorsize,
  491     void *ptr, u_char fill, int op)
  492 {
  493         struct sf_buf *sf;
  494         vm_page_t m, *mp1;
  495         char *p, first;
  496         off_t *uc;
  497         unsigned n;
  498         int error, i, ma_offs1, sz, first_read;
  499 
  500         m = NULL;
  501         error = 0;
  502         sf = NULL;
  503         /* if (op == MD_MALLOC_MOVE_CMP) { gcc */
  504                 first = 0;
  505                 first_read = 0;
  506                 uc = ptr;
  507                 mp1 = *mp;
  508                 ma_offs1 = *ma_offs;
  509         /* } */
  510         sched_pin();
  511         for (n = sectorsize; n != 0; n -= sz) {
  512                 sz = imin(PAGE_SIZE - *ma_offs, n);
  513                 if (m != **mp) {
  514                         if (sf != NULL)
  515                                 sf_buf_free(sf);
  516                         m = **mp;
  517                         sf = sf_buf_alloc(m, SFB_CPUPRIVATE |
  518                             (md_malloc_wait ? 0 : SFB_NOWAIT));
  519                         if (sf == NULL) {
  520                                 error = ENOMEM;
  521                                 break;
  522                         }
  523                 }
  524                 p = (char *)sf_buf_kva(sf) + *ma_offs;
  525                 switch (op) {
  526                 case MD_MALLOC_MOVE_ZERO:
  527                         bzero(p, sz);
  528                         break;
  529                 case MD_MALLOC_MOVE_FILL:
  530                         memset(p, fill, sz);
  531                         break;
  532                 case MD_MALLOC_MOVE_READ:
  533                         bcopy(ptr, p, sz);
  534                         cpu_flush_dcache(p, sz);
  535                         break;
  536                 case MD_MALLOC_MOVE_WRITE:
  537                         bcopy(p, ptr, sz);
  538                         break;
  539                 case MD_MALLOC_MOVE_CMP:
  540                         for (i = 0; i < sz; i++, p++) {
  541                                 if (!first_read) {
  542                                         *uc = (u_char)*p;
  543                                         first = *p;
  544                                         first_read = 1;
  545                                 } else if (*p != first) {
  546                                         error = EDOOFUS;
  547                                         break;
  548                                 }
  549                         }
  550                         break;
  551                 default:
  552                         KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op));
  553                         break;
  554                 }
  555                 if (error != 0)
  556                         break;
  557                 *ma_offs += sz;
  558                 *ma_offs %= PAGE_SIZE;
  559                 if (*ma_offs == 0)
  560                         (*mp)++;
  561                 ptr = (char *)ptr + sz;
  562         }
  563 
  564         if (sf != NULL)
  565                 sf_buf_free(sf);
  566         sched_unpin();
  567         if (op == MD_MALLOC_MOVE_CMP && error != 0) {
  568                 *mp = mp1;
  569                 *ma_offs = ma_offs1;
  570         }
  571         return (error);
  572 }
  573 
  574 static int
  575 md_malloc_move_vlist(bus_dma_segment_t **pvlist, int *pma_offs,
  576     unsigned len, void *ptr, u_char fill, int op)
  577 {
  578         bus_dma_segment_t *vlist;
  579         uint8_t *p, *end, first;
  580         off_t *uc;
  581         int ma_offs, seg_len;
  582 
  583         vlist = *pvlist;
  584         ma_offs = *pma_offs;
  585         uc = ptr;
  586 
  587         for (; len != 0; len -= seg_len) {
  588                 seg_len = imin(vlist->ds_len - ma_offs, len);
  589                 p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs;
  590                 switch (op) {
  591                 case MD_MALLOC_MOVE_ZERO:
  592                         bzero(p, seg_len);
  593                         break;
  594                 case MD_MALLOC_MOVE_FILL:
  595                         memset(p, fill, seg_len);
  596                         break;
  597                 case MD_MALLOC_MOVE_READ:
  598                         bcopy(ptr, p, seg_len);
  599                         cpu_flush_dcache(p, seg_len);
  600                         break;
  601                 case MD_MALLOC_MOVE_WRITE:
  602                         bcopy(p, ptr, seg_len);
  603                         break;
  604                 case MD_MALLOC_MOVE_CMP:
  605                         end = p + seg_len;
  606                         first = *uc = *p;
  607                         /* Confirm all following bytes match the first */
  608                         while (++p < end) {
  609                                 if (*p != first)
  610                                         return (EDOOFUS);
  611                         }
  612                         break;
  613                 default:
  614                         KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op));
  615                         break;
  616                 }
  617 
  618                 ma_offs += seg_len;
  619                 if (ma_offs == vlist->ds_len) {
  620                         ma_offs = 0;
  621                         vlist++;
  622                 }
  623                 ptr = (uint8_t *)ptr + seg_len;
  624         }
  625         *pvlist = vlist;
  626         *pma_offs = ma_offs;
  627 
  628         return (0);
  629 }
  630 
  631 static int
  632 mdstart_malloc(struct md_s *sc, struct bio *bp)
  633 {
  634         u_char *dst;
  635         vm_page_t *m;
  636         bus_dma_segment_t *vlist;
  637         int i, error, error1, ma_offs, notmapped;
  638         off_t secno, nsec, uc;
  639         uintptr_t sp, osp;
  640 
  641         switch (bp->bio_cmd) {
  642         case BIO_READ:
  643         case BIO_WRITE:
  644         case BIO_DELETE:
  645                 break;
  646         default:
  647                 return (EOPNOTSUPP);
  648         }
  649 
  650         notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0;
  651         vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
  652             (bus_dma_segment_t *)bp->bio_data : NULL;
  653         if (notmapped) {
  654                 m = bp->bio_ma;
  655                 ma_offs = bp->bio_ma_offset;
  656                 dst = NULL;
  657                 KASSERT(vlist == NULL, ("vlists cannot be unmapped"));
  658         } else if (vlist != NULL) {
  659                 ma_offs = bp->bio_ma_offset;
  660                 dst = NULL;
  661         } else {
  662                 dst = bp->bio_data;
  663         }
  664 
  665         nsec = bp->bio_length / sc->sectorsize;
  666         secno = bp->bio_offset / sc->sectorsize;
  667         error = 0;
  668         while (nsec--) {
  669                 osp = s_read(sc->indir, secno);
  670                 if (bp->bio_cmd == BIO_DELETE) {
  671                         if (osp != 0)
  672                                 error = s_write(sc->indir, secno, 0);
  673                 } else if (bp->bio_cmd == BIO_READ) {
  674                         if (osp == 0) {
  675                                 if (notmapped) {
  676                                         error = md_malloc_move_ma(&m, &ma_offs,
  677                                             sc->sectorsize, NULL, 0,
  678                                             MD_MALLOC_MOVE_ZERO);
  679                                 } else if (vlist != NULL) {
  680                                         error = md_malloc_move_vlist(&vlist,
  681                                             &ma_offs, sc->sectorsize, NULL, 0,
  682                                             MD_MALLOC_MOVE_ZERO);
  683                                 } else
  684                                         bzero(dst, sc->sectorsize);
  685                         } else if (osp <= 255) {
  686                                 if (notmapped) {
  687                                         error = md_malloc_move_ma(&m, &ma_offs,
  688                                             sc->sectorsize, NULL, osp,
  689                                             MD_MALLOC_MOVE_FILL);
  690                                 } else if (vlist != NULL) {
  691                                         error = md_malloc_move_vlist(&vlist,
  692                                             &ma_offs, sc->sectorsize, NULL, osp,
  693                                             MD_MALLOC_MOVE_FILL);
  694                                 } else
  695                                         memset(dst, osp, sc->sectorsize);
  696                         } else {
  697                                 if (notmapped) {
  698                                         error = md_malloc_move_ma(&m, &ma_offs,
  699                                             sc->sectorsize, (void *)osp, 0,
  700                                             MD_MALLOC_MOVE_READ);
  701                                 } else if (vlist != NULL) {
  702                                         error = md_malloc_move_vlist(&vlist,
  703                                             &ma_offs, sc->sectorsize,
  704                                             (void *)osp, 0,
  705                                             MD_MALLOC_MOVE_READ);
  706                                 } else {
  707                                         bcopy((void *)osp, dst, sc->sectorsize);
  708                                         cpu_flush_dcache(dst, sc->sectorsize);
  709                                 }
  710                         }
  711                         osp = 0;
  712                 } else if (bp->bio_cmd == BIO_WRITE) {
  713                         if (sc->flags & MD_COMPRESS) {
  714                                 if (notmapped) {
  715                                         error1 = md_malloc_move_ma(&m, &ma_offs,
  716                                             sc->sectorsize, &uc, 0,
  717                                             MD_MALLOC_MOVE_CMP);
  718                                         i = error1 == 0 ? sc->sectorsize : 0;
  719                                 } else if (vlist != NULL) {
  720                                         error1 = md_malloc_move_vlist(&vlist,
  721                                             &ma_offs, sc->sectorsize, &uc, 0,
  722                                             MD_MALLOC_MOVE_CMP);
  723                                         i = error1 == 0 ? sc->sectorsize : 0;
  724                                 } else {
  725                                         uc = dst[0];
  726                                         for (i = 1; i < sc->sectorsize; i++) {
  727                                                 if (dst[i] != uc)
  728                                                         break;
  729                                         }
  730                                 }
  731                         } else {
  732                                 i = 0;
  733                                 uc = 0;
  734                         }
  735                         if (i == sc->sectorsize) {
  736                                 if (osp != uc)
  737                                         error = s_write(sc->indir, secno, uc);
  738                         } else {
  739                                 if (osp <= 255) {
  740                                         sp = (uintptr_t)uma_zalloc(sc->uma,
  741                                             md_malloc_wait ? M_WAITOK :
  742                                             M_NOWAIT);
  743                                         if (sp == 0) {
  744                                                 error = ENOSPC;
  745                                                 break;
  746                                         }
  747                                         if (notmapped) {
  748                                                 error = md_malloc_move_ma(&m,
  749                                                     &ma_offs, sc->sectorsize,
  750                                                     (void *)sp, 0,
  751                                                     MD_MALLOC_MOVE_WRITE);
  752                                         } else if (vlist != NULL) {
  753                                                 error = md_malloc_move_vlist(
  754                                                     &vlist, &ma_offs,
  755                                                     sc->sectorsize, (void *)sp,
  756                                                     0, MD_MALLOC_MOVE_WRITE);
  757                                         } else {
  758                                                 bcopy(dst, (void *)sp,
  759                                                     sc->sectorsize);
  760                                         }
  761                                         error = s_write(sc->indir, secno, sp);
  762                                 } else {
  763                                         if (notmapped) {
  764                                                 error = md_malloc_move_ma(&m,
  765                                                     &ma_offs, sc->sectorsize,
  766                                                     (void *)osp, 0,
  767                                                     MD_MALLOC_MOVE_WRITE);
  768                                         } else if (vlist != NULL) {
  769                                                 error = md_malloc_move_vlist(
  770                                                     &vlist, &ma_offs,
  771                                                     sc->sectorsize, (void *)osp,
  772                                                     0, MD_MALLOC_MOVE_WRITE);
  773                                         } else {
  774                                                 bcopy(dst, (void *)osp,
  775                                                     sc->sectorsize);
  776                                         }
  777                                         osp = 0;
  778                                 }
  779                         }
  780                 } else {
  781                         error = EOPNOTSUPP;
  782                 }
  783                 if (osp > 255)
  784                         uma_zfree(sc->uma, (void*)osp);
  785                 if (error != 0)
  786                         break;
  787                 secno++;
  788                 if (!notmapped && vlist == NULL)
  789                         dst += sc->sectorsize;
  790         }
  791         bp->bio_resid = 0;
  792         return (error);
  793 }
  794 
  795 static void
  796 mdcopyto_vlist(void *src, bus_dma_segment_t *vlist, off_t offset, off_t len)
  797 {
  798         off_t seg_len;
  799 
  800         while (offset >= vlist->ds_len) {
  801                 offset -= vlist->ds_len;
  802                 vlist++;
  803         }
  804 
  805         while (len != 0) {
  806                 seg_len = omin(len, vlist->ds_len - offset);
  807                 bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset),
  808                     seg_len);
  809                 offset = 0;
  810                 src = (uint8_t *)src + seg_len;
  811                 len -= seg_len;
  812                 vlist++;
  813         }
  814 }
  815 
  816 static void
  817 mdcopyfrom_vlist(bus_dma_segment_t *vlist, off_t offset, void *dst, off_t len)
  818 {
  819         off_t seg_len;
  820 
  821         while (offset >= vlist->ds_len) {
  822                 offset -= vlist->ds_len;
  823                 vlist++;
  824         }
  825 
  826         while (len != 0) {
  827                 seg_len = omin(len, vlist->ds_len - offset);
  828                 bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst,
  829                     seg_len);
  830                 offset = 0;
  831                 dst = (uint8_t *)dst + seg_len;
  832                 len -= seg_len;
  833                 vlist++;
  834         }
  835 }
  836 
  837 static int
  838 mdstart_preload(struct md_s *sc, struct bio *bp)
  839 {
  840         uint8_t *p;
  841 
  842         p = sc->pl_ptr + bp->bio_offset;
  843         switch (bp->bio_cmd) {
  844         case BIO_READ:
  845                 if ((bp->bio_flags & BIO_VLIST) != 0) {
  846                         mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data,
  847                             bp->bio_ma_offset, bp->bio_length);
  848                 } else {
  849                         bcopy(p, bp->bio_data, bp->bio_length);
  850                 }
  851                 cpu_flush_dcache(bp->bio_data, bp->bio_length);
  852                 break;
  853         case BIO_WRITE:
  854                 if ((bp->bio_flags & BIO_VLIST) != 0) {
  855                         mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data,
  856                             bp->bio_ma_offset, p, bp->bio_length);
  857                 } else {
  858                         bcopy(bp->bio_data, p, bp->bio_length);
  859                 }
  860                 break;
  861         }
  862         bp->bio_resid = 0;
  863         return (0);
  864 }
  865 
  866 static int
  867 mdstart_vnode(struct md_s *sc, struct bio *bp)
  868 {
  869         int error;
  870         struct uio auio;
  871         struct iovec aiov;
  872         struct iovec *piov;
  873         struct mount *mp;
  874         struct vnode *vp;
  875         struct buf *pb;
  876         bus_dma_segment_t *vlist;
  877         struct thread *td;
  878         off_t iolen, iostart, len, zerosize;
  879         int ma_offs, npages;
  880 
  881         switch (bp->bio_cmd) {
  882         case BIO_READ:
  883                 auio.uio_rw = UIO_READ;
  884                 break;
  885         case BIO_WRITE:
  886         case BIO_DELETE:
  887                 auio.uio_rw = UIO_WRITE;
  888                 break;
  889         case BIO_FLUSH:
  890                 break;
  891         default:
  892                 return (EOPNOTSUPP);
  893         }
  894 
  895         td = curthread;
  896         vp = sc->vnode;
  897         pb = NULL;
  898         piov = NULL;
  899         ma_offs = bp->bio_ma_offset;
  900         len = bp->bio_length;
  901 
  902         /*
  903          * VNODE I/O
  904          *
  905          * If an error occurs, we set BIO_ERROR but we do not set
  906          * B_INVAL because (for a write anyway), the buffer is
  907          * still valid.
  908          */
  909 
  910         if (bp->bio_cmd == BIO_FLUSH) {
  911                 (void) vn_start_write(vp, &mp, V_WAIT);
  912                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  913                 error = VOP_FSYNC(vp, MNT_WAIT, td);
  914                 VOP_UNLOCK(vp);
  915                 vn_finished_write(mp);
  916                 return (error);
  917         }
  918 
  919         auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
  920         auio.uio_resid = bp->bio_length;
  921         auio.uio_segflg = UIO_SYSSPACE;
  922         auio.uio_td = td;
  923 
  924         if (bp->bio_cmd == BIO_DELETE) {
  925                 /*
  926                  * Emulate BIO_DELETE by writing zeros.
  927                  */
  928                 zerosize = ZERO_REGION_SIZE -
  929                     (ZERO_REGION_SIZE % sc->sectorsize);
  930                 auio.uio_iovcnt = howmany(bp->bio_length, zerosize);
  931                 piov = malloc(sizeof(*piov) * auio.uio_iovcnt, M_MD, M_WAITOK);
  932                 auio.uio_iov = piov;
  933                 while (len > 0) {
  934                         piov->iov_base = __DECONST(void *, zero_region);
  935                         piov->iov_len = len;
  936                         if (len > zerosize)
  937                                 piov->iov_len = zerosize;
  938                         len -= piov->iov_len;
  939                         piov++;
  940                 }
  941                 piov = auio.uio_iov;
  942         } else if ((bp->bio_flags & BIO_VLIST) != 0) {
  943                 piov = malloc(sizeof(*piov) * bp->bio_ma_n, M_MD, M_WAITOK);
  944                 auio.uio_iov = piov;
  945                 vlist = (bus_dma_segment_t *)bp->bio_data;
  946                 while (len > 0) {
  947                         piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr +
  948                             ma_offs);
  949                         piov->iov_len = vlist->ds_len - ma_offs;
  950                         if (piov->iov_len > len)
  951                                 piov->iov_len = len;
  952                         len -= piov->iov_len;
  953                         ma_offs = 0;
  954                         vlist++;
  955                         piov++;
  956                 }
  957                 auio.uio_iovcnt = piov - auio.uio_iov;
  958                 piov = auio.uio_iov;
  959         } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
  960                 pb = uma_zalloc(md_pbuf_zone, M_WAITOK);
  961                 MPASS((pb->b_flags & B_MAXPHYS) != 0);
  962                 bp->bio_resid = len;
  963 unmapped_step:
  964                 npages = atop(min(maxphys, round_page(len + (ma_offs &
  965                     PAGE_MASK))));
  966                 iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len);
  967                 KASSERT(iolen > 0, ("zero iolen"));
  968                 pmap_qenter((vm_offset_t)pb->b_data,
  969                     &bp->bio_ma[atop(ma_offs)], npages);
  970                 aiov.iov_base = (void *)((vm_offset_t)pb->b_data +
  971                     (ma_offs & PAGE_MASK));
  972                 aiov.iov_len = iolen;
  973                 auio.uio_iov = &aiov;
  974                 auio.uio_iovcnt = 1;
  975                 auio.uio_resid = iolen;
  976         } else {
  977                 aiov.iov_base = bp->bio_data;
  978                 aiov.iov_len = bp->bio_length;
  979                 auio.uio_iov = &aiov;
  980                 auio.uio_iovcnt = 1;
  981         }
  982         iostart = auio.uio_offset;
  983         if (auio.uio_rw == UIO_READ) {
  984                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  985                 error = VOP_READ(vp, &auio, 0, sc->cred);
  986                 VOP_UNLOCK(vp);
  987         } else {
  988                 (void) vn_start_write(vp, &mp, V_WAIT);
  989                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  990                 error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC,
  991                     sc->cred);
  992                 VOP_UNLOCK(vp);
  993                 vn_finished_write(mp);
  994                 if (error == 0)
  995                         sc->flags &= ~MD_VERIFY;
  996         }
  997 
  998         /* When MD_CACHE is set, try to avoid double-caching the data. */
  999         if (error == 0 && (sc->flags & MD_CACHE) == 0)
 1000                 VOP_ADVISE(vp, iostart, auio.uio_offset - 1,
 1001                     POSIX_FADV_DONTNEED);
 1002 
 1003         if (pb != NULL) {
 1004                 pmap_qremove((vm_offset_t)pb->b_data, npages);
 1005                 if (error == 0) {
 1006                         len -= iolen;
 1007                         bp->bio_resid -= iolen;
 1008                         ma_offs += iolen;
 1009                         if (len > 0)
 1010                                 goto unmapped_step;
 1011                 }
 1012                 uma_zfree(md_pbuf_zone, pb);
 1013         } else {
 1014                 bp->bio_resid = auio.uio_resid;
 1015         }
 1016 
 1017         free(piov, M_MD);
 1018         return (error);
 1019 }
 1020 
 1021 static int
 1022 mdstart_swap(struct md_s *sc, struct bio *bp)
 1023 {
 1024         vm_page_t m;
 1025         u_char *p;
 1026         vm_pindex_t i, lastp;
 1027         bus_dma_segment_t *vlist;
 1028         int rv, ma_offs, offs, len, lastend;
 1029 
 1030         switch (bp->bio_cmd) {
 1031         case BIO_READ:
 1032         case BIO_WRITE:
 1033         case BIO_DELETE:
 1034                 break;
 1035         default:
 1036                 return (EOPNOTSUPP);
 1037         }
 1038 
 1039         p = bp->bio_data;
 1040         ma_offs = (bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0 ?
 1041             bp->bio_ma_offset : 0;
 1042         vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
 1043             (bus_dma_segment_t *)bp->bio_data : NULL;
 1044 
 1045         /*
 1046          * offs is the offset at which to start operating on the
 1047          * next (ie, first) page.  lastp is the last page on
 1048          * which we're going to operate.  lastend is the ending
 1049          * position within that last page (ie, PAGE_SIZE if
 1050          * we're operating on complete aligned pages).
 1051          */
 1052         offs = bp->bio_offset % PAGE_SIZE;
 1053         lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
 1054         lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
 1055 
 1056         rv = VM_PAGER_OK;
 1057         vm_object_pip_add(sc->object, 1);
 1058         for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
 1059                 len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
 1060                 m = vm_page_grab_unlocked(sc->object, i, VM_ALLOC_SYSTEM);
 1061                 if (bp->bio_cmd == BIO_READ) {
 1062                         if (vm_page_all_valid(m))
 1063                                 rv = VM_PAGER_OK;
 1064                         else
 1065                                 rv = vm_pager_get_pages(sc->object, &m, 1,
 1066                                     NULL, NULL);
 1067                         if (rv == VM_PAGER_ERROR) {
 1068                                 VM_OBJECT_WLOCK(sc->object);
 1069                                 vm_page_free(m);
 1070                                 VM_OBJECT_WUNLOCK(sc->object);
 1071                                 break;
 1072                         } else if (rv == VM_PAGER_FAIL) {
 1073                                 /*
 1074                                  * Pager does not have the page.  Zero
 1075                                  * the allocated page, and mark it as
 1076                                  * valid. Do not set dirty, the page
 1077                                  * can be recreated if thrown out.
 1078                                  */
 1079                                 pmap_zero_page(m);
 1080                                 vm_page_valid(m);
 1081                         }
 1082                         if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 1083                                 pmap_copy_pages(&m, offs, bp->bio_ma,
 1084                                     ma_offs, len);
 1085                         } else if ((bp->bio_flags & BIO_VLIST) != 0) {
 1086                                 physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs,
 1087                                     vlist, ma_offs, len);
 1088                                 cpu_flush_dcache(p, len);
 1089                         } else {
 1090                                 physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len);
 1091                                 cpu_flush_dcache(p, len);
 1092                         }
 1093                 } else if (bp->bio_cmd == BIO_WRITE) {
 1094                         if (len == PAGE_SIZE || vm_page_all_valid(m))
 1095                                 rv = VM_PAGER_OK;
 1096                         else
 1097                                 rv = vm_pager_get_pages(sc->object, &m, 1,
 1098                                     NULL, NULL);
 1099                         if (rv == VM_PAGER_ERROR) {
 1100                                 VM_OBJECT_WLOCK(sc->object);
 1101                                 vm_page_free(m);
 1102                                 VM_OBJECT_WUNLOCK(sc->object);
 1103                                 break;
 1104                         } else if (rv == VM_PAGER_FAIL)
 1105                                 pmap_zero_page(m);
 1106 
 1107                         if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 1108                                 pmap_copy_pages(bp->bio_ma, ma_offs, &m,
 1109                                     offs, len);
 1110                         } else if ((bp->bio_flags & BIO_VLIST) != 0) {
 1111                                 physcopyin_vlist(vlist, ma_offs,
 1112                                     VM_PAGE_TO_PHYS(m) + offs, len);
 1113                         } else {
 1114                                 physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len);
 1115                         }
 1116 
 1117                         vm_page_valid(m);
 1118                         vm_page_set_dirty(m);
 1119                 } else if (bp->bio_cmd == BIO_DELETE) {
 1120                         if (len == PAGE_SIZE || vm_page_all_valid(m))
 1121                                 rv = VM_PAGER_OK;
 1122                         else
 1123                                 rv = vm_pager_get_pages(sc->object, &m, 1,
 1124                                     NULL, NULL);
 1125                         VM_OBJECT_WLOCK(sc->object);
 1126                         if (rv == VM_PAGER_ERROR) {
 1127                                 vm_page_free(m);
 1128                                 VM_OBJECT_WUNLOCK(sc->object);
 1129                                 break;
 1130                         } else if (rv == VM_PAGER_FAIL) {
 1131                                 vm_page_free(m);
 1132                                 m = NULL;
 1133                         } else {
 1134                                 /* Page is valid. */
 1135                                 if (len != PAGE_SIZE) {
 1136                                         pmap_zero_page_area(m, offs, len);
 1137                                         vm_page_set_dirty(m);
 1138                                 } else {
 1139                                         vm_pager_page_unswapped(m);
 1140                                         vm_page_free(m);
 1141                                         m = NULL;
 1142                                 }
 1143                         }
 1144                         VM_OBJECT_WUNLOCK(sc->object);
 1145                 }
 1146                 if (m != NULL) {
 1147                         /*
 1148                          * The page may be deactivated prior to setting
 1149                          * PGA_REFERENCED, but in this case it will be
 1150                          * reactivated by the page daemon.
 1151                          */
 1152                         if (vm_page_active(m))
 1153                                 vm_page_reference(m);
 1154                         else
 1155                                 vm_page_activate(m);
 1156                         vm_page_xunbusy(m);
 1157                 }
 1158 
 1159                 /* Actions on further pages start at offset 0 */
 1160                 p += PAGE_SIZE - offs;
 1161                 offs = 0;
 1162                 ma_offs += len;
 1163         }
 1164         vm_object_pip_wakeup(sc->object);
 1165         return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
 1166 }
 1167 
 1168 static int
 1169 mdstart_null(struct md_s *sc, struct bio *bp)
 1170 {
 1171 
 1172         switch (bp->bio_cmd) {
 1173         case BIO_READ:
 1174                 bzero(bp->bio_data, bp->bio_length);
 1175                 cpu_flush_dcache(bp->bio_data, bp->bio_length);
 1176                 break;
 1177         case BIO_WRITE:
 1178                 break;
 1179         }
 1180         bp->bio_resid = 0;
 1181         return (0);
 1182 }
 1183 
 1184 static void
 1185 md_handleattr(struct md_s *sc, struct bio *bp)
 1186 {
 1187         if (sc->fwsectors && sc->fwheads &&
 1188             (g_handleattr_int(bp, "GEOM::fwsectors", sc->fwsectors) != 0 ||
 1189             g_handleattr_int(bp, "GEOM::fwheads", sc->fwheads) != 0))
 1190                 return;
 1191         if (g_handleattr_int(bp, "GEOM::candelete", 1) != 0)
 1192                 return;
 1193         if (sc->ident[0] != '\0' &&
 1194             g_handleattr_str(bp, "GEOM::ident", sc->ident) != 0)
 1195                 return;
 1196         if (g_handleattr_int(bp, "MNT::verified", (sc->flags & MD_VERIFY) != 0))
 1197                 return;
 1198         g_io_deliver(bp, EOPNOTSUPP);
 1199 }
 1200 
 1201 static void
 1202 md_kthread(void *arg)
 1203 {
 1204         struct md_s *sc;
 1205         struct bio *bp;
 1206         int error;
 1207 
 1208         sc = arg;
 1209         thread_lock(curthread);
 1210         sched_prio(curthread, PRIBIO);
 1211         thread_unlock(curthread);
 1212         if (sc->type == MD_VNODE)
 1213                 curthread->td_pflags |= TDP_NORUNNINGBUF;
 1214 
 1215         for (;;) {
 1216                 mtx_lock(&sc->queue_mtx);
 1217                 if (sc->flags & MD_SHUTDOWN) {
 1218                         sc->flags |= MD_EXITING;
 1219                         mtx_unlock(&sc->queue_mtx);
 1220                         kproc_exit(0);
 1221                 }
 1222                 bp = bioq_takefirst(&sc->bio_queue);
 1223                 if (!bp) {
 1224                         msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
 1225                         continue;
 1226                 }
 1227                 mtx_unlock(&sc->queue_mtx);
 1228                 if (bp->bio_cmd == BIO_GETATTR) {
 1229                         md_handleattr(sc, bp);
 1230                 } else {
 1231                         error = sc->start(sc, bp);
 1232                         if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
 1233                                 /*
 1234                                  * Devstat uses (bio_bcount, bio_resid) for
 1235                                  * determining the length of the completed part
 1236                                  * of the i/o.  g_io_deliver() will translate
 1237                                  * from bio_completed to that, but it also
 1238                                  * destroys the bio so we must do our own
 1239                                  * translation.
 1240                                  */
 1241                                 bp->bio_bcount = bp->bio_length;
 1242                                 devstat_end_transaction_bio(sc->devstat, bp);
 1243                         }
 1244                         bp->bio_completed = bp->bio_length - bp->bio_resid;
 1245                         g_io_deliver(bp, error);
 1246                 }
 1247         }
 1248 }
 1249 
 1250 static struct md_s *
 1251 mdfind(int unit)
 1252 {
 1253         struct md_s *sc;
 1254 
 1255         LIST_FOREACH(sc, &md_softc_list, list) {
 1256                 if (sc->unit == unit)
 1257                         break;
 1258         }
 1259         return (sc);
 1260 }
 1261 
 1262 static struct md_s *
 1263 mdnew(int unit, int *errp, enum md_types type)
 1264 {
 1265         struct md_s *sc;
 1266         int error;
 1267 
 1268         *errp = 0;
 1269         if (unit == -1)
 1270                 unit = alloc_unr(md_uh);
 1271         else
 1272                 unit = alloc_unr_specific(md_uh, unit);
 1273 
 1274         if (unit == -1) {
 1275                 *errp = EBUSY;
 1276                 return (NULL);
 1277         }
 1278 
 1279         sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
 1280         sc->type = type;
 1281         bioq_init(&sc->bio_queue);
 1282         mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
 1283         sc->unit = unit;
 1284         sprintf(sc->name, "md%d", unit);
 1285         LIST_INSERT_HEAD(&md_softc_list, sc, list);
 1286         error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
 1287         if (error == 0)
 1288                 return (sc);
 1289         LIST_REMOVE(sc, list);
 1290         mtx_destroy(&sc->queue_mtx);
 1291         free_unr(md_uh, sc->unit);
 1292         free(sc, M_MD);
 1293         *errp = error;
 1294         return (NULL);
 1295 }
 1296 
 1297 static void
 1298 mdinit(struct md_s *sc)
 1299 {
 1300         struct g_geom *gp;
 1301         struct g_provider *pp;
 1302 
 1303         g_topology_lock();
 1304         gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
 1305         gp->softc = sc;
 1306         pp = g_new_providerf(gp, "md%d", sc->unit);
 1307         devstat_remove_entry(pp->stat);
 1308         pp->stat = NULL;
 1309         pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
 1310         pp->mediasize = sc->mediasize;
 1311         pp->sectorsize = sc->sectorsize;
 1312         switch (sc->type) {
 1313         case MD_MALLOC:
 1314         case MD_VNODE:
 1315         case MD_SWAP:
 1316                 pp->flags |= G_PF_ACCEPT_UNMAPPED;
 1317                 break;
 1318         case MD_PRELOAD:
 1319         case MD_NULL:
 1320                 break;
 1321         }
 1322         sc->gp = gp;
 1323         sc->pp = pp;
 1324         sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize,
 1325             DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
 1326         sc->devstat->id = pp;
 1327         g_error_provider(pp, 0);
 1328         g_topology_unlock();
 1329 }
 1330 
 1331 static int
 1332 mdcreate_malloc(struct md_s *sc, struct md_req *mdr)
 1333 {
 1334         uintptr_t sp;
 1335         int error;
 1336         off_t u;
 1337 
 1338         error = 0;
 1339         if (mdr->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
 1340                 return (EINVAL);
 1341         if (mdr->md_sectorsize != 0 && !powerof2(mdr->md_sectorsize))
 1342                 return (EINVAL);
 1343         /* Compression doesn't make sense if we have reserved space */
 1344         if (mdr->md_options & MD_RESERVE)
 1345                 mdr->md_options &= ~MD_COMPRESS;
 1346         if (mdr->md_fwsectors != 0)
 1347                 sc->fwsectors = mdr->md_fwsectors;
 1348         if (mdr->md_fwheads != 0)
 1349                 sc->fwheads = mdr->md_fwheads;
 1350         sc->flags = mdr->md_options & (MD_COMPRESS | MD_FORCE);
 1351         sc->indir = dimension(sc->mediasize / sc->sectorsize);
 1352         sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
 1353             0x1ff, 0);
 1354         if (mdr->md_options & MD_RESERVE) {
 1355                 off_t nsectors;
 1356 
 1357                 nsectors = sc->mediasize / sc->sectorsize;
 1358                 for (u = 0; u < nsectors; u++) {
 1359                         sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ?
 1360                             M_WAITOK : M_NOWAIT) | M_ZERO);
 1361                         if (sp != 0)
 1362                                 error = s_write(sc->indir, u, sp);
 1363                         else
 1364                                 error = ENOMEM;
 1365                         if (error != 0)
 1366                                 break;
 1367                 }
 1368         }
 1369         return (error);
 1370 }
 1371 
 1372 static int
 1373 mdsetcred(struct md_s *sc, struct ucred *cred)
 1374 {
 1375         char *tmpbuf;
 1376         int error = 0;
 1377 
 1378         /*
 1379          * Set credits in our softc
 1380          */
 1381 
 1382         if (sc->cred)
 1383                 crfree(sc->cred);
 1384         sc->cred = crhold(cred);
 1385 
 1386         /*
 1387          * Horrible kludge to establish credentials for NFS  XXX.
 1388          */
 1389 
 1390         if (sc->vnode) {
 1391                 struct uio auio;
 1392                 struct iovec aiov;
 1393 
 1394                 tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
 1395                 bzero(&auio, sizeof(auio));
 1396 
 1397                 aiov.iov_base = tmpbuf;
 1398                 aiov.iov_len = sc->sectorsize;
 1399                 auio.uio_iov = &aiov;
 1400                 auio.uio_iovcnt = 1;
 1401                 auio.uio_offset = 0;
 1402                 auio.uio_rw = UIO_READ;
 1403                 auio.uio_segflg = UIO_SYSSPACE;
 1404                 auio.uio_resid = aiov.iov_len;
 1405                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
 1406                 error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
 1407                 VOP_UNLOCK(sc->vnode);
 1408                 free(tmpbuf, M_TEMP);
 1409         }
 1410         return (error);
 1411 }
 1412 
 1413 static int
 1414 mdcreate_vnode(struct md_s *sc, struct md_req *mdr, struct thread *td)
 1415 {
 1416         struct vattr vattr;
 1417         struct nameidata nd;
 1418         char *fname;
 1419         int error, flags;
 1420 
 1421         fname = mdr->md_file;
 1422         if (mdr->md_file_seg == UIO_USERSPACE) {
 1423                 error = copyinstr(fname, sc->file, sizeof(sc->file), NULL);
 1424                 if (error != 0)
 1425                         return (error);
 1426         } else if (mdr->md_file_seg == UIO_SYSSPACE)
 1427                 strlcpy(sc->file, fname, sizeof(sc->file));
 1428         else
 1429                 return (EDOOFUS);
 1430 
 1431         /*
 1432          * If the user specified that this is a read only device, don't
 1433          * set the FWRITE mask before trying to open the backing store.
 1434          */
 1435         flags = FREAD | ((mdr->md_options & MD_READONLY) ? 0 : FWRITE) \
 1436             | ((mdr->md_options & MD_VERIFY) ? O_VERIFY : 0);
 1437         NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td);
 1438         error = vn_open(&nd, &flags, 0, NULL);
 1439         if (error != 0)
 1440                 return (error);
 1441         NDFREE(&nd, NDF_ONLY_PNBUF);
 1442         if (nd.ni_vp->v_type != VREG) {
 1443                 error = EINVAL;
 1444                 goto bad;
 1445         }
 1446         error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred);
 1447         if (error != 0)
 1448                 goto bad;
 1449         if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) {
 1450                 vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY);
 1451                 if (VN_IS_DOOMED(nd.ni_vp)) {
 1452                         /* Forced unmount. */
 1453                         error = EBADF;
 1454                         goto bad;
 1455                 }
 1456         }
 1457         nd.ni_vp->v_vflag |= VV_MD;
 1458         VOP_UNLOCK(nd.ni_vp);
 1459 
 1460         if (mdr->md_fwsectors != 0)
 1461                 sc->fwsectors = mdr->md_fwsectors;
 1462         if (mdr->md_fwheads != 0)
 1463                 sc->fwheads = mdr->md_fwheads;
 1464         snprintf(sc->ident, sizeof(sc->ident), "MD-DEV%ju-INO%ju",
 1465             (uintmax_t)vattr.va_fsid, (uintmax_t)vattr.va_fileid);
 1466         sc->flags = mdr->md_options & (MD_ASYNC | MD_CACHE | MD_FORCE |
 1467             MD_VERIFY);
 1468         if (!(flags & FWRITE))
 1469                 sc->flags |= MD_READONLY;
 1470         sc->vnode = nd.ni_vp;
 1471 
 1472         error = mdsetcred(sc, td->td_ucred);
 1473         if (error != 0) {
 1474                 sc->vnode = NULL;
 1475                 vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
 1476                 nd.ni_vp->v_vflag &= ~VV_MD;
 1477                 goto bad;
 1478         }
 1479         return (0);
 1480 bad:
 1481         VOP_UNLOCK(nd.ni_vp);
 1482         (void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
 1483         return (error);
 1484 }
 1485 
 1486 static void
 1487 g_md_providergone(struct g_provider *pp)
 1488 {
 1489         struct md_s *sc = pp->geom->softc;
 1490 
 1491         mtx_lock(&sc->queue_mtx);
 1492         sc->flags |= MD_PROVIDERGONE;
 1493         wakeup(&sc->flags);
 1494         mtx_unlock(&sc->queue_mtx);
 1495 }
 1496 
 1497 static int
 1498 mddestroy(struct md_s *sc, struct thread *td)
 1499 {
 1500 
 1501         if (sc->gp) {
 1502                 g_topology_lock();
 1503                 g_wither_geom(sc->gp, ENXIO);
 1504                 g_topology_unlock();
 1505 
 1506                 mtx_lock(&sc->queue_mtx);
 1507                 while (!(sc->flags & MD_PROVIDERGONE))
 1508                         msleep(&sc->flags, &sc->queue_mtx, PRIBIO, "mddestroy", 0);
 1509                 mtx_unlock(&sc->queue_mtx);
 1510         }
 1511         if (sc->devstat) {
 1512                 devstat_remove_entry(sc->devstat);
 1513                 sc->devstat = NULL;
 1514         }
 1515         mtx_lock(&sc->queue_mtx);
 1516         sc->flags |= MD_SHUTDOWN;
 1517         wakeup(sc);
 1518         while (!(sc->flags & MD_EXITING))
 1519                 msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10);
 1520         mtx_unlock(&sc->queue_mtx);
 1521         mtx_destroy(&sc->queue_mtx);
 1522         if (sc->vnode != NULL) {
 1523                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
 1524                 sc->vnode->v_vflag &= ~VV_MD;
 1525                 VOP_UNLOCK(sc->vnode);
 1526                 (void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
 1527                     FREAD : (FREAD|FWRITE), sc->cred, td);
 1528         }
 1529         if (sc->cred != NULL)
 1530                 crfree(sc->cred);
 1531         if (sc->object != NULL)
 1532                 vm_object_deallocate(sc->object);
 1533         if (sc->indir)
 1534                 destroy_indir(sc, sc->indir);
 1535         if (sc->uma)
 1536                 uma_zdestroy(sc->uma);
 1537 
 1538         LIST_REMOVE(sc, list);
 1539         free_unr(md_uh, sc->unit);
 1540         free(sc, M_MD);
 1541         return (0);
 1542 }
 1543 
 1544 static int
 1545 mdresize(struct md_s *sc, struct md_req *mdr)
 1546 {
 1547         int error, res;
 1548         vm_pindex_t oldpages, newpages;
 1549 
 1550         switch (sc->type) {
 1551         case MD_VNODE:
 1552         case MD_NULL:
 1553                 break;
 1554         case MD_SWAP:
 1555                 if (mdr->md_mediasize <= 0 ||
 1556                     (mdr->md_mediasize % PAGE_SIZE) != 0)
 1557                         return (EDOM);
 1558                 oldpages = OFF_TO_IDX(sc->mediasize);
 1559                 newpages = OFF_TO_IDX(mdr->md_mediasize);
 1560                 if (newpages < oldpages) {
 1561                         VM_OBJECT_WLOCK(sc->object);
 1562                         vm_object_page_remove(sc->object, newpages, 0, 0);
 1563                         swap_release_by_cred(IDX_TO_OFF(oldpages -
 1564                             newpages), sc->cred);
 1565                         sc->object->charge = IDX_TO_OFF(newpages);
 1566                         sc->object->size = newpages;
 1567                         VM_OBJECT_WUNLOCK(sc->object);
 1568                 } else if (newpages > oldpages) {
 1569                         res = swap_reserve_by_cred(IDX_TO_OFF(newpages -
 1570                             oldpages), sc->cred);
 1571                         if (!res)
 1572                                 return (ENOMEM);
 1573                         if ((mdr->md_options & MD_RESERVE) ||
 1574                             (sc->flags & MD_RESERVE)) {
 1575                                 error = swap_pager_reserve(sc->object,
 1576                                     oldpages, newpages - oldpages);
 1577                                 if (error < 0) {
 1578                                         swap_release_by_cred(
 1579                                             IDX_TO_OFF(newpages - oldpages),
 1580                                             sc->cred);
 1581                                         return (EDOM);
 1582                                 }
 1583                         }
 1584                         VM_OBJECT_WLOCK(sc->object);
 1585                         sc->object->charge = IDX_TO_OFF(newpages);
 1586                         sc->object->size = newpages;
 1587                         VM_OBJECT_WUNLOCK(sc->object);
 1588                 }
 1589                 break;
 1590         default:
 1591                 return (EOPNOTSUPP);
 1592         }
 1593 
 1594         sc->mediasize = mdr->md_mediasize;
 1595 
 1596         g_topology_lock();
 1597         g_resize_provider(sc->pp, sc->mediasize);
 1598         g_topology_unlock();
 1599         return (0);
 1600 }
 1601 
 1602 static int
 1603 mdcreate_swap(struct md_s *sc, struct md_req *mdr, struct thread *td)
 1604 {
 1605         vm_ooffset_t npage;
 1606         int error;
 1607 
 1608         /*
 1609          * Range check.  Disallow negative sizes and sizes not being
 1610          * multiple of page size.
 1611          */
 1612         if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
 1613                 return (EDOM);
 1614 
 1615         /*
 1616          * Allocate an OBJT_SWAP object.
 1617          *
 1618          * Note the truncation.
 1619          */
 1620 
 1621         if ((mdr->md_options & MD_VERIFY) != 0)
 1622                 return (EINVAL);
 1623         npage = mdr->md_mediasize / PAGE_SIZE;
 1624         if (mdr->md_fwsectors != 0)
 1625                 sc->fwsectors = mdr->md_fwsectors;
 1626         if (mdr->md_fwheads != 0)
 1627                 sc->fwheads = mdr->md_fwheads;
 1628         sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
 1629             VM_PROT_DEFAULT, 0, td->td_ucred);
 1630         if (sc->object == NULL)
 1631                 return (ENOMEM);
 1632         sc->flags = mdr->md_options & (MD_FORCE | MD_RESERVE);
 1633         if (mdr->md_options & MD_RESERVE) {
 1634                 if (swap_pager_reserve(sc->object, 0, npage) < 0) {
 1635                         error = EDOM;
 1636                         goto finish;
 1637                 }
 1638         }
 1639         error = mdsetcred(sc, td->td_ucred);
 1640  finish:
 1641         if (error != 0) {
 1642                 vm_object_deallocate(sc->object);
 1643                 sc->object = NULL;
 1644         }
 1645         return (error);
 1646 }
 1647 
 1648 static int
 1649 mdcreate_null(struct md_s *sc, struct md_req *mdr, struct thread *td)
 1650 {
 1651 
 1652         /*
 1653          * Range check.  Disallow negative sizes and sizes not being
 1654          * multiple of page size.
 1655          */
 1656         if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
 1657                 return (EDOM);
 1658 
 1659         return (0);
 1660 }
 1661 
 1662 static int
 1663 kern_mdattach_locked(struct thread *td, struct md_req *mdr)
 1664 {
 1665         struct md_s *sc;
 1666         unsigned sectsize;
 1667         int error, i;
 1668 
 1669         sx_assert(&md_sx, SA_XLOCKED);
 1670 
 1671         switch (mdr->md_type) {
 1672         case MD_MALLOC:
 1673         case MD_PRELOAD:
 1674         case MD_VNODE:
 1675         case MD_SWAP:
 1676         case MD_NULL:
 1677                 break;
 1678         default:
 1679                 return (EINVAL);
 1680         }
 1681         if (mdr->md_sectorsize == 0)
 1682                 sectsize = DEV_BSIZE;
 1683         else
 1684                 sectsize = mdr->md_sectorsize;
 1685         if (sectsize > maxphys || mdr->md_mediasize < sectsize)
 1686                 return (EINVAL);
 1687         if (mdr->md_options & MD_AUTOUNIT)
 1688                 sc = mdnew(-1, &error, mdr->md_type);
 1689         else {
 1690                 if (mdr->md_unit > INT_MAX)
 1691                         return (EINVAL);
 1692                 sc = mdnew(mdr->md_unit, &error, mdr->md_type);
 1693         }
 1694         if (sc == NULL)
 1695                 return (error);
 1696         if (mdr->md_label != NULL)
 1697                 error = copyinstr(mdr->md_label, sc->label,
 1698                     sizeof(sc->label), NULL);
 1699         if (error != 0)
 1700                 goto err_after_new;
 1701         if (mdr->md_options & MD_AUTOUNIT)
 1702                 mdr->md_unit = sc->unit;
 1703         sc->mediasize = mdr->md_mediasize;
 1704         sc->sectorsize = sectsize;
 1705         error = EDOOFUS;
 1706         switch (sc->type) {
 1707         case MD_MALLOC:
 1708                 sc->start = mdstart_malloc;
 1709                 error = mdcreate_malloc(sc, mdr);
 1710                 break;
 1711         case MD_PRELOAD:
 1712                 /*
 1713                  * We disallow attaching preloaded memory disks via
 1714                  * ioctl. Preloaded memory disks are automatically
 1715                  * attached in g_md_init().
 1716                  */
 1717                 error = EOPNOTSUPP;
 1718                 break;
 1719         case MD_VNODE:
 1720                 sc->start = mdstart_vnode;
 1721                 error = mdcreate_vnode(sc, mdr, td);
 1722                 break;
 1723         case MD_SWAP:
 1724                 sc->start = mdstart_swap;
 1725                 error = mdcreate_swap(sc, mdr, td);
 1726                 break;
 1727         case MD_NULL:
 1728                 sc->start = mdstart_null;
 1729                 error = mdcreate_null(sc, mdr, td);
 1730                 break;
 1731         }
 1732 err_after_new:
 1733         if (error != 0) {
 1734                 mddestroy(sc, td);
 1735                 return (error);
 1736         }
 1737 
 1738         /* Prune off any residual fractional sector */
 1739         i = sc->mediasize % sc->sectorsize;
 1740         sc->mediasize -= i;
 1741 
 1742         mdinit(sc);
 1743         return (0);
 1744 }
 1745 
 1746 static int
 1747 kern_mdattach(struct thread *td, struct md_req *mdr)
 1748 {
 1749         int error;
 1750 
 1751         sx_xlock(&md_sx);
 1752         error = kern_mdattach_locked(td, mdr);
 1753         sx_xunlock(&md_sx);
 1754         return (error);
 1755 }
 1756 
 1757 static int
 1758 kern_mddetach_locked(struct thread *td, struct md_req *mdr)
 1759 {
 1760         struct md_s *sc;
 1761 
 1762         sx_assert(&md_sx, SA_XLOCKED);
 1763 
 1764         if (mdr->md_mediasize != 0 ||
 1765             (mdr->md_options & ~MD_FORCE) != 0)
 1766                 return (EINVAL);
 1767 
 1768         sc = mdfind(mdr->md_unit);
 1769         if (sc == NULL)
 1770                 return (ENOENT);
 1771         if (sc->opencount != 0 && !(sc->flags & MD_FORCE) &&
 1772             !(mdr->md_options & MD_FORCE))
 1773                 return (EBUSY);
 1774         return (mddestroy(sc, td));
 1775 }
 1776 
 1777 static int
 1778 kern_mddetach(struct thread *td, struct md_req *mdr)
 1779 {
 1780         int error;
 1781 
 1782         sx_xlock(&md_sx);
 1783         error = kern_mddetach_locked(td, mdr);
 1784         sx_xunlock(&md_sx);
 1785         return (error);
 1786 }
 1787 
 1788 static int
 1789 kern_mdresize_locked(struct md_req *mdr)
 1790 {
 1791         struct md_s *sc;
 1792 
 1793         sx_assert(&md_sx, SA_XLOCKED);
 1794 
 1795         if ((mdr->md_options & ~(MD_FORCE | MD_RESERVE)) != 0)
 1796                 return (EINVAL);
 1797 
 1798         sc = mdfind(mdr->md_unit);
 1799         if (sc == NULL)
 1800                 return (ENOENT);
 1801         if (mdr->md_mediasize < sc->sectorsize)
 1802                 return (EINVAL);
 1803         mdr->md_mediasize -= mdr->md_mediasize % sc->sectorsize;
 1804         if (mdr->md_mediasize < sc->mediasize &&
 1805             !(sc->flags & MD_FORCE) &&
 1806             !(mdr->md_options & MD_FORCE))
 1807                 return (EBUSY);
 1808         return (mdresize(sc, mdr));
 1809 }
 1810 
 1811 static int
 1812 kern_mdresize(struct md_req *mdr)
 1813 {
 1814         int error;
 1815 
 1816         sx_xlock(&md_sx);
 1817         error = kern_mdresize_locked(mdr);
 1818         sx_xunlock(&md_sx);
 1819         return (error);
 1820 }
 1821 
 1822 static int
 1823 kern_mdquery_locked(struct md_req *mdr)
 1824 {
 1825         struct md_s *sc;
 1826         int error;
 1827 
 1828         sx_assert(&md_sx, SA_XLOCKED);
 1829 
 1830         sc = mdfind(mdr->md_unit);
 1831         if (sc == NULL)
 1832                 return (ENOENT);
 1833         mdr->md_type = sc->type;
 1834         mdr->md_options = sc->flags;
 1835         mdr->md_mediasize = sc->mediasize;
 1836         mdr->md_sectorsize = sc->sectorsize;
 1837         error = 0;
 1838         if (mdr->md_label != NULL) {
 1839                 error = copyout(sc->label, mdr->md_label,
 1840                     strlen(sc->label) + 1);
 1841                 if (error != 0)
 1842                         return (error);
 1843         }
 1844         if (sc->type == MD_VNODE ||
 1845             (sc->type == MD_PRELOAD && mdr->md_file != NULL))
 1846                 error = copyout(sc->file, mdr->md_file,
 1847                     strlen(sc->file) + 1);
 1848         return (error);
 1849 }
 1850 
 1851 static int
 1852 kern_mdquery(struct md_req *mdr)
 1853 {
 1854         int error;
 1855 
 1856         sx_xlock(&md_sx);
 1857         error = kern_mdquery_locked(mdr);
 1858         sx_xunlock(&md_sx);
 1859         return (error);
 1860 }
 1861 
 1862 /* Copy members that are not userspace pointers. */
 1863 #define MD_IOCTL2REQ(mdio, mdr) do {                                    \
 1864         (mdr)->md_unit = (mdio)->md_unit;                               \
 1865         (mdr)->md_type = (mdio)->md_type;                               \
 1866         (mdr)->md_mediasize = (mdio)->md_mediasize;                     \
 1867         (mdr)->md_sectorsize = (mdio)->md_sectorsize;                   \
 1868         (mdr)->md_options = (mdio)->md_options;                         \
 1869         (mdr)->md_fwheads = (mdio)->md_fwheads;                         \
 1870         (mdr)->md_fwsectors = (mdio)->md_fwsectors;                     \
 1871         (mdr)->md_units = &(mdio)->md_pad[0];                           \
 1872         (mdr)->md_units_nitems = nitems((mdio)->md_pad);                \
 1873 } while(0)
 1874 
 1875 /* Copy members that might have been updated */
 1876 #define MD_REQ2IOCTL(mdr, mdio) do {                                    \
 1877         (mdio)->md_unit = (mdr)->md_unit;                               \
 1878         (mdio)->md_type = (mdr)->md_type;                               \
 1879         (mdio)->md_mediasize = (mdr)->md_mediasize;                     \
 1880         (mdio)->md_sectorsize = (mdr)->md_sectorsize;                   \
 1881         (mdio)->md_options = (mdr)->md_options;                         \
 1882         (mdio)->md_fwheads = (mdr)->md_fwheads;                         \
 1883         (mdio)->md_fwsectors = (mdr)->md_fwsectors;                     \
 1884 } while(0)
 1885 
 1886 static int
 1887 mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
 1888     struct thread *td)
 1889 {
 1890         struct md_req mdr;
 1891         int error;
 1892 
 1893         if (md_debug)
 1894                 printf("mdctlioctl(%s %lx %p %x %p)\n",
 1895                         devtoname(dev), cmd, addr, flags, td);
 1896 
 1897         bzero(&mdr, sizeof(mdr));
 1898         switch (cmd) {
 1899         case MDIOCATTACH:
 1900         case MDIOCDETACH:
 1901         case MDIOCRESIZE:
 1902         case MDIOCQUERY: {
 1903                 struct md_ioctl *mdio = (struct md_ioctl *)addr;
 1904                 if (mdio->md_version != MDIOVERSION)
 1905                         return (EINVAL);
 1906                 MD_IOCTL2REQ(mdio, &mdr);
 1907                 mdr.md_file = mdio->md_file;
 1908                 mdr.md_file_seg = UIO_USERSPACE;
 1909                 /* If the file is adjacent to the md_ioctl it's in kernel. */
 1910                 if ((void *)mdio->md_file == (void *)(mdio + 1))
 1911                         mdr.md_file_seg = UIO_SYSSPACE;
 1912                 mdr.md_label = mdio->md_label;
 1913                 break;
 1914         }
 1915 #ifdef COMPAT_FREEBSD32
 1916         case MDIOCATTACH_32:
 1917         case MDIOCDETACH_32:
 1918         case MDIOCRESIZE_32:
 1919         case MDIOCQUERY_32: {
 1920                 struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
 1921                 if (mdio->md_version != MDIOVERSION)
 1922                         return (EINVAL);
 1923                 MD_IOCTL2REQ(mdio, &mdr);
 1924                 mdr.md_file = (void *)(uintptr_t)mdio->md_file;
 1925                 mdr.md_file_seg = UIO_USERSPACE;
 1926                 mdr.md_label = (void *)(uintptr_t)mdio->md_label;
 1927                 break;
 1928         }
 1929 #endif
 1930         default:
 1931                 /* Fall through to handler switch. */
 1932                 break;
 1933         }
 1934 
 1935         error = 0;
 1936         switch (cmd) {
 1937         case MDIOCATTACH:
 1938 #ifdef COMPAT_FREEBSD32
 1939         case MDIOCATTACH_32:
 1940 #endif
 1941                 error = kern_mdattach(td, &mdr);
 1942                 break;
 1943         case MDIOCDETACH:
 1944 #ifdef COMPAT_FREEBSD32
 1945         case MDIOCDETACH_32:
 1946 #endif
 1947                 error = kern_mddetach(td, &mdr);
 1948                 break;
 1949         case MDIOCRESIZE:
 1950 #ifdef COMPAT_FREEBSD32
 1951         case MDIOCRESIZE_32:
 1952 #endif
 1953                 error = kern_mdresize(&mdr);
 1954                 break;
 1955         case MDIOCQUERY:
 1956 #ifdef COMPAT_FREEBSD32
 1957         case MDIOCQUERY_32:
 1958 #endif
 1959                 error = kern_mdquery(&mdr);
 1960                 break;
 1961         default:
 1962                 error = ENOIOCTL;
 1963         }
 1964 
 1965         switch (cmd) {
 1966         case MDIOCATTACH:
 1967         case MDIOCQUERY: {
 1968                 struct md_ioctl *mdio = (struct md_ioctl *)addr;
 1969                 MD_REQ2IOCTL(&mdr, mdio);
 1970                 break;
 1971         }
 1972 #ifdef COMPAT_FREEBSD32
 1973         case MDIOCATTACH_32:
 1974         case MDIOCQUERY_32: {
 1975                 struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
 1976                 MD_REQ2IOCTL(&mdr, mdio);
 1977                 break;
 1978         }
 1979 #endif
 1980         default:
 1981                 /* Other commands to not alter mdr. */
 1982                 break;
 1983         }
 1984 
 1985         return (error);
 1986 }
 1987 
 1988 static void
 1989 md_preloaded(u_char *image, size_t length, const char *name)
 1990 {
 1991         struct md_s *sc;
 1992         int error;
 1993 
 1994         sc = mdnew(-1, &error, MD_PRELOAD);
 1995         if (sc == NULL)
 1996                 return;
 1997         sc->mediasize = length;
 1998         sc->sectorsize = DEV_BSIZE;
 1999         sc->pl_ptr = image;
 2000         sc->pl_len = length;
 2001         sc->start = mdstart_preload;
 2002         if (name != NULL)
 2003                 strlcpy(sc->file, name, sizeof(sc->file));
 2004 #ifdef MD_ROOT
 2005         if (sc->unit == 0) {
 2006 #ifndef ROOTDEVNAME
 2007                 rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0";
 2008 #endif
 2009 #ifdef MD_ROOT_READONLY
 2010                 sc->flags |= MD_READONLY;
 2011 #endif
 2012         }
 2013 #endif
 2014         mdinit(sc);
 2015         if (name != NULL) {
 2016                 printf("%s%d: Preloaded image <%s> %zd bytes at %p\n",
 2017                     MD_NAME, sc->unit, name, length, image);
 2018         } else {
 2019                 printf("%s%d: Embedded image %zd bytes at %p\n",
 2020                     MD_NAME, sc->unit, length, image);
 2021         }
 2022 }
 2023 
 2024 static void
 2025 g_md_init(struct g_class *mp __unused)
 2026 {
 2027         caddr_t mod;
 2028         u_char *ptr, *name, *type;
 2029         unsigned len;
 2030         int i;
 2031 
 2032         /* figure out log2(NINDIR) */
 2033         for (i = NINDIR, nshift = -1; i; nshift++)
 2034                 i >>= 1;
 2035 
 2036         mod = NULL;
 2037         sx_init(&md_sx, "MD config lock");
 2038         g_topology_unlock();
 2039         md_uh = new_unrhdr(0, INT_MAX, NULL);
 2040 #ifdef MD_ROOT
 2041         if (mfs_root_size != 0) {
 2042                 sx_xlock(&md_sx);
 2043 #ifdef MD_ROOT_MEM
 2044                 md_preloaded(mfs_root, mfs_root_size, NULL);
 2045 #else
 2046                 md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size,
 2047                     NULL);
 2048 #endif
 2049                 sx_xunlock(&md_sx);
 2050         }
 2051 #endif
 2052         /* XXX: are preload_* static or do they need Giant ? */
 2053         while ((mod = preload_search_next_name(mod)) != NULL) {
 2054                 name = (char *)preload_search_info(mod, MODINFO_NAME);
 2055                 if (name == NULL)
 2056                         continue;
 2057                 type = (char *)preload_search_info(mod, MODINFO_TYPE);
 2058                 if (type == NULL)
 2059                         continue;
 2060                 if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
 2061                         continue;
 2062                 ptr = preload_fetch_addr(mod);
 2063                 len = preload_fetch_size(mod);
 2064                 if (ptr != NULL && len != 0) {
 2065                         sx_xlock(&md_sx);
 2066                         md_preloaded(ptr, len, name);
 2067                         sx_xunlock(&md_sx);
 2068                 }
 2069         }
 2070         md_pbuf_zone = pbuf_zsecond_create("mdpbuf", nswbuf / 10);
 2071         status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL,
 2072             0600, MDCTL_NAME);
 2073         g_topology_lock();
 2074 }
 2075 
 2076 static void
 2077 g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
 2078     struct g_consumer *cp __unused, struct g_provider *pp)
 2079 {
 2080         struct md_s *mp;
 2081         char *type;
 2082 
 2083         mp = gp->softc;
 2084         if (mp == NULL)
 2085                 return;
 2086 
 2087         switch (mp->type) {
 2088         case MD_MALLOC:
 2089                 type = "malloc";
 2090                 break;
 2091         case MD_PRELOAD:
 2092                 type = "preload";
 2093                 break;
 2094         case MD_VNODE:
 2095                 type = "vnode";
 2096                 break;
 2097         case MD_SWAP:
 2098                 type = "swap";
 2099                 break;
 2100         case MD_NULL:
 2101                 type = "null";
 2102                 break;
 2103         default:
 2104                 type = "unknown";
 2105                 break;
 2106         }
 2107 
 2108         if (pp != NULL) {
 2109                 if (indent == NULL) {
 2110                         sbuf_printf(sb, " u %d", mp->unit);
 2111                         sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize);
 2112                         sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads);
 2113                         sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors);
 2114                         sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize);
 2115                         sbuf_printf(sb, " t %s", type);
 2116                         if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
 2117                             (mp->type == MD_PRELOAD && mp->file[0] != '\0'))
 2118                                 sbuf_printf(sb, " file %s", mp->file);
 2119                         sbuf_printf(sb, " label %s", mp->label);
 2120                 } else {
 2121                         sbuf_printf(sb, "%s<unit>%d</unit>\n", indent,
 2122                             mp->unit);
 2123                         sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n",
 2124                             indent, (uintmax_t) mp->sectorsize);
 2125                         sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n",
 2126                             indent, (uintmax_t) mp->fwheads);
 2127                         sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n",
 2128                             indent, (uintmax_t) mp->fwsectors);
 2129                         if (mp->ident[0] != '\0') {
 2130                                 sbuf_printf(sb, "%s<ident>", indent);
 2131                                 g_conf_printf_escaped(sb, "%s", mp->ident);
 2132                                 sbuf_printf(sb, "</ident>\n");
 2133                         }
 2134                         sbuf_printf(sb, "%s<length>%ju</length>\n",
 2135                             indent, (uintmax_t) mp->mediasize);
 2136                         sbuf_printf(sb, "%s<compression>%s</compression>\n", indent,
 2137                             (mp->flags & MD_COMPRESS) == 0 ? "off": "on");
 2138                         sbuf_printf(sb, "%s<access>%s</access>\n", indent,
 2139                             (mp->flags & MD_READONLY) == 0 ? "read-write":
 2140                             "read-only");
 2141                         sbuf_printf(sb, "%s<type>%s</type>\n", indent,
 2142                             type);
 2143                         if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
 2144                             (mp->type == MD_PRELOAD && mp->file[0] != '\0')) {
 2145                                 sbuf_printf(sb, "%s<file>", indent);
 2146                                 g_conf_printf_escaped(sb, "%s", mp->file);
 2147                                 sbuf_printf(sb, "</file>\n");
 2148                         }
 2149                         if (mp->type == MD_VNODE)
 2150                                 sbuf_printf(sb, "%s<cache>%s</cache>\n", indent,
 2151                                     (mp->flags & MD_CACHE) == 0 ? "off": "on");
 2152                         sbuf_printf(sb, "%s<label>", indent);
 2153                         g_conf_printf_escaped(sb, "%s", mp->label);
 2154                         sbuf_printf(sb, "</label>\n");
 2155                 }
 2156         }
 2157 }
 2158 
 2159 static void
 2160 g_md_fini(struct g_class *mp __unused)
 2161 {
 2162 
 2163         sx_destroy(&md_sx);
 2164         if (status_dev != NULL)
 2165                 destroy_dev(status_dev);
 2166         uma_zdestroy(md_pbuf_zone);
 2167         delete_unrhdr(md_uh);
 2168 }

Cache object: e16644343d75d1cb34a97f9e1d322a61


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.