The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/md/md.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: (Beerware AND BSD-3-Clause)
    3  *
    4  * ----------------------------------------------------------------------------
    5  * "THE BEER-WARE LICENSE" (Revision 42):
    6  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
    7  * can do whatever you want with this stuff. If we meet some day, and you think
    8  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
    9  * ----------------------------------------------------------------------------
   10  *
   11  * $FreeBSD$
   12  *
   13  */
   14 
   15 /*-
   16  * The following functions are based on the vn(4) driver: mdstart_swap(),
   17  * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
   18  * and as such under the following copyright:
   19  *
   20  * Copyright (c) 1988 University of Utah.
   21  * Copyright (c) 1990, 1993
   22  *      The Regents of the University of California.  All rights reserved.
   23  * Copyright (c) 2013 The FreeBSD Foundation
   24  * All rights reserved.
   25  *
   26  * This code is derived from software contributed to Berkeley by
   27  * the Systems Programming Group of the University of Utah Computer
   28  * Science Department.
   29  *
   30  * Portions of this software were developed by Konstantin Belousov
   31  * under sponsorship from the FreeBSD Foundation.
   32  *
   33  * Redistribution and use in source and binary forms, with or without
   34  * modification, are permitted provided that the following conditions
   35  * are met:
   36  * 1. Redistributions of source code must retain the above copyright
   37  *    notice, this list of conditions and the following disclaimer.
   38  * 2. Redistributions in binary form must reproduce the above copyright
   39  *    notice, this list of conditions and the following disclaimer in the
   40  *    documentation and/or other materials provided with the distribution.
   41  * 3. Neither the name of the University nor the names of its contributors
   42  *    may be used to endorse or promote products derived from this software
   43  *    without specific prior written permission.
   44  *
   45  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   55  * SUCH DAMAGE.
   56  *
   57  * from: Utah Hdr: vn.c 1.13 94/04/02
   58  *
   59  *      from: @(#)vn.c  8.6 (Berkeley) 4/1/94
   60  * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
   61  */
   62 
   63 #include "opt_rootdevname.h"
   64 #include "opt_geom.h"
   65 #include "opt_md.h"
   66 
   67 #include <sys/param.h>
   68 #include <sys/systm.h>
   69 #include <sys/bio.h>
   70 #include <sys/buf.h>
   71 #include <sys/conf.h>
   72 #include <sys/devicestat.h>
   73 #include <sys/fcntl.h>
   74 #include <sys/kernel.h>
   75 #include <sys/kthread.h>
   76 #include <sys/limits.h>
   77 #include <sys/linker.h>
   78 #include <sys/lock.h>
   79 #include <sys/malloc.h>
   80 #include <sys/mdioctl.h>
   81 #include <sys/mount.h>
   82 #include <sys/mutex.h>
   83 #include <sys/sx.h>
   84 #include <sys/namei.h>
   85 #include <sys/proc.h>
   86 #include <sys/queue.h>
   87 #include <sys/rwlock.h>
   88 #include <sys/sbuf.h>
   89 #include <sys/sched.h>
   90 #include <sys/sf_buf.h>
   91 #include <sys/sysctl.h>
   92 #include <sys/uio.h>
   93 #include <sys/vnode.h>
   94 #include <sys/disk.h>
   95 
   96 #include <geom/geom.h>
   97 #include <geom/geom_int.h>
   98 
   99 #include <vm/vm.h>
  100 #include <vm/vm_param.h>
  101 #include <vm/vm_object.h>
  102 #include <vm/vm_page.h>
  103 #include <vm/vm_pager.h>
  104 #include <vm/swap_pager.h>
  105 #include <vm/uma.h>
  106 
  107 #include <machine/bus.h>
  108 
  109 #define MD_MODVER 1
  110 
  111 #define MD_SHUTDOWN     0x10000         /* Tell worker thread to terminate. */
  112 #define MD_EXITING      0x20000         /* Worker thread is exiting. */
  113 #define MD_PROVIDERGONE 0x40000         /* Safe to free the softc */
  114 
  115 #ifndef MD_NSECT
  116 #define MD_NSECT (10000 * 2)
  117 #endif
  118 
  119 struct md_req {
  120         unsigned        md_unit;        /* unit number */
  121         enum md_types   md_type;        /* type of disk */
  122         off_t           md_mediasize;   /* size of disk in bytes */
  123         unsigned        md_sectorsize;  /* sectorsize */
  124         unsigned        md_options;     /* options */
  125         int             md_fwheads;     /* firmware heads */
  126         int             md_fwsectors;   /* firmware sectors */
  127         char            *md_file;       /* pathname of file to mount */
  128         enum uio_seg    md_file_seg;    /* location of md_file */
  129         char            *md_label;      /* label of the device (userspace) */
  130         int             *md_units;      /* pointer to units array (kernel) */
  131         size_t          md_units_nitems; /* items in md_units array */
  132 };
  133 
  134 #ifdef COMPAT_FREEBSD32
  135 struct md_ioctl32 {
  136         unsigned        md_version;
  137         unsigned        md_unit;
  138         enum md_types   md_type;
  139         uint32_t        md_file;
  140         off_t           md_mediasize;
  141         unsigned        md_sectorsize;
  142         unsigned        md_options;
  143         uint64_t        md_base;
  144         int             md_fwheads;
  145         int             md_fwsectors;
  146         uint32_t        md_label;
  147         int             md_pad[MDNPAD];
  148 } __attribute__((__packed__));
  149 CTASSERT((sizeof(struct md_ioctl32)) == 436);
  150 
  151 #define MDIOCATTACH_32  _IOC_NEWTYPE(MDIOCATTACH, struct md_ioctl32)
  152 #define MDIOCDETACH_32  _IOC_NEWTYPE(MDIOCDETACH, struct md_ioctl32)
  153 #define MDIOCQUERY_32   _IOC_NEWTYPE(MDIOCQUERY, struct md_ioctl32)
  154 #define MDIOCRESIZE_32  _IOC_NEWTYPE(MDIOCRESIZE, struct md_ioctl32)
  155 #endif /* COMPAT_FREEBSD32 */
  156 
  157 static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk");
  158 static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors");
  159 
  160 static int md_debug;
  161 SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0,
  162     "Enable md(4) debug messages");
  163 static int md_malloc_wait;
  164 SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0,
  165     "Allow malloc to wait for memory allocations");
  166 
  167 #if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE)
  168 #define MD_ROOT_FSTYPE  "ufs"
  169 #endif
  170 
  171 #if defined(MD_ROOT)
  172 /*
  173  * Preloaded image gets put here.
  174  */
  175 #if defined(MD_ROOT_SIZE)
  176 /*
  177  * We put the mfs_root symbol into the oldmfs section of the kernel object file.
  178  * Applications that patch the object with the image can determine
  179  * the size looking at the oldmfs section size within the kernel.
  180  */
  181 u_char mfs_root[MD_ROOT_SIZE*1024] __attribute__ ((section ("oldmfs")));
  182 const int mfs_root_size = sizeof(mfs_root);
  183 #elif defined(MD_ROOT_MEM)
  184 /* MD region already mapped in the memory */
  185 u_char *mfs_root;
  186 int mfs_root_size;
  187 #else
  188 extern volatile u_char __weak_symbol mfs_root;
  189 extern volatile u_char __weak_symbol mfs_root_end;
  190 __GLOBL(mfs_root);
  191 __GLOBL(mfs_root_end);
  192 #define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root))
  193 #endif
  194 #endif
  195 
  196 static g_init_t g_md_init;
  197 static g_fini_t g_md_fini;
  198 static g_start_t g_md_start;
  199 static g_access_t g_md_access;
  200 static void g_md_dumpconf(struct sbuf *sb, const char *indent,
  201     struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp);
  202 static g_provgone_t g_md_providergone;
  203 
  204 static struct cdev *status_dev = NULL;
  205 static struct sx md_sx;
  206 static struct unrhdr *md_uh;
  207 
  208 static d_ioctl_t mdctlioctl;
  209 
  210 static struct cdevsw mdctl_cdevsw = {
  211         .d_version =    D_VERSION,
  212         .d_ioctl =      mdctlioctl,
  213         .d_name =       MD_NAME,
  214 };
  215 
  216 struct g_class g_md_class = {
  217         .name = "MD",
  218         .version = G_VERSION,
  219         .init = g_md_init,
  220         .fini = g_md_fini,
  221         .start = g_md_start,
  222         .access = g_md_access,
  223         .dumpconf = g_md_dumpconf,
  224         .providergone = g_md_providergone,
  225 };
  226 
  227 DECLARE_GEOM_CLASS(g_md_class, g_md);
  228 
  229 static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list);
  230 
  231 #define NINDIR  (PAGE_SIZE / sizeof(uintptr_t))
  232 #define NMASK   (NINDIR-1)
  233 static int nshift;
  234 
  235 static uma_zone_t md_pbuf_zone;
  236 
  237 struct indir {
  238         uintptr_t       *array;
  239         u_int           total;
  240         u_int           used;
  241         u_int           shift;
  242 };
  243 
  244 struct md_s {
  245         int unit;
  246         LIST_ENTRY(md_s) list;
  247         struct bio_queue_head bio_queue;
  248         struct mtx queue_mtx;
  249         struct cdev *dev;
  250         enum md_types type;
  251         off_t mediasize;
  252         unsigned sectorsize;
  253         unsigned opencount;
  254         unsigned fwheads;
  255         unsigned fwsectors;
  256         char ident[32];
  257         unsigned flags;
  258         char name[20];
  259         struct proc *procp;
  260         struct g_geom *gp;
  261         struct g_provider *pp;
  262         int (*start)(struct md_s *sc, struct bio *bp);
  263         struct devstat *devstat;
  264 
  265         /* MD_MALLOC related fields */
  266         struct indir *indir;
  267         uma_zone_t uma;
  268 
  269         /* MD_PRELOAD related fields */
  270         u_char *pl_ptr;
  271         size_t pl_len;
  272 
  273         /* MD_VNODE related fields */
  274         struct vnode *vnode;
  275         char file[PATH_MAX];
  276         char label[PATH_MAX];
  277         struct ucred *cred;
  278 
  279         /* MD_SWAP related fields */
  280         vm_object_t object;
  281 };
  282 
  283 static struct indir *
  284 new_indir(u_int shift)
  285 {
  286         struct indir *ip;
  287 
  288         ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT)
  289             | M_ZERO);
  290         if (ip == NULL)
  291                 return (NULL);
  292         ip->array = malloc(sizeof(uintptr_t) * NINDIR,
  293             M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO);
  294         if (ip->array == NULL) {
  295                 free(ip, M_MD);
  296                 return (NULL);
  297         }
  298         ip->total = NINDIR;
  299         ip->shift = shift;
  300         return (ip);
  301 }
  302 
  303 static void
  304 del_indir(struct indir *ip)
  305 {
  306 
  307         free(ip->array, M_MDSECT);
  308         free(ip, M_MD);
  309 }
  310 
  311 static void
  312 destroy_indir(struct md_s *sc, struct indir *ip)
  313 {
  314         int i;
  315 
  316         for (i = 0; i < NINDIR; i++) {
  317                 if (!ip->array[i])
  318                         continue;
  319                 if (ip->shift)
  320                         destroy_indir(sc, (struct indir*)(ip->array[i]));
  321                 else if (ip->array[i] > 255)
  322                         uma_zfree(sc->uma, (void *)(ip->array[i]));
  323         }
  324         del_indir(ip);
  325 }
  326 
  327 /*
  328  * This function does the math and allocates the top level "indir" structure
  329  * for a device of "size" sectors.
  330  */
  331 
  332 static struct indir *
  333 dimension(off_t size)
  334 {
  335         off_t rcnt;
  336         struct indir *ip;
  337         int layer;
  338 
  339         rcnt = size;
  340         layer = 0;
  341         while (rcnt > NINDIR) {
  342                 rcnt /= NINDIR;
  343                 layer++;
  344         }
  345 
  346         /*
  347          * XXX: the top layer is probably not fully populated, so we allocate
  348          * too much space for ip->array in here.
  349          */
  350         ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
  351         ip->array = malloc(sizeof(uintptr_t) * NINDIR,
  352             M_MDSECT, M_WAITOK | M_ZERO);
  353         ip->total = NINDIR;
  354         ip->shift = layer * nshift;
  355         return (ip);
  356 }
  357 
  358 /*
  359  * Read a given sector
  360  */
  361 
  362 static uintptr_t
  363 s_read(struct indir *ip, off_t offset)
  364 {
  365         struct indir *cip;
  366         int idx;
  367         uintptr_t up;
  368 
  369         if (md_debug > 1)
  370                 printf("s_read(%jd)\n", (intmax_t)offset);
  371         up = 0;
  372         for (cip = ip; cip != NULL;) {
  373                 if (cip->shift) {
  374                         idx = (offset >> cip->shift) & NMASK;
  375                         up = cip->array[idx];
  376                         cip = (struct indir *)up;
  377                         continue;
  378                 }
  379                 idx = offset & NMASK;
  380                 return (cip->array[idx]);
  381         }
  382         return (0);
  383 }
  384 
  385 /*
  386  * Write a given sector, prune the tree if the value is 0
  387  */
  388 
  389 static int
  390 s_write(struct indir *ip, off_t offset, uintptr_t ptr)
  391 {
  392         struct indir *cip, *lip[10];
  393         int idx, li;
  394         uintptr_t up;
  395 
  396         if (md_debug > 1)
  397                 printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
  398         up = 0;
  399         li = 0;
  400         cip = ip;
  401         for (;;) {
  402                 lip[li++] = cip;
  403                 if (cip->shift) {
  404                         idx = (offset >> cip->shift) & NMASK;
  405                         up = cip->array[idx];
  406                         if (up != 0) {
  407                                 cip = (struct indir *)up;
  408                                 continue;
  409                         }
  410                         /* Allocate branch */
  411                         cip->array[idx] =
  412                             (uintptr_t)new_indir(cip->shift - nshift);
  413                         if (cip->array[idx] == 0)
  414                                 return (ENOSPC);
  415                         cip->used++;
  416                         up = cip->array[idx];
  417                         cip = (struct indir *)up;
  418                         continue;
  419                 }
  420                 /* leafnode */
  421                 idx = offset & NMASK;
  422                 up = cip->array[idx];
  423                 if (up != 0)
  424                         cip->used--;
  425                 cip->array[idx] = ptr;
  426                 if (ptr != 0)
  427                         cip->used++;
  428                 break;
  429         }
  430         if (cip->used != 0 || li == 1)
  431                 return (0);
  432         li--;
  433         while (cip->used == 0 && cip != ip) {
  434                 li--;
  435                 idx = (offset >> lip[li]->shift) & NMASK;
  436                 up = lip[li]->array[idx];
  437                 KASSERT(up == (uintptr_t)cip, ("md screwed up"));
  438                 del_indir(cip);
  439                 lip[li]->array[idx] = 0;
  440                 lip[li]->used--;
  441                 cip = lip[li];
  442         }
  443         return (0);
  444 }
  445 
  446 static int
  447 g_md_access(struct g_provider *pp, int r, int w, int e)
  448 {
  449         struct md_s *sc;
  450 
  451         sc = pp->geom->softc;
  452         if (sc == NULL) {
  453                 if (r <= 0 && w <= 0 && e <= 0)
  454                         return (0);
  455                 return (ENXIO);
  456         }
  457         r += pp->acr;
  458         w += pp->acw;
  459         e += pp->ace;
  460         if ((sc->flags & MD_READONLY) != 0 && w > 0)
  461                 return (EROFS);
  462         if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
  463                 sc->opencount = 1;
  464         } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
  465                 sc->opencount = 0;
  466         }
  467         return (0);
  468 }
  469 
  470 static void
  471 g_md_start(struct bio *bp)
  472 {
  473         struct md_s *sc;
  474 
  475         sc = bp->bio_to->geom->softc;
  476         if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) {
  477                 devstat_start_transaction_bio(sc->devstat, bp);
  478         }
  479         mtx_lock(&sc->queue_mtx);
  480         bioq_disksort(&sc->bio_queue, bp);
  481         wakeup(sc);
  482         mtx_unlock(&sc->queue_mtx);
  483 }
  484 
  485 #define MD_MALLOC_MOVE_ZERO     1
  486 #define MD_MALLOC_MOVE_FILL     2
  487 #define MD_MALLOC_MOVE_READ     3
  488 #define MD_MALLOC_MOVE_WRITE    4
  489 #define MD_MALLOC_MOVE_CMP      5
  490 
  491 static int
  492 md_malloc_move_ma(vm_page_t **mp, int *ma_offs, unsigned sectorsize,
  493     void *ptr, u_char fill, int op)
  494 {
  495         struct sf_buf *sf;
  496         vm_page_t m, *mp1;
  497         char *p, first;
  498         off_t *uc;
  499         unsigned n;
  500         int error, i, ma_offs1, sz, first_read;
  501 
  502         m = NULL;
  503         error = 0;
  504         sf = NULL;
  505         /* if (op == MD_MALLOC_MOVE_CMP) { gcc */
  506                 first = 0;
  507                 first_read = 0;
  508                 uc = ptr;
  509                 mp1 = *mp;
  510                 ma_offs1 = *ma_offs;
  511         /* } */
  512         sched_pin();
  513         for (n = sectorsize; n != 0; n -= sz) {
  514                 sz = imin(PAGE_SIZE - *ma_offs, n);
  515                 if (m != **mp) {
  516                         if (sf != NULL)
  517                                 sf_buf_free(sf);
  518                         m = **mp;
  519                         sf = sf_buf_alloc(m, SFB_CPUPRIVATE |
  520                             (md_malloc_wait ? 0 : SFB_NOWAIT));
  521                         if (sf == NULL) {
  522                                 error = ENOMEM;
  523                                 break;
  524                         }
  525                 }
  526                 p = (char *)sf_buf_kva(sf) + *ma_offs;
  527                 switch (op) {
  528                 case MD_MALLOC_MOVE_ZERO:
  529                         bzero(p, sz);
  530                         break;
  531                 case MD_MALLOC_MOVE_FILL:
  532                         memset(p, fill, sz);
  533                         break;
  534                 case MD_MALLOC_MOVE_READ:
  535                         bcopy(ptr, p, sz);
  536                         cpu_flush_dcache(p, sz);
  537                         break;
  538                 case MD_MALLOC_MOVE_WRITE:
  539                         bcopy(p, ptr, sz);
  540                         break;
  541                 case MD_MALLOC_MOVE_CMP:
  542                         for (i = 0; i < sz; i++, p++) {
  543                                 if (!first_read) {
  544                                         *uc = (u_char)*p;
  545                                         first = *p;
  546                                         first_read = 1;
  547                                 } else if (*p != first) {
  548                                         error = EDOOFUS;
  549                                         break;
  550                                 }
  551                         }
  552                         break;
  553                 default:
  554                         KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op));
  555                         break;
  556                 }
  557                 if (error != 0)
  558                         break;
  559                 *ma_offs += sz;
  560                 *ma_offs %= PAGE_SIZE;
  561                 if (*ma_offs == 0)
  562                         (*mp)++;
  563                 ptr = (char *)ptr + sz;
  564         }
  565 
  566         if (sf != NULL)
  567                 sf_buf_free(sf);
  568         sched_unpin();
  569         if (op == MD_MALLOC_MOVE_CMP && error != 0) {
  570                 *mp = mp1;
  571                 *ma_offs = ma_offs1;
  572         }
  573         return (error);
  574 }
  575 
  576 static int
  577 md_malloc_move_vlist(bus_dma_segment_t **pvlist, int *pma_offs,
  578     unsigned len, void *ptr, u_char fill, int op)
  579 {
  580         bus_dma_segment_t *vlist;
  581         uint8_t *p, *end, first;
  582         off_t *uc;
  583         int ma_offs, seg_len;
  584 
  585         vlist = *pvlist;
  586         ma_offs = *pma_offs;
  587         uc = ptr;
  588 
  589         for (; len != 0; len -= seg_len) {
  590                 seg_len = imin(vlist->ds_len - ma_offs, len);
  591                 p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs;
  592                 switch (op) {
  593                 case MD_MALLOC_MOVE_ZERO:
  594                         bzero(p, seg_len);
  595                         break;
  596                 case MD_MALLOC_MOVE_FILL:
  597                         memset(p, fill, seg_len);
  598                         break;
  599                 case MD_MALLOC_MOVE_READ:
  600                         bcopy(ptr, p, seg_len);
  601                         cpu_flush_dcache(p, seg_len);
  602                         break;
  603                 case MD_MALLOC_MOVE_WRITE:
  604                         bcopy(p, ptr, seg_len);
  605                         break;
  606                 case MD_MALLOC_MOVE_CMP:
  607                         end = p + seg_len;
  608                         first = *uc = *p;
  609                         /* Confirm all following bytes match the first */
  610                         while (++p < end) {
  611                                 if (*p != first)
  612                                         return (EDOOFUS);
  613                         }
  614                         break;
  615                 default:
  616                         KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op));
  617                         break;
  618                 }
  619 
  620                 ma_offs += seg_len;
  621                 if (ma_offs == vlist->ds_len) {
  622                         ma_offs = 0;
  623                         vlist++;
  624                 }
  625                 ptr = (uint8_t *)ptr + seg_len;
  626         }
  627         *pvlist = vlist;
  628         *pma_offs = ma_offs;
  629 
  630         return (0);
  631 }
  632 
  633 static int
  634 mdstart_malloc(struct md_s *sc, struct bio *bp)
  635 {
  636         u_char *dst;
  637         vm_page_t *m;
  638         bus_dma_segment_t *vlist;
  639         int i, error, error1, ma_offs, notmapped;
  640         off_t secno, nsec, uc;
  641         uintptr_t sp, osp;
  642 
  643         switch (bp->bio_cmd) {
  644         case BIO_READ:
  645         case BIO_WRITE:
  646         case BIO_DELETE:
  647                 break;
  648         default:
  649                 return (EOPNOTSUPP);
  650         }
  651 
  652         notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0;
  653         vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
  654             (bus_dma_segment_t *)bp->bio_data : NULL;
  655         if (notmapped) {
  656                 m = bp->bio_ma;
  657                 ma_offs = bp->bio_ma_offset;
  658                 dst = NULL;
  659                 KASSERT(vlist == NULL, ("vlists cannot be unmapped"));
  660         } else if (vlist != NULL) {
  661                 ma_offs = bp->bio_ma_offset;
  662                 dst = NULL;
  663         } else {
  664                 dst = bp->bio_data;
  665         }
  666 
  667         nsec = bp->bio_length / sc->sectorsize;
  668         secno = bp->bio_offset / sc->sectorsize;
  669         error = 0;
  670         while (nsec--) {
  671                 osp = s_read(sc->indir, secno);
  672                 if (bp->bio_cmd == BIO_DELETE) {
  673                         if (osp != 0)
  674                                 error = s_write(sc->indir, secno, 0);
  675                 } else if (bp->bio_cmd == BIO_READ) {
  676                         if (osp == 0) {
  677                                 if (notmapped) {
  678                                         error = md_malloc_move_ma(&m, &ma_offs,
  679                                             sc->sectorsize, NULL, 0,
  680                                             MD_MALLOC_MOVE_ZERO);
  681                                 } else if (vlist != NULL) {
  682                                         error = md_malloc_move_vlist(&vlist,
  683                                             &ma_offs, sc->sectorsize, NULL, 0,
  684                                             MD_MALLOC_MOVE_ZERO);
  685                                 } else
  686                                         bzero(dst, sc->sectorsize);
  687                         } else if (osp <= 255) {
  688                                 if (notmapped) {
  689                                         error = md_malloc_move_ma(&m, &ma_offs,
  690                                             sc->sectorsize, NULL, osp,
  691                                             MD_MALLOC_MOVE_FILL);
  692                                 } else if (vlist != NULL) {
  693                                         error = md_malloc_move_vlist(&vlist,
  694                                             &ma_offs, sc->sectorsize, NULL, osp,
  695                                             MD_MALLOC_MOVE_FILL);
  696                                 } else
  697                                         memset(dst, osp, sc->sectorsize);
  698                         } else {
  699                                 if (notmapped) {
  700                                         error = md_malloc_move_ma(&m, &ma_offs,
  701                                             sc->sectorsize, (void *)osp, 0,
  702                                             MD_MALLOC_MOVE_READ);
  703                                 } else if (vlist != NULL) {
  704                                         error = md_malloc_move_vlist(&vlist,
  705                                             &ma_offs, sc->sectorsize,
  706                                             (void *)osp, 0,
  707                                             MD_MALLOC_MOVE_READ);
  708                                 } else {
  709                                         bcopy((void *)osp, dst, sc->sectorsize);
  710                                         cpu_flush_dcache(dst, sc->sectorsize);
  711                                 }
  712                         }
  713                         osp = 0;
  714                 } else if (bp->bio_cmd == BIO_WRITE) {
  715                         if (sc->flags & MD_COMPRESS) {
  716                                 if (notmapped) {
  717                                         error1 = md_malloc_move_ma(&m, &ma_offs,
  718                                             sc->sectorsize, &uc, 0,
  719                                             MD_MALLOC_MOVE_CMP);
  720                                         i = error1 == 0 ? sc->sectorsize : 0;
  721                                 } else if (vlist != NULL) {
  722                                         error1 = md_malloc_move_vlist(&vlist,
  723                                             &ma_offs, sc->sectorsize, &uc, 0,
  724                                             MD_MALLOC_MOVE_CMP);
  725                                         i = error1 == 0 ? sc->sectorsize : 0;
  726                                 } else {
  727                                         uc = dst[0];
  728                                         for (i = 1; i < sc->sectorsize; i++) {
  729                                                 if (dst[i] != uc)
  730                                                         break;
  731                                         }
  732                                 }
  733                         } else {
  734                                 i = 0;
  735                                 uc = 0;
  736                         }
  737                         if (i == sc->sectorsize) {
  738                                 if (osp != uc)
  739                                         error = s_write(sc->indir, secno, uc);
  740                         } else {
  741                                 if (osp <= 255) {
  742                                         sp = (uintptr_t)uma_zalloc(sc->uma,
  743                                             md_malloc_wait ? M_WAITOK :
  744                                             M_NOWAIT);
  745                                         if (sp == 0) {
  746                                                 error = ENOSPC;
  747                                                 break;
  748                                         }
  749                                         if (notmapped) {
  750                                                 error = md_malloc_move_ma(&m,
  751                                                     &ma_offs, sc->sectorsize,
  752                                                     (void *)sp, 0,
  753                                                     MD_MALLOC_MOVE_WRITE);
  754                                         } else if (vlist != NULL) {
  755                                                 error = md_malloc_move_vlist(
  756                                                     &vlist, &ma_offs,
  757                                                     sc->sectorsize, (void *)sp,
  758                                                     0, MD_MALLOC_MOVE_WRITE);
  759                                         } else {
  760                                                 bcopy(dst, (void *)sp,
  761                                                     sc->sectorsize);
  762                                         }
  763                                         error = s_write(sc->indir, secno, sp);
  764                                 } else {
  765                                         if (notmapped) {
  766                                                 error = md_malloc_move_ma(&m,
  767                                                     &ma_offs, sc->sectorsize,
  768                                                     (void *)osp, 0,
  769                                                     MD_MALLOC_MOVE_WRITE);
  770                                         } else if (vlist != NULL) {
  771                                                 error = md_malloc_move_vlist(
  772                                                     &vlist, &ma_offs,
  773                                                     sc->sectorsize, (void *)osp,
  774                                                     0, MD_MALLOC_MOVE_WRITE);
  775                                         } else {
  776                                                 bcopy(dst, (void *)osp,
  777                                                     sc->sectorsize);
  778                                         }
  779                                         osp = 0;
  780                                 }
  781                         }
  782                 } else {
  783                         error = EOPNOTSUPP;
  784                 }
  785                 if (osp > 255)
  786                         uma_zfree(sc->uma, (void*)osp);
  787                 if (error != 0)
  788                         break;
  789                 secno++;
  790                 if (!notmapped && vlist == NULL)
  791                         dst += sc->sectorsize;
  792         }
  793         bp->bio_resid = 0;
  794         return (error);
  795 }
  796 
  797 static void
  798 mdcopyto_vlist(void *src, bus_dma_segment_t *vlist, off_t offset, off_t len)
  799 {
  800         off_t seg_len;
  801 
  802         while (offset >= vlist->ds_len) {
  803                 offset -= vlist->ds_len;
  804                 vlist++;
  805         }
  806 
  807         while (len != 0) {
  808                 seg_len = omin(len, vlist->ds_len - offset);
  809                 bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset),
  810                     seg_len);
  811                 offset = 0;
  812                 src = (uint8_t *)src + seg_len;
  813                 len -= seg_len;
  814                 vlist++;
  815         }
  816 }
  817 
  818 static void
  819 mdcopyfrom_vlist(bus_dma_segment_t *vlist, off_t offset, void *dst, off_t len)
  820 {
  821         off_t seg_len;
  822 
  823         while (offset >= vlist->ds_len) {
  824                 offset -= vlist->ds_len;
  825                 vlist++;
  826         }
  827 
  828         while (len != 0) {
  829                 seg_len = omin(len, vlist->ds_len - offset);
  830                 bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst,
  831                     seg_len);
  832                 offset = 0;
  833                 dst = (uint8_t *)dst + seg_len;
  834                 len -= seg_len;
  835                 vlist++;
  836         }
  837 }
  838 
  839 static int
  840 mdstart_preload(struct md_s *sc, struct bio *bp)
  841 {
  842         uint8_t *p;
  843 
  844         p = sc->pl_ptr + bp->bio_offset;
  845         switch (bp->bio_cmd) {
  846         case BIO_READ:
  847                 if ((bp->bio_flags & BIO_VLIST) != 0) {
  848                         mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data,
  849                             bp->bio_ma_offset, bp->bio_length);
  850                 } else {
  851                         bcopy(p, bp->bio_data, bp->bio_length);
  852                 }
  853                 cpu_flush_dcache(bp->bio_data, bp->bio_length);
  854                 break;
  855         case BIO_WRITE:
  856                 if ((bp->bio_flags & BIO_VLIST) != 0) {
  857                         mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data,
  858                             bp->bio_ma_offset, p, bp->bio_length);
  859                 } else {
  860                         bcopy(bp->bio_data, p, bp->bio_length);
  861                 }
  862                 break;
  863         }
  864         bp->bio_resid = 0;
  865         return (0);
  866 }
  867 
  868 static int
  869 mdstart_vnode(struct md_s *sc, struct bio *bp)
  870 {
  871         int error;
  872         struct uio auio;
  873         struct iovec aiov;
  874         struct iovec *piov;
  875         struct mount *mp;
  876         struct vnode *vp;
  877         struct buf *pb;
  878         bus_dma_segment_t *vlist;
  879         struct thread *td;
  880         off_t iolen, iostart, len, zerosize;
  881         int ma_offs, npages;
  882 
  883         switch (bp->bio_cmd) {
  884         case BIO_READ:
  885                 auio.uio_rw = UIO_READ;
  886                 break;
  887         case BIO_WRITE:
  888         case BIO_DELETE:
  889                 auio.uio_rw = UIO_WRITE;
  890                 break;
  891         case BIO_FLUSH:
  892                 break;
  893         default:
  894                 return (EOPNOTSUPP);
  895         }
  896 
  897         td = curthread;
  898         vp = sc->vnode;
  899         pb = NULL;
  900         piov = NULL;
  901         ma_offs = bp->bio_ma_offset;
  902         len = bp->bio_length;
  903 
  904         /*
  905          * VNODE I/O
  906          *
  907          * If an error occurs, we set BIO_ERROR but we do not set
  908          * B_INVAL because (for a write anyway), the buffer is
  909          * still valid.
  910          */
  911 
  912         if (bp->bio_cmd == BIO_FLUSH) {
  913                 (void) vn_start_write(vp, &mp, V_WAIT);
  914                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  915                 error = VOP_FSYNC(vp, MNT_WAIT, td);
  916                 VOP_UNLOCK(vp);
  917                 vn_finished_write(mp);
  918                 return (error);
  919         }
  920 
  921         auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
  922         auio.uio_resid = bp->bio_length;
  923         auio.uio_segflg = UIO_SYSSPACE;
  924         auio.uio_td = td;
  925 
  926         if (bp->bio_cmd == BIO_DELETE) {
  927                 /*
  928                  * Emulate BIO_DELETE by writing zeros.
  929                  */
  930                 zerosize = ZERO_REGION_SIZE -
  931                     (ZERO_REGION_SIZE % sc->sectorsize);
  932                 auio.uio_iovcnt = howmany(bp->bio_length, zerosize);
  933                 piov = malloc(sizeof(*piov) * auio.uio_iovcnt, M_MD, M_WAITOK);
  934                 auio.uio_iov = piov;
  935                 while (len > 0) {
  936                         piov->iov_base = __DECONST(void *, zero_region);
  937                         piov->iov_len = len;
  938                         if (len > zerosize)
  939                                 piov->iov_len = zerosize;
  940                         len -= piov->iov_len;
  941                         piov++;
  942                 }
  943                 piov = auio.uio_iov;
  944         } else if ((bp->bio_flags & BIO_VLIST) != 0) {
  945                 piov = malloc(sizeof(*piov) * bp->bio_ma_n, M_MD, M_WAITOK);
  946                 auio.uio_iov = piov;
  947                 vlist = (bus_dma_segment_t *)bp->bio_data;
  948                 while (len > 0) {
  949                         piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr +
  950                             ma_offs);
  951                         piov->iov_len = vlist->ds_len - ma_offs;
  952                         if (piov->iov_len > len)
  953                                 piov->iov_len = len;
  954                         len -= piov->iov_len;
  955                         ma_offs = 0;
  956                         vlist++;
  957                         piov++;
  958                 }
  959                 auio.uio_iovcnt = piov - auio.uio_iov;
  960                 piov = auio.uio_iov;
  961         } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
  962                 pb = uma_zalloc(md_pbuf_zone, M_WAITOK);
  963                 MPASS((pb->b_flags & B_MAXPHYS) != 0);
  964                 bp->bio_resid = len;
  965 unmapped_step:
  966                 npages = atop(min(maxphys, round_page(len + (ma_offs &
  967                     PAGE_MASK))));
  968                 iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len);
  969                 KASSERT(iolen > 0, ("zero iolen"));
  970                 pmap_qenter((vm_offset_t)pb->b_data,
  971                     &bp->bio_ma[atop(ma_offs)], npages);
  972                 aiov.iov_base = (void *)((vm_offset_t)pb->b_data +
  973                     (ma_offs & PAGE_MASK));
  974                 aiov.iov_len = iolen;
  975                 auio.uio_iov = &aiov;
  976                 auio.uio_iovcnt = 1;
  977                 auio.uio_resid = iolen;
  978         } else {
  979                 aiov.iov_base = bp->bio_data;
  980                 aiov.iov_len = bp->bio_length;
  981                 auio.uio_iov = &aiov;
  982                 auio.uio_iovcnt = 1;
  983         }
  984         iostart = auio.uio_offset;
  985         if (auio.uio_rw == UIO_READ) {
  986                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  987                 error = VOP_READ(vp, &auio, 0, sc->cred);
  988                 VOP_UNLOCK(vp);
  989         } else {
  990                 (void) vn_start_write(vp, &mp, V_WAIT);
  991                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  992                 error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC,
  993                     sc->cred);
  994                 VOP_UNLOCK(vp);
  995                 vn_finished_write(mp);
  996                 if (error == 0)
  997                         sc->flags &= ~MD_VERIFY;
  998         }
  999 
 1000         /* When MD_CACHE is set, try to avoid double-caching the data. */
 1001         if (error == 0 && (sc->flags & MD_CACHE) == 0)
 1002                 VOP_ADVISE(vp, iostart, auio.uio_offset - 1,
 1003                     POSIX_FADV_DONTNEED);
 1004 
 1005         if (pb != NULL) {
 1006                 pmap_qremove((vm_offset_t)pb->b_data, npages);
 1007                 if (error == 0) {
 1008                         len -= iolen;
 1009                         bp->bio_resid -= iolen;
 1010                         ma_offs += iolen;
 1011                         if (len > 0)
 1012                                 goto unmapped_step;
 1013                 }
 1014                 uma_zfree(md_pbuf_zone, pb);
 1015         } else {
 1016                 bp->bio_resid = auio.uio_resid;
 1017         }
 1018 
 1019         free(piov, M_MD);
 1020         return (error);
 1021 }
 1022 
 1023 static int
 1024 mdstart_swap(struct md_s *sc, struct bio *bp)
 1025 {
 1026         vm_page_t m;
 1027         u_char *p;
 1028         vm_pindex_t i, lastp;
 1029         bus_dma_segment_t *vlist;
 1030         int rv, ma_offs, offs, len, lastend;
 1031 
 1032         switch (bp->bio_cmd) {
 1033         case BIO_READ:
 1034         case BIO_WRITE:
 1035         case BIO_DELETE:
 1036                 break;
 1037         default:
 1038                 return (EOPNOTSUPP);
 1039         }
 1040 
 1041         p = bp->bio_data;
 1042         ma_offs = (bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0 ?
 1043             bp->bio_ma_offset : 0;
 1044         vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
 1045             (bus_dma_segment_t *)bp->bio_data : NULL;
 1046 
 1047         /*
 1048          * offs is the offset at which to start operating on the
 1049          * next (ie, first) page.  lastp is the last page on
 1050          * which we're going to operate.  lastend is the ending
 1051          * position within that last page (ie, PAGE_SIZE if
 1052          * we're operating on complete aligned pages).
 1053          */
 1054         offs = bp->bio_offset % PAGE_SIZE;
 1055         lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
 1056         lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
 1057 
 1058         rv = VM_PAGER_OK;
 1059         vm_object_pip_add(sc->object, 1);
 1060         for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
 1061                 len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
 1062                 m = vm_page_grab_unlocked(sc->object, i, VM_ALLOC_SYSTEM);
 1063                 if (bp->bio_cmd == BIO_READ) {
 1064                         if (vm_page_all_valid(m))
 1065                                 rv = VM_PAGER_OK;
 1066                         else
 1067                                 rv = vm_pager_get_pages(sc->object, &m, 1,
 1068                                     NULL, NULL);
 1069                         if (rv == VM_PAGER_ERROR) {
 1070                                 VM_OBJECT_WLOCK(sc->object);
 1071                                 vm_page_free(m);
 1072                                 VM_OBJECT_WUNLOCK(sc->object);
 1073                                 break;
 1074                         } else if (rv == VM_PAGER_FAIL) {
 1075                                 /*
 1076                                  * Pager does not have the page.  Zero
 1077                                  * the allocated page, and mark it as
 1078                                  * valid. Do not set dirty, the page
 1079                                  * can be recreated if thrown out.
 1080                                  */
 1081                                 pmap_zero_page(m);
 1082                                 vm_page_valid(m);
 1083                         }
 1084                         if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 1085                                 pmap_copy_pages(&m, offs, bp->bio_ma,
 1086                                     ma_offs, len);
 1087                         } else if ((bp->bio_flags & BIO_VLIST) != 0) {
 1088                                 physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs,
 1089                                     vlist, ma_offs, len);
 1090                                 cpu_flush_dcache(p, len);
 1091                         } else {
 1092                                 physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len);
 1093                                 cpu_flush_dcache(p, len);
 1094                         }
 1095                 } else if (bp->bio_cmd == BIO_WRITE) {
 1096                         if (len == PAGE_SIZE || vm_page_all_valid(m))
 1097                                 rv = VM_PAGER_OK;
 1098                         else
 1099                                 rv = vm_pager_get_pages(sc->object, &m, 1,
 1100                                     NULL, NULL);
 1101                         if (rv == VM_PAGER_ERROR) {
 1102                                 VM_OBJECT_WLOCK(sc->object);
 1103                                 vm_page_free(m);
 1104                                 VM_OBJECT_WUNLOCK(sc->object);
 1105                                 break;
 1106                         } else if (rv == VM_PAGER_FAIL)
 1107                                 pmap_zero_page(m);
 1108 
 1109                         if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 1110                                 pmap_copy_pages(bp->bio_ma, ma_offs, &m,
 1111                                     offs, len);
 1112                         } else if ((bp->bio_flags & BIO_VLIST) != 0) {
 1113                                 physcopyin_vlist(vlist, ma_offs,
 1114                                     VM_PAGE_TO_PHYS(m) + offs, len);
 1115                         } else {
 1116                                 physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len);
 1117                         }
 1118 
 1119                         vm_page_valid(m);
 1120                         vm_page_set_dirty(m);
 1121                 } else if (bp->bio_cmd == BIO_DELETE) {
 1122                         if (len == PAGE_SIZE || vm_page_all_valid(m))
 1123                                 rv = VM_PAGER_OK;
 1124                         else
 1125                                 rv = vm_pager_get_pages(sc->object, &m, 1,
 1126                                     NULL, NULL);
 1127                         VM_OBJECT_WLOCK(sc->object);
 1128                         if (rv == VM_PAGER_ERROR) {
 1129                                 vm_page_free(m);
 1130                                 VM_OBJECT_WUNLOCK(sc->object);
 1131                                 break;
 1132                         } else if (rv == VM_PAGER_FAIL) {
 1133                                 vm_page_free(m);
 1134                                 m = NULL;
 1135                         } else {
 1136                                 /* Page is valid. */
 1137                                 if (len != PAGE_SIZE) {
 1138                                         pmap_zero_page_area(m, offs, len);
 1139                                         vm_page_set_dirty(m);
 1140                                 } else {
 1141                                         vm_pager_page_unswapped(m);
 1142                                         vm_page_free(m);
 1143                                         m = NULL;
 1144                                 }
 1145                         }
 1146                         VM_OBJECT_WUNLOCK(sc->object);
 1147                 }
 1148                 if (m != NULL) {
 1149                         /*
 1150                          * The page may be deactivated prior to setting
 1151                          * PGA_REFERENCED, but in this case it will be
 1152                          * reactivated by the page daemon.
 1153                          */
 1154                         if (vm_page_active(m))
 1155                                 vm_page_reference(m);
 1156                         else
 1157                                 vm_page_activate(m);
 1158                         vm_page_xunbusy(m);
 1159                 }
 1160 
 1161                 /* Actions on further pages start at offset 0 */
 1162                 p += PAGE_SIZE - offs;
 1163                 offs = 0;
 1164                 ma_offs += len;
 1165         }
 1166         vm_object_pip_wakeup(sc->object);
 1167         return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
 1168 }
 1169 
 1170 static int
 1171 mdstart_null(struct md_s *sc, struct bio *bp)
 1172 {
 1173 
 1174         switch (bp->bio_cmd) {
 1175         case BIO_READ:
 1176                 bzero(bp->bio_data, bp->bio_length);
 1177                 cpu_flush_dcache(bp->bio_data, bp->bio_length);
 1178                 break;
 1179         case BIO_WRITE:
 1180                 break;
 1181         }
 1182         bp->bio_resid = 0;
 1183         return (0);
 1184 }
 1185 
 1186 static void
 1187 md_handleattr(struct md_s *sc, struct bio *bp)
 1188 {
 1189         if (sc->fwsectors && sc->fwheads &&
 1190             (g_handleattr_int(bp, "GEOM::fwsectors", sc->fwsectors) != 0 ||
 1191             g_handleattr_int(bp, "GEOM::fwheads", sc->fwheads) != 0))
 1192                 return;
 1193         if (g_handleattr_int(bp, "GEOM::candelete", 1) != 0)
 1194                 return;
 1195         if (sc->ident[0] != '\0' &&
 1196             g_handleattr_str(bp, "GEOM::ident", sc->ident) != 0)
 1197                 return;
 1198         if (g_handleattr_int(bp, "MNT::verified", (sc->flags & MD_VERIFY) != 0))
 1199                 return;
 1200         g_io_deliver(bp, EOPNOTSUPP);
 1201 }
 1202 
 1203 static void
 1204 md_kthread(void *arg)
 1205 {
 1206         struct md_s *sc;
 1207         struct bio *bp;
 1208         int error;
 1209 
 1210         sc = arg;
 1211         thread_lock(curthread);
 1212         sched_prio(curthread, PRIBIO);
 1213         thread_unlock(curthread);
 1214         if (sc->type == MD_VNODE)
 1215                 curthread->td_pflags |= TDP_NORUNNINGBUF;
 1216 
 1217         for (;;) {
 1218                 mtx_lock(&sc->queue_mtx);
 1219                 if (sc->flags & MD_SHUTDOWN) {
 1220                         sc->flags |= MD_EXITING;
 1221                         mtx_unlock(&sc->queue_mtx);
 1222                         kproc_exit(0);
 1223                 }
 1224                 bp = bioq_takefirst(&sc->bio_queue);
 1225                 if (!bp) {
 1226                         msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
 1227                         continue;
 1228                 }
 1229                 mtx_unlock(&sc->queue_mtx);
 1230                 if (bp->bio_cmd == BIO_GETATTR) {
 1231                         md_handleattr(sc, bp);
 1232                 } else {
 1233                         error = sc->start(sc, bp);
 1234                         if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
 1235                                 /*
 1236                                  * Devstat uses (bio_bcount, bio_resid) for
 1237                                  * determining the length of the completed part
 1238                                  * of the i/o.  g_io_deliver() will translate
 1239                                  * from bio_completed to that, but it also
 1240                                  * destroys the bio so we must do our own
 1241                                  * translation.
 1242                                  */
 1243                                 bp->bio_bcount = bp->bio_length;
 1244                                 devstat_end_transaction_bio(sc->devstat, bp);
 1245                         }
 1246                         bp->bio_completed = bp->bio_length - bp->bio_resid;
 1247                         g_io_deliver(bp, error);
 1248                 }
 1249         }
 1250 }
 1251 
 1252 static struct md_s *
 1253 mdfind(int unit)
 1254 {
 1255         struct md_s *sc;
 1256 
 1257         LIST_FOREACH(sc, &md_softc_list, list) {
 1258                 if (sc->unit == unit)
 1259                         break;
 1260         }
 1261         return (sc);
 1262 }
 1263 
 1264 static struct md_s *
 1265 mdnew(int unit, int *errp, enum md_types type)
 1266 {
 1267         struct md_s *sc;
 1268         int error;
 1269 
 1270         *errp = 0;
 1271         if (unit == -1)
 1272                 unit = alloc_unr(md_uh);
 1273         else
 1274                 unit = alloc_unr_specific(md_uh, unit);
 1275 
 1276         if (unit == -1) {
 1277                 *errp = EBUSY;
 1278                 return (NULL);
 1279         }
 1280 
 1281         sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
 1282         sc->type = type;
 1283         bioq_init(&sc->bio_queue);
 1284         mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
 1285         sc->unit = unit;
 1286         sprintf(sc->name, "md%d", unit);
 1287         LIST_INSERT_HEAD(&md_softc_list, sc, list);
 1288         error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
 1289         if (error == 0)
 1290                 return (sc);
 1291         LIST_REMOVE(sc, list);
 1292         mtx_destroy(&sc->queue_mtx);
 1293         free_unr(md_uh, sc->unit);
 1294         free(sc, M_MD);
 1295         *errp = error;
 1296         return (NULL);
 1297 }
 1298 
 1299 static void
 1300 mdinit(struct md_s *sc)
 1301 {
 1302         struct g_geom *gp;
 1303         struct g_provider *pp;
 1304 
 1305         g_topology_lock();
 1306         gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
 1307         gp->softc = sc;
 1308         pp = g_new_providerf(gp, "md%d", sc->unit);
 1309         devstat_remove_entry(pp->stat);
 1310         pp->stat = NULL;
 1311         pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
 1312         pp->mediasize = sc->mediasize;
 1313         pp->sectorsize = sc->sectorsize;
 1314         switch (sc->type) {
 1315         case MD_MALLOC:
 1316         case MD_VNODE:
 1317         case MD_SWAP:
 1318                 pp->flags |= G_PF_ACCEPT_UNMAPPED;
 1319                 break;
 1320         case MD_PRELOAD:
 1321         case MD_NULL:
 1322                 break;
 1323         }
 1324         sc->gp = gp;
 1325         sc->pp = pp;
 1326         sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize,
 1327             DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
 1328         sc->devstat->id = pp;
 1329         g_error_provider(pp, 0);
 1330         g_topology_unlock();
 1331 }
 1332 
 1333 static int
 1334 mdcreate_malloc(struct md_s *sc, struct md_req *mdr)
 1335 {
 1336         uintptr_t sp;
 1337         int error;
 1338         off_t u;
 1339 
 1340         error = 0;
 1341         if (mdr->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
 1342                 return (EINVAL);
 1343         if (mdr->md_sectorsize != 0 && !powerof2(mdr->md_sectorsize))
 1344                 return (EINVAL);
 1345         /* Compression doesn't make sense if we have reserved space */
 1346         if (mdr->md_options & MD_RESERVE)
 1347                 mdr->md_options &= ~MD_COMPRESS;
 1348         if (mdr->md_fwsectors != 0)
 1349                 sc->fwsectors = mdr->md_fwsectors;
 1350         if (mdr->md_fwheads != 0)
 1351                 sc->fwheads = mdr->md_fwheads;
 1352         sc->flags = mdr->md_options & (MD_COMPRESS | MD_FORCE);
 1353         sc->indir = dimension(sc->mediasize / sc->sectorsize);
 1354         sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
 1355             0x1ff, 0);
 1356         if (mdr->md_options & MD_RESERVE) {
 1357                 off_t nsectors;
 1358 
 1359                 nsectors = sc->mediasize / sc->sectorsize;
 1360                 for (u = 0; u < nsectors; u++) {
 1361                         sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ?
 1362                             M_WAITOK : M_NOWAIT) | M_ZERO);
 1363                         if (sp != 0)
 1364                                 error = s_write(sc->indir, u, sp);
 1365                         else
 1366                                 error = ENOMEM;
 1367                         if (error != 0)
 1368                                 break;
 1369                 }
 1370         }
 1371         return (error);
 1372 }
 1373 
 1374 static int
 1375 mdsetcred(struct md_s *sc, struct ucred *cred)
 1376 {
 1377         char *tmpbuf;
 1378         int error = 0;
 1379 
 1380         /*
 1381          * Set credits in our softc
 1382          */
 1383 
 1384         if (sc->cred)
 1385                 crfree(sc->cred);
 1386         sc->cred = crhold(cred);
 1387 
 1388         /*
 1389          * Horrible kludge to establish credentials for NFS  XXX.
 1390          */
 1391 
 1392         if (sc->vnode) {
 1393                 struct uio auio;
 1394                 struct iovec aiov;
 1395 
 1396                 tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
 1397                 bzero(&auio, sizeof(auio));
 1398 
 1399                 aiov.iov_base = tmpbuf;
 1400                 aiov.iov_len = sc->sectorsize;
 1401                 auio.uio_iov = &aiov;
 1402                 auio.uio_iovcnt = 1;
 1403                 auio.uio_offset = 0;
 1404                 auio.uio_rw = UIO_READ;
 1405                 auio.uio_segflg = UIO_SYSSPACE;
 1406                 auio.uio_resid = aiov.iov_len;
 1407                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
 1408                 error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
 1409                 VOP_UNLOCK(sc->vnode);
 1410                 free(tmpbuf, M_TEMP);
 1411         }
 1412         return (error);
 1413 }
 1414 
 1415 static int
 1416 mdcreate_vnode(struct md_s *sc, struct md_req *mdr, struct thread *td)
 1417 {
 1418         struct vattr vattr;
 1419         struct nameidata nd;
 1420         char *fname;
 1421         int error, flags;
 1422 
 1423         fname = mdr->md_file;
 1424         if (mdr->md_file_seg == UIO_USERSPACE) {
 1425                 error = copyinstr(fname, sc->file, sizeof(sc->file), NULL);
 1426                 if (error != 0)
 1427                         return (error);
 1428         } else if (mdr->md_file_seg == UIO_SYSSPACE)
 1429                 strlcpy(sc->file, fname, sizeof(sc->file));
 1430         else
 1431                 return (EDOOFUS);
 1432 
 1433         /*
 1434          * If the user specified that this is a read only device, don't
 1435          * set the FWRITE mask before trying to open the backing store.
 1436          */
 1437         flags = FREAD | ((mdr->md_options & MD_READONLY) ? 0 : FWRITE) \
 1438             | ((mdr->md_options & MD_VERIFY) ? O_VERIFY : 0);
 1439         NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td);
 1440         error = vn_open(&nd, &flags, 0, NULL);
 1441         if (error != 0)
 1442                 return (error);
 1443         NDFREE(&nd, NDF_ONLY_PNBUF);
 1444         if (nd.ni_vp->v_type != VREG) {
 1445                 error = EINVAL;
 1446                 goto bad;
 1447         }
 1448         error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred);
 1449         if (error != 0)
 1450                 goto bad;
 1451         if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) {
 1452                 vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY);
 1453                 if (VN_IS_DOOMED(nd.ni_vp)) {
 1454                         /* Forced unmount. */
 1455                         error = EBADF;
 1456                         goto bad;
 1457                 }
 1458         }
 1459         nd.ni_vp->v_vflag |= VV_MD;
 1460         VOP_UNLOCK(nd.ni_vp);
 1461 
 1462         if (mdr->md_fwsectors != 0)
 1463                 sc->fwsectors = mdr->md_fwsectors;
 1464         if (mdr->md_fwheads != 0)
 1465                 sc->fwheads = mdr->md_fwheads;
 1466         snprintf(sc->ident, sizeof(sc->ident), "MD-DEV%ju-INO%ju",
 1467             (uintmax_t)vattr.va_fsid, (uintmax_t)vattr.va_fileid);
 1468         sc->flags = mdr->md_options & (MD_ASYNC | MD_CACHE | MD_FORCE |
 1469             MD_VERIFY);
 1470         if (!(flags & FWRITE))
 1471                 sc->flags |= MD_READONLY;
 1472         sc->vnode = nd.ni_vp;
 1473 
 1474         error = mdsetcred(sc, td->td_ucred);
 1475         if (error != 0) {
 1476                 sc->vnode = NULL;
 1477                 vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
 1478                 nd.ni_vp->v_vflag &= ~VV_MD;
 1479                 goto bad;
 1480         }
 1481         return (0);
 1482 bad:
 1483         VOP_UNLOCK(nd.ni_vp);
 1484         (void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
 1485         return (error);
 1486 }
 1487 
 1488 static void
 1489 g_md_providergone(struct g_provider *pp)
 1490 {
 1491         struct md_s *sc = pp->geom->softc;
 1492 
 1493         mtx_lock(&sc->queue_mtx);
 1494         sc->flags |= MD_PROVIDERGONE;
 1495         wakeup(&sc->flags);
 1496         mtx_unlock(&sc->queue_mtx);
 1497 }
 1498 
 1499 static int
 1500 mddestroy(struct md_s *sc, struct thread *td)
 1501 {
 1502 
 1503         if (sc->gp) {
 1504                 g_topology_lock();
 1505                 g_wither_geom(sc->gp, ENXIO);
 1506                 g_topology_unlock();
 1507 
 1508                 mtx_lock(&sc->queue_mtx);
 1509                 while (!(sc->flags & MD_PROVIDERGONE))
 1510                         msleep(&sc->flags, &sc->queue_mtx, PRIBIO, "mddestroy", 0);
 1511                 mtx_unlock(&sc->queue_mtx);
 1512         }
 1513         if (sc->devstat) {
 1514                 devstat_remove_entry(sc->devstat);
 1515                 sc->devstat = NULL;
 1516         }
 1517         mtx_lock(&sc->queue_mtx);
 1518         sc->flags |= MD_SHUTDOWN;
 1519         wakeup(sc);
 1520         while (!(sc->flags & MD_EXITING))
 1521                 msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10);
 1522         mtx_unlock(&sc->queue_mtx);
 1523         mtx_destroy(&sc->queue_mtx);
 1524         if (sc->vnode != NULL) {
 1525                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
 1526                 sc->vnode->v_vflag &= ~VV_MD;
 1527                 VOP_UNLOCK(sc->vnode);
 1528                 (void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
 1529                     FREAD : (FREAD|FWRITE), sc->cred, td);
 1530         }
 1531         if (sc->cred != NULL)
 1532                 crfree(sc->cred);
 1533         if (sc->object != NULL)
 1534                 vm_object_deallocate(sc->object);
 1535         if (sc->indir)
 1536                 destroy_indir(sc, sc->indir);
 1537         if (sc->uma)
 1538                 uma_zdestroy(sc->uma);
 1539 
 1540         LIST_REMOVE(sc, list);
 1541         free_unr(md_uh, sc->unit);
 1542         free(sc, M_MD);
 1543         return (0);
 1544 }
 1545 
 1546 static int
 1547 mdresize(struct md_s *sc, struct md_req *mdr)
 1548 {
 1549         int error, res;
 1550         vm_pindex_t oldpages, newpages;
 1551 
 1552         switch (sc->type) {
 1553         case MD_VNODE:
 1554         case MD_NULL:
 1555                 break;
 1556         case MD_SWAP:
 1557                 if (mdr->md_mediasize <= 0 ||
 1558                     (mdr->md_mediasize % PAGE_SIZE) != 0)
 1559                         return (EDOM);
 1560                 oldpages = OFF_TO_IDX(sc->mediasize);
 1561                 newpages = OFF_TO_IDX(mdr->md_mediasize);
 1562                 if (newpages < oldpages) {
 1563                         VM_OBJECT_WLOCK(sc->object);
 1564                         vm_object_page_remove(sc->object, newpages, 0, 0);
 1565                         swap_release_by_cred(IDX_TO_OFF(oldpages -
 1566                             newpages), sc->cred);
 1567                         sc->object->charge = IDX_TO_OFF(newpages);
 1568                         sc->object->size = newpages;
 1569                         VM_OBJECT_WUNLOCK(sc->object);
 1570                 } else if (newpages > oldpages) {
 1571                         res = swap_reserve_by_cred(IDX_TO_OFF(newpages -
 1572                             oldpages), sc->cred);
 1573                         if (!res)
 1574                                 return (ENOMEM);
 1575                         if ((mdr->md_options & MD_RESERVE) ||
 1576                             (sc->flags & MD_RESERVE)) {
 1577                                 error = swap_pager_reserve(sc->object,
 1578                                     oldpages, newpages - oldpages);
 1579                                 if (error < 0) {
 1580                                         swap_release_by_cred(
 1581                                             IDX_TO_OFF(newpages - oldpages),
 1582                                             sc->cred);
 1583                                         return (EDOM);
 1584                                 }
 1585                         }
 1586                         VM_OBJECT_WLOCK(sc->object);
 1587                         sc->object->charge = IDX_TO_OFF(newpages);
 1588                         sc->object->size = newpages;
 1589                         VM_OBJECT_WUNLOCK(sc->object);
 1590                 }
 1591                 break;
 1592         default:
 1593                 return (EOPNOTSUPP);
 1594         }
 1595 
 1596         sc->mediasize = mdr->md_mediasize;
 1597         g_topology_lock();
 1598         g_resize_provider(sc->pp, sc->mediasize);
 1599         g_topology_unlock();
 1600         return (0);
 1601 }
 1602 
 1603 static int
 1604 mdcreate_swap(struct md_s *sc, struct md_req *mdr, struct thread *td)
 1605 {
 1606         vm_ooffset_t npage;
 1607         int error;
 1608 
 1609         /*
 1610          * Range check.  Disallow negative sizes and sizes not being
 1611          * multiple of page size.
 1612          */
 1613         if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
 1614                 return (EDOM);
 1615 
 1616         /*
 1617          * Allocate an OBJT_SWAP object.
 1618          *
 1619          * Note the truncation.
 1620          */
 1621 
 1622         if ((mdr->md_options & MD_VERIFY) != 0)
 1623                 return (EINVAL);
 1624         npage = mdr->md_mediasize / PAGE_SIZE;
 1625         if (mdr->md_fwsectors != 0)
 1626                 sc->fwsectors = mdr->md_fwsectors;
 1627         if (mdr->md_fwheads != 0)
 1628                 sc->fwheads = mdr->md_fwheads;
 1629         sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
 1630             VM_PROT_DEFAULT, 0, td->td_ucred);
 1631         if (sc->object == NULL)
 1632                 return (ENOMEM);
 1633         sc->flags = mdr->md_options & (MD_FORCE | MD_RESERVE);
 1634         if (mdr->md_options & MD_RESERVE) {
 1635                 if (swap_pager_reserve(sc->object, 0, npage) < 0) {
 1636                         error = EDOM;
 1637                         goto finish;
 1638                 }
 1639         }
 1640         error = mdsetcred(sc, td->td_ucred);
 1641  finish:
 1642         if (error != 0) {
 1643                 vm_object_deallocate(sc->object);
 1644                 sc->object = NULL;
 1645         }
 1646         return (error);
 1647 }
 1648 
 1649 static int
 1650 mdcreate_null(struct md_s *sc, struct md_req *mdr, struct thread *td)
 1651 {
 1652 
 1653         /*
 1654          * Range check.  Disallow negative sizes and sizes not being
 1655          * multiple of page size.
 1656          */
 1657         if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
 1658                 return (EDOM);
 1659 
 1660         return (0);
 1661 }
 1662 
 1663 static int
 1664 kern_mdattach_locked(struct thread *td, struct md_req *mdr)
 1665 {
 1666         struct md_s *sc;
 1667         unsigned sectsize;
 1668         int error, i;
 1669 
 1670         sx_assert(&md_sx, SA_XLOCKED);
 1671 
 1672         switch (mdr->md_type) {
 1673         case MD_MALLOC:
 1674         case MD_PRELOAD:
 1675         case MD_VNODE:
 1676         case MD_SWAP:
 1677         case MD_NULL:
 1678                 break;
 1679         default:
 1680                 return (EINVAL);
 1681         }
 1682         if (mdr->md_sectorsize == 0)
 1683                 sectsize = DEV_BSIZE;
 1684         else
 1685                 sectsize = mdr->md_sectorsize;
 1686         if (sectsize > maxphys || mdr->md_mediasize < sectsize)
 1687                 return (EINVAL);
 1688         if (mdr->md_options & MD_AUTOUNIT)
 1689                 sc = mdnew(-1, &error, mdr->md_type);
 1690         else {
 1691                 if (mdr->md_unit > INT_MAX)
 1692                         return (EINVAL);
 1693                 sc = mdnew(mdr->md_unit, &error, mdr->md_type);
 1694         }
 1695         if (sc == NULL)
 1696                 return (error);
 1697         if (mdr->md_label != NULL)
 1698                 error = copyinstr(mdr->md_label, sc->label,
 1699                     sizeof(sc->label), NULL);
 1700         if (error != 0)
 1701                 goto err_after_new;
 1702         if (mdr->md_options & MD_AUTOUNIT)
 1703                 mdr->md_unit = sc->unit;
 1704         sc->mediasize = mdr->md_mediasize;
 1705         sc->sectorsize = sectsize;
 1706         error = EDOOFUS;
 1707         switch (sc->type) {
 1708         case MD_MALLOC:
 1709                 sc->start = mdstart_malloc;
 1710                 error = mdcreate_malloc(sc, mdr);
 1711                 break;
 1712         case MD_PRELOAD:
 1713                 /*
 1714                  * We disallow attaching preloaded memory disks via
 1715                  * ioctl. Preloaded memory disks are automatically
 1716                  * attached in g_md_init().
 1717                  */
 1718                 error = EOPNOTSUPP;
 1719                 break;
 1720         case MD_VNODE:
 1721                 sc->start = mdstart_vnode;
 1722                 error = mdcreate_vnode(sc, mdr, td);
 1723                 break;
 1724         case MD_SWAP:
 1725                 sc->start = mdstart_swap;
 1726                 error = mdcreate_swap(sc, mdr, td);
 1727                 break;
 1728         case MD_NULL:
 1729                 sc->start = mdstart_null;
 1730                 error = mdcreate_null(sc, mdr, td);
 1731                 break;
 1732         }
 1733 err_after_new:
 1734         if (error != 0) {
 1735                 mddestroy(sc, td);
 1736                 return (error);
 1737         }
 1738 
 1739         /* Prune off any residual fractional sector */
 1740         i = sc->mediasize % sc->sectorsize;
 1741         sc->mediasize -= i;
 1742 
 1743         mdinit(sc);
 1744         return (0);
 1745 }
 1746 
 1747 static int
 1748 kern_mdattach(struct thread *td, struct md_req *mdr)
 1749 {
 1750         int error;
 1751 
 1752         sx_xlock(&md_sx);
 1753         error = kern_mdattach_locked(td, mdr);
 1754         sx_xunlock(&md_sx);
 1755         return (error);
 1756 }
 1757 
 1758 static int
 1759 kern_mddetach_locked(struct thread *td, struct md_req *mdr)
 1760 {
 1761         struct md_s *sc;
 1762 
 1763         sx_assert(&md_sx, SA_XLOCKED);
 1764 
 1765         if (mdr->md_mediasize != 0 ||
 1766             (mdr->md_options & ~MD_FORCE) != 0)
 1767                 return (EINVAL);
 1768 
 1769         sc = mdfind(mdr->md_unit);
 1770         if (sc == NULL)
 1771                 return (ENOENT);
 1772         if (sc->opencount != 0 && !(sc->flags & MD_FORCE) &&
 1773             !(mdr->md_options & MD_FORCE))
 1774                 return (EBUSY);
 1775         return (mddestroy(sc, td));
 1776 }
 1777 
 1778 static int
 1779 kern_mddetach(struct thread *td, struct md_req *mdr)
 1780 {
 1781         int error;
 1782 
 1783         sx_xlock(&md_sx);
 1784         error = kern_mddetach_locked(td, mdr);
 1785         sx_xunlock(&md_sx);
 1786         return (error);
 1787 }
 1788 
 1789 static int
 1790 kern_mdresize_locked(struct md_req *mdr)
 1791 {
 1792         struct md_s *sc;
 1793 
 1794         sx_assert(&md_sx, SA_XLOCKED);
 1795 
 1796         if ((mdr->md_options & ~(MD_FORCE | MD_RESERVE)) != 0)
 1797                 return (EINVAL);
 1798 
 1799         sc = mdfind(mdr->md_unit);
 1800         if (sc == NULL)
 1801                 return (ENOENT);
 1802         if (mdr->md_mediasize < sc->sectorsize)
 1803                 return (EINVAL);
 1804         if (mdr->md_mediasize < sc->mediasize &&
 1805             !(sc->flags & MD_FORCE) &&
 1806             !(mdr->md_options & MD_FORCE))
 1807                 return (EBUSY);
 1808         return (mdresize(sc, mdr));
 1809 }
 1810 
 1811 static int
 1812 kern_mdresize(struct md_req *mdr)
 1813 {
 1814         int error;
 1815 
 1816         sx_xlock(&md_sx);
 1817         error = kern_mdresize_locked(mdr);
 1818         sx_xunlock(&md_sx);
 1819         return (error);
 1820 }
 1821 
 1822 static int
 1823 kern_mdquery_locked(struct md_req *mdr)
 1824 {
 1825         struct md_s *sc;
 1826         int error;
 1827 
 1828         sx_assert(&md_sx, SA_XLOCKED);
 1829 
 1830         sc = mdfind(mdr->md_unit);
 1831         if (sc == NULL)
 1832                 return (ENOENT);
 1833         mdr->md_type = sc->type;
 1834         mdr->md_options = sc->flags;
 1835         mdr->md_mediasize = sc->mediasize;
 1836         mdr->md_sectorsize = sc->sectorsize;
 1837         error = 0;
 1838         if (mdr->md_label != NULL) {
 1839                 error = copyout(sc->label, mdr->md_label,
 1840                     strlen(sc->label) + 1);
 1841                 if (error != 0)
 1842                         return (error);
 1843         }
 1844         if (sc->type == MD_VNODE ||
 1845             (sc->type == MD_PRELOAD && mdr->md_file != NULL))
 1846                 error = copyout(sc->file, mdr->md_file,
 1847                     strlen(sc->file) + 1);
 1848         return (error);
 1849 }
 1850 
 1851 static int
 1852 kern_mdquery(struct md_req *mdr)
 1853 {
 1854         int error;
 1855 
 1856         sx_xlock(&md_sx);
 1857         error = kern_mdquery_locked(mdr);
 1858         sx_xunlock(&md_sx);
 1859         return (error);
 1860 }
 1861 
 1862 /* Copy members that are not userspace pointers. */
 1863 #define MD_IOCTL2REQ(mdio, mdr) do {                                    \
 1864         (mdr)->md_unit = (mdio)->md_unit;                               \
 1865         (mdr)->md_type = (mdio)->md_type;                               \
 1866         (mdr)->md_mediasize = (mdio)->md_mediasize;                     \
 1867         (mdr)->md_sectorsize = (mdio)->md_sectorsize;                   \
 1868         (mdr)->md_options = (mdio)->md_options;                         \
 1869         (mdr)->md_fwheads = (mdio)->md_fwheads;                         \
 1870         (mdr)->md_fwsectors = (mdio)->md_fwsectors;                     \
 1871         (mdr)->md_units = &(mdio)->md_pad[0];                           \
 1872         (mdr)->md_units_nitems = nitems((mdio)->md_pad);                \
 1873 } while(0)
 1874 
 1875 /* Copy members that might have been updated */
 1876 #define MD_REQ2IOCTL(mdr, mdio) do {                                    \
 1877         (mdio)->md_unit = (mdr)->md_unit;                               \
 1878         (mdio)->md_type = (mdr)->md_type;                               \
 1879         (mdio)->md_mediasize = (mdr)->md_mediasize;                     \
 1880         (mdio)->md_sectorsize = (mdr)->md_sectorsize;                   \
 1881         (mdio)->md_options = (mdr)->md_options;                         \
 1882         (mdio)->md_fwheads = (mdr)->md_fwheads;                         \
 1883         (mdio)->md_fwsectors = (mdr)->md_fwsectors;                     \
 1884 } while(0)
 1885 
 1886 static int
 1887 mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
 1888     struct thread *td)
 1889 {
 1890         struct md_req mdr;
 1891         int error;
 1892 
 1893         if (md_debug)
 1894                 printf("mdctlioctl(%s %lx %p %x %p)\n",
 1895                         devtoname(dev), cmd, addr, flags, td);
 1896 
 1897         bzero(&mdr, sizeof(mdr));
 1898         switch (cmd) {
 1899         case MDIOCATTACH:
 1900         case MDIOCDETACH:
 1901         case MDIOCRESIZE:
 1902         case MDIOCQUERY: {
 1903                 struct md_ioctl *mdio = (struct md_ioctl *)addr;
 1904                 if (mdio->md_version != MDIOVERSION)
 1905                         return (EINVAL);
 1906                 MD_IOCTL2REQ(mdio, &mdr);
 1907                 mdr.md_file = mdio->md_file;
 1908                 mdr.md_file_seg = UIO_USERSPACE;
 1909                 /* If the file is adjacent to the md_ioctl it's in kernel. */
 1910                 if ((void *)mdio->md_file == (void *)(mdio + 1))
 1911                         mdr.md_file_seg = UIO_SYSSPACE;
 1912                 mdr.md_label = mdio->md_label;
 1913                 break;
 1914         }
 1915 #ifdef COMPAT_FREEBSD32
 1916         case MDIOCATTACH_32:
 1917         case MDIOCDETACH_32:
 1918         case MDIOCRESIZE_32:
 1919         case MDIOCQUERY_32: {
 1920                 struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
 1921                 if (mdio->md_version != MDIOVERSION)
 1922                         return (EINVAL);
 1923                 MD_IOCTL2REQ(mdio, &mdr);
 1924                 mdr.md_file = (void *)(uintptr_t)mdio->md_file;
 1925                 mdr.md_file_seg = UIO_USERSPACE;
 1926                 mdr.md_label = (void *)(uintptr_t)mdio->md_label;
 1927                 break;
 1928         }
 1929 #endif
 1930         default:
 1931                 /* Fall through to handler switch. */
 1932                 break;
 1933         }
 1934 
 1935         error = 0;
 1936         switch (cmd) {
 1937         case MDIOCATTACH:
 1938 #ifdef COMPAT_FREEBSD32
 1939         case MDIOCATTACH_32:
 1940 #endif
 1941                 error = kern_mdattach(td, &mdr);
 1942                 break;
 1943         case MDIOCDETACH:
 1944 #ifdef COMPAT_FREEBSD32
 1945         case MDIOCDETACH_32:
 1946 #endif
 1947                 error = kern_mddetach(td, &mdr);
 1948                 break;
 1949         case MDIOCRESIZE:
 1950 #ifdef COMPAT_FREEBSD32
 1951         case MDIOCRESIZE_32:
 1952 #endif
 1953                 error = kern_mdresize(&mdr);
 1954                 break;
 1955         case MDIOCQUERY:
 1956 #ifdef COMPAT_FREEBSD32
 1957         case MDIOCQUERY_32:
 1958 #endif
 1959                 error = kern_mdquery(&mdr);
 1960                 break;
 1961         default:
 1962                 error = ENOIOCTL;
 1963         }
 1964 
 1965         switch (cmd) {
 1966         case MDIOCATTACH:
 1967         case MDIOCQUERY: {
 1968                 struct md_ioctl *mdio = (struct md_ioctl *)addr;
 1969                 MD_REQ2IOCTL(&mdr, mdio);
 1970                 break;
 1971         }
 1972 #ifdef COMPAT_FREEBSD32
 1973         case MDIOCATTACH_32:
 1974         case MDIOCQUERY_32: {
 1975                 struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
 1976                 MD_REQ2IOCTL(&mdr, mdio);
 1977                 break;
 1978         }
 1979 #endif
 1980         default:
 1981                 /* Other commands to not alter mdr. */
 1982                 break;
 1983         }
 1984 
 1985         return (error);
 1986 }
 1987 
 1988 static void
 1989 md_preloaded(u_char *image, size_t length, const char *name)
 1990 {
 1991         struct md_s *sc;
 1992         int error;
 1993 
 1994         sc = mdnew(-1, &error, MD_PRELOAD);
 1995         if (sc == NULL)
 1996                 return;
 1997         sc->mediasize = length;
 1998         sc->sectorsize = DEV_BSIZE;
 1999         sc->pl_ptr = image;
 2000         sc->pl_len = length;
 2001         sc->start = mdstart_preload;
 2002         if (name != NULL)
 2003                 strlcpy(sc->file, name, sizeof(sc->file));
 2004 #ifdef MD_ROOT
 2005         if (sc->unit == 0) {
 2006 #ifndef ROOTDEVNAME
 2007                 rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0";
 2008 #endif
 2009 #ifdef MD_ROOT_READONLY
 2010                 sc->flags |= MD_READONLY;
 2011 #endif
 2012         }
 2013 #endif
 2014         mdinit(sc);
 2015         if (name != NULL) {
 2016                 printf("%s%d: Preloaded image <%s> %zd bytes at %p\n",
 2017                     MD_NAME, sc->unit, name, length, image);
 2018         } else {
 2019                 printf("%s%d: Embedded image %zd bytes at %p\n",
 2020                     MD_NAME, sc->unit, length, image);
 2021         }
 2022 }
 2023 
 2024 static void
 2025 g_md_init(struct g_class *mp __unused)
 2026 {
 2027         caddr_t mod;
 2028         u_char *ptr, *name, *type;
 2029         unsigned len;
 2030         int i;
 2031 
 2032         /* figure out log2(NINDIR) */
 2033         for (i = NINDIR, nshift = -1; i; nshift++)
 2034                 i >>= 1;
 2035 
 2036         mod = NULL;
 2037         sx_init(&md_sx, "MD config lock");
 2038         g_topology_unlock();
 2039         md_uh = new_unrhdr(0, INT_MAX, NULL);
 2040 #ifdef MD_ROOT
 2041         if (mfs_root_size != 0) {
 2042                 sx_xlock(&md_sx);
 2043 #ifdef MD_ROOT_MEM
 2044                 md_preloaded(mfs_root, mfs_root_size, NULL);
 2045 #else
 2046                 md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size,
 2047                     NULL);
 2048 #endif
 2049                 sx_xunlock(&md_sx);
 2050         }
 2051 #endif
 2052         /* XXX: are preload_* static or do they need Giant ? */
 2053         while ((mod = preload_search_next_name(mod)) != NULL) {
 2054                 name = (char *)preload_search_info(mod, MODINFO_NAME);
 2055                 if (name == NULL)
 2056                         continue;
 2057                 type = (char *)preload_search_info(mod, MODINFO_TYPE);
 2058                 if (type == NULL)
 2059                         continue;
 2060                 if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
 2061                         continue;
 2062                 ptr = preload_fetch_addr(mod);
 2063                 len = preload_fetch_size(mod);
 2064                 if (ptr != NULL && len != 0) {
 2065                         sx_xlock(&md_sx);
 2066                         md_preloaded(ptr, len, name);
 2067                         sx_xunlock(&md_sx);
 2068                 }
 2069         }
 2070         md_pbuf_zone = pbuf_zsecond_create("mdpbuf", nswbuf / 10);
 2071         status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL,
 2072             0600, MDCTL_NAME);
 2073         g_topology_lock();
 2074 }
 2075 
 2076 static void
 2077 g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
 2078     struct g_consumer *cp __unused, struct g_provider *pp)
 2079 {
 2080         struct md_s *mp;
 2081         char *type;
 2082 
 2083         mp = gp->softc;
 2084         if (mp == NULL)
 2085                 return;
 2086 
 2087         switch (mp->type) {
 2088         case MD_MALLOC:
 2089                 type = "malloc";
 2090                 break;
 2091         case MD_PRELOAD:
 2092                 type = "preload";
 2093                 break;
 2094         case MD_VNODE:
 2095                 type = "vnode";
 2096                 break;
 2097         case MD_SWAP:
 2098                 type = "swap";
 2099                 break;
 2100         case MD_NULL:
 2101                 type = "null";
 2102                 break;
 2103         default:
 2104                 type = "unknown";
 2105                 break;
 2106         }
 2107 
 2108         if (pp != NULL) {
 2109                 if (indent == NULL) {
 2110                         sbuf_printf(sb, " u %d", mp->unit);
 2111                         sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize);
 2112                         sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads);
 2113                         sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors);
 2114                         sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize);
 2115                         sbuf_printf(sb, " t %s", type);
 2116                         if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
 2117                             (mp->type == MD_PRELOAD && mp->file[0] != '\0'))
 2118                                 sbuf_printf(sb, " file %s", mp->file);
 2119                         sbuf_printf(sb, " label %s", mp->label);
 2120                 } else {
 2121                         sbuf_printf(sb, "%s<unit>%d</unit>\n", indent,
 2122                             mp->unit);
 2123                         sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n",
 2124                             indent, (uintmax_t) mp->sectorsize);
 2125                         sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n",
 2126                             indent, (uintmax_t) mp->fwheads);
 2127                         sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n",
 2128                             indent, (uintmax_t) mp->fwsectors);
 2129                         if (mp->ident[0] != '\0') {
 2130                                 sbuf_printf(sb, "%s<ident>", indent);
 2131                                 g_conf_printf_escaped(sb, "%s", mp->ident);
 2132                                 sbuf_printf(sb, "</ident>\n");
 2133                         }
 2134                         sbuf_printf(sb, "%s<length>%ju</length>\n",
 2135                             indent, (uintmax_t) mp->mediasize);
 2136                         sbuf_printf(sb, "%s<compression>%s</compression>\n", indent,
 2137                             (mp->flags & MD_COMPRESS) == 0 ? "off": "on");
 2138                         sbuf_printf(sb, "%s<access>%s</access>\n", indent,
 2139                             (mp->flags & MD_READONLY) == 0 ? "read-write":
 2140                             "read-only");
 2141                         sbuf_printf(sb, "%s<type>%s</type>\n", indent,
 2142                             type);
 2143                         if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
 2144                             (mp->type == MD_PRELOAD && mp->file[0] != '\0')) {
 2145                                 sbuf_printf(sb, "%s<file>", indent);
 2146                                 g_conf_printf_escaped(sb, "%s", mp->file);
 2147                                 sbuf_printf(sb, "</file>\n");
 2148                         }
 2149                         if (mp->type == MD_VNODE)
 2150                                 sbuf_printf(sb, "%s<cache>%s</cache>\n", indent,
 2151                                     (mp->flags & MD_CACHE) == 0 ? "off": "on");
 2152                         sbuf_printf(sb, "%s<label>", indent);
 2153                         g_conf_printf_escaped(sb, "%s", mp->label);
 2154                         sbuf_printf(sb, "</label>\n");
 2155                 }
 2156         }
 2157 }
 2158 
 2159 static void
 2160 g_md_fini(struct g_class *mp __unused)
 2161 {
 2162 
 2163         sx_destroy(&md_sx);
 2164         if (status_dev != NULL)
 2165                 destroy_dev(status_dev);
 2166         uma_zdestroy(md_pbuf_zone);
 2167         delete_unrhdr(md_uh);
 2168 }

Cache object: 6836251de81af63f795938cfc7d7997e


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.