The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/md/md.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: (Beerware AND BSD-3-Clause)
    3  *
    4  * ----------------------------------------------------------------------------
    5  * "THE BEER-WARE LICENSE" (Revision 42):
    6  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
    7  * can do whatever you want with this stuff. If we meet some day, and you think
    8  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
    9  * ----------------------------------------------------------------------------
   10  *
   11  * $FreeBSD$
   12  *
   13  */
   14 
   15 /*-
   16  * The following functions are based on the vn(4) driver: mdstart_swap(),
   17  * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
   18  * and as such under the following copyright:
   19  *
   20  * Copyright (c) 1988 University of Utah.
   21  * Copyright (c) 1990, 1993
   22  *      The Regents of the University of California.  All rights reserved.
   23  * Copyright (c) 2013 The FreeBSD Foundation
   24  * All rights reserved.
   25  *
   26  * This code is derived from software contributed to Berkeley by
   27  * the Systems Programming Group of the University of Utah Computer
   28  * Science Department.
   29  *
   30  * Portions of this software were developed by Konstantin Belousov
   31  * under sponsorship from the FreeBSD Foundation.
   32  *
   33  * Redistribution and use in source and binary forms, with or without
   34  * modification, are permitted provided that the following conditions
   35  * are met:
   36  * 1. Redistributions of source code must retain the above copyright
   37  *    notice, this list of conditions and the following disclaimer.
   38  * 2. Redistributions in binary form must reproduce the above copyright
   39  *    notice, this list of conditions and the following disclaimer in the
   40  *    documentation and/or other materials provided with the distribution.
   41  * 3. Neither the name of the University nor the names of its contributors
   42  *    may be used to endorse or promote products derived from this software
   43  *    without specific prior written permission.
   44  *
   45  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   55  * SUCH DAMAGE.
   56  *
   57  * from: Utah Hdr: vn.c 1.13 94/04/02
   58  *
   59  *      from: @(#)vn.c  8.6 (Berkeley) 4/1/94
   60  * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
   61  */
   62 
   63 #include "opt_rootdevname.h"
   64 #include "opt_geom.h"
   65 #include "opt_md.h"
   66 
   67 #include <sys/param.h>
   68 #include <sys/systm.h>
   69 #include <sys/bio.h>
   70 #include <sys/buf.h>
   71 #include <sys/conf.h>
   72 #include <sys/devicestat.h>
   73 #include <sys/fcntl.h>
   74 #include <sys/kernel.h>
   75 #include <sys/kthread.h>
   76 #include <sys/limits.h>
   77 #include <sys/linker.h>
   78 #include <sys/lock.h>
   79 #include <sys/malloc.h>
   80 #include <sys/mdioctl.h>
   81 #include <sys/mount.h>
   82 #include <sys/mutex.h>
   83 #include <sys/sx.h>
   84 #include <sys/namei.h>
   85 #include <sys/proc.h>
   86 #include <sys/queue.h>
   87 #include <sys/rwlock.h>
   88 #include <sys/sbuf.h>
   89 #include <sys/sched.h>
   90 #include <sys/sf_buf.h>
   91 #include <sys/sysctl.h>
   92 #include <sys/uio.h>
   93 #include <sys/unistd.h>
   94 #include <sys/vnode.h>
   95 #include <sys/disk.h>
   96 
   97 #include <geom/geom.h>
   98 #include <geom/geom_int.h>
   99 
  100 #include <vm/vm.h>
  101 #include <vm/vm_param.h>
  102 #include <vm/vm_object.h>
  103 #include <vm/vm_page.h>
  104 #include <vm/vm_pager.h>
  105 #include <vm/swap_pager.h>
  106 #include <vm/uma.h>
  107 
  108 #include <machine/bus.h>
  109 
  110 #define MD_MODVER 1
  111 
  112 #define MD_SHUTDOWN     0x10000         /* Tell worker thread to terminate. */
  113 #define MD_EXITING      0x20000         /* Worker thread is exiting. */
  114 #define MD_PROVIDERGONE 0x40000         /* Safe to free the softc */
  115 
  116 #ifndef MD_NSECT
  117 #define MD_NSECT (10000 * 2)
  118 #endif
  119 
  120 struct md_req {
  121         unsigned        md_unit;        /* unit number */
  122         enum md_types   md_type;        /* type of disk */
  123         off_t           md_mediasize;   /* size of disk in bytes */
  124         unsigned        md_sectorsize;  /* sectorsize */
  125         unsigned        md_options;     /* options */
  126         int             md_fwheads;     /* firmware heads */
  127         int             md_fwsectors;   /* firmware sectors */
  128         char            *md_file;       /* pathname of file to mount */
  129         enum uio_seg    md_file_seg;    /* location of md_file */
  130         char            *md_label;      /* label of the device (userspace) */
  131         int             *md_units;      /* pointer to units array (kernel) */
  132         size_t          md_units_nitems; /* items in md_units array */
  133 };
  134 
  135 #ifdef COMPAT_FREEBSD32
  136 struct md_ioctl32 {
  137         unsigned        md_version;
  138         unsigned        md_unit;
  139         enum md_types   md_type;
  140         uint32_t        md_file;
  141         off_t           md_mediasize;
  142         unsigned        md_sectorsize;
  143         unsigned        md_options;
  144         uint64_t        md_base;
  145         int             md_fwheads;
  146         int             md_fwsectors;
  147         uint32_t        md_label;
  148         int             md_pad[MDNPAD];
  149 } __attribute__((__packed__));
  150 CTASSERT((sizeof(struct md_ioctl32)) == 436);
  151 
  152 #define MDIOCATTACH_32  _IOC_NEWTYPE(MDIOCATTACH, struct md_ioctl32)
  153 #define MDIOCDETACH_32  _IOC_NEWTYPE(MDIOCDETACH, struct md_ioctl32)
  154 #define MDIOCQUERY_32   _IOC_NEWTYPE(MDIOCQUERY, struct md_ioctl32)
  155 #define MDIOCRESIZE_32  _IOC_NEWTYPE(MDIOCRESIZE, struct md_ioctl32)
  156 #endif /* COMPAT_FREEBSD32 */
  157 
  158 static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk");
  159 static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors");
  160 
  161 static int md_debug;
  162 SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0,
  163     "Enable md(4) debug messages");
  164 static int md_malloc_wait;
  165 SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0,
  166     "Allow malloc to wait for memory allocations");
  167 
  168 #if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE)
  169 #define MD_ROOT_FSTYPE  "ufs"
  170 #endif
  171 
  172 #if defined(MD_ROOT)
  173 /*
  174  * Preloaded image gets put here.
  175  */
  176 #if defined(MD_ROOT_SIZE)
  177 /*
  178  * We put the mfs_root symbol into the oldmfs section of the kernel object file.
  179  * Applications that patch the object with the image can determine
  180  * the size looking at the oldmfs section size within the kernel.
  181  */
  182 u_char mfs_root[MD_ROOT_SIZE*1024] __attribute__ ((section ("oldmfs")));
  183 const int mfs_root_size = sizeof(mfs_root);
  184 #elif defined(MD_ROOT_MEM)
  185 /* MD region already mapped in the memory */
  186 u_char *mfs_root;
  187 int mfs_root_size;
  188 #else
  189 extern volatile u_char __weak_symbol mfs_root;
  190 extern volatile u_char __weak_symbol mfs_root_end;
  191 #define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root))
  192 #endif
  193 #endif
  194 
  195 static g_init_t g_md_init;
  196 static g_fini_t g_md_fini;
  197 static g_start_t g_md_start;
  198 static g_access_t g_md_access;
  199 static void g_md_dumpconf(struct sbuf *sb, const char *indent,
  200     struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp);
  201 static g_provgone_t g_md_providergone;
  202 
  203 static struct cdev *status_dev = NULL;
  204 static struct sx md_sx;
  205 static struct unrhdr *md_uh;
  206 
  207 static d_ioctl_t mdctlioctl;
  208 
  209 static struct cdevsw mdctl_cdevsw = {
  210         .d_version =    D_VERSION,
  211         .d_ioctl =      mdctlioctl,
  212         .d_name =       MD_NAME,
  213 };
  214 
  215 struct g_class g_md_class = {
  216         .name = "MD",
  217         .version = G_VERSION,
  218         .init = g_md_init,
  219         .fini = g_md_fini,
  220         .start = g_md_start,
  221         .access = g_md_access,
  222         .dumpconf = g_md_dumpconf,
  223         .providergone = g_md_providergone,
  224 };
  225 
  226 DECLARE_GEOM_CLASS(g_md_class, g_md);
  227 MODULE_VERSION(geom_md, 0);
  228 
  229 static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list);
  230 
  231 #define NINDIR  (PAGE_SIZE / sizeof(uintptr_t))
  232 #define NMASK   (NINDIR-1)
  233 static int nshift;
  234 
  235 static uma_zone_t md_pbuf_zone;
  236 
  237 struct indir {
  238         uintptr_t       *array;
  239         u_int           total;
  240         u_int           used;
  241         u_int           shift;
  242 };
  243 
  244 struct md_s {
  245         int unit;
  246         LIST_ENTRY(md_s) list;
  247         struct bio_queue_head bio_queue;
  248         struct mtx queue_mtx;
  249         struct cdev *dev;
  250         enum md_types type;
  251         off_t mediasize;
  252         unsigned sectorsize;
  253         unsigned opencount;
  254         unsigned fwheads;
  255         unsigned fwsectors;
  256         char ident[32];
  257         unsigned flags;
  258         char name[20];
  259         struct proc *procp;
  260         struct g_geom *gp;
  261         struct g_provider *pp;
  262         int (*start)(struct md_s *sc, struct bio *bp);
  263         struct devstat *devstat;
  264         bool candelete;
  265 
  266         /* MD_MALLOC related fields */
  267         struct indir *indir;
  268         uma_zone_t uma;
  269 
  270         /* MD_PRELOAD related fields */
  271         u_char *pl_ptr;
  272         size_t pl_len;
  273 
  274         /* MD_VNODE related fields */
  275         struct vnode *vnode;
  276         char file[PATH_MAX];
  277         char label[PATH_MAX];
  278         struct ucred *cred;
  279 
  280         /* MD_SWAP related fields */
  281         vm_object_t object;
  282 };
  283 
  284 static struct indir *
  285 new_indir(u_int shift)
  286 {
  287         struct indir *ip;
  288 
  289         ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT)
  290             | M_ZERO);
  291         if (ip == NULL)
  292                 return (NULL);
  293         ip->array = malloc(sizeof(uintptr_t) * NINDIR,
  294             M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO);
  295         if (ip->array == NULL) {
  296                 free(ip, M_MD);
  297                 return (NULL);
  298         }
  299         ip->total = NINDIR;
  300         ip->shift = shift;
  301         return (ip);
  302 }
  303 
  304 static void
  305 del_indir(struct indir *ip)
  306 {
  307 
  308         free(ip->array, M_MDSECT);
  309         free(ip, M_MD);
  310 }
  311 
  312 static void
  313 destroy_indir(struct md_s *sc, struct indir *ip)
  314 {
  315         int i;
  316 
  317         for (i = 0; i < NINDIR; i++) {
  318                 if (!ip->array[i])
  319                         continue;
  320                 if (ip->shift)
  321                         destroy_indir(sc, (struct indir*)(ip->array[i]));
  322                 else if (ip->array[i] > 255)
  323                         uma_zfree(sc->uma, (void *)(ip->array[i]));
  324         }
  325         del_indir(ip);
  326 }
  327 
  328 /*
  329  * This function does the math and allocates the top level "indir" structure
  330  * for a device of "size" sectors.
  331  */
  332 
  333 static struct indir *
  334 dimension(off_t size)
  335 {
  336         off_t rcnt;
  337         struct indir *ip;
  338         int layer;
  339 
  340         rcnt = size;
  341         layer = 0;
  342         while (rcnt > NINDIR) {
  343                 rcnt /= NINDIR;
  344                 layer++;
  345         }
  346 
  347         /*
  348          * XXX: the top layer is probably not fully populated, so we allocate
  349          * too much space for ip->array in here.
  350          */
  351         ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
  352         ip->array = malloc(sizeof(uintptr_t) * NINDIR,
  353             M_MDSECT, M_WAITOK | M_ZERO);
  354         ip->total = NINDIR;
  355         ip->shift = layer * nshift;
  356         return (ip);
  357 }
  358 
  359 /*
  360  * Read a given sector
  361  */
  362 
  363 static uintptr_t
  364 s_read(struct indir *ip, off_t offset)
  365 {
  366         struct indir *cip;
  367         int idx;
  368         uintptr_t up;
  369 
  370         if (md_debug > 1)
  371                 printf("s_read(%jd)\n", (intmax_t)offset);
  372         up = 0;
  373         for (cip = ip; cip != NULL;) {
  374                 if (cip->shift) {
  375                         idx = (offset >> cip->shift) & NMASK;
  376                         up = cip->array[idx];
  377                         cip = (struct indir *)up;
  378                         continue;
  379                 }
  380                 idx = offset & NMASK;
  381                 return (cip->array[idx]);
  382         }
  383         return (0);
  384 }
  385 
  386 /*
  387  * Write a given sector, prune the tree if the value is 0
  388  */
  389 
  390 static int
  391 s_write(struct indir *ip, off_t offset, uintptr_t ptr)
  392 {
  393         struct indir *cip, *lip[10];
  394         int idx, li;
  395         uintptr_t up;
  396 
  397         if (md_debug > 1)
  398                 printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
  399         up = 0;
  400         li = 0;
  401         cip = ip;
  402         for (;;) {
  403                 lip[li++] = cip;
  404                 if (cip->shift) {
  405                         idx = (offset >> cip->shift) & NMASK;
  406                         up = cip->array[idx];
  407                         if (up != 0) {
  408                                 cip = (struct indir *)up;
  409                                 continue;
  410                         }
  411                         /* Allocate branch */
  412                         cip->array[idx] =
  413                             (uintptr_t)new_indir(cip->shift - nshift);
  414                         if (cip->array[idx] == 0)
  415                                 return (ENOSPC);
  416                         cip->used++;
  417                         up = cip->array[idx];
  418                         cip = (struct indir *)up;
  419                         continue;
  420                 }
  421                 /* leafnode */
  422                 idx = offset & NMASK;
  423                 up = cip->array[idx];
  424                 if (up != 0)
  425                         cip->used--;
  426                 cip->array[idx] = ptr;
  427                 if (ptr != 0)
  428                         cip->used++;
  429                 break;
  430         }
  431         if (cip->used != 0 || li == 1)
  432                 return (0);
  433         li--;
  434         while (cip->used == 0 && cip != ip) {
  435                 li--;
  436                 idx = (offset >> lip[li]->shift) & NMASK;
  437                 up = lip[li]->array[idx];
  438                 KASSERT(up == (uintptr_t)cip, ("md screwed up"));
  439                 del_indir(cip);
  440                 lip[li]->array[idx] = 0;
  441                 lip[li]->used--;
  442                 cip = lip[li];
  443         }
  444         return (0);
  445 }
  446 
  447 static int
  448 g_md_access(struct g_provider *pp, int r, int w, int e)
  449 {
  450         struct md_s *sc;
  451 
  452         sc = pp->geom->softc;
  453         if (sc == NULL) {
  454                 if (r <= 0 && w <= 0 && e <= 0)
  455                         return (0);
  456                 return (ENXIO);
  457         }
  458         r += pp->acr;
  459         w += pp->acw;
  460         e += pp->ace;
  461         if ((sc->flags & MD_READONLY) != 0 && w > 0)
  462                 return (EROFS);
  463         if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
  464                 sc->opencount = 1;
  465         } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
  466                 sc->opencount = 0;
  467         }
  468         return (0);
  469 }
  470 
  471 static void
  472 g_md_start(struct bio *bp)
  473 {
  474         struct md_s *sc;
  475 
  476         sc = bp->bio_to->geom->softc;
  477         if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) {
  478                 devstat_start_transaction_bio(sc->devstat, bp);
  479         }
  480         mtx_lock(&sc->queue_mtx);
  481         bioq_disksort(&sc->bio_queue, bp);
  482         wakeup(sc);
  483         mtx_unlock(&sc->queue_mtx);
  484 }
  485 
  486 #define MD_MALLOC_MOVE_ZERO     1
  487 #define MD_MALLOC_MOVE_FILL     2
  488 #define MD_MALLOC_MOVE_READ     3
  489 #define MD_MALLOC_MOVE_WRITE    4
  490 #define MD_MALLOC_MOVE_CMP      5
  491 
  492 static int
  493 md_malloc_move_ma(vm_page_t **mp, int *ma_offs, unsigned sectorsize,
  494     void *ptr, u_char fill, int op)
  495 {
  496         struct sf_buf *sf;
  497         vm_page_t m, *mp1;
  498         char *p, first;
  499         off_t *uc;
  500         unsigned n;
  501         int error, i, ma_offs1, sz, first_read;
  502 
  503         m = NULL;
  504         error = 0;
  505         sf = NULL;
  506         /* if (op == MD_MALLOC_MOVE_CMP) { gcc */
  507                 first = 0;
  508                 first_read = 0;
  509                 uc = ptr;
  510                 mp1 = *mp;
  511                 ma_offs1 = *ma_offs;
  512         /* } */
  513         sched_pin();
  514         for (n = sectorsize; n != 0; n -= sz) {
  515                 sz = imin(PAGE_SIZE - *ma_offs, n);
  516                 if (m != **mp) {
  517                         if (sf != NULL)
  518                                 sf_buf_free(sf);
  519                         m = **mp;
  520                         sf = sf_buf_alloc(m, SFB_CPUPRIVATE |
  521                             (md_malloc_wait ? 0 : SFB_NOWAIT));
  522                         if (sf == NULL) {
  523                                 error = ENOMEM;
  524                                 break;
  525                         }
  526                 }
  527                 p = (char *)sf_buf_kva(sf) + *ma_offs;
  528                 switch (op) {
  529                 case MD_MALLOC_MOVE_ZERO:
  530                         bzero(p, sz);
  531                         break;
  532                 case MD_MALLOC_MOVE_FILL:
  533                         memset(p, fill, sz);
  534                         break;
  535                 case MD_MALLOC_MOVE_READ:
  536                         bcopy(ptr, p, sz);
  537                         cpu_flush_dcache(p, sz);
  538                         break;
  539                 case MD_MALLOC_MOVE_WRITE:
  540                         bcopy(p, ptr, sz);
  541                         break;
  542                 case MD_MALLOC_MOVE_CMP:
  543                         for (i = 0; i < sz; i++, p++) {
  544                                 if (!first_read) {
  545                                         *uc = (u_char)*p;
  546                                         first = *p;
  547                                         first_read = 1;
  548                                 } else if (*p != first) {
  549                                         error = EDOOFUS;
  550                                         break;
  551                                 }
  552                         }
  553                         break;
  554                 default:
  555                         KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op));
  556                         break;
  557                 }
  558                 if (error != 0)
  559                         break;
  560                 *ma_offs += sz;
  561                 *ma_offs %= PAGE_SIZE;
  562                 if (*ma_offs == 0)
  563                         (*mp)++;
  564                 ptr = (char *)ptr + sz;
  565         }
  566 
  567         if (sf != NULL)
  568                 sf_buf_free(sf);
  569         sched_unpin();
  570         if (op == MD_MALLOC_MOVE_CMP && error != 0) {
  571                 *mp = mp1;
  572                 *ma_offs = ma_offs1;
  573         }
  574         return (error);
  575 }
  576 
  577 static int
  578 md_malloc_move_vlist(bus_dma_segment_t **pvlist, int *pma_offs,
  579     unsigned len, void *ptr, u_char fill, int op)
  580 {
  581         bus_dma_segment_t *vlist;
  582         uint8_t *p, *end, first;
  583         off_t *uc;
  584         int ma_offs, seg_len;
  585 
  586         vlist = *pvlist;
  587         ma_offs = *pma_offs;
  588         uc = ptr;
  589 
  590         for (; len != 0; len -= seg_len) {
  591                 seg_len = imin(vlist->ds_len - ma_offs, len);
  592                 p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs;
  593                 switch (op) {
  594                 case MD_MALLOC_MOVE_ZERO:
  595                         bzero(p, seg_len);
  596                         break;
  597                 case MD_MALLOC_MOVE_FILL:
  598                         memset(p, fill, seg_len);
  599                         break;
  600                 case MD_MALLOC_MOVE_READ:
  601                         bcopy(ptr, p, seg_len);
  602                         cpu_flush_dcache(p, seg_len);
  603                         break;
  604                 case MD_MALLOC_MOVE_WRITE:
  605                         bcopy(p, ptr, seg_len);
  606                         break;
  607                 case MD_MALLOC_MOVE_CMP:
  608                         end = p + seg_len;
  609                         first = *uc = *p;
  610                         /* Confirm all following bytes match the first */
  611                         while (++p < end) {
  612                                 if (*p != first)
  613                                         return (EDOOFUS);
  614                         }
  615                         break;
  616                 default:
  617                         KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op));
  618                         break;
  619                 }
  620 
  621                 ma_offs += seg_len;
  622                 if (ma_offs == vlist->ds_len) {
  623                         ma_offs = 0;
  624                         vlist++;
  625                 }
  626                 ptr = (uint8_t *)ptr + seg_len;
  627         }
  628         *pvlist = vlist;
  629         *pma_offs = ma_offs;
  630 
  631         return (0);
  632 }
  633 
  634 static int
  635 mdstart_malloc(struct md_s *sc, struct bio *bp)
  636 {
  637         u_char *dst;
  638         vm_page_t *m;
  639         bus_dma_segment_t *vlist;
  640         int i, error, error1, ma_offs, notmapped;
  641         off_t secno, nsec, uc;
  642         uintptr_t sp, osp;
  643 
  644         switch (bp->bio_cmd) {
  645         case BIO_READ:
  646         case BIO_WRITE:
  647         case BIO_DELETE:
  648                 break;
  649         case BIO_FLUSH:
  650                 return (0);
  651         default:
  652                 return (EOPNOTSUPP);
  653         }
  654 
  655         notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0;
  656         vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
  657             (bus_dma_segment_t *)bp->bio_data : NULL;
  658         if (notmapped) {
  659                 m = bp->bio_ma;
  660                 ma_offs = bp->bio_ma_offset;
  661                 dst = NULL;
  662                 KASSERT(vlist == NULL, ("vlists cannot be unmapped"));
  663         } else if (vlist != NULL) {
  664                 ma_offs = bp->bio_ma_offset;
  665                 dst = NULL;
  666         } else {
  667                 dst = bp->bio_data;
  668         }
  669 
  670         nsec = bp->bio_length / sc->sectorsize;
  671         secno = bp->bio_offset / sc->sectorsize;
  672         error = 0;
  673         while (nsec--) {
  674                 osp = s_read(sc->indir, secno);
  675                 if (bp->bio_cmd == BIO_DELETE) {
  676                         if (osp != 0)
  677                                 error = s_write(sc->indir, secno, 0);
  678                 } else if (bp->bio_cmd == BIO_READ) {
  679                         if (osp == 0) {
  680                                 if (notmapped) {
  681                                         error = md_malloc_move_ma(&m, &ma_offs,
  682                                             sc->sectorsize, NULL, 0,
  683                                             MD_MALLOC_MOVE_ZERO);
  684                                 } else if (vlist != NULL) {
  685                                         error = md_malloc_move_vlist(&vlist,
  686                                             &ma_offs, sc->sectorsize, NULL, 0,
  687                                             MD_MALLOC_MOVE_ZERO);
  688                                 } else
  689                                         bzero(dst, sc->sectorsize);
  690                         } else if (osp <= 255) {
  691                                 if (notmapped) {
  692                                         error = md_malloc_move_ma(&m, &ma_offs,
  693                                             sc->sectorsize, NULL, osp,
  694                                             MD_MALLOC_MOVE_FILL);
  695                                 } else if (vlist != NULL) {
  696                                         error = md_malloc_move_vlist(&vlist,
  697                                             &ma_offs, sc->sectorsize, NULL, osp,
  698                                             MD_MALLOC_MOVE_FILL);
  699                                 } else
  700                                         memset(dst, osp, sc->sectorsize);
  701                         } else {
  702                                 if (notmapped) {
  703                                         error = md_malloc_move_ma(&m, &ma_offs,
  704                                             sc->sectorsize, (void *)osp, 0,
  705                                             MD_MALLOC_MOVE_READ);
  706                                 } else if (vlist != NULL) {
  707                                         error = md_malloc_move_vlist(&vlist,
  708                                             &ma_offs, sc->sectorsize,
  709                                             (void *)osp, 0,
  710                                             MD_MALLOC_MOVE_READ);
  711                                 } else {
  712                                         bcopy((void *)osp, dst, sc->sectorsize);
  713                                         cpu_flush_dcache(dst, sc->sectorsize);
  714                                 }
  715                         }
  716                         osp = 0;
  717                 } else if (bp->bio_cmd == BIO_WRITE) {
  718                         if (sc->flags & MD_COMPRESS) {
  719                                 if (notmapped) {
  720                                         error1 = md_malloc_move_ma(&m, &ma_offs,
  721                                             sc->sectorsize, &uc, 0,
  722                                             MD_MALLOC_MOVE_CMP);
  723                                         i = error1 == 0 ? sc->sectorsize : 0;
  724                                 } else if (vlist != NULL) {
  725                                         error1 = md_malloc_move_vlist(&vlist,
  726                                             &ma_offs, sc->sectorsize, &uc, 0,
  727                                             MD_MALLOC_MOVE_CMP);
  728                                         i = error1 == 0 ? sc->sectorsize : 0;
  729                                 } else {
  730                                         uc = dst[0];
  731                                         for (i = 1; i < sc->sectorsize; i++) {
  732                                                 if (dst[i] != uc)
  733                                                         break;
  734                                         }
  735                                 }
  736                         } else {
  737                                 i = 0;
  738                                 uc = 0;
  739                         }
  740                         if (i == sc->sectorsize) {
  741                                 if (osp != uc)
  742                                         error = s_write(sc->indir, secno, uc);
  743                         } else {
  744                                 if (osp <= 255) {
  745                                         sp = (uintptr_t)uma_zalloc(sc->uma,
  746                                             md_malloc_wait ? M_WAITOK :
  747                                             M_NOWAIT);
  748                                         if (sp == 0) {
  749                                                 error = ENOSPC;
  750                                                 break;
  751                                         }
  752                                         if (notmapped) {
  753                                                 error = md_malloc_move_ma(&m,
  754                                                     &ma_offs, sc->sectorsize,
  755                                                     (void *)sp, 0,
  756                                                     MD_MALLOC_MOVE_WRITE);
  757                                         } else if (vlist != NULL) {
  758                                                 error = md_malloc_move_vlist(
  759                                                     &vlist, &ma_offs,
  760                                                     sc->sectorsize, (void *)sp,
  761                                                     0, MD_MALLOC_MOVE_WRITE);
  762                                         } else {
  763                                                 bcopy(dst, (void *)sp,
  764                                                     sc->sectorsize);
  765                                         }
  766                                         error = s_write(sc->indir, secno, sp);
  767                                 } else {
  768                                         if (notmapped) {
  769                                                 error = md_malloc_move_ma(&m,
  770                                                     &ma_offs, sc->sectorsize,
  771                                                     (void *)osp, 0,
  772                                                     MD_MALLOC_MOVE_WRITE);
  773                                         } else if (vlist != NULL) {
  774                                                 error = md_malloc_move_vlist(
  775                                                     &vlist, &ma_offs,
  776                                                     sc->sectorsize, (void *)osp,
  777                                                     0, MD_MALLOC_MOVE_WRITE);
  778                                         } else {
  779                                                 bcopy(dst, (void *)osp,
  780                                                     sc->sectorsize);
  781                                         }
  782                                         osp = 0;
  783                                 }
  784                         }
  785                 } else {
  786                         error = EOPNOTSUPP;
  787                 }
  788                 if (osp > 255)
  789                         uma_zfree(sc->uma, (void*)osp);
  790                 if (error != 0)
  791                         break;
  792                 secno++;
  793                 if (!notmapped && vlist == NULL)
  794                         dst += sc->sectorsize;
  795         }
  796         bp->bio_resid = 0;
  797         return (error);
  798 }
  799 
  800 static void
  801 mdcopyto_vlist(void *src, bus_dma_segment_t *vlist, off_t offset, off_t len)
  802 {
  803         off_t seg_len;
  804 
  805         while (offset >= vlist->ds_len) {
  806                 offset -= vlist->ds_len;
  807                 vlist++;
  808         }
  809 
  810         while (len != 0) {
  811                 seg_len = omin(len, vlist->ds_len - offset);
  812                 bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset),
  813                     seg_len);
  814                 offset = 0;
  815                 src = (uint8_t *)src + seg_len;
  816                 len -= seg_len;
  817                 vlist++;
  818         }
  819 }
  820 
  821 static void
  822 mdcopyfrom_vlist(bus_dma_segment_t *vlist, off_t offset, void *dst, off_t len)
  823 {
  824         off_t seg_len;
  825 
  826         while (offset >= vlist->ds_len) {
  827                 offset -= vlist->ds_len;
  828                 vlist++;
  829         }
  830 
  831         while (len != 0) {
  832                 seg_len = omin(len, vlist->ds_len - offset);
  833                 bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst,
  834                     seg_len);
  835                 offset = 0;
  836                 dst = (uint8_t *)dst + seg_len;
  837                 len -= seg_len;
  838                 vlist++;
  839         }
  840 }
  841 
  842 static int
  843 mdstart_preload(struct md_s *sc, struct bio *bp)
  844 {
  845         uint8_t *p;
  846 
  847         p = sc->pl_ptr + bp->bio_offset;
  848         switch (bp->bio_cmd) {
  849         case BIO_READ:
  850                 if ((bp->bio_flags & BIO_VLIST) != 0) {
  851                         mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data,
  852                             bp->bio_ma_offset, bp->bio_length);
  853                 } else {
  854                         bcopy(p, bp->bio_data, bp->bio_length);
  855                 }
  856                 cpu_flush_dcache(bp->bio_data, bp->bio_length);
  857                 break;
  858         case BIO_WRITE:
  859                 if ((bp->bio_flags & BIO_VLIST) != 0) {
  860                         mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data,
  861                             bp->bio_ma_offset, p, bp->bio_length);
  862                 } else {
  863                         bcopy(bp->bio_data, p, bp->bio_length);
  864                 }
  865                 break;
  866         }
  867         bp->bio_resid = 0;
  868         return (0);
  869 }
  870 
  871 static int
  872 mdstart_vnode(struct md_s *sc, struct bio *bp)
  873 {
  874         int error;
  875         struct uio auio;
  876         struct iovec aiov;
  877         struct iovec *piov;
  878         struct mount *mp;
  879         struct vnode *vp;
  880         struct buf *pb;
  881         bus_dma_segment_t *vlist;
  882         struct thread *td;
  883         off_t iolen, iostart, off, len;
  884         int ma_offs, npages;
  885 
  886         switch (bp->bio_cmd) {
  887         case BIO_READ:
  888                 auio.uio_rw = UIO_READ;
  889                 break;
  890         case BIO_WRITE:
  891                 auio.uio_rw = UIO_WRITE;
  892                 break;
  893         case BIO_FLUSH:
  894                 break;
  895         case BIO_DELETE:
  896                 if (sc->candelete)
  897                         break;
  898                 /* FALLTHROUGH */
  899         default:
  900                 return (EOPNOTSUPP);
  901         }
  902 
  903         td = curthread;
  904         vp = sc->vnode;
  905         pb = NULL;
  906         piov = NULL;
  907         ma_offs = bp->bio_ma_offset;
  908         off = bp->bio_offset;
  909         len = bp->bio_length;
  910 
  911         /*
  912          * VNODE I/O
  913          *
  914          * If an error occurs, we set BIO_ERROR but we do not set
  915          * B_INVAL because (for a write anyway), the buffer is
  916          * still valid.
  917          */
  918 
  919         if (bp->bio_cmd == BIO_FLUSH) {
  920                 do {
  921                         (void)vn_start_write(vp, &mp, V_WAIT);
  922                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  923                         error = VOP_FSYNC(vp, MNT_WAIT, td);
  924                         VOP_UNLOCK(vp);
  925                         vn_finished_write(mp);
  926                 } while (error == ERELOOKUP);
  927                 return (error);
  928         } else if (bp->bio_cmd == BIO_DELETE) {
  929                 error = vn_deallocate(vp, &off, &len, 0,
  930                     sc->flags & MD_ASYNC ? 0 : IO_SYNC, sc->cred, NOCRED);
  931                 bp->bio_resid = len;
  932                 return (error);
  933         }
  934 
  935         auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
  936         auio.uio_resid = bp->bio_length;
  937         auio.uio_segflg = UIO_SYSSPACE;
  938         auio.uio_td = td;
  939 
  940         if ((bp->bio_flags & BIO_VLIST) != 0) {
  941                 piov = malloc(sizeof(*piov) * bp->bio_ma_n, M_MD, M_WAITOK);
  942                 auio.uio_iov = piov;
  943                 vlist = (bus_dma_segment_t *)bp->bio_data;
  944                 while (len > 0) {
  945                         piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr +
  946                             ma_offs);
  947                         piov->iov_len = vlist->ds_len - ma_offs;
  948                         if (piov->iov_len > len)
  949                                 piov->iov_len = len;
  950                         len -= piov->iov_len;
  951                         ma_offs = 0;
  952                         vlist++;
  953                         piov++;
  954                 }
  955                 auio.uio_iovcnt = piov - auio.uio_iov;
  956                 piov = auio.uio_iov;
  957         } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
  958                 pb = uma_zalloc(md_pbuf_zone, M_WAITOK);
  959                 MPASS((pb->b_flags & B_MAXPHYS) != 0);
  960                 bp->bio_resid = len;
  961 unmapped_step:
  962                 npages = atop(min(maxphys, round_page(len + (ma_offs &
  963                     PAGE_MASK))));
  964                 iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len);
  965                 KASSERT(iolen > 0, ("zero iolen"));
  966                 pmap_qenter((vm_offset_t)pb->b_data,
  967                     &bp->bio_ma[atop(ma_offs)], npages);
  968                 aiov.iov_base = (void *)((vm_offset_t)pb->b_data +
  969                     (ma_offs & PAGE_MASK));
  970                 aiov.iov_len = iolen;
  971                 auio.uio_iov = &aiov;
  972                 auio.uio_iovcnt = 1;
  973                 auio.uio_resid = iolen;
  974         } else {
  975                 aiov.iov_base = bp->bio_data;
  976                 aiov.iov_len = bp->bio_length;
  977                 auio.uio_iov = &aiov;
  978                 auio.uio_iovcnt = 1;
  979         }
  980         iostart = auio.uio_offset;
  981         if (auio.uio_rw == UIO_READ) {
  982                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  983                 error = VOP_READ(vp, &auio, 0, sc->cred);
  984                 VOP_UNLOCK(vp);
  985         } else {
  986                 (void) vn_start_write(vp, &mp, V_WAIT);
  987                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  988                 error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC,
  989                     sc->cred);
  990                 VOP_UNLOCK(vp);
  991                 vn_finished_write(mp);
  992                 if (error == 0)
  993                         sc->flags &= ~MD_VERIFY;
  994         }
  995 
  996         /* When MD_CACHE is set, try to avoid double-caching the data. */
  997         if (error == 0 && (sc->flags & MD_CACHE) == 0)
  998                 VOP_ADVISE(vp, iostart, auio.uio_offset - 1,
  999                     POSIX_FADV_DONTNEED);
 1000 
 1001         if (pb != NULL) {
 1002                 pmap_qremove((vm_offset_t)pb->b_data, npages);
 1003                 if (error == 0) {
 1004                         len -= iolen;
 1005                         bp->bio_resid -= iolen;
 1006                         ma_offs += iolen;
 1007                         if (len > 0)
 1008                                 goto unmapped_step;
 1009                 }
 1010                 uma_zfree(md_pbuf_zone, pb);
 1011         } else {
 1012                 bp->bio_resid = auio.uio_resid;
 1013         }
 1014 
 1015         free(piov, M_MD);
 1016         return (error);
 1017 }
 1018 
 1019 static int
 1020 mdstart_swap(struct md_s *sc, struct bio *bp)
 1021 {
 1022         vm_page_t m;
 1023         u_char *p;
 1024         vm_pindex_t i, lastp;
 1025         bus_dma_segment_t *vlist;
 1026         int rv, ma_offs, offs, len, lastend;
 1027 
 1028         switch (bp->bio_cmd) {
 1029         case BIO_READ:
 1030         case BIO_WRITE:
 1031         case BIO_DELETE:
 1032                 break;
 1033         case BIO_FLUSH:
 1034                 return (0);
 1035         default:
 1036                 return (EOPNOTSUPP);
 1037         }
 1038 
 1039         p = bp->bio_data;
 1040         ma_offs = (bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0 ?
 1041             bp->bio_ma_offset : 0;
 1042         vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
 1043             (bus_dma_segment_t *)bp->bio_data : NULL;
 1044 
 1045         /*
 1046          * offs is the offset at which to start operating on the
 1047          * next (ie, first) page.  lastp is the last page on
 1048          * which we're going to operate.  lastend is the ending
 1049          * position within that last page (ie, PAGE_SIZE if
 1050          * we're operating on complete aligned pages).
 1051          */
 1052         offs = bp->bio_offset % PAGE_SIZE;
 1053         lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
 1054         lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
 1055 
 1056         rv = VM_PAGER_OK;
 1057         vm_object_pip_add(sc->object, 1);
 1058         for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
 1059                 len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
 1060                 m = vm_page_grab_unlocked(sc->object, i, VM_ALLOC_SYSTEM);
 1061                 if (bp->bio_cmd == BIO_READ) {
 1062                         if (vm_page_all_valid(m))
 1063                                 rv = VM_PAGER_OK;
 1064                         else
 1065                                 rv = vm_pager_get_pages(sc->object, &m, 1,
 1066                                     NULL, NULL);
 1067                         if (rv == VM_PAGER_ERROR) {
 1068                                 VM_OBJECT_WLOCK(sc->object);
 1069                                 vm_page_free(m);
 1070                                 VM_OBJECT_WUNLOCK(sc->object);
 1071                                 break;
 1072                         } else if (rv == VM_PAGER_FAIL) {
 1073                                 /*
 1074                                  * Pager does not have the page.  Zero
 1075                                  * the allocated page, and mark it as
 1076                                  * valid. Do not set dirty, the page
 1077                                  * can be recreated if thrown out.
 1078                                  */
 1079                                 pmap_zero_page(m);
 1080                                 vm_page_valid(m);
 1081                         }
 1082                         if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 1083                                 pmap_copy_pages(&m, offs, bp->bio_ma,
 1084                                     ma_offs, len);
 1085                         } else if ((bp->bio_flags & BIO_VLIST) != 0) {
 1086                                 physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs,
 1087                                     vlist, ma_offs, len);
 1088                                 cpu_flush_dcache(p, len);
 1089                         } else {
 1090                                 physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len);
 1091                                 cpu_flush_dcache(p, len);
 1092                         }
 1093                 } else if (bp->bio_cmd == BIO_WRITE) {
 1094                         if (len == PAGE_SIZE || vm_page_all_valid(m))
 1095                                 rv = VM_PAGER_OK;
 1096                         else
 1097                                 rv = vm_pager_get_pages(sc->object, &m, 1,
 1098                                     NULL, NULL);
 1099                         if (rv == VM_PAGER_ERROR) {
 1100                                 VM_OBJECT_WLOCK(sc->object);
 1101                                 vm_page_free(m);
 1102                                 VM_OBJECT_WUNLOCK(sc->object);
 1103                                 break;
 1104                         } else if (rv == VM_PAGER_FAIL)
 1105                                 pmap_zero_page(m);
 1106 
 1107                         if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 1108                                 pmap_copy_pages(bp->bio_ma, ma_offs, &m,
 1109                                     offs, len);
 1110                         } else if ((bp->bio_flags & BIO_VLIST) != 0) {
 1111                                 physcopyin_vlist(vlist, ma_offs,
 1112                                     VM_PAGE_TO_PHYS(m) + offs, len);
 1113                         } else {
 1114                                 physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len);
 1115                         }
 1116 
 1117                         vm_page_valid(m);
 1118                         vm_page_set_dirty(m);
 1119                 } else if (bp->bio_cmd == BIO_DELETE) {
 1120                         if (len == PAGE_SIZE || vm_page_all_valid(m))
 1121                                 rv = VM_PAGER_OK;
 1122                         else
 1123                                 rv = vm_pager_get_pages(sc->object, &m, 1,
 1124                                     NULL, NULL);
 1125                         VM_OBJECT_WLOCK(sc->object);
 1126                         if (rv == VM_PAGER_ERROR) {
 1127                                 vm_page_free(m);
 1128                                 VM_OBJECT_WUNLOCK(sc->object);
 1129                                 break;
 1130                         } else if (rv == VM_PAGER_FAIL) {
 1131                                 vm_page_free(m);
 1132                                 m = NULL;
 1133                         } else {
 1134                                 /* Page is valid. */
 1135                                 if (len != PAGE_SIZE) {
 1136                                         pmap_zero_page_area(m, offs, len);
 1137                                         vm_page_set_dirty(m);
 1138                                 } else {
 1139                                         vm_pager_page_unswapped(m);
 1140                                         vm_page_free(m);
 1141                                         m = NULL;
 1142                                 }
 1143                         }
 1144                         VM_OBJECT_WUNLOCK(sc->object);
 1145                 }
 1146                 if (m != NULL) {
 1147                         /*
 1148                          * The page may be deactivated prior to setting
 1149                          * PGA_REFERENCED, but in this case it will be
 1150                          * reactivated by the page daemon.
 1151                          */
 1152                         if (vm_page_active(m))
 1153                                 vm_page_reference(m);
 1154                         else
 1155                                 vm_page_activate(m);
 1156                         vm_page_xunbusy(m);
 1157                 }
 1158 
 1159                 /* Actions on further pages start at offset 0 */
 1160                 p += PAGE_SIZE - offs;
 1161                 offs = 0;
 1162                 ma_offs += len;
 1163         }
 1164         vm_object_pip_wakeup(sc->object);
 1165         return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
 1166 }
 1167 
 1168 static int
 1169 mdstart_null(struct md_s *sc, struct bio *bp)
 1170 {
 1171 
 1172         switch (bp->bio_cmd) {
 1173         case BIO_READ:
 1174                 bzero(bp->bio_data, bp->bio_length);
 1175                 cpu_flush_dcache(bp->bio_data, bp->bio_length);
 1176                 break;
 1177         case BIO_WRITE:
 1178                 break;
 1179         }
 1180         bp->bio_resid = 0;
 1181         return (0);
 1182 }
 1183 
 1184 static void
 1185 md_handleattr(struct md_s *sc, struct bio *bp)
 1186 {
 1187         if (sc->fwsectors && sc->fwheads &&
 1188             (g_handleattr_int(bp, "GEOM::fwsectors", sc->fwsectors) != 0 ||
 1189             g_handleattr_int(bp, "GEOM::fwheads", sc->fwheads) != 0))
 1190                 return;
 1191         if (g_handleattr_int(bp, "GEOM::candelete", sc->candelete) != 0)
 1192                 return;
 1193         if (sc->ident[0] != '\0' &&
 1194             g_handleattr_str(bp, "GEOM::ident", sc->ident) != 0)
 1195                 return;
 1196         if (g_handleattr_int(bp, "MNT::verified", (sc->flags & MD_VERIFY) != 0))
 1197                 return;
 1198         g_io_deliver(bp, EOPNOTSUPP);
 1199 }
 1200 
 1201 static void
 1202 md_kthread(void *arg)
 1203 {
 1204         struct md_s *sc;
 1205         struct bio *bp;
 1206         int error;
 1207 
 1208         sc = arg;
 1209         thread_lock(curthread);
 1210         sched_prio(curthread, PRIBIO);
 1211         thread_unlock(curthread);
 1212         if (sc->type == MD_VNODE)
 1213                 curthread->td_pflags |= TDP_NORUNNINGBUF;
 1214 
 1215         for (;;) {
 1216                 mtx_lock(&sc->queue_mtx);
 1217                 if (sc->flags & MD_SHUTDOWN) {
 1218                         sc->flags |= MD_EXITING;
 1219                         mtx_unlock(&sc->queue_mtx);
 1220                         kproc_exit(0);
 1221                 }
 1222                 bp = bioq_takefirst(&sc->bio_queue);
 1223                 if (!bp) {
 1224                         msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
 1225                         continue;
 1226                 }
 1227                 mtx_unlock(&sc->queue_mtx);
 1228                 if (bp->bio_cmd == BIO_GETATTR) {
 1229                         md_handleattr(sc, bp);
 1230                 } else {
 1231                         error = sc->start(sc, bp);
 1232                         if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
 1233                                 /*
 1234                                  * Devstat uses (bio_bcount, bio_resid) for
 1235                                  * determining the length of the completed part
 1236                                  * of the i/o.  g_io_deliver() will translate
 1237                                  * from bio_completed to that, but it also
 1238                                  * destroys the bio so we must do our own
 1239                                  * translation.
 1240                                  */
 1241                                 bp->bio_bcount = bp->bio_length;
 1242                                 devstat_end_transaction_bio(sc->devstat, bp);
 1243                         }
 1244                         bp->bio_completed = bp->bio_length - bp->bio_resid;
 1245                         g_io_deliver(bp, error);
 1246                 }
 1247         }
 1248 }
 1249 
 1250 static struct md_s *
 1251 mdfind(int unit)
 1252 {
 1253         struct md_s *sc;
 1254 
 1255         LIST_FOREACH(sc, &md_softc_list, list) {
 1256                 if (sc->unit == unit)
 1257                         break;
 1258         }
 1259         return (sc);
 1260 }
 1261 
 1262 static struct md_s *
 1263 mdnew(int unit, int *errp, enum md_types type)
 1264 {
 1265         struct md_s *sc;
 1266         int error;
 1267 
 1268         *errp = 0;
 1269         if (unit == -1)
 1270                 unit = alloc_unr(md_uh);
 1271         else
 1272                 unit = alloc_unr_specific(md_uh, unit);
 1273 
 1274         if (unit == -1) {
 1275                 *errp = EBUSY;
 1276                 return (NULL);
 1277         }
 1278 
 1279         sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
 1280         sc->type = type;
 1281         bioq_init(&sc->bio_queue);
 1282         mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
 1283         sc->unit = unit;
 1284         sprintf(sc->name, "md%d", unit);
 1285         LIST_INSERT_HEAD(&md_softc_list, sc, list);
 1286         error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
 1287         if (error == 0)
 1288                 return (sc);
 1289         LIST_REMOVE(sc, list);
 1290         mtx_destroy(&sc->queue_mtx);
 1291         free_unr(md_uh, sc->unit);
 1292         free(sc, M_MD);
 1293         *errp = error;
 1294         return (NULL);
 1295 }
 1296 
 1297 static void
 1298 mdinit(struct md_s *sc)
 1299 {
 1300         struct g_geom *gp;
 1301         struct g_provider *pp;
 1302 
 1303         g_topology_lock();
 1304         gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
 1305         gp->softc = sc;
 1306         pp = g_new_providerf(gp, "md%d", sc->unit);
 1307         devstat_remove_entry(pp->stat);
 1308         pp->stat = NULL;
 1309         pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
 1310         pp->mediasize = sc->mediasize;
 1311         pp->sectorsize = sc->sectorsize;
 1312         switch (sc->type) {
 1313         case MD_MALLOC:
 1314         case MD_VNODE:
 1315         case MD_SWAP:
 1316                 pp->flags |= G_PF_ACCEPT_UNMAPPED;
 1317                 break;
 1318         case MD_PRELOAD:
 1319         case MD_NULL:
 1320                 break;
 1321         }
 1322         sc->gp = gp;
 1323         sc->pp = pp;
 1324         sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize,
 1325             DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
 1326         sc->devstat->id = pp;
 1327         g_error_provider(pp, 0);
 1328         g_topology_unlock();
 1329 }
 1330 
 1331 static int
 1332 mdcreate_malloc(struct md_s *sc, struct md_req *mdr)
 1333 {
 1334         uintptr_t sp;
 1335         int error;
 1336         off_t u;
 1337 
 1338         error = 0;
 1339         if (mdr->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
 1340                 return (EINVAL);
 1341         if (mdr->md_sectorsize != 0 && !powerof2(mdr->md_sectorsize))
 1342                 return (EINVAL);
 1343         /* Compression doesn't make sense if we have reserved space */
 1344         if (mdr->md_options & MD_RESERVE)
 1345                 mdr->md_options &= ~MD_COMPRESS;
 1346         if (mdr->md_fwsectors != 0)
 1347                 sc->fwsectors = mdr->md_fwsectors;
 1348         if (mdr->md_fwheads != 0)
 1349                 sc->fwheads = mdr->md_fwheads;
 1350         sc->flags = mdr->md_options & (MD_COMPRESS | MD_FORCE);
 1351         sc->indir = dimension(sc->mediasize / sc->sectorsize);
 1352         sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
 1353             0x1ff, 0);
 1354         if (mdr->md_options & MD_RESERVE) {
 1355                 off_t nsectors;
 1356 
 1357                 nsectors = sc->mediasize / sc->sectorsize;
 1358                 for (u = 0; u < nsectors; u++) {
 1359                         sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ?
 1360                             M_WAITOK : M_NOWAIT) | M_ZERO);
 1361                         if (sp != 0)
 1362                                 error = s_write(sc->indir, u, sp);
 1363                         else
 1364                                 error = ENOMEM;
 1365                         if (error != 0)
 1366                                 break;
 1367                 }
 1368         }
 1369         return (error);
 1370 }
 1371 
 1372 static int
 1373 mdsetcred(struct md_s *sc, struct ucred *cred)
 1374 {
 1375         char *tmpbuf;
 1376         int error = 0;
 1377 
 1378         /*
 1379          * Set credits in our softc
 1380          */
 1381 
 1382         if (sc->cred)
 1383                 crfree(sc->cred);
 1384         sc->cred = crhold(cred);
 1385 
 1386         /*
 1387          * Horrible kludge to establish credentials for NFS  XXX.
 1388          */
 1389 
 1390         if (sc->vnode) {
 1391                 struct uio auio;
 1392                 struct iovec aiov;
 1393 
 1394                 tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
 1395                 bzero(&auio, sizeof(auio));
 1396 
 1397                 aiov.iov_base = tmpbuf;
 1398                 aiov.iov_len = sc->sectorsize;
 1399                 auio.uio_iov = &aiov;
 1400                 auio.uio_iovcnt = 1;
 1401                 auio.uio_offset = 0;
 1402                 auio.uio_rw = UIO_READ;
 1403                 auio.uio_segflg = UIO_SYSSPACE;
 1404                 auio.uio_resid = aiov.iov_len;
 1405                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
 1406                 error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
 1407                 VOP_UNLOCK(sc->vnode);
 1408                 free(tmpbuf, M_TEMP);
 1409         }
 1410         return (error);
 1411 }
 1412 
 1413 static int
 1414 mdcreate_vnode(struct md_s *sc, struct md_req *mdr, struct thread *td)
 1415 {
 1416         struct vattr vattr;
 1417         struct nameidata nd;
 1418         char *fname;
 1419         int error, flags;
 1420         long v;
 1421 
 1422         fname = mdr->md_file;
 1423         if (mdr->md_file_seg == UIO_USERSPACE) {
 1424                 error = copyinstr(fname, sc->file, sizeof(sc->file), NULL);
 1425                 if (error != 0)
 1426                         return (error);
 1427         } else if (mdr->md_file_seg == UIO_SYSSPACE)
 1428                 strlcpy(sc->file, fname, sizeof(sc->file));
 1429         else
 1430                 return (EDOOFUS);
 1431 
 1432         /*
 1433          * If the user specified that this is a read only device, don't
 1434          * set the FWRITE mask before trying to open the backing store.
 1435          */
 1436         flags = FREAD | ((mdr->md_options & MD_READONLY) ? 0 : FWRITE) \
 1437             | ((mdr->md_options & MD_VERIFY) ? O_VERIFY : 0);
 1438         NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file);
 1439         error = vn_open(&nd, &flags, 0, NULL);
 1440         if (error != 0)
 1441                 return (error);
 1442         NDFREE_PNBUF(&nd);
 1443         if (nd.ni_vp->v_type != VREG) {
 1444                 error = EINVAL;
 1445                 goto bad;
 1446         }
 1447         error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred);
 1448         if (error != 0)
 1449                 goto bad;
 1450         if ((mdr->md_options & MD_MUSTDEALLOC) != 0) {
 1451                 error = VOP_PATHCONF(nd.ni_vp, _PC_DEALLOC_PRESENT, &v);
 1452                 if (error != 0)
 1453                         goto bad;
 1454                 if (v == 0)
 1455                         sc->candelete = false;
 1456         }
 1457         if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) {
 1458                 vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY);
 1459                 if (VN_IS_DOOMED(nd.ni_vp)) {
 1460                         /* Forced unmount. */
 1461                         error = EBADF;
 1462                         goto bad;
 1463                 }
 1464         }
 1465         nd.ni_vp->v_vflag |= VV_MD;
 1466         VOP_UNLOCK(nd.ni_vp);
 1467 
 1468         if (mdr->md_fwsectors != 0)
 1469                 sc->fwsectors = mdr->md_fwsectors;
 1470         if (mdr->md_fwheads != 0)
 1471                 sc->fwheads = mdr->md_fwheads;
 1472         snprintf(sc->ident, sizeof(sc->ident), "MD-DEV%ju-INO%ju",
 1473             (uintmax_t)vattr.va_fsid, (uintmax_t)vattr.va_fileid);
 1474         sc->flags = mdr->md_options & (MD_ASYNC | MD_CACHE | MD_FORCE |
 1475             MD_VERIFY);
 1476         if (!(flags & FWRITE))
 1477                 sc->flags |= MD_READONLY;
 1478         sc->vnode = nd.ni_vp;
 1479 
 1480         error = mdsetcred(sc, td->td_ucred);
 1481         if (error != 0) {
 1482                 sc->vnode = NULL;
 1483                 vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
 1484                 nd.ni_vp->v_vflag &= ~VV_MD;
 1485                 goto bad;
 1486         }
 1487         return (0);
 1488 bad:
 1489         VOP_UNLOCK(nd.ni_vp);
 1490         (void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
 1491         return (error);
 1492 }
 1493 
 1494 static void
 1495 g_md_providergone(struct g_provider *pp)
 1496 {
 1497         struct md_s *sc = pp->geom->softc;
 1498 
 1499         mtx_lock(&sc->queue_mtx);
 1500         sc->flags |= MD_PROVIDERGONE;
 1501         wakeup(&sc->flags);
 1502         mtx_unlock(&sc->queue_mtx);
 1503 }
 1504 
 1505 static int
 1506 mddestroy(struct md_s *sc, struct thread *td)
 1507 {
 1508 
 1509         if (sc->gp) {
 1510                 g_topology_lock();
 1511                 g_wither_geom(sc->gp, ENXIO);
 1512                 g_topology_unlock();
 1513 
 1514                 mtx_lock(&sc->queue_mtx);
 1515                 while (!(sc->flags & MD_PROVIDERGONE))
 1516                         msleep(&sc->flags, &sc->queue_mtx, PRIBIO, "mddestroy", 0);
 1517                 mtx_unlock(&sc->queue_mtx);
 1518         }
 1519         if (sc->devstat) {
 1520                 devstat_remove_entry(sc->devstat);
 1521                 sc->devstat = NULL;
 1522         }
 1523         mtx_lock(&sc->queue_mtx);
 1524         sc->flags |= MD_SHUTDOWN;
 1525         wakeup(sc);
 1526         while (!(sc->flags & MD_EXITING))
 1527                 msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10);
 1528         mtx_unlock(&sc->queue_mtx);
 1529         mtx_destroy(&sc->queue_mtx);
 1530         if (sc->vnode != NULL) {
 1531                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
 1532                 sc->vnode->v_vflag &= ~VV_MD;
 1533                 VOP_UNLOCK(sc->vnode);
 1534                 (void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
 1535                     FREAD : (FREAD|FWRITE), sc->cred, td);
 1536         }
 1537         if (sc->cred != NULL)
 1538                 crfree(sc->cred);
 1539         if (sc->object != NULL)
 1540                 vm_object_deallocate(sc->object);
 1541         if (sc->indir)
 1542                 destroy_indir(sc, sc->indir);
 1543         if (sc->uma)
 1544                 uma_zdestroy(sc->uma);
 1545 
 1546         LIST_REMOVE(sc, list);
 1547         free_unr(md_uh, sc->unit);
 1548         free(sc, M_MD);
 1549         return (0);
 1550 }
 1551 
 1552 static int
 1553 mdresize(struct md_s *sc, struct md_req *mdr)
 1554 {
 1555         int error, res;
 1556         vm_pindex_t oldpages, newpages;
 1557 
 1558         switch (sc->type) {
 1559         case MD_VNODE:
 1560         case MD_NULL:
 1561                 break;
 1562         case MD_SWAP:
 1563                 if (mdr->md_mediasize <= 0 ||
 1564                     (mdr->md_mediasize % PAGE_SIZE) != 0)
 1565                         return (EDOM);
 1566                 oldpages = OFF_TO_IDX(sc->mediasize);
 1567                 newpages = OFF_TO_IDX(mdr->md_mediasize);
 1568                 if (newpages < oldpages) {
 1569                         VM_OBJECT_WLOCK(sc->object);
 1570                         vm_object_page_remove(sc->object, newpages, 0, 0);
 1571                         swap_release_by_cred(IDX_TO_OFF(oldpages -
 1572                             newpages), sc->cred);
 1573                         sc->object->charge = IDX_TO_OFF(newpages);
 1574                         sc->object->size = newpages;
 1575                         VM_OBJECT_WUNLOCK(sc->object);
 1576                 } else if (newpages > oldpages) {
 1577                         res = swap_reserve_by_cred(IDX_TO_OFF(newpages -
 1578                             oldpages), sc->cred);
 1579                         if (!res)
 1580                                 return (ENOMEM);
 1581                         if ((mdr->md_options & MD_RESERVE) ||
 1582                             (sc->flags & MD_RESERVE)) {
 1583                                 error = swap_pager_reserve(sc->object,
 1584                                     oldpages, newpages - oldpages);
 1585                                 if (error < 0) {
 1586                                         swap_release_by_cred(
 1587                                             IDX_TO_OFF(newpages - oldpages),
 1588                                             sc->cred);
 1589                                         return (EDOM);
 1590                                 }
 1591                         }
 1592                         VM_OBJECT_WLOCK(sc->object);
 1593                         sc->object->charge = IDX_TO_OFF(newpages);
 1594                         sc->object->size = newpages;
 1595                         VM_OBJECT_WUNLOCK(sc->object);
 1596                 }
 1597                 break;
 1598         default:
 1599                 return (EOPNOTSUPP);
 1600         }
 1601 
 1602         sc->mediasize = mdr->md_mediasize;
 1603 
 1604         g_topology_lock();
 1605         g_resize_provider(sc->pp, sc->mediasize);
 1606         g_topology_unlock();
 1607         return (0);
 1608 }
 1609 
 1610 static int
 1611 mdcreate_swap(struct md_s *sc, struct md_req *mdr, struct thread *td)
 1612 {
 1613         vm_ooffset_t npage;
 1614         int error;
 1615 
 1616         /*
 1617          * Range check.  Disallow negative sizes and sizes not being
 1618          * multiple of page size.
 1619          */
 1620         if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
 1621                 return (EDOM);
 1622 
 1623         /*
 1624          * Allocate an OBJT_SWAP object.
 1625          *
 1626          * Note the truncation.
 1627          */
 1628 
 1629         if ((mdr->md_options & MD_VERIFY) != 0)
 1630                 return (EINVAL);
 1631         npage = mdr->md_mediasize / PAGE_SIZE;
 1632         if (mdr->md_fwsectors != 0)
 1633                 sc->fwsectors = mdr->md_fwsectors;
 1634         if (mdr->md_fwheads != 0)
 1635                 sc->fwheads = mdr->md_fwheads;
 1636         sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
 1637             VM_PROT_DEFAULT, 0, td->td_ucred);
 1638         if (sc->object == NULL)
 1639                 return (ENOMEM);
 1640         sc->flags = mdr->md_options & (MD_FORCE | MD_RESERVE);
 1641         if (mdr->md_options & MD_RESERVE) {
 1642                 if (swap_pager_reserve(sc->object, 0, npage) < 0) {
 1643                         error = EDOM;
 1644                         goto finish;
 1645                 }
 1646         }
 1647         error = mdsetcred(sc, td->td_ucred);
 1648  finish:
 1649         if (error != 0) {
 1650                 vm_object_deallocate(sc->object);
 1651                 sc->object = NULL;
 1652         }
 1653         return (error);
 1654 }
 1655 
 1656 static int
 1657 mdcreate_null(struct md_s *sc, struct md_req *mdr, struct thread *td)
 1658 {
 1659 
 1660         /*
 1661          * Range check.  Disallow negative sizes and sizes not being
 1662          * multiple of page size.
 1663          */
 1664         if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
 1665                 return (EDOM);
 1666 
 1667         return (0);
 1668 }
 1669 
 1670 static int
 1671 kern_mdattach_locked(struct thread *td, struct md_req *mdr)
 1672 {
 1673         struct md_s *sc;
 1674         unsigned sectsize;
 1675         int error, i;
 1676 
 1677         sx_assert(&md_sx, SA_XLOCKED);
 1678 
 1679         switch (mdr->md_type) {
 1680         case MD_MALLOC:
 1681         case MD_PRELOAD:
 1682         case MD_VNODE:
 1683         case MD_SWAP:
 1684         case MD_NULL:
 1685                 break;
 1686         default:
 1687                 return (EINVAL);
 1688         }
 1689         if (mdr->md_sectorsize == 0)
 1690                 sectsize = DEV_BSIZE;
 1691         else
 1692                 sectsize = mdr->md_sectorsize;
 1693         if (sectsize > maxphys || mdr->md_mediasize < sectsize)
 1694                 return (EINVAL);
 1695         if (mdr->md_options & MD_AUTOUNIT)
 1696                 sc = mdnew(-1, &error, mdr->md_type);
 1697         else {
 1698                 if (mdr->md_unit > INT_MAX)
 1699                         return (EINVAL);
 1700                 sc = mdnew(mdr->md_unit, &error, mdr->md_type);
 1701         }
 1702         if (sc == NULL)
 1703                 return (error);
 1704         if (mdr->md_label != NULL)
 1705                 error = copyinstr(mdr->md_label, sc->label,
 1706                     sizeof(sc->label), NULL);
 1707         if (error != 0)
 1708                 goto err_after_new;
 1709         if (mdr->md_options & MD_AUTOUNIT)
 1710                 mdr->md_unit = sc->unit;
 1711         sc->mediasize = mdr->md_mediasize;
 1712         sc->sectorsize = sectsize;
 1713         sc->candelete = true;
 1714         error = EDOOFUS;
 1715         switch (sc->type) {
 1716         case MD_MALLOC:
 1717                 sc->start = mdstart_malloc;
 1718                 error = mdcreate_malloc(sc, mdr);
 1719                 break;
 1720         case MD_PRELOAD:
 1721                 /*
 1722                  * We disallow attaching preloaded memory disks via
 1723                  * ioctl. Preloaded memory disks are automatically
 1724                  * attached in g_md_init().
 1725                  */
 1726                 error = EOPNOTSUPP;
 1727                 break;
 1728         case MD_VNODE:
 1729                 sc->start = mdstart_vnode;
 1730                 error = mdcreate_vnode(sc, mdr, td);
 1731                 break;
 1732         case MD_SWAP:
 1733                 sc->start = mdstart_swap;
 1734                 error = mdcreate_swap(sc, mdr, td);
 1735                 break;
 1736         case MD_NULL:
 1737                 sc->start = mdstart_null;
 1738                 error = mdcreate_null(sc, mdr, td);
 1739                 break;
 1740         }
 1741 err_after_new:
 1742         if (error != 0) {
 1743                 mddestroy(sc, td);
 1744                 return (error);
 1745         }
 1746 
 1747         /* Prune off any residual fractional sector */
 1748         i = sc->mediasize % sc->sectorsize;
 1749         sc->mediasize -= i;
 1750 
 1751         mdinit(sc);
 1752         return (0);
 1753 }
 1754 
 1755 static int
 1756 kern_mdattach(struct thread *td, struct md_req *mdr)
 1757 {
 1758         int error;
 1759 
 1760         sx_xlock(&md_sx);
 1761         error = kern_mdattach_locked(td, mdr);
 1762         sx_xunlock(&md_sx);
 1763         return (error);
 1764 }
 1765 
 1766 static int
 1767 kern_mddetach_locked(struct thread *td, struct md_req *mdr)
 1768 {
 1769         struct md_s *sc;
 1770 
 1771         sx_assert(&md_sx, SA_XLOCKED);
 1772 
 1773         if (mdr->md_mediasize != 0 ||
 1774             (mdr->md_options & ~MD_FORCE) != 0)
 1775                 return (EINVAL);
 1776 
 1777         sc = mdfind(mdr->md_unit);
 1778         if (sc == NULL)
 1779                 return (ENOENT);
 1780         if (sc->opencount != 0 && !(sc->flags & MD_FORCE) &&
 1781             !(mdr->md_options & MD_FORCE))
 1782                 return (EBUSY);
 1783         return (mddestroy(sc, td));
 1784 }
 1785 
 1786 static int
 1787 kern_mddetach(struct thread *td, struct md_req *mdr)
 1788 {
 1789         int error;
 1790 
 1791         sx_xlock(&md_sx);
 1792         error = kern_mddetach_locked(td, mdr);
 1793         sx_xunlock(&md_sx);
 1794         return (error);
 1795 }
 1796 
 1797 static int
 1798 kern_mdresize_locked(struct md_req *mdr)
 1799 {
 1800         struct md_s *sc;
 1801 
 1802         sx_assert(&md_sx, SA_XLOCKED);
 1803 
 1804         if ((mdr->md_options & ~(MD_FORCE | MD_RESERVE)) != 0)
 1805                 return (EINVAL);
 1806 
 1807         sc = mdfind(mdr->md_unit);
 1808         if (sc == NULL)
 1809                 return (ENOENT);
 1810         if (mdr->md_mediasize < sc->sectorsize)
 1811                 return (EINVAL);
 1812         mdr->md_mediasize -= mdr->md_mediasize % sc->sectorsize;
 1813         if (mdr->md_mediasize < sc->mediasize &&
 1814             !(sc->flags & MD_FORCE) &&
 1815             !(mdr->md_options & MD_FORCE))
 1816                 return (EBUSY);
 1817         return (mdresize(sc, mdr));
 1818 }
 1819 
 1820 static int
 1821 kern_mdresize(struct md_req *mdr)
 1822 {
 1823         int error;
 1824 
 1825         sx_xlock(&md_sx);
 1826         error = kern_mdresize_locked(mdr);
 1827         sx_xunlock(&md_sx);
 1828         return (error);
 1829 }
 1830 
 1831 static int
 1832 kern_mdquery_locked(struct md_req *mdr)
 1833 {
 1834         struct md_s *sc;
 1835         int error;
 1836 
 1837         sx_assert(&md_sx, SA_XLOCKED);
 1838 
 1839         sc = mdfind(mdr->md_unit);
 1840         if (sc == NULL)
 1841                 return (ENOENT);
 1842         mdr->md_type = sc->type;
 1843         mdr->md_options = sc->flags;
 1844         mdr->md_mediasize = sc->mediasize;
 1845         mdr->md_sectorsize = sc->sectorsize;
 1846         error = 0;
 1847         if (mdr->md_label != NULL) {
 1848                 error = copyout(sc->label, mdr->md_label,
 1849                     strlen(sc->label) + 1);
 1850                 if (error != 0)
 1851                         return (error);
 1852         }
 1853         if (sc->type == MD_VNODE ||
 1854             (sc->type == MD_PRELOAD && mdr->md_file != NULL))
 1855                 error = copyout(sc->file, mdr->md_file,
 1856                     strlen(sc->file) + 1);
 1857         return (error);
 1858 }
 1859 
 1860 static int
 1861 kern_mdquery(struct md_req *mdr)
 1862 {
 1863         int error;
 1864 
 1865         sx_xlock(&md_sx);
 1866         error = kern_mdquery_locked(mdr);
 1867         sx_xunlock(&md_sx);
 1868         return (error);
 1869 }
 1870 
 1871 /* Copy members that are not userspace pointers. */
 1872 #define MD_IOCTL2REQ(mdio, mdr) do {                                    \
 1873         (mdr)->md_unit = (mdio)->md_unit;                               \
 1874         (mdr)->md_type = (mdio)->md_type;                               \
 1875         (mdr)->md_mediasize = (mdio)->md_mediasize;                     \
 1876         (mdr)->md_sectorsize = (mdio)->md_sectorsize;                   \
 1877         (mdr)->md_options = (mdio)->md_options;                         \
 1878         (mdr)->md_fwheads = (mdio)->md_fwheads;                         \
 1879         (mdr)->md_fwsectors = (mdio)->md_fwsectors;                     \
 1880         (mdr)->md_units = &(mdio)->md_pad[0];                           \
 1881         (mdr)->md_units_nitems = nitems((mdio)->md_pad);                \
 1882 } while(0)
 1883 
 1884 /* Copy members that might have been updated */
 1885 #define MD_REQ2IOCTL(mdr, mdio) do {                                    \
 1886         (mdio)->md_unit = (mdr)->md_unit;                               \
 1887         (mdio)->md_type = (mdr)->md_type;                               \
 1888         (mdio)->md_mediasize = (mdr)->md_mediasize;                     \
 1889         (mdio)->md_sectorsize = (mdr)->md_sectorsize;                   \
 1890         (mdio)->md_options = (mdr)->md_options;                         \
 1891         (mdio)->md_fwheads = (mdr)->md_fwheads;                         \
 1892         (mdio)->md_fwsectors = (mdr)->md_fwsectors;                     \
 1893 } while(0)
 1894 
 1895 static int
 1896 mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
 1897     struct thread *td)
 1898 {
 1899         struct md_req mdr;
 1900         int error;
 1901 
 1902         if (md_debug)
 1903                 printf("mdctlioctl(%s %lx %p %x %p)\n",
 1904                         devtoname(dev), cmd, addr, flags, td);
 1905 
 1906         bzero(&mdr, sizeof(mdr));
 1907         switch (cmd) {
 1908         case MDIOCATTACH:
 1909         case MDIOCDETACH:
 1910         case MDIOCRESIZE:
 1911         case MDIOCQUERY: {
 1912                 struct md_ioctl *mdio = (struct md_ioctl *)addr;
 1913                 if (mdio->md_version != MDIOVERSION)
 1914                         return (EINVAL);
 1915                 MD_IOCTL2REQ(mdio, &mdr);
 1916                 mdr.md_file = mdio->md_file;
 1917                 mdr.md_file_seg = UIO_USERSPACE;
 1918                 /* If the file is adjacent to the md_ioctl it's in kernel. */
 1919                 if ((void *)mdio->md_file == (void *)(mdio + 1))
 1920                         mdr.md_file_seg = UIO_SYSSPACE;
 1921                 mdr.md_label = mdio->md_label;
 1922                 break;
 1923         }
 1924 #ifdef COMPAT_FREEBSD32
 1925         case MDIOCATTACH_32:
 1926         case MDIOCDETACH_32:
 1927         case MDIOCRESIZE_32:
 1928         case MDIOCQUERY_32: {
 1929                 struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
 1930                 if (mdio->md_version != MDIOVERSION)
 1931                         return (EINVAL);
 1932                 MD_IOCTL2REQ(mdio, &mdr);
 1933                 mdr.md_file = (void *)(uintptr_t)mdio->md_file;
 1934                 mdr.md_file_seg = UIO_USERSPACE;
 1935                 mdr.md_label = (void *)(uintptr_t)mdio->md_label;
 1936                 break;
 1937         }
 1938 #endif
 1939         default:
 1940                 /* Fall through to handler switch. */
 1941                 break;
 1942         }
 1943 
 1944         error = 0;
 1945         switch (cmd) {
 1946         case MDIOCATTACH:
 1947 #ifdef COMPAT_FREEBSD32
 1948         case MDIOCATTACH_32:
 1949 #endif
 1950                 error = kern_mdattach(td, &mdr);
 1951                 break;
 1952         case MDIOCDETACH:
 1953 #ifdef COMPAT_FREEBSD32
 1954         case MDIOCDETACH_32:
 1955 #endif
 1956                 error = kern_mddetach(td, &mdr);
 1957                 break;
 1958         case MDIOCRESIZE:
 1959 #ifdef COMPAT_FREEBSD32
 1960         case MDIOCRESIZE_32:
 1961 #endif
 1962                 error = kern_mdresize(&mdr);
 1963                 break;
 1964         case MDIOCQUERY:
 1965 #ifdef COMPAT_FREEBSD32
 1966         case MDIOCQUERY_32:
 1967 #endif
 1968                 error = kern_mdquery(&mdr);
 1969                 break;
 1970         default:
 1971                 error = ENOIOCTL;
 1972         }
 1973 
 1974         switch (cmd) {
 1975         case MDIOCATTACH:
 1976         case MDIOCQUERY: {
 1977                 struct md_ioctl *mdio = (struct md_ioctl *)addr;
 1978                 MD_REQ2IOCTL(&mdr, mdio);
 1979                 break;
 1980         }
 1981 #ifdef COMPAT_FREEBSD32
 1982         case MDIOCATTACH_32:
 1983         case MDIOCQUERY_32: {
 1984                 struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
 1985                 MD_REQ2IOCTL(&mdr, mdio);
 1986                 break;
 1987         }
 1988 #endif
 1989         default:
 1990                 /* Other commands to not alter mdr. */
 1991                 break;
 1992         }
 1993 
 1994         return (error);
 1995 }
 1996 
 1997 static void
 1998 md_preloaded(u_char *image, size_t length, const char *name)
 1999 {
 2000         struct md_s *sc;
 2001         int error;
 2002 
 2003         sc = mdnew(-1, &error, MD_PRELOAD);
 2004         if (sc == NULL)
 2005                 return;
 2006         sc->mediasize = length;
 2007         sc->sectorsize = DEV_BSIZE;
 2008         sc->pl_ptr = image;
 2009         sc->pl_len = length;
 2010         sc->start = mdstart_preload;
 2011         if (name != NULL)
 2012                 strlcpy(sc->file, name, sizeof(sc->file));
 2013 #ifdef MD_ROOT
 2014         if (sc->unit == 0) {
 2015 #ifndef ROOTDEVNAME
 2016                 rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0";
 2017 #endif
 2018 #ifdef MD_ROOT_READONLY
 2019                 sc->flags |= MD_READONLY;
 2020 #endif
 2021         }
 2022 #endif
 2023         mdinit(sc);
 2024         if (name != NULL) {
 2025                 printf("%s%d: Preloaded image <%s> %zd bytes at %p\n",
 2026                     MD_NAME, sc->unit, name, length, image);
 2027         } else {
 2028                 printf("%s%d: Embedded image %zd bytes at %p\n",
 2029                     MD_NAME, sc->unit, length, image);
 2030         }
 2031 }
 2032 
 2033 static void
 2034 g_md_init(struct g_class *mp __unused)
 2035 {
 2036         caddr_t mod;
 2037         u_char *ptr, *name, *type;
 2038         unsigned len;
 2039         int i;
 2040 
 2041         /* figure out log2(NINDIR) */
 2042         for (i = NINDIR, nshift = -1; i; nshift++)
 2043                 i >>= 1;
 2044 
 2045         mod = NULL;
 2046         sx_init(&md_sx, "MD config lock");
 2047         g_topology_unlock();
 2048         md_uh = new_unrhdr(0, INT_MAX, NULL);
 2049 #ifdef MD_ROOT
 2050         if (mfs_root_size != 0) {
 2051                 sx_xlock(&md_sx);
 2052 #ifdef MD_ROOT_MEM
 2053                 md_preloaded(mfs_root, mfs_root_size, NULL);
 2054 #else
 2055                 md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size,
 2056                     NULL);
 2057 #endif
 2058                 sx_xunlock(&md_sx);
 2059         }
 2060 #endif
 2061         /* XXX: are preload_* static or do they need Giant ? */
 2062         while ((mod = preload_search_next_name(mod)) != NULL) {
 2063                 name = (char *)preload_search_info(mod, MODINFO_NAME);
 2064                 if (name == NULL)
 2065                         continue;
 2066                 type = (char *)preload_search_info(mod, MODINFO_TYPE);
 2067                 if (type == NULL)
 2068                         continue;
 2069                 if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
 2070                         continue;
 2071                 ptr = preload_fetch_addr(mod);
 2072                 len = preload_fetch_size(mod);
 2073                 if (ptr != NULL && len != 0) {
 2074                         sx_xlock(&md_sx);
 2075                         md_preloaded(ptr, len, name);
 2076                         sx_xunlock(&md_sx);
 2077                 }
 2078         }
 2079         md_pbuf_zone = pbuf_zsecond_create("mdpbuf", nswbuf / 10);
 2080         status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL,
 2081             0600, MDCTL_NAME);
 2082         g_topology_lock();
 2083 }
 2084 
 2085 static void
 2086 g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
 2087     struct g_consumer *cp __unused, struct g_provider *pp)
 2088 {
 2089         struct md_s *mp;
 2090         char *type;
 2091 
 2092         mp = gp->softc;
 2093         if (mp == NULL)
 2094                 return;
 2095 
 2096         switch (mp->type) {
 2097         case MD_MALLOC:
 2098                 type = "malloc";
 2099                 break;
 2100         case MD_PRELOAD:
 2101                 type = "preload";
 2102                 break;
 2103         case MD_VNODE:
 2104                 type = "vnode";
 2105                 break;
 2106         case MD_SWAP:
 2107                 type = "swap";
 2108                 break;
 2109         case MD_NULL:
 2110                 type = "null";
 2111                 break;
 2112         default:
 2113                 type = "unknown";
 2114                 break;
 2115         }
 2116 
 2117         if (pp != NULL) {
 2118                 if (indent == NULL) {
 2119                         sbuf_printf(sb, " u %d", mp->unit);
 2120                         sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize);
 2121                         sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads);
 2122                         sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors);
 2123                         sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize);
 2124                         sbuf_printf(sb, " t %s", type);
 2125                         if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
 2126                             (mp->type == MD_PRELOAD && mp->file[0] != '\0'))
 2127                                 sbuf_printf(sb, " file %s", mp->file);
 2128                         sbuf_printf(sb, " label %s", mp->label);
 2129                 } else {
 2130                         sbuf_printf(sb, "%s<unit>%d</unit>\n", indent,
 2131                             mp->unit);
 2132                         sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n",
 2133                             indent, (uintmax_t) mp->sectorsize);
 2134                         sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n",
 2135                             indent, (uintmax_t) mp->fwheads);
 2136                         sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n",
 2137                             indent, (uintmax_t) mp->fwsectors);
 2138                         if (mp->ident[0] != '\0') {
 2139                                 sbuf_printf(sb, "%s<ident>", indent);
 2140                                 g_conf_printf_escaped(sb, "%s", mp->ident);
 2141                                 sbuf_printf(sb, "</ident>\n");
 2142                         }
 2143                         sbuf_printf(sb, "%s<length>%ju</length>\n",
 2144                             indent, (uintmax_t) mp->mediasize);
 2145                         sbuf_printf(sb, "%s<compression>%s</compression>\n", indent,
 2146                             (mp->flags & MD_COMPRESS) == 0 ? "off": "on");
 2147                         sbuf_printf(sb, "%s<access>%s</access>\n", indent,
 2148                             (mp->flags & MD_READONLY) == 0 ? "read-write":
 2149                             "read-only");
 2150                         sbuf_printf(sb, "%s<type>%s</type>\n", indent,
 2151                             type);
 2152                         if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
 2153                             (mp->type == MD_PRELOAD && mp->file[0] != '\0')) {
 2154                                 sbuf_printf(sb, "%s<file>", indent);
 2155                                 g_conf_printf_escaped(sb, "%s", mp->file);
 2156                                 sbuf_printf(sb, "</file>\n");
 2157                         }
 2158                         if (mp->type == MD_VNODE)
 2159                                 sbuf_printf(sb, "%s<cache>%s</cache>\n", indent,
 2160                                     (mp->flags & MD_CACHE) == 0 ? "off": "on");
 2161                         sbuf_printf(sb, "%s<label>", indent);
 2162                         g_conf_printf_escaped(sb, "%s", mp->label);
 2163                         sbuf_printf(sb, "</label>\n");
 2164                 }
 2165         }
 2166 }
 2167 
 2168 static void
 2169 g_md_fini(struct g_class *mp __unused)
 2170 {
 2171 
 2172         sx_destroy(&md_sx);
 2173         if (status_dev != NULL)
 2174                 destroy_dev(status_dev);
 2175         uma_zdestroy(md_pbuf_zone);
 2176         delete_unrhdr(md_uh);
 2177 }

Cache object: 173d8cff8d9f205f7751f5f42896c558


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.