The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/md/md.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: (Beerware AND BSD-3-Clause)
    3  *
    4  * ----------------------------------------------------------------------------
    5  * "THE BEER-WARE LICENSE" (Revision 42):
    6  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
    7  * can do whatever you want with this stuff. If we meet some day, and you think
    8  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
    9  * ----------------------------------------------------------------------------
   10  *
   11  * $FreeBSD$
   12  *
   13  */
   14 
   15 /*-
   16  * The following functions are based on the vn(4) driver: mdstart_swap(),
   17  * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
   18  * and as such under the following copyright:
   19  *
   20  * Copyright (c) 1988 University of Utah.
   21  * Copyright (c) 1990, 1993
   22  *      The Regents of the University of California.  All rights reserved.
   23  * Copyright (c) 2013 The FreeBSD Foundation
   24  * All rights reserved.
   25  *
   26  * This code is derived from software contributed to Berkeley by
   27  * the Systems Programming Group of the University of Utah Computer
   28  * Science Department.
   29  *
   30  * Portions of this software were developed by Konstantin Belousov
   31  * under sponsorship from the FreeBSD Foundation.
   32  *
   33  * Redistribution and use in source and binary forms, with or without
   34  * modification, are permitted provided that the following conditions
   35  * are met:
   36  * 1. Redistributions of source code must retain the above copyright
   37  *    notice, this list of conditions and the following disclaimer.
   38  * 2. Redistributions in binary form must reproduce the above copyright
   39  *    notice, this list of conditions and the following disclaimer in the
   40  *    documentation and/or other materials provided with the distribution.
   41  * 3. Neither the name of the University nor the names of its contributors
   42  *    may be used to endorse or promote products derived from this software
   43  *    without specific prior written permission.
   44  *
   45  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   55  * SUCH DAMAGE.
   56  *
   57  * from: Utah Hdr: vn.c 1.13 94/04/02
   58  *
   59  *      from: @(#)vn.c  8.6 (Berkeley) 4/1/94
   60  * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
   61  */
   62 
   63 #include "opt_rootdevname.h"
   64 #include "opt_geom.h"
   65 #include "opt_md.h"
   66 
   67 #include <sys/param.h>
   68 #include <sys/systm.h>
   69 #include <sys/bio.h>
   70 #include <sys/buf.h>
   71 #include <sys/conf.h>
   72 #include <sys/devicestat.h>
   73 #include <sys/fcntl.h>
   74 #include <sys/kernel.h>
   75 #include <sys/kthread.h>
   76 #include <sys/limits.h>
   77 #include <sys/linker.h>
   78 #include <sys/lock.h>
   79 #include <sys/malloc.h>
   80 #include <sys/mdioctl.h>
   81 #include <sys/mount.h>
   82 #include <sys/mutex.h>
   83 #include <sys/sx.h>
   84 #include <sys/namei.h>
   85 #include <sys/proc.h>
   86 #include <sys/queue.h>
   87 #include <sys/rwlock.h>
   88 #include <sys/sbuf.h>
   89 #include <sys/sched.h>
   90 #include <sys/sf_buf.h>
   91 #include <sys/sysctl.h>
   92 #include <sys/uio.h>
   93 #include <sys/vnode.h>
   94 #include <sys/disk.h>
   95 
   96 #include <geom/geom.h>
   97 #include <geom/geom_int.h>
   98 
   99 #include <vm/vm.h>
  100 #include <vm/vm_param.h>
  101 #include <vm/vm_object.h>
  102 #include <vm/vm_page.h>
  103 #include <vm/vm_pager.h>
  104 #include <vm/swap_pager.h>
  105 #include <vm/uma.h>
  106 
  107 #include <machine/bus.h>
  108 
  109 #define MD_MODVER 1
  110 
  111 #define MD_SHUTDOWN     0x10000         /* Tell worker thread to terminate. */
  112 #define MD_EXITING      0x20000         /* Worker thread is exiting. */
  113 #define MD_PROVIDERGONE 0x40000         /* Safe to free the softc */
  114 
  115 #ifndef MD_NSECT
  116 #define MD_NSECT (10000 * 2)
  117 #endif
  118 
  119 struct md_req {
  120         unsigned        md_unit;        /* unit number */
  121         enum md_types   md_type;        /* type of disk */
  122         off_t           md_mediasize;   /* size of disk in bytes */
  123         unsigned        md_sectorsize;  /* sectorsize */
  124         unsigned        md_options;     /* options */
  125         int             md_fwheads;     /* firmware heads */
  126         int             md_fwsectors;   /* firmware sectors */
  127         char            *md_file;       /* pathname of file to mount */
  128         enum uio_seg    md_file_seg;    /* location of md_file */
  129         char            *md_label;      /* label of the device (userspace) */
  130         int             *md_units;      /* pointer to units array (kernel) */
  131         size_t          md_units_nitems; /* items in md_units array */
  132 };
  133 
  134 #ifdef COMPAT_FREEBSD32
  135 struct md_ioctl32 {
  136         unsigned        md_version;
  137         unsigned        md_unit;
  138         enum md_types   md_type;
  139         uint32_t        md_file;
  140         off_t           md_mediasize;
  141         unsigned        md_sectorsize;
  142         unsigned        md_options;
  143         uint64_t        md_base;
  144         int             md_fwheads;
  145         int             md_fwsectors;
  146         uint32_t        md_label;
  147         int             md_pad[MDNPAD];
  148 } __attribute__((__packed__));
  149 CTASSERT((sizeof(struct md_ioctl32)) == 436);
  150 
  151 #define MDIOCATTACH_32  _IOC_NEWTYPE(MDIOCATTACH, struct md_ioctl32)
  152 #define MDIOCDETACH_32  _IOC_NEWTYPE(MDIOCDETACH, struct md_ioctl32)
  153 #define MDIOCQUERY_32   _IOC_NEWTYPE(MDIOCQUERY, struct md_ioctl32)
  154 #define MDIOCRESIZE_32  _IOC_NEWTYPE(MDIOCRESIZE, struct md_ioctl32)
  155 #endif /* COMPAT_FREEBSD32 */
  156 
  157 static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk");
  158 static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors");
  159 
  160 static int md_debug;
  161 SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0,
  162     "Enable md(4) debug messages");
  163 static int md_malloc_wait;
  164 SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0,
  165     "Allow malloc to wait for memory allocations");
  166 
  167 #if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE)
  168 #define MD_ROOT_FSTYPE  "ufs"
  169 #endif
  170 
  171 #if defined(MD_ROOT)
  172 /*
  173  * Preloaded image gets put here.
  174  */
  175 #if defined(MD_ROOT_SIZE)
  176 /*
  177  * We put the mfs_root symbol into the oldmfs section of the kernel object file.
  178  * Applications that patch the object with the image can determine
  179  * the size looking at the oldmfs section size within the kernel.
  180  */
  181 u_char mfs_root[MD_ROOT_SIZE*1024] __attribute__ ((section ("oldmfs")));
  182 const int mfs_root_size = sizeof(mfs_root);
  183 #elif defined(MD_ROOT_MEM)
  184 /* MD region already mapped in the memory */
  185 u_char *mfs_root;
  186 int mfs_root_size;
  187 #else
  188 extern volatile u_char __weak_symbol mfs_root;
  189 extern volatile u_char __weak_symbol mfs_root_end;
  190 #define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root))
  191 #endif
  192 #endif
  193 
  194 static g_init_t g_md_init;
  195 static g_fini_t g_md_fini;
  196 static g_start_t g_md_start;
  197 static g_access_t g_md_access;
  198 static void g_md_dumpconf(struct sbuf *sb, const char *indent,
  199     struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp);
  200 static g_provgone_t g_md_providergone;
  201 
  202 static struct cdev *status_dev = NULL;
  203 static struct sx md_sx;
  204 static struct unrhdr *md_uh;
  205 
  206 static d_ioctl_t mdctlioctl;
  207 
  208 static struct cdevsw mdctl_cdevsw = {
  209         .d_version =    D_VERSION,
  210         .d_ioctl =      mdctlioctl,
  211         .d_name =       MD_NAME,
  212 };
  213 
  214 struct g_class g_md_class = {
  215         .name = "MD",
  216         .version = G_VERSION,
  217         .init = g_md_init,
  218         .fini = g_md_fini,
  219         .start = g_md_start,
  220         .access = g_md_access,
  221         .dumpconf = g_md_dumpconf,
  222         .providergone = g_md_providergone,
  223 };
  224 
  225 DECLARE_GEOM_CLASS(g_md_class, g_md);
  226 MODULE_VERSION(geom_md, 0);
  227 
  228 static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list);
  229 
  230 #define NINDIR  (PAGE_SIZE / sizeof(uintptr_t))
  231 #define NMASK   (NINDIR-1)
  232 static int nshift;
  233 
  234 static uma_zone_t md_pbuf_zone;
  235 
  236 struct indir {
  237         uintptr_t       *array;
  238         u_int           total;
  239         u_int           used;
  240         u_int           shift;
  241 };
  242 
  243 struct md_s {
  244         int unit;
  245         LIST_ENTRY(md_s) list;
  246         struct bio_queue_head bio_queue;
  247         struct mtx queue_mtx;
  248         struct cdev *dev;
  249         enum md_types type;
  250         off_t mediasize;
  251         unsigned sectorsize;
  252         unsigned opencount;
  253         unsigned fwheads;
  254         unsigned fwsectors;
  255         char ident[32];
  256         unsigned flags;
  257         char name[20];
  258         struct proc *procp;
  259         struct g_geom *gp;
  260         struct g_provider *pp;
  261         int (*start)(struct md_s *sc, struct bio *bp);
  262         struct devstat *devstat;
  263 
  264         /* MD_MALLOC related fields */
  265         struct indir *indir;
  266         uma_zone_t uma;
  267 
  268         /* MD_PRELOAD related fields */
  269         u_char *pl_ptr;
  270         size_t pl_len;
  271 
  272         /* MD_VNODE related fields */
  273         struct vnode *vnode;
  274         char file[PATH_MAX];
  275         char label[PATH_MAX];
  276         struct ucred *cred;
  277 
  278         /* MD_SWAP related fields */
  279         vm_object_t object;
  280 };
  281 
  282 static struct indir *
  283 new_indir(u_int shift)
  284 {
  285         struct indir *ip;
  286 
  287         ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT)
  288             | M_ZERO);
  289         if (ip == NULL)
  290                 return (NULL);
  291         ip->array = malloc(sizeof(uintptr_t) * NINDIR,
  292             M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO);
  293         if (ip->array == NULL) {
  294                 free(ip, M_MD);
  295                 return (NULL);
  296         }
  297         ip->total = NINDIR;
  298         ip->shift = shift;
  299         return (ip);
  300 }
  301 
  302 static void
  303 del_indir(struct indir *ip)
  304 {
  305 
  306         free(ip->array, M_MDSECT);
  307         free(ip, M_MD);
  308 }
  309 
  310 static void
  311 destroy_indir(struct md_s *sc, struct indir *ip)
  312 {
  313         int i;
  314 
  315         for (i = 0; i < NINDIR; i++) {
  316                 if (!ip->array[i])
  317                         continue;
  318                 if (ip->shift)
  319                         destroy_indir(sc, (struct indir*)(ip->array[i]));
  320                 else if (ip->array[i] > 255)
  321                         uma_zfree(sc->uma, (void *)(ip->array[i]));
  322         }
  323         del_indir(ip);
  324 }
  325 
  326 /*
  327  * This function does the math and allocates the top level "indir" structure
  328  * for a device of "size" sectors.
  329  */
  330 
  331 static struct indir *
  332 dimension(off_t size)
  333 {
  334         off_t rcnt;
  335         struct indir *ip;
  336         int layer;
  337 
  338         rcnt = size;
  339         layer = 0;
  340         while (rcnt > NINDIR) {
  341                 rcnt /= NINDIR;
  342                 layer++;
  343         }
  344 
  345         /*
  346          * XXX: the top layer is probably not fully populated, so we allocate
  347          * too much space for ip->array in here.
  348          */
  349         ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
  350         ip->array = malloc(sizeof(uintptr_t) * NINDIR,
  351             M_MDSECT, M_WAITOK | M_ZERO);
  352         ip->total = NINDIR;
  353         ip->shift = layer * nshift;
  354         return (ip);
  355 }
  356 
  357 /*
  358  * Read a given sector
  359  */
  360 
  361 static uintptr_t
  362 s_read(struct indir *ip, off_t offset)
  363 {
  364         struct indir *cip;
  365         int idx;
  366         uintptr_t up;
  367 
  368         if (md_debug > 1)
  369                 printf("s_read(%jd)\n", (intmax_t)offset);
  370         up = 0;
  371         for (cip = ip; cip != NULL;) {
  372                 if (cip->shift) {
  373                         idx = (offset >> cip->shift) & NMASK;
  374                         up = cip->array[idx];
  375                         cip = (struct indir *)up;
  376                         continue;
  377                 }
  378                 idx = offset & NMASK;
  379                 return (cip->array[idx]);
  380         }
  381         return (0);
  382 }
  383 
  384 /*
  385  * Write a given sector, prune the tree if the value is 0
  386  */
  387 
  388 static int
  389 s_write(struct indir *ip, off_t offset, uintptr_t ptr)
  390 {
  391         struct indir *cip, *lip[10];
  392         int idx, li;
  393         uintptr_t up;
  394 
  395         if (md_debug > 1)
  396                 printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
  397         up = 0;
  398         li = 0;
  399         cip = ip;
  400         for (;;) {
  401                 lip[li++] = cip;
  402                 if (cip->shift) {
  403                         idx = (offset >> cip->shift) & NMASK;
  404                         up = cip->array[idx];
  405                         if (up != 0) {
  406                                 cip = (struct indir *)up;
  407                                 continue;
  408                         }
  409                         /* Allocate branch */
  410                         cip->array[idx] =
  411                             (uintptr_t)new_indir(cip->shift - nshift);
  412                         if (cip->array[idx] == 0)
  413                                 return (ENOSPC);
  414                         cip->used++;
  415                         up = cip->array[idx];
  416                         cip = (struct indir *)up;
  417                         continue;
  418                 }
  419                 /* leafnode */
  420                 idx = offset & NMASK;
  421                 up = cip->array[idx];
  422                 if (up != 0)
  423                         cip->used--;
  424                 cip->array[idx] = ptr;
  425                 if (ptr != 0)
  426                         cip->used++;
  427                 break;
  428         }
  429         if (cip->used != 0 || li == 1)
  430                 return (0);
  431         li--;
  432         while (cip->used == 0 && cip != ip) {
  433                 li--;
  434                 idx = (offset >> lip[li]->shift) & NMASK;
  435                 up = lip[li]->array[idx];
  436                 KASSERT(up == (uintptr_t)cip, ("md screwed up"));
  437                 del_indir(cip);
  438                 lip[li]->array[idx] = 0;
  439                 lip[li]->used--;
  440                 cip = lip[li];
  441         }
  442         return (0);
  443 }
  444 
  445 static int
  446 g_md_access(struct g_provider *pp, int r, int w, int e)
  447 {
  448         struct md_s *sc;
  449 
  450         sc = pp->geom->softc;
  451         if (sc == NULL) {
  452                 if (r <= 0 && w <= 0 && e <= 0)
  453                         return (0);
  454                 return (ENXIO);
  455         }
  456         r += pp->acr;
  457         w += pp->acw;
  458         e += pp->ace;
  459         if ((sc->flags & MD_READONLY) != 0 && w > 0)
  460                 return (EROFS);
  461         if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
  462                 sc->opencount = 1;
  463         } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
  464                 sc->opencount = 0;
  465         }
  466         return (0);
  467 }
  468 
  469 static void
  470 g_md_start(struct bio *bp)
  471 {
  472         struct md_s *sc;
  473 
  474         sc = bp->bio_to->geom->softc;
  475         if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) {
  476                 devstat_start_transaction_bio(sc->devstat, bp);
  477         }
  478         mtx_lock(&sc->queue_mtx);
  479         bioq_disksort(&sc->bio_queue, bp);
  480         wakeup(sc);
  481         mtx_unlock(&sc->queue_mtx);
  482 }
  483 
  484 #define MD_MALLOC_MOVE_ZERO     1
  485 #define MD_MALLOC_MOVE_FILL     2
  486 #define MD_MALLOC_MOVE_READ     3
  487 #define MD_MALLOC_MOVE_WRITE    4
  488 #define MD_MALLOC_MOVE_CMP      5
  489 
  490 static int
  491 md_malloc_move_ma(vm_page_t **mp, int *ma_offs, unsigned sectorsize,
  492     void *ptr, u_char fill, int op)
  493 {
  494         struct sf_buf *sf;
  495         vm_page_t m, *mp1;
  496         char *p, first;
  497         off_t *uc;
  498         unsigned n;
  499         int error, i, ma_offs1, sz, first_read;
  500 
  501         m = NULL;
  502         error = 0;
  503         sf = NULL;
  504         /* if (op == MD_MALLOC_MOVE_CMP) { gcc */
  505                 first = 0;
  506                 first_read = 0;
  507                 uc = ptr;
  508                 mp1 = *mp;
  509                 ma_offs1 = *ma_offs;
  510         /* } */
  511         sched_pin();
  512         for (n = sectorsize; n != 0; n -= sz) {
  513                 sz = imin(PAGE_SIZE - *ma_offs, n);
  514                 if (m != **mp) {
  515                         if (sf != NULL)
  516                                 sf_buf_free(sf);
  517                         m = **mp;
  518                         sf = sf_buf_alloc(m, SFB_CPUPRIVATE |
  519                             (md_malloc_wait ? 0 : SFB_NOWAIT));
  520                         if (sf == NULL) {
  521                                 error = ENOMEM;
  522                                 break;
  523                         }
  524                 }
  525                 p = (char *)sf_buf_kva(sf) + *ma_offs;
  526                 switch (op) {
  527                 case MD_MALLOC_MOVE_ZERO:
  528                         bzero(p, sz);
  529                         break;
  530                 case MD_MALLOC_MOVE_FILL:
  531                         memset(p, fill, sz);
  532                         break;
  533                 case MD_MALLOC_MOVE_READ:
  534                         bcopy(ptr, p, sz);
  535                         cpu_flush_dcache(p, sz);
  536                         break;
  537                 case MD_MALLOC_MOVE_WRITE:
  538                         bcopy(p, ptr, sz);
  539                         break;
  540                 case MD_MALLOC_MOVE_CMP:
  541                         for (i = 0; i < sz; i++, p++) {
  542                                 if (!first_read) {
  543                                         *uc = (u_char)*p;
  544                                         first = *p;
  545                                         first_read = 1;
  546                                 } else if (*p != first) {
  547                                         error = EDOOFUS;
  548                                         break;
  549                                 }
  550                         }
  551                         break;
  552                 default:
  553                         KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op));
  554                         break;
  555                 }
  556                 if (error != 0)
  557                         break;
  558                 *ma_offs += sz;
  559                 *ma_offs %= PAGE_SIZE;
  560                 if (*ma_offs == 0)
  561                         (*mp)++;
  562                 ptr = (char *)ptr + sz;
  563         }
  564 
  565         if (sf != NULL)
  566                 sf_buf_free(sf);
  567         sched_unpin();
  568         if (op == MD_MALLOC_MOVE_CMP && error != 0) {
  569                 *mp = mp1;
  570                 *ma_offs = ma_offs1;
  571         }
  572         return (error);
  573 }
  574 
  575 static int
  576 md_malloc_move_vlist(bus_dma_segment_t **pvlist, int *pma_offs,
  577     unsigned len, void *ptr, u_char fill, int op)
  578 {
  579         bus_dma_segment_t *vlist;
  580         uint8_t *p, *end, first;
  581         off_t *uc;
  582         int ma_offs, seg_len;
  583 
  584         vlist = *pvlist;
  585         ma_offs = *pma_offs;
  586         uc = ptr;
  587 
  588         for (; len != 0; len -= seg_len) {
  589                 seg_len = imin(vlist->ds_len - ma_offs, len);
  590                 p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs;
  591                 switch (op) {
  592                 case MD_MALLOC_MOVE_ZERO:
  593                         bzero(p, seg_len);
  594                         break;
  595                 case MD_MALLOC_MOVE_FILL:
  596                         memset(p, fill, seg_len);
  597                         break;
  598                 case MD_MALLOC_MOVE_READ:
  599                         bcopy(ptr, p, seg_len);
  600                         cpu_flush_dcache(p, seg_len);
  601                         break;
  602                 case MD_MALLOC_MOVE_WRITE:
  603                         bcopy(p, ptr, seg_len);
  604                         break;
  605                 case MD_MALLOC_MOVE_CMP:
  606                         end = p + seg_len;
  607                         first = *uc = *p;
  608                         /* Confirm all following bytes match the first */
  609                         while (++p < end) {
  610                                 if (*p != first)
  611                                         return (EDOOFUS);
  612                         }
  613                         break;
  614                 default:
  615                         KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op));
  616                         break;
  617                 }
  618 
  619                 ma_offs += seg_len;
  620                 if (ma_offs == vlist->ds_len) {
  621                         ma_offs = 0;
  622                         vlist++;
  623                 }
  624                 ptr = (uint8_t *)ptr + seg_len;
  625         }
  626         *pvlist = vlist;
  627         *pma_offs = ma_offs;
  628 
  629         return (0);
  630 }
  631 
  632 static int
  633 mdstart_malloc(struct md_s *sc, struct bio *bp)
  634 {
  635         u_char *dst;
  636         vm_page_t *m;
  637         bus_dma_segment_t *vlist;
  638         int i, error, error1, ma_offs, notmapped;
  639         off_t secno, nsec, uc;
  640         uintptr_t sp, osp;
  641 
  642         switch (bp->bio_cmd) {
  643         case BIO_READ:
  644         case BIO_WRITE:
  645         case BIO_DELETE:
  646                 break;
  647         case BIO_FLUSH:
  648                 return (0);
  649         default:
  650                 return (EOPNOTSUPP);
  651         }
  652 
  653         notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0;
  654         vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
  655             (bus_dma_segment_t *)bp->bio_data : NULL;
  656         if (notmapped) {
  657                 m = bp->bio_ma;
  658                 ma_offs = bp->bio_ma_offset;
  659                 dst = NULL;
  660                 KASSERT(vlist == NULL, ("vlists cannot be unmapped"));
  661         } else if (vlist != NULL) {
  662                 ma_offs = bp->bio_ma_offset;
  663                 dst = NULL;
  664         } else {
  665                 dst = bp->bio_data;
  666         }
  667 
  668         nsec = bp->bio_length / sc->sectorsize;
  669         secno = bp->bio_offset / sc->sectorsize;
  670         error = 0;
  671         while (nsec--) {
  672                 osp = s_read(sc->indir, secno);
  673                 if (bp->bio_cmd == BIO_DELETE) {
  674                         if (osp != 0)
  675                                 error = s_write(sc->indir, secno, 0);
  676                 } else if (bp->bio_cmd == BIO_READ) {
  677                         if (osp == 0) {
  678                                 if (notmapped) {
  679                                         error = md_malloc_move_ma(&m, &ma_offs,
  680                                             sc->sectorsize, NULL, 0,
  681                                             MD_MALLOC_MOVE_ZERO);
  682                                 } else if (vlist != NULL) {
  683                                         error = md_malloc_move_vlist(&vlist,
  684                                             &ma_offs, sc->sectorsize, NULL, 0,
  685                                             MD_MALLOC_MOVE_ZERO);
  686                                 } else
  687                                         bzero(dst, sc->sectorsize);
  688                         } else if (osp <= 255) {
  689                                 if (notmapped) {
  690                                         error = md_malloc_move_ma(&m, &ma_offs,
  691                                             sc->sectorsize, NULL, osp,
  692                                             MD_MALLOC_MOVE_FILL);
  693                                 } else if (vlist != NULL) {
  694                                         error = md_malloc_move_vlist(&vlist,
  695                                             &ma_offs, sc->sectorsize, NULL, osp,
  696                                             MD_MALLOC_MOVE_FILL);
  697                                 } else
  698                                         memset(dst, osp, sc->sectorsize);
  699                         } else {
  700                                 if (notmapped) {
  701                                         error = md_malloc_move_ma(&m, &ma_offs,
  702                                             sc->sectorsize, (void *)osp, 0,
  703                                             MD_MALLOC_MOVE_READ);
  704                                 } else if (vlist != NULL) {
  705                                         error = md_malloc_move_vlist(&vlist,
  706                                             &ma_offs, sc->sectorsize,
  707                                             (void *)osp, 0,
  708                                             MD_MALLOC_MOVE_READ);
  709                                 } else {
  710                                         bcopy((void *)osp, dst, sc->sectorsize);
  711                                         cpu_flush_dcache(dst, sc->sectorsize);
  712                                 }
  713                         }
  714                         osp = 0;
  715                 } else if (bp->bio_cmd == BIO_WRITE) {
  716                         if (sc->flags & MD_COMPRESS) {
  717                                 if (notmapped) {
  718                                         error1 = md_malloc_move_ma(&m, &ma_offs,
  719                                             sc->sectorsize, &uc, 0,
  720                                             MD_MALLOC_MOVE_CMP);
  721                                         i = error1 == 0 ? sc->sectorsize : 0;
  722                                 } else if (vlist != NULL) {
  723                                         error1 = md_malloc_move_vlist(&vlist,
  724                                             &ma_offs, sc->sectorsize, &uc, 0,
  725                                             MD_MALLOC_MOVE_CMP);
  726                                         i = error1 == 0 ? sc->sectorsize : 0;
  727                                 } else {
  728                                         uc = dst[0];
  729                                         for (i = 1; i < sc->sectorsize; i++) {
  730                                                 if (dst[i] != uc)
  731                                                         break;
  732                                         }
  733                                 }
  734                         } else {
  735                                 i = 0;
  736                                 uc = 0;
  737                         }
  738                         if (i == sc->sectorsize) {
  739                                 if (osp != uc)
  740                                         error = s_write(sc->indir, secno, uc);
  741                         } else {
  742                                 if (osp <= 255) {
  743                                         sp = (uintptr_t)uma_zalloc(sc->uma,
  744                                             md_malloc_wait ? M_WAITOK :
  745                                             M_NOWAIT);
  746                                         if (sp == 0) {
  747                                                 error = ENOSPC;
  748                                                 break;
  749                                         }
  750                                         if (notmapped) {
  751                                                 error = md_malloc_move_ma(&m,
  752                                                     &ma_offs, sc->sectorsize,
  753                                                     (void *)sp, 0,
  754                                                     MD_MALLOC_MOVE_WRITE);
  755                                         } else if (vlist != NULL) {
  756                                                 error = md_malloc_move_vlist(
  757                                                     &vlist, &ma_offs,
  758                                                     sc->sectorsize, (void *)sp,
  759                                                     0, MD_MALLOC_MOVE_WRITE);
  760                                         } else {
  761                                                 bcopy(dst, (void *)sp,
  762                                                     sc->sectorsize);
  763                                         }
  764                                         error = s_write(sc->indir, secno, sp);
  765                                 } else {
  766                                         if (notmapped) {
  767                                                 error = md_malloc_move_ma(&m,
  768                                                     &ma_offs, sc->sectorsize,
  769                                                     (void *)osp, 0,
  770                                                     MD_MALLOC_MOVE_WRITE);
  771                                         } else if (vlist != NULL) {
  772                                                 error = md_malloc_move_vlist(
  773                                                     &vlist, &ma_offs,
  774                                                     sc->sectorsize, (void *)osp,
  775                                                     0, MD_MALLOC_MOVE_WRITE);
  776                                         } else {
  777                                                 bcopy(dst, (void *)osp,
  778                                                     sc->sectorsize);
  779                                         }
  780                                         osp = 0;
  781                                 }
  782                         }
  783                 } else {
  784                         error = EOPNOTSUPP;
  785                 }
  786                 if (osp > 255)
  787                         uma_zfree(sc->uma, (void*)osp);
  788                 if (error != 0)
  789                         break;
  790                 secno++;
  791                 if (!notmapped && vlist == NULL)
  792                         dst += sc->sectorsize;
  793         }
  794         bp->bio_resid = 0;
  795         return (error);
  796 }
  797 
  798 static void
  799 mdcopyto_vlist(void *src, bus_dma_segment_t *vlist, off_t offset, off_t len)
  800 {
  801         off_t seg_len;
  802 
  803         while (offset >= vlist->ds_len) {
  804                 offset -= vlist->ds_len;
  805                 vlist++;
  806         }
  807 
  808         while (len != 0) {
  809                 seg_len = omin(len, vlist->ds_len - offset);
  810                 bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset),
  811                     seg_len);
  812                 offset = 0;
  813                 src = (uint8_t *)src + seg_len;
  814                 len -= seg_len;
  815                 vlist++;
  816         }
  817 }
  818 
  819 static void
  820 mdcopyfrom_vlist(bus_dma_segment_t *vlist, off_t offset, void *dst, off_t len)
  821 {
  822         off_t seg_len;
  823 
  824         while (offset >= vlist->ds_len) {
  825                 offset -= vlist->ds_len;
  826                 vlist++;
  827         }
  828 
  829         while (len != 0) {
  830                 seg_len = omin(len, vlist->ds_len - offset);
  831                 bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst,
  832                     seg_len);
  833                 offset = 0;
  834                 dst = (uint8_t *)dst + seg_len;
  835                 len -= seg_len;
  836                 vlist++;
  837         }
  838 }
  839 
  840 static int
  841 mdstart_preload(struct md_s *sc, struct bio *bp)
  842 {
  843         uint8_t *p;
  844 
  845         p = sc->pl_ptr + bp->bio_offset;
  846         switch (bp->bio_cmd) {
  847         case BIO_READ:
  848                 if ((bp->bio_flags & BIO_VLIST) != 0) {
  849                         mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data,
  850                             bp->bio_ma_offset, bp->bio_length);
  851                 } else {
  852                         bcopy(p, bp->bio_data, bp->bio_length);
  853                 }
  854                 cpu_flush_dcache(bp->bio_data, bp->bio_length);
  855                 break;
  856         case BIO_WRITE:
  857                 if ((bp->bio_flags & BIO_VLIST) != 0) {
  858                         mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data,
  859                             bp->bio_ma_offset, p, bp->bio_length);
  860                 } else {
  861                         bcopy(bp->bio_data, p, bp->bio_length);
  862                 }
  863                 break;
  864         }
  865         bp->bio_resid = 0;
  866         return (0);
  867 }
  868 
  869 static int
  870 mdstart_vnode(struct md_s *sc, struct bio *bp)
  871 {
  872         int error;
  873         struct uio auio;
  874         struct iovec aiov;
  875         struct iovec *piov;
  876         struct mount *mp;
  877         struct vnode *vp;
  878         struct buf *pb;
  879         bus_dma_segment_t *vlist;
  880         struct thread *td;
  881         off_t iolen, iostart, len, zerosize;
  882         int ma_offs, npages;
  883 
  884         switch (bp->bio_cmd) {
  885         case BIO_READ:
  886                 auio.uio_rw = UIO_READ;
  887                 break;
  888         case BIO_WRITE:
  889         case BIO_DELETE:
  890                 auio.uio_rw = UIO_WRITE;
  891                 break;
  892         case BIO_FLUSH:
  893                 break;
  894         default:
  895                 return (EOPNOTSUPP);
  896         }
  897 
  898         td = curthread;
  899         vp = sc->vnode;
  900         pb = NULL;
  901         piov = NULL;
  902         ma_offs = bp->bio_ma_offset;
  903         len = bp->bio_length;
  904 
  905         /*
  906          * VNODE I/O
  907          *
  908          * If an error occurs, we set BIO_ERROR but we do not set
  909          * B_INVAL because (for a write anyway), the buffer is
  910          * still valid.
  911          */
  912 
  913         if (bp->bio_cmd == BIO_FLUSH) {
  914                 do {
  915                         (void)vn_start_write(vp, &mp, V_WAIT);
  916                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  917                         error = VOP_FSYNC(vp, MNT_WAIT, td);
  918                         VOP_UNLOCK(vp);
  919                         vn_finished_write(mp);
  920                 } while (error == ERELOOKUP);
  921                 return (error);
  922         }
  923 
  924         auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
  925         auio.uio_resid = bp->bio_length;
  926         auio.uio_segflg = UIO_SYSSPACE;
  927         auio.uio_td = td;
  928 
  929         if (bp->bio_cmd == BIO_DELETE) {
  930                 /*
  931                  * Emulate BIO_DELETE by writing zeros.
  932                  */
  933                 zerosize = ZERO_REGION_SIZE -
  934                     (ZERO_REGION_SIZE % sc->sectorsize);
  935                 auio.uio_iovcnt = howmany(bp->bio_length, zerosize);
  936                 piov = malloc(sizeof(*piov) * auio.uio_iovcnt, M_MD, M_WAITOK);
  937                 auio.uio_iov = piov;
  938                 while (len > 0) {
  939                         piov->iov_base = __DECONST(void *, zero_region);
  940                         piov->iov_len = len;
  941                         if (len > zerosize)
  942                                 piov->iov_len = zerosize;
  943                         len -= piov->iov_len;
  944                         piov++;
  945                 }
  946                 piov = auio.uio_iov;
  947         } else if ((bp->bio_flags & BIO_VLIST) != 0) {
  948                 piov = malloc(sizeof(*piov) * bp->bio_ma_n, M_MD, M_WAITOK);
  949                 auio.uio_iov = piov;
  950                 vlist = (bus_dma_segment_t *)bp->bio_data;
  951                 while (len > 0) {
  952                         piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr +
  953                             ma_offs);
  954                         piov->iov_len = vlist->ds_len - ma_offs;
  955                         if (piov->iov_len > len)
  956                                 piov->iov_len = len;
  957                         len -= piov->iov_len;
  958                         ma_offs = 0;
  959                         vlist++;
  960                         piov++;
  961                 }
  962                 auio.uio_iovcnt = piov - auio.uio_iov;
  963                 piov = auio.uio_iov;
  964         } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
  965                 pb = uma_zalloc(md_pbuf_zone, M_WAITOK);
  966                 MPASS((pb->b_flags & B_MAXPHYS) != 0);
  967                 bp->bio_resid = len;
  968 unmapped_step:
  969                 npages = atop(min(maxphys, round_page(len + (ma_offs &
  970                     PAGE_MASK))));
  971                 iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len);
  972                 KASSERT(iolen > 0, ("zero iolen"));
  973                 pmap_qenter((vm_offset_t)pb->b_data,
  974                     &bp->bio_ma[atop(ma_offs)], npages);
  975                 aiov.iov_base = (void *)((vm_offset_t)pb->b_data +
  976                     (ma_offs & PAGE_MASK));
  977                 aiov.iov_len = iolen;
  978                 auio.uio_iov = &aiov;
  979                 auio.uio_iovcnt = 1;
  980                 auio.uio_resid = iolen;
  981         } else {
  982                 aiov.iov_base = bp->bio_data;
  983                 aiov.iov_len = bp->bio_length;
  984                 auio.uio_iov = &aiov;
  985                 auio.uio_iovcnt = 1;
  986         }
  987         iostart = auio.uio_offset;
  988         if (auio.uio_rw == UIO_READ) {
  989                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  990                 error = VOP_READ(vp, &auio, 0, sc->cred);
  991                 VOP_UNLOCK(vp);
  992         } else {
  993                 (void) vn_start_write(vp, &mp, V_WAIT);
  994                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  995                 error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC,
  996                     sc->cred);
  997                 VOP_UNLOCK(vp);
  998                 vn_finished_write(mp);
  999                 if (error == 0)
 1000                         sc->flags &= ~MD_VERIFY;
 1001         }
 1002 
 1003         /* When MD_CACHE is set, try to avoid double-caching the data. */
 1004         if (error == 0 && (sc->flags & MD_CACHE) == 0)
 1005                 VOP_ADVISE(vp, iostart, auio.uio_offset - 1,
 1006                     POSIX_FADV_DONTNEED);
 1007 
 1008         if (pb != NULL) {
 1009                 pmap_qremove((vm_offset_t)pb->b_data, npages);
 1010                 if (error == 0) {
 1011                         len -= iolen;
 1012                         bp->bio_resid -= iolen;
 1013                         ma_offs += iolen;
 1014                         if (len > 0)
 1015                                 goto unmapped_step;
 1016                 }
 1017                 uma_zfree(md_pbuf_zone, pb);
 1018         } else {
 1019                 bp->bio_resid = auio.uio_resid;
 1020         }
 1021 
 1022         free(piov, M_MD);
 1023         return (error);
 1024 }
 1025 
 1026 static int
 1027 mdstart_swap(struct md_s *sc, struct bio *bp)
 1028 {
 1029         vm_page_t m;
 1030         u_char *p;
 1031         vm_pindex_t i, lastp;
 1032         bus_dma_segment_t *vlist;
 1033         int rv, ma_offs, offs, len, lastend;
 1034 
 1035         switch (bp->bio_cmd) {
 1036         case BIO_READ:
 1037         case BIO_WRITE:
 1038         case BIO_DELETE:
 1039                 break;
 1040         case BIO_FLUSH:
 1041                 return (0);
 1042         default:
 1043                 return (EOPNOTSUPP);
 1044         }
 1045 
 1046         p = bp->bio_data;
 1047         ma_offs = (bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0 ?
 1048             bp->bio_ma_offset : 0;
 1049         vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
 1050             (bus_dma_segment_t *)bp->bio_data : NULL;
 1051 
 1052         /*
 1053          * offs is the offset at which to start operating on the
 1054          * next (ie, first) page.  lastp is the last page on
 1055          * which we're going to operate.  lastend is the ending
 1056          * position within that last page (ie, PAGE_SIZE if
 1057          * we're operating on complete aligned pages).
 1058          */
 1059         offs = bp->bio_offset % PAGE_SIZE;
 1060         lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
 1061         lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
 1062 
 1063         rv = VM_PAGER_OK;
 1064         vm_object_pip_add(sc->object, 1);
 1065         for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
 1066                 len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
 1067                 m = vm_page_grab_unlocked(sc->object, i, VM_ALLOC_SYSTEM);
 1068                 if (bp->bio_cmd == BIO_READ) {
 1069                         if (vm_page_all_valid(m))
 1070                                 rv = VM_PAGER_OK;
 1071                         else
 1072                                 rv = vm_pager_get_pages(sc->object, &m, 1,
 1073                                     NULL, NULL);
 1074                         if (rv == VM_PAGER_ERROR) {
 1075                                 VM_OBJECT_WLOCK(sc->object);
 1076                                 vm_page_free(m);
 1077                                 VM_OBJECT_WUNLOCK(sc->object);
 1078                                 break;
 1079                         } else if (rv == VM_PAGER_FAIL) {
 1080                                 /*
 1081                                  * Pager does not have the page.  Zero
 1082                                  * the allocated page, and mark it as
 1083                                  * valid. Do not set dirty, the page
 1084                                  * can be recreated if thrown out.
 1085                                  */
 1086                                 pmap_zero_page(m);
 1087                                 vm_page_valid(m);
 1088                         }
 1089                         if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 1090                                 pmap_copy_pages(&m, offs, bp->bio_ma,
 1091                                     ma_offs, len);
 1092                         } else if ((bp->bio_flags & BIO_VLIST) != 0) {
 1093                                 physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs,
 1094                                     vlist, ma_offs, len);
 1095                                 cpu_flush_dcache(p, len);
 1096                         } else {
 1097                                 physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len);
 1098                                 cpu_flush_dcache(p, len);
 1099                         }
 1100                 } else if (bp->bio_cmd == BIO_WRITE) {
 1101                         if (len == PAGE_SIZE || vm_page_all_valid(m))
 1102                                 rv = VM_PAGER_OK;
 1103                         else
 1104                                 rv = vm_pager_get_pages(sc->object, &m, 1,
 1105                                     NULL, NULL);
 1106                         if (rv == VM_PAGER_ERROR) {
 1107                                 VM_OBJECT_WLOCK(sc->object);
 1108                                 vm_page_free(m);
 1109                                 VM_OBJECT_WUNLOCK(sc->object);
 1110                                 break;
 1111                         } else if (rv == VM_PAGER_FAIL)
 1112                                 pmap_zero_page(m);
 1113 
 1114                         if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 1115                                 pmap_copy_pages(bp->bio_ma, ma_offs, &m,
 1116                                     offs, len);
 1117                         } else if ((bp->bio_flags & BIO_VLIST) != 0) {
 1118                                 physcopyin_vlist(vlist, ma_offs,
 1119                                     VM_PAGE_TO_PHYS(m) + offs, len);
 1120                         } else {
 1121                                 physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len);
 1122                         }
 1123 
 1124                         vm_page_valid(m);
 1125                         vm_page_set_dirty(m);
 1126                 } else if (bp->bio_cmd == BIO_DELETE) {
 1127                         if (len == PAGE_SIZE || vm_page_all_valid(m))
 1128                                 rv = VM_PAGER_OK;
 1129                         else
 1130                                 rv = vm_pager_get_pages(sc->object, &m, 1,
 1131                                     NULL, NULL);
 1132                         VM_OBJECT_WLOCK(sc->object);
 1133                         if (rv == VM_PAGER_ERROR) {
 1134                                 vm_page_free(m);
 1135                                 VM_OBJECT_WUNLOCK(sc->object);
 1136                                 break;
 1137                         } else if (rv == VM_PAGER_FAIL) {
 1138                                 vm_page_free(m);
 1139                                 m = NULL;
 1140                         } else {
 1141                                 /* Page is valid. */
 1142                                 if (len != PAGE_SIZE) {
 1143                                         pmap_zero_page_area(m, offs, len);
 1144                                         vm_page_set_dirty(m);
 1145                                 } else {
 1146                                         vm_pager_page_unswapped(m);
 1147                                         vm_page_free(m);
 1148                                         m = NULL;
 1149                                 }
 1150                         }
 1151                         VM_OBJECT_WUNLOCK(sc->object);
 1152                 }
 1153                 if (m != NULL) {
 1154                         /*
 1155                          * The page may be deactivated prior to setting
 1156                          * PGA_REFERENCED, but in this case it will be
 1157                          * reactivated by the page daemon.
 1158                          */
 1159                         if (vm_page_active(m))
 1160                                 vm_page_reference(m);
 1161                         else
 1162                                 vm_page_activate(m);
 1163                         vm_page_xunbusy(m);
 1164                 }
 1165 
 1166                 /* Actions on further pages start at offset 0 */
 1167                 p += PAGE_SIZE - offs;
 1168                 offs = 0;
 1169                 ma_offs += len;
 1170         }
 1171         vm_object_pip_wakeup(sc->object);
 1172         return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
 1173 }
 1174 
 1175 static int
 1176 mdstart_null(struct md_s *sc, struct bio *bp)
 1177 {
 1178 
 1179         switch (bp->bio_cmd) {
 1180         case BIO_READ:
 1181                 bzero(bp->bio_data, bp->bio_length);
 1182                 cpu_flush_dcache(bp->bio_data, bp->bio_length);
 1183                 break;
 1184         case BIO_WRITE:
 1185                 break;
 1186         }
 1187         bp->bio_resid = 0;
 1188         return (0);
 1189 }
 1190 
 1191 static void
 1192 md_handleattr(struct md_s *sc, struct bio *bp)
 1193 {
 1194         if (sc->fwsectors && sc->fwheads &&
 1195             (g_handleattr_int(bp, "GEOM::fwsectors", sc->fwsectors) != 0 ||
 1196             g_handleattr_int(bp, "GEOM::fwheads", sc->fwheads) != 0))
 1197                 return;
 1198         if (g_handleattr_int(bp, "GEOM::candelete", 1) != 0)
 1199                 return;
 1200         if (sc->ident[0] != '\0' &&
 1201             g_handleattr_str(bp, "GEOM::ident", sc->ident) != 0)
 1202                 return;
 1203         if (g_handleattr_int(bp, "MNT::verified", (sc->flags & MD_VERIFY) != 0))
 1204                 return;
 1205         g_io_deliver(bp, EOPNOTSUPP);
 1206 }
 1207 
 1208 static void
 1209 md_kthread(void *arg)
 1210 {
 1211         struct md_s *sc;
 1212         struct bio *bp;
 1213         int error;
 1214 
 1215         sc = arg;
 1216         thread_lock(curthread);
 1217         sched_prio(curthread, PRIBIO);
 1218         thread_unlock(curthread);
 1219         if (sc->type == MD_VNODE)
 1220                 curthread->td_pflags |= TDP_NORUNNINGBUF;
 1221 
 1222         for (;;) {
 1223                 mtx_lock(&sc->queue_mtx);
 1224                 if (sc->flags & MD_SHUTDOWN) {
 1225                         sc->flags |= MD_EXITING;
 1226                         mtx_unlock(&sc->queue_mtx);
 1227                         kproc_exit(0);
 1228                 }
 1229                 bp = bioq_takefirst(&sc->bio_queue);
 1230                 if (!bp) {
 1231                         msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
 1232                         continue;
 1233                 }
 1234                 mtx_unlock(&sc->queue_mtx);
 1235                 if (bp->bio_cmd == BIO_GETATTR) {
 1236                         md_handleattr(sc, bp);
 1237                 } else {
 1238                         error = sc->start(sc, bp);
 1239                         if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
 1240                                 /*
 1241                                  * Devstat uses (bio_bcount, bio_resid) for
 1242                                  * determining the length of the completed part
 1243                                  * of the i/o.  g_io_deliver() will translate
 1244                                  * from bio_completed to that, but it also
 1245                                  * destroys the bio so we must do our own
 1246                                  * translation.
 1247                                  */
 1248                                 bp->bio_bcount = bp->bio_length;
 1249                                 devstat_end_transaction_bio(sc->devstat, bp);
 1250                         }
 1251                         bp->bio_completed = bp->bio_length - bp->bio_resid;
 1252                         g_io_deliver(bp, error);
 1253                 }
 1254         }
 1255 }
 1256 
 1257 static struct md_s *
 1258 mdfind(int unit)
 1259 {
 1260         struct md_s *sc;
 1261 
 1262         LIST_FOREACH(sc, &md_softc_list, list) {
 1263                 if (sc->unit == unit)
 1264                         break;
 1265         }
 1266         return (sc);
 1267 }
 1268 
 1269 static struct md_s *
 1270 mdnew(int unit, int *errp, enum md_types type)
 1271 {
 1272         struct md_s *sc;
 1273         int error;
 1274 
 1275         *errp = 0;
 1276         if (unit == -1)
 1277                 unit = alloc_unr(md_uh);
 1278         else
 1279                 unit = alloc_unr_specific(md_uh, unit);
 1280 
 1281         if (unit == -1) {
 1282                 *errp = EBUSY;
 1283                 return (NULL);
 1284         }
 1285 
 1286         sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
 1287         sc->type = type;
 1288         bioq_init(&sc->bio_queue);
 1289         mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
 1290         sc->unit = unit;
 1291         sprintf(sc->name, "md%d", unit);
 1292         LIST_INSERT_HEAD(&md_softc_list, sc, list);
 1293         error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
 1294         if (error == 0)
 1295                 return (sc);
 1296         LIST_REMOVE(sc, list);
 1297         mtx_destroy(&sc->queue_mtx);
 1298         free_unr(md_uh, sc->unit);
 1299         free(sc, M_MD);
 1300         *errp = error;
 1301         return (NULL);
 1302 }
 1303 
 1304 static void
 1305 mdinit(struct md_s *sc)
 1306 {
 1307         struct g_geom *gp;
 1308         struct g_provider *pp;
 1309 
 1310         g_topology_lock();
 1311         gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
 1312         gp->softc = sc;
 1313         pp = g_new_providerf(gp, "md%d", sc->unit);
 1314         devstat_remove_entry(pp->stat);
 1315         pp->stat = NULL;
 1316         pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
 1317         pp->mediasize = sc->mediasize;
 1318         pp->sectorsize = sc->sectorsize;
 1319         switch (sc->type) {
 1320         case MD_MALLOC:
 1321         case MD_VNODE:
 1322         case MD_SWAP:
 1323                 pp->flags |= G_PF_ACCEPT_UNMAPPED;
 1324                 break;
 1325         case MD_PRELOAD:
 1326         case MD_NULL:
 1327                 break;
 1328         }
 1329         sc->gp = gp;
 1330         sc->pp = pp;
 1331         sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize,
 1332             DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
 1333         sc->devstat->id = pp;
 1334         g_error_provider(pp, 0);
 1335         g_topology_unlock();
 1336 }
 1337 
 1338 static int
 1339 mdcreate_malloc(struct md_s *sc, struct md_req *mdr)
 1340 {
 1341         uintptr_t sp;
 1342         int error;
 1343         off_t u;
 1344 
 1345         error = 0;
 1346         if (mdr->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
 1347                 return (EINVAL);
 1348         if (mdr->md_sectorsize != 0 && !powerof2(mdr->md_sectorsize))
 1349                 return (EINVAL);
 1350         /* Compression doesn't make sense if we have reserved space */
 1351         if (mdr->md_options & MD_RESERVE)
 1352                 mdr->md_options &= ~MD_COMPRESS;
 1353         if (mdr->md_fwsectors != 0)
 1354                 sc->fwsectors = mdr->md_fwsectors;
 1355         if (mdr->md_fwheads != 0)
 1356                 sc->fwheads = mdr->md_fwheads;
 1357         sc->flags = mdr->md_options & (MD_COMPRESS | MD_FORCE);
 1358         sc->indir = dimension(sc->mediasize / sc->sectorsize);
 1359         sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
 1360             0x1ff, 0);
 1361         if (mdr->md_options & MD_RESERVE) {
 1362                 off_t nsectors;
 1363 
 1364                 nsectors = sc->mediasize / sc->sectorsize;
 1365                 for (u = 0; u < nsectors; u++) {
 1366                         sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ?
 1367                             M_WAITOK : M_NOWAIT) | M_ZERO);
 1368                         if (sp != 0)
 1369                                 error = s_write(sc->indir, u, sp);
 1370                         else
 1371                                 error = ENOMEM;
 1372                         if (error != 0)
 1373                                 break;
 1374                 }
 1375         }
 1376         return (error);
 1377 }
 1378 
 1379 static int
 1380 mdsetcred(struct md_s *sc, struct ucred *cred)
 1381 {
 1382         char *tmpbuf;
 1383         int error = 0;
 1384 
 1385         /*
 1386          * Set credits in our softc
 1387          */
 1388 
 1389         if (sc->cred)
 1390                 crfree(sc->cred);
 1391         sc->cred = crhold(cred);
 1392 
 1393         /*
 1394          * Horrible kludge to establish credentials for NFS  XXX.
 1395          */
 1396 
 1397         if (sc->vnode) {
 1398                 struct uio auio;
 1399                 struct iovec aiov;
 1400 
 1401                 tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
 1402                 bzero(&auio, sizeof(auio));
 1403 
 1404                 aiov.iov_base = tmpbuf;
 1405                 aiov.iov_len = sc->sectorsize;
 1406                 auio.uio_iov = &aiov;
 1407                 auio.uio_iovcnt = 1;
 1408                 auio.uio_offset = 0;
 1409                 auio.uio_rw = UIO_READ;
 1410                 auio.uio_segflg = UIO_SYSSPACE;
 1411                 auio.uio_resid = aiov.iov_len;
 1412                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
 1413                 error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
 1414                 VOP_UNLOCK(sc->vnode);
 1415                 free(tmpbuf, M_TEMP);
 1416         }
 1417         return (error);
 1418 }
 1419 
 1420 static int
 1421 mdcreate_vnode(struct md_s *sc, struct md_req *mdr, struct thread *td)
 1422 {
 1423         struct vattr vattr;
 1424         struct nameidata nd;
 1425         char *fname;
 1426         int error, flags;
 1427 
 1428         fname = mdr->md_file;
 1429         if (mdr->md_file_seg == UIO_USERSPACE) {
 1430                 error = copyinstr(fname, sc->file, sizeof(sc->file), NULL);
 1431                 if (error != 0)
 1432                         return (error);
 1433         } else if (mdr->md_file_seg == UIO_SYSSPACE)
 1434                 strlcpy(sc->file, fname, sizeof(sc->file));
 1435         else
 1436                 return (EDOOFUS);
 1437 
 1438         /*
 1439          * If the user specified that this is a read only device, don't
 1440          * set the FWRITE mask before trying to open the backing store.
 1441          */
 1442         flags = FREAD | ((mdr->md_options & MD_READONLY) ? 0 : FWRITE) \
 1443             | ((mdr->md_options & MD_VERIFY) ? O_VERIFY : 0);
 1444         NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td);
 1445         error = vn_open(&nd, &flags, 0, NULL);
 1446         if (error != 0)
 1447                 return (error);
 1448         NDFREE(&nd, NDF_ONLY_PNBUF);
 1449         if (nd.ni_vp->v_type != VREG) {
 1450                 error = EINVAL;
 1451                 goto bad;
 1452         }
 1453         error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred);
 1454         if (error != 0)
 1455                 goto bad;
 1456         if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) {
 1457                 vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY);
 1458                 if (VN_IS_DOOMED(nd.ni_vp)) {
 1459                         /* Forced unmount. */
 1460                         error = EBADF;
 1461                         goto bad;
 1462                 }
 1463         }
 1464         nd.ni_vp->v_vflag |= VV_MD;
 1465         VOP_UNLOCK(nd.ni_vp);
 1466 
 1467         if (mdr->md_fwsectors != 0)
 1468                 sc->fwsectors = mdr->md_fwsectors;
 1469         if (mdr->md_fwheads != 0)
 1470                 sc->fwheads = mdr->md_fwheads;
 1471         snprintf(sc->ident, sizeof(sc->ident), "MD-DEV%ju-INO%ju",
 1472             (uintmax_t)vattr.va_fsid, (uintmax_t)vattr.va_fileid);
 1473         sc->flags = mdr->md_options & (MD_ASYNC | MD_CACHE | MD_FORCE |
 1474             MD_VERIFY);
 1475         if (!(flags & FWRITE))
 1476                 sc->flags |= MD_READONLY;
 1477         sc->vnode = nd.ni_vp;
 1478 
 1479         error = mdsetcred(sc, td->td_ucred);
 1480         if (error != 0) {
 1481                 sc->vnode = NULL;
 1482                 vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
 1483                 nd.ni_vp->v_vflag &= ~VV_MD;
 1484                 goto bad;
 1485         }
 1486         return (0);
 1487 bad:
 1488         VOP_UNLOCK(nd.ni_vp);
 1489         (void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
 1490         return (error);
 1491 }
 1492 
 1493 static void
 1494 g_md_providergone(struct g_provider *pp)
 1495 {
 1496         struct md_s *sc = pp->geom->softc;
 1497 
 1498         mtx_lock(&sc->queue_mtx);
 1499         sc->flags |= MD_PROVIDERGONE;
 1500         wakeup(&sc->flags);
 1501         mtx_unlock(&sc->queue_mtx);
 1502 }
 1503 
 1504 static int
 1505 mddestroy(struct md_s *sc, struct thread *td)
 1506 {
 1507 
 1508         if (sc->gp) {
 1509                 g_topology_lock();
 1510                 g_wither_geom(sc->gp, ENXIO);
 1511                 g_topology_unlock();
 1512 
 1513                 mtx_lock(&sc->queue_mtx);
 1514                 while (!(sc->flags & MD_PROVIDERGONE))
 1515                         msleep(&sc->flags, &sc->queue_mtx, PRIBIO, "mddestroy", 0);
 1516                 mtx_unlock(&sc->queue_mtx);
 1517         }
 1518         if (sc->devstat) {
 1519                 devstat_remove_entry(sc->devstat);
 1520                 sc->devstat = NULL;
 1521         }
 1522         mtx_lock(&sc->queue_mtx);
 1523         sc->flags |= MD_SHUTDOWN;
 1524         wakeup(sc);
 1525         while (!(sc->flags & MD_EXITING))
 1526                 msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10);
 1527         mtx_unlock(&sc->queue_mtx);
 1528         mtx_destroy(&sc->queue_mtx);
 1529         if (sc->vnode != NULL) {
 1530                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
 1531                 sc->vnode->v_vflag &= ~VV_MD;
 1532                 VOP_UNLOCK(sc->vnode);
 1533                 (void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
 1534                     FREAD : (FREAD|FWRITE), sc->cred, td);
 1535         }
 1536         if (sc->cred != NULL)
 1537                 crfree(sc->cred);
 1538         if (sc->object != NULL)
 1539                 vm_object_deallocate(sc->object);
 1540         if (sc->indir)
 1541                 destroy_indir(sc, sc->indir);
 1542         if (sc->uma)
 1543                 uma_zdestroy(sc->uma);
 1544 
 1545         LIST_REMOVE(sc, list);
 1546         free_unr(md_uh, sc->unit);
 1547         free(sc, M_MD);
 1548         return (0);
 1549 }
 1550 
 1551 static int
 1552 mdresize(struct md_s *sc, struct md_req *mdr)
 1553 {
 1554         int error, res;
 1555         vm_pindex_t oldpages, newpages;
 1556 
 1557         switch (sc->type) {
 1558         case MD_VNODE:
 1559         case MD_NULL:
 1560                 break;
 1561         case MD_SWAP:
 1562                 if (mdr->md_mediasize <= 0 ||
 1563                     (mdr->md_mediasize % PAGE_SIZE) != 0)
 1564                         return (EDOM);
 1565                 oldpages = OFF_TO_IDX(sc->mediasize);
 1566                 newpages = OFF_TO_IDX(mdr->md_mediasize);
 1567                 if (newpages < oldpages) {
 1568                         VM_OBJECT_WLOCK(sc->object);
 1569                         vm_object_page_remove(sc->object, newpages, 0, 0);
 1570                         swap_release_by_cred(IDX_TO_OFF(oldpages -
 1571                             newpages), sc->cred);
 1572                         sc->object->charge = IDX_TO_OFF(newpages);
 1573                         sc->object->size = newpages;
 1574                         VM_OBJECT_WUNLOCK(sc->object);
 1575                 } else if (newpages > oldpages) {
 1576                         res = swap_reserve_by_cred(IDX_TO_OFF(newpages -
 1577                             oldpages), sc->cred);
 1578                         if (!res)
 1579                                 return (ENOMEM);
 1580                         if ((mdr->md_options & MD_RESERVE) ||
 1581                             (sc->flags & MD_RESERVE)) {
 1582                                 error = swap_pager_reserve(sc->object,
 1583                                     oldpages, newpages - oldpages);
 1584                                 if (error < 0) {
 1585                                         swap_release_by_cred(
 1586                                             IDX_TO_OFF(newpages - oldpages),
 1587                                             sc->cred);
 1588                                         return (EDOM);
 1589                                 }
 1590                         }
 1591                         VM_OBJECT_WLOCK(sc->object);
 1592                         sc->object->charge = IDX_TO_OFF(newpages);
 1593                         sc->object->size = newpages;
 1594                         VM_OBJECT_WUNLOCK(sc->object);
 1595                 }
 1596                 break;
 1597         default:
 1598                 return (EOPNOTSUPP);
 1599         }
 1600 
 1601         sc->mediasize = mdr->md_mediasize;
 1602 
 1603         g_topology_lock();
 1604         g_resize_provider(sc->pp, sc->mediasize);
 1605         g_topology_unlock();
 1606         return (0);
 1607 }
 1608 
 1609 static int
 1610 mdcreate_swap(struct md_s *sc, struct md_req *mdr, struct thread *td)
 1611 {
 1612         vm_ooffset_t npage;
 1613         int error;
 1614 
 1615         /*
 1616          * Range check.  Disallow negative sizes and sizes not being
 1617          * multiple of page size.
 1618          */
 1619         if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
 1620                 return (EDOM);
 1621 
 1622         /*
 1623          * Allocate an OBJT_SWAP object.
 1624          *
 1625          * Note the truncation.
 1626          */
 1627 
 1628         if ((mdr->md_options & MD_VERIFY) != 0)
 1629                 return (EINVAL);
 1630         npage = mdr->md_mediasize / PAGE_SIZE;
 1631         if (mdr->md_fwsectors != 0)
 1632                 sc->fwsectors = mdr->md_fwsectors;
 1633         if (mdr->md_fwheads != 0)
 1634                 sc->fwheads = mdr->md_fwheads;
 1635         sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
 1636             VM_PROT_DEFAULT, 0, td->td_ucred);
 1637         if (sc->object == NULL)
 1638                 return (ENOMEM);
 1639         sc->flags = mdr->md_options & (MD_FORCE | MD_RESERVE);
 1640         if (mdr->md_options & MD_RESERVE) {
 1641                 if (swap_pager_reserve(sc->object, 0, npage) < 0) {
 1642                         error = EDOM;
 1643                         goto finish;
 1644                 }
 1645         }
 1646         error = mdsetcred(sc, td->td_ucred);
 1647  finish:
 1648         if (error != 0) {
 1649                 vm_object_deallocate(sc->object);
 1650                 sc->object = NULL;
 1651         }
 1652         return (error);
 1653 }
 1654 
 1655 static int
 1656 mdcreate_null(struct md_s *sc, struct md_req *mdr, struct thread *td)
 1657 {
 1658 
 1659         /*
 1660          * Range check.  Disallow negative sizes and sizes not being
 1661          * multiple of page size.
 1662          */
 1663         if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
 1664                 return (EDOM);
 1665 
 1666         return (0);
 1667 }
 1668 
 1669 static int
 1670 kern_mdattach_locked(struct thread *td, struct md_req *mdr)
 1671 {
 1672         struct md_s *sc;
 1673         unsigned sectsize;
 1674         int error, i;
 1675 
 1676         sx_assert(&md_sx, SA_XLOCKED);
 1677 
 1678         switch (mdr->md_type) {
 1679         case MD_MALLOC:
 1680         case MD_PRELOAD:
 1681         case MD_VNODE:
 1682         case MD_SWAP:
 1683         case MD_NULL:
 1684                 break;
 1685         default:
 1686                 return (EINVAL);
 1687         }
 1688         if (mdr->md_sectorsize == 0)
 1689                 sectsize = DEV_BSIZE;
 1690         else
 1691                 sectsize = mdr->md_sectorsize;
 1692         if (sectsize > maxphys || mdr->md_mediasize < sectsize)
 1693                 return (EINVAL);
 1694         if (mdr->md_options & MD_AUTOUNIT)
 1695                 sc = mdnew(-1, &error, mdr->md_type);
 1696         else {
 1697                 if (mdr->md_unit > INT_MAX)
 1698                         return (EINVAL);
 1699                 sc = mdnew(mdr->md_unit, &error, mdr->md_type);
 1700         }
 1701         if (sc == NULL)
 1702                 return (error);
 1703         if (mdr->md_label != NULL)
 1704                 error = copyinstr(mdr->md_label, sc->label,
 1705                     sizeof(sc->label), NULL);
 1706         if (error != 0)
 1707                 goto err_after_new;
 1708         if (mdr->md_options & MD_AUTOUNIT)
 1709                 mdr->md_unit = sc->unit;
 1710         sc->mediasize = mdr->md_mediasize;
 1711         sc->sectorsize = sectsize;
 1712         error = EDOOFUS;
 1713         switch (sc->type) {
 1714         case MD_MALLOC:
 1715                 sc->start = mdstart_malloc;
 1716                 error = mdcreate_malloc(sc, mdr);
 1717                 break;
 1718         case MD_PRELOAD:
 1719                 /*
 1720                  * We disallow attaching preloaded memory disks via
 1721                  * ioctl. Preloaded memory disks are automatically
 1722                  * attached in g_md_init().
 1723                  */
 1724                 error = EOPNOTSUPP;
 1725                 break;
 1726         case MD_VNODE:
 1727                 sc->start = mdstart_vnode;
 1728                 error = mdcreate_vnode(sc, mdr, td);
 1729                 break;
 1730         case MD_SWAP:
 1731                 sc->start = mdstart_swap;
 1732                 error = mdcreate_swap(sc, mdr, td);
 1733                 break;
 1734         case MD_NULL:
 1735                 sc->start = mdstart_null;
 1736                 error = mdcreate_null(sc, mdr, td);
 1737                 break;
 1738         }
 1739 err_after_new:
 1740         if (error != 0) {
 1741                 mddestroy(sc, td);
 1742                 return (error);
 1743         }
 1744 
 1745         /* Prune off any residual fractional sector */
 1746         i = sc->mediasize % sc->sectorsize;
 1747         sc->mediasize -= i;
 1748 
 1749         mdinit(sc);
 1750         return (0);
 1751 }
 1752 
 1753 static int
 1754 kern_mdattach(struct thread *td, struct md_req *mdr)
 1755 {
 1756         int error;
 1757 
 1758         sx_xlock(&md_sx);
 1759         error = kern_mdattach_locked(td, mdr);
 1760         sx_xunlock(&md_sx);
 1761         return (error);
 1762 }
 1763 
 1764 static int
 1765 kern_mddetach_locked(struct thread *td, struct md_req *mdr)
 1766 {
 1767         struct md_s *sc;
 1768 
 1769         sx_assert(&md_sx, SA_XLOCKED);
 1770 
 1771         if (mdr->md_mediasize != 0 ||
 1772             (mdr->md_options & ~MD_FORCE) != 0)
 1773                 return (EINVAL);
 1774 
 1775         sc = mdfind(mdr->md_unit);
 1776         if (sc == NULL)
 1777                 return (ENOENT);
 1778         if (sc->opencount != 0 && !(sc->flags & MD_FORCE) &&
 1779             !(mdr->md_options & MD_FORCE))
 1780                 return (EBUSY);
 1781         return (mddestroy(sc, td));
 1782 }
 1783 
 1784 static int
 1785 kern_mddetach(struct thread *td, struct md_req *mdr)
 1786 {
 1787         int error;
 1788 
 1789         sx_xlock(&md_sx);
 1790         error = kern_mddetach_locked(td, mdr);
 1791         sx_xunlock(&md_sx);
 1792         return (error);
 1793 }
 1794 
 1795 static int
 1796 kern_mdresize_locked(struct md_req *mdr)
 1797 {
 1798         struct md_s *sc;
 1799 
 1800         sx_assert(&md_sx, SA_XLOCKED);
 1801 
 1802         if ((mdr->md_options & ~(MD_FORCE | MD_RESERVE)) != 0)
 1803                 return (EINVAL);
 1804 
 1805         sc = mdfind(mdr->md_unit);
 1806         if (sc == NULL)
 1807                 return (ENOENT);
 1808         if (mdr->md_mediasize < sc->sectorsize)
 1809                 return (EINVAL);
 1810         mdr->md_mediasize -= mdr->md_mediasize % sc->sectorsize;
 1811         if (mdr->md_mediasize < sc->mediasize &&
 1812             !(sc->flags & MD_FORCE) &&
 1813             !(mdr->md_options & MD_FORCE))
 1814                 return (EBUSY);
 1815         return (mdresize(sc, mdr));
 1816 }
 1817 
 1818 static int
 1819 kern_mdresize(struct md_req *mdr)
 1820 {
 1821         int error;
 1822 
 1823         sx_xlock(&md_sx);
 1824         error = kern_mdresize_locked(mdr);
 1825         sx_xunlock(&md_sx);
 1826         return (error);
 1827 }
 1828 
 1829 static int
 1830 kern_mdquery_locked(struct md_req *mdr)
 1831 {
 1832         struct md_s *sc;
 1833         int error;
 1834 
 1835         sx_assert(&md_sx, SA_XLOCKED);
 1836 
 1837         sc = mdfind(mdr->md_unit);
 1838         if (sc == NULL)
 1839                 return (ENOENT);
 1840         mdr->md_type = sc->type;
 1841         mdr->md_options = sc->flags;
 1842         mdr->md_mediasize = sc->mediasize;
 1843         mdr->md_sectorsize = sc->sectorsize;
 1844         error = 0;
 1845         if (mdr->md_label != NULL) {
 1846                 error = copyout(sc->label, mdr->md_label,
 1847                     strlen(sc->label) + 1);
 1848                 if (error != 0)
 1849                         return (error);
 1850         }
 1851         if (sc->type == MD_VNODE ||
 1852             (sc->type == MD_PRELOAD && mdr->md_file != NULL))
 1853                 error = copyout(sc->file, mdr->md_file,
 1854                     strlen(sc->file) + 1);
 1855         return (error);
 1856 }
 1857 
 1858 static int
 1859 kern_mdquery(struct md_req *mdr)
 1860 {
 1861         int error;
 1862 
 1863         sx_xlock(&md_sx);
 1864         error = kern_mdquery_locked(mdr);
 1865         sx_xunlock(&md_sx);
 1866         return (error);
 1867 }
 1868 
 1869 /* Copy members that are not userspace pointers. */
 1870 #define MD_IOCTL2REQ(mdio, mdr) do {                                    \
 1871         (mdr)->md_unit = (mdio)->md_unit;                               \
 1872         (mdr)->md_type = (mdio)->md_type;                               \
 1873         (mdr)->md_mediasize = (mdio)->md_mediasize;                     \
 1874         (mdr)->md_sectorsize = (mdio)->md_sectorsize;                   \
 1875         (mdr)->md_options = (mdio)->md_options;                         \
 1876         (mdr)->md_fwheads = (mdio)->md_fwheads;                         \
 1877         (mdr)->md_fwsectors = (mdio)->md_fwsectors;                     \
 1878         (mdr)->md_units = &(mdio)->md_pad[0];                           \
 1879         (mdr)->md_units_nitems = nitems((mdio)->md_pad);                \
 1880 } while(0)
 1881 
 1882 /* Copy members that might have been updated */
 1883 #define MD_REQ2IOCTL(mdr, mdio) do {                                    \
 1884         (mdio)->md_unit = (mdr)->md_unit;                               \
 1885         (mdio)->md_type = (mdr)->md_type;                               \
 1886         (mdio)->md_mediasize = (mdr)->md_mediasize;                     \
 1887         (mdio)->md_sectorsize = (mdr)->md_sectorsize;                   \
 1888         (mdio)->md_options = (mdr)->md_options;                         \
 1889         (mdio)->md_fwheads = (mdr)->md_fwheads;                         \
 1890         (mdio)->md_fwsectors = (mdr)->md_fwsectors;                     \
 1891 } while(0)
 1892 
 1893 static int
 1894 mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
 1895     struct thread *td)
 1896 {
 1897         struct md_req mdr;
 1898         int error;
 1899 
 1900         if (md_debug)
 1901                 printf("mdctlioctl(%s %lx %p %x %p)\n",
 1902                         devtoname(dev), cmd, addr, flags, td);
 1903 
 1904         bzero(&mdr, sizeof(mdr));
 1905         switch (cmd) {
 1906         case MDIOCATTACH:
 1907         case MDIOCDETACH:
 1908         case MDIOCRESIZE:
 1909         case MDIOCQUERY: {
 1910                 struct md_ioctl *mdio = (struct md_ioctl *)addr;
 1911                 if (mdio->md_version != MDIOVERSION)
 1912                         return (EINVAL);
 1913                 MD_IOCTL2REQ(mdio, &mdr);
 1914                 mdr.md_file = mdio->md_file;
 1915                 mdr.md_file_seg = UIO_USERSPACE;
 1916                 /* If the file is adjacent to the md_ioctl it's in kernel. */
 1917                 if ((void *)mdio->md_file == (void *)(mdio + 1))
 1918                         mdr.md_file_seg = UIO_SYSSPACE;
 1919                 mdr.md_label = mdio->md_label;
 1920                 break;
 1921         }
 1922 #ifdef COMPAT_FREEBSD32
 1923         case MDIOCATTACH_32:
 1924         case MDIOCDETACH_32:
 1925         case MDIOCRESIZE_32:
 1926         case MDIOCQUERY_32: {
 1927                 struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
 1928                 if (mdio->md_version != MDIOVERSION)
 1929                         return (EINVAL);
 1930                 MD_IOCTL2REQ(mdio, &mdr);
 1931                 mdr.md_file = (void *)(uintptr_t)mdio->md_file;
 1932                 mdr.md_file_seg = UIO_USERSPACE;
 1933                 mdr.md_label = (void *)(uintptr_t)mdio->md_label;
 1934                 break;
 1935         }
 1936 #endif
 1937         default:
 1938                 /* Fall through to handler switch. */
 1939                 break;
 1940         }
 1941 
 1942         error = 0;
 1943         switch (cmd) {
 1944         case MDIOCATTACH:
 1945 #ifdef COMPAT_FREEBSD32
 1946         case MDIOCATTACH_32:
 1947 #endif
 1948                 error = kern_mdattach(td, &mdr);
 1949                 break;
 1950         case MDIOCDETACH:
 1951 #ifdef COMPAT_FREEBSD32
 1952         case MDIOCDETACH_32:
 1953 #endif
 1954                 error = kern_mddetach(td, &mdr);
 1955                 break;
 1956         case MDIOCRESIZE:
 1957 #ifdef COMPAT_FREEBSD32
 1958         case MDIOCRESIZE_32:
 1959 #endif
 1960                 error = kern_mdresize(&mdr);
 1961                 break;
 1962         case MDIOCQUERY:
 1963 #ifdef COMPAT_FREEBSD32
 1964         case MDIOCQUERY_32:
 1965 #endif
 1966                 error = kern_mdquery(&mdr);
 1967                 break;
 1968         default:
 1969                 error = ENOIOCTL;
 1970         }
 1971 
 1972         switch (cmd) {
 1973         case MDIOCATTACH:
 1974         case MDIOCQUERY: {
 1975                 struct md_ioctl *mdio = (struct md_ioctl *)addr;
 1976                 MD_REQ2IOCTL(&mdr, mdio);
 1977                 break;
 1978         }
 1979 #ifdef COMPAT_FREEBSD32
 1980         case MDIOCATTACH_32:
 1981         case MDIOCQUERY_32: {
 1982                 struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
 1983                 MD_REQ2IOCTL(&mdr, mdio);
 1984                 break;
 1985         }
 1986 #endif
 1987         default:
 1988                 /* Other commands to not alter mdr. */
 1989                 break;
 1990         }
 1991 
 1992         return (error);
 1993 }
 1994 
 1995 static void
 1996 md_preloaded(u_char *image, size_t length, const char *name)
 1997 {
 1998         struct md_s *sc;
 1999         int error;
 2000 
 2001         sc = mdnew(-1, &error, MD_PRELOAD);
 2002         if (sc == NULL)
 2003                 return;
 2004         sc->mediasize = length;
 2005         sc->sectorsize = DEV_BSIZE;
 2006         sc->pl_ptr = image;
 2007         sc->pl_len = length;
 2008         sc->start = mdstart_preload;
 2009         if (name != NULL)
 2010                 strlcpy(sc->file, name, sizeof(sc->file));
 2011 #ifdef MD_ROOT
 2012         if (sc->unit == 0) {
 2013 #ifndef ROOTDEVNAME
 2014                 rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0";
 2015 #endif
 2016 #ifdef MD_ROOT_READONLY
 2017                 sc->flags |= MD_READONLY;
 2018 #endif
 2019         }
 2020 #endif
 2021         mdinit(sc);
 2022         if (name != NULL) {
 2023                 printf("%s%d: Preloaded image <%s> %zd bytes at %p\n",
 2024                     MD_NAME, sc->unit, name, length, image);
 2025         } else {
 2026                 printf("%s%d: Embedded image %zd bytes at %p\n",
 2027                     MD_NAME, sc->unit, length, image);
 2028         }
 2029 }
 2030 
 2031 static void
 2032 g_md_init(struct g_class *mp __unused)
 2033 {
 2034         caddr_t mod;
 2035         u_char *ptr, *name, *type;
 2036         unsigned len;
 2037         int i;
 2038 
 2039         /* figure out log2(NINDIR) */
 2040         for (i = NINDIR, nshift = -1; i; nshift++)
 2041                 i >>= 1;
 2042 
 2043         mod = NULL;
 2044         sx_init(&md_sx, "MD config lock");
 2045         g_topology_unlock();
 2046         md_uh = new_unrhdr(0, INT_MAX, NULL);
 2047 #ifdef MD_ROOT
 2048         if (mfs_root_size != 0) {
 2049                 sx_xlock(&md_sx);
 2050 #ifdef MD_ROOT_MEM
 2051                 md_preloaded(mfs_root, mfs_root_size, NULL);
 2052 #else
 2053                 md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size,
 2054                     NULL);
 2055 #endif
 2056                 sx_xunlock(&md_sx);
 2057         }
 2058 #endif
 2059         /* XXX: are preload_* static or do they need Giant ? */
 2060         while ((mod = preload_search_next_name(mod)) != NULL) {
 2061                 name = (char *)preload_search_info(mod, MODINFO_NAME);
 2062                 if (name == NULL)
 2063                         continue;
 2064                 type = (char *)preload_search_info(mod, MODINFO_TYPE);
 2065                 if (type == NULL)
 2066                         continue;
 2067                 if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
 2068                         continue;
 2069                 ptr = preload_fetch_addr(mod);
 2070                 len = preload_fetch_size(mod);
 2071                 if (ptr != NULL && len != 0) {
 2072                         sx_xlock(&md_sx);
 2073                         md_preloaded(ptr, len, name);
 2074                         sx_xunlock(&md_sx);
 2075                 }
 2076         }
 2077         md_pbuf_zone = pbuf_zsecond_create("mdpbuf", nswbuf / 10);
 2078         status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL,
 2079             0600, MDCTL_NAME);
 2080         g_topology_lock();
 2081 }
 2082 
 2083 static void
 2084 g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
 2085     struct g_consumer *cp __unused, struct g_provider *pp)
 2086 {
 2087         struct md_s *mp;
 2088         char *type;
 2089 
 2090         mp = gp->softc;
 2091         if (mp == NULL)
 2092                 return;
 2093 
 2094         switch (mp->type) {
 2095         case MD_MALLOC:
 2096                 type = "malloc";
 2097                 break;
 2098         case MD_PRELOAD:
 2099                 type = "preload";
 2100                 break;
 2101         case MD_VNODE:
 2102                 type = "vnode";
 2103                 break;
 2104         case MD_SWAP:
 2105                 type = "swap";
 2106                 break;
 2107         case MD_NULL:
 2108                 type = "null";
 2109                 break;
 2110         default:
 2111                 type = "unknown";
 2112                 break;
 2113         }
 2114 
 2115         if (pp != NULL) {
 2116                 if (indent == NULL) {
 2117                         sbuf_printf(sb, " u %d", mp->unit);
 2118                         sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize);
 2119                         sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads);
 2120                         sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors);
 2121                         sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize);
 2122                         sbuf_printf(sb, " t %s", type);
 2123                         if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
 2124                             (mp->type == MD_PRELOAD && mp->file[0] != '\0'))
 2125                                 sbuf_printf(sb, " file %s", mp->file);
 2126                         sbuf_printf(sb, " label %s", mp->label);
 2127                 } else {
 2128                         sbuf_printf(sb, "%s<unit>%d</unit>\n", indent,
 2129                             mp->unit);
 2130                         sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n",
 2131                             indent, (uintmax_t) mp->sectorsize);
 2132                         sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n",
 2133                             indent, (uintmax_t) mp->fwheads);
 2134                         sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n",
 2135                             indent, (uintmax_t) mp->fwsectors);
 2136                         if (mp->ident[0] != '\0') {
 2137                                 sbuf_printf(sb, "%s<ident>", indent);
 2138                                 g_conf_printf_escaped(sb, "%s", mp->ident);
 2139                                 sbuf_printf(sb, "</ident>\n");
 2140                         }
 2141                         sbuf_printf(sb, "%s<length>%ju</length>\n",
 2142                             indent, (uintmax_t) mp->mediasize);
 2143                         sbuf_printf(sb, "%s<compression>%s</compression>\n", indent,
 2144                             (mp->flags & MD_COMPRESS) == 0 ? "off": "on");
 2145                         sbuf_printf(sb, "%s<access>%s</access>\n", indent,
 2146                             (mp->flags & MD_READONLY) == 0 ? "read-write":
 2147                             "read-only");
 2148                         sbuf_printf(sb, "%s<type>%s</type>\n", indent,
 2149                             type);
 2150                         if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
 2151                             (mp->type == MD_PRELOAD && mp->file[0] != '\0')) {
 2152                                 sbuf_printf(sb, "%s<file>", indent);
 2153                                 g_conf_printf_escaped(sb, "%s", mp->file);
 2154                                 sbuf_printf(sb, "</file>\n");
 2155                         }
 2156                         if (mp->type == MD_VNODE)
 2157                                 sbuf_printf(sb, "%s<cache>%s</cache>\n", indent,
 2158                                     (mp->flags & MD_CACHE) == 0 ? "off": "on");
 2159                         sbuf_printf(sb, "%s<label>", indent);
 2160                         g_conf_printf_escaped(sb, "%s", mp->label);
 2161                         sbuf_printf(sb, "</label>\n");
 2162                 }
 2163         }
 2164 }
 2165 
 2166 static void
 2167 g_md_fini(struct g_class *mp __unused)
 2168 {
 2169 
 2170         sx_destroy(&md_sx);
 2171         if (status_dev != NULL)
 2172                 destroy_dev(status_dev);
 2173         uma_zdestroy(md_pbuf_zone);
 2174         delete_unrhdr(md_uh);
 2175 }

Cache object: ec322eec9b34d9edefc11412c469aebe


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.