The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/md/md.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * ----------------------------------------------------------------------------
    3  * "THE BEER-WARE LICENSE" (Revision 42):
    4  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
    5  * can do whatever you want with this stuff. If we meet some day, and you think
    6  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
    7  * ----------------------------------------------------------------------------
    8  *
    9  * $FreeBSD: releng/5.0/sys/dev/md/md.c 107423 2002-11-30 22:03:53Z iedowse $
   10  *
   11  */
   12 
   13 /*
   14  * The following functions are based in the vn(4) driver: mdstart_swap(),
   15  * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
   16  * and as such under the following copyright:
   17  *
   18  * Copyright (c) 1988 University of Utah.
   19  * Copyright (c) 1990, 1993
   20  *      The Regents of the University of California.  All rights reserved.
   21  *
   22  * This code is derived from software contributed to Berkeley by
   23  * the Systems Programming Group of the University of Utah Computer
   24  * Science Department.
   25  *
   26  * Redistribution and use in source and binary forms, with or without
   27  * modification, are permitted provided that the following conditions
   28  * are met:
   29  * 1. Redistributions of source code must retain the above copyright
   30  *    notice, this list of conditions and the following disclaimer.
   31  * 2. Redistributions in binary form must reproduce the above copyright
   32  *    notice, this list of conditions and the following disclaimer in the
   33  *    documentation and/or other materials provided with the distribution.
   34  * 3. All advertising materials mentioning features or use of this software
   35  *    must display the following acknowledgement:
   36  *      This product includes software developed by the University of
   37  *      California, Berkeley and its contributors.
   38  * 4. Neither the name of the University nor the names of its contributors
   39  *    may be used to endorse or promote products derived from this software
   40  *    without specific prior written permission.
   41  *
   42  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   52  * SUCH DAMAGE.
   53  *
   54  * from: Utah Hdr: vn.c 1.13 94/04/02
   55  *
   56  *      from: @(#)vn.c  8.6 (Berkeley) 4/1/94
   57  * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
   58  */
   59 
   60 #include "opt_md.h"
   61 
   62 #include <sys/param.h>
   63 #include <sys/systm.h>
   64 #include <sys/bio.h>
   65 #include <sys/conf.h>
   66 #include <sys/devicestat.h>
   67 #include <sys/disk.h>
   68 #include <sys/fcntl.h>
   69 #include <sys/kernel.h>
   70 #include <sys/kthread.h>
   71 #include <sys/linker.h>
   72 #include <sys/lock.h>
   73 #include <sys/malloc.h>
   74 #include <sys/mdioctl.h>
   75 #include <sys/mutex.h>
   76 #include <sys/namei.h>
   77 #include <sys/proc.h>
   78 #include <sys/queue.h>
   79 #include <sys/stdint.h>
   80 #include <sys/sysctl.h>
   81 #include <sys/vnode.h>
   82 
   83 #include <vm/vm.h>
   84 #include <vm/vm_object.h>
   85 #include <vm/vm_page.h>
   86 #include <vm/vm_pager.h>
   87 #include <vm/swap_pager.h>
   88 #include <vm/uma.h>
   89 
   90 #define MD_MODVER 1
   91 
   92 #define MD_SHUTDOWN 0x10000     /* Tell worker thread to terminate. */
   93 
   94 #ifndef MD_NSECT
   95 #define MD_NSECT (10000 * 2)
   96 #endif
   97 
   98 static MALLOC_DEFINE(M_MD, "MD disk", "Memory Disk");
   99 static MALLOC_DEFINE(M_MDSECT, "MD sectors", "Memory Disk Sectors");
  100 
  101 static int md_debug;
  102 SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "");
  103 
  104 #if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
  105 /* Image gets put here: */
  106 static u_char mfs_root[MD_ROOT_SIZE*1024] = "MFS Filesystem goes here";
  107 static u_char end_mfs_root[] __unused = "MFS Filesystem had better STOP here";
  108 #endif
  109 
  110 static int      mdrootready;
  111 static int      mdunits;
  112 static dev_t    status_dev = 0;
  113 
  114 #define CDEV_MAJOR      95
  115 
  116 static d_strategy_t mdstrategy;
  117 static d_open_t mdopen;
  118 static d_close_t mdclose;
  119 static d_ioctl_t mdioctl, mdctlioctl;
  120 
  121 static struct cdevsw md_cdevsw = {
  122         /* open */      mdopen,
  123         /* close */     mdclose,
  124         /* read */      physread,
  125         /* write */     physwrite,
  126         /* ioctl */     mdioctl,
  127         /* poll */      nopoll,
  128         /* mmap */      nommap,
  129         /* strategy */  mdstrategy,
  130         /* name */      MD_NAME,
  131         /* maj */       CDEV_MAJOR,
  132         /* dump */      nodump,
  133         /* psize */     nopsize,
  134         /* flags */     D_DISK | D_CANFREE | D_MEMDISK,
  135 };
  136 
  137 static struct cdevsw mdctl_cdevsw = {
  138         /* open */      nullopen,
  139         /* close */     nullclose,
  140         /* read */      noread,
  141         /* write */     nowrite,
  142         /* ioctl */     mdctlioctl,
  143         /* poll */      nopoll,
  144         /* mmap */      nommap,
  145         /* strategy */  nostrategy,
  146         /* name */      MD_NAME,
  147         /* maj */       CDEV_MAJOR
  148 };
  149 
  150 static struct cdevsw mddisk_cdevsw;
  151 
  152 static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(&md_softc_list);
  153 
  154 #define NINDIR  (PAGE_SIZE / sizeof(uintptr_t))
  155 #define NMASK   (NINDIR-1)
  156 static int nshift;
  157 
  158 struct indir {
  159         uintptr_t       *array;
  160         uint            total;
  161         uint            used;
  162         uint            shift;
  163 };
  164 
  165 struct md_s {
  166         int unit;
  167         LIST_ENTRY(md_s) list;
  168         struct devstat stats;
  169         struct bio_queue_head bio_queue;
  170         struct disk disk;
  171         dev_t dev;
  172         enum md_types type;
  173         unsigned nsect;
  174         unsigned opencount;
  175         unsigned secsize;
  176         unsigned flags;
  177         char name[20];
  178         struct proc *procp;
  179 
  180         /* MD_MALLOC related fields */
  181         struct indir *indir;
  182         uma_zone_t uma;
  183 
  184         /* MD_PRELOAD related fields */
  185         u_char *pl_ptr;
  186         unsigned pl_len;
  187 
  188         /* MD_VNODE related fields */
  189         struct vnode *vnode;
  190         struct ucred *cred;
  191 
  192         /* MD_SWAP related fields */
  193         vm_object_t object;
  194 };
  195 
  196 static int mddestroy(struct md_s *sc, struct thread *td);
  197 
  198 static struct indir *
  199 new_indir(uint shift)
  200 {
  201         struct indir *ip;
  202 
  203         ip = malloc(sizeof *ip, M_MD, M_NOWAIT | M_ZERO);
  204         if (ip == NULL)
  205                 return (NULL);
  206         ip->array = malloc(sizeof(uintptr_t) * NINDIR,
  207             M_MDSECT, M_NOWAIT | M_ZERO);
  208         if (ip->array == NULL) {
  209                 free(ip, M_MD);
  210                 return (NULL);
  211         }
  212         ip->total = NINDIR;
  213         ip->shift = shift;
  214         return (ip);
  215 }
  216 
  217 static void
  218 del_indir(struct indir *ip)
  219 {
  220 
  221         free(ip->array, M_MDSECT);
  222         free(ip, M_MD);
  223 }
  224 
  225 static void
  226 destroy_indir(struct md_s *sc, struct indir *ip)
  227 {
  228         int i;
  229 
  230         for (i = 0; i < NINDIR; i++) {
  231                 if (!ip->array[i])
  232                         continue;
  233                 if (ip->shift)
  234                         destroy_indir(sc, (struct indir*)(ip->array[i]));
  235                 else if (ip->array[i] > 255)
  236                         uma_zfree(sc->uma, (void *)(ip->array[i]));
  237         }
  238         del_indir(ip);
  239 }
  240 
  241 /*
  242  * This function does the math and alloctes the top level "indir" structure
  243  * for a device of "size" sectors.
  244  */
  245 
  246 static struct indir *
  247 dimension(off_t size)
  248 {
  249         off_t rcnt;
  250         struct indir *ip;
  251         int i, layer;
  252 
  253         rcnt = size;
  254         layer = 0;
  255         while (rcnt > NINDIR) {
  256                 rcnt /= NINDIR;
  257                 layer++;
  258         }
  259         /* figure out log2(NINDIR) */
  260         for (i = NINDIR, nshift = -1; i; nshift++)
  261                 i >>= 1;
  262 
  263         /*
  264          * XXX: the top layer is probably not fully populated, so we allocate
  265          * too much space for ip->array in new_indir() here.
  266          */
  267         ip = new_indir(layer * nshift);
  268         return (ip);
  269 }
  270 
  271 /*
  272  * Read a given sector
  273  */
  274 
  275 static uintptr_t
  276 s_read(struct indir *ip, off_t offset)
  277 {
  278         struct indir *cip;
  279         int idx;
  280         uintptr_t up;
  281 
  282         if (md_debug > 1)
  283                 printf("s_read(%jd)\n", (intmax_t)offset);
  284         up = 0;
  285         for (cip = ip; cip != NULL;) {
  286                 if (cip->shift) {
  287                         idx = (offset >> cip->shift) & NMASK;
  288                         up = cip->array[idx];
  289                         cip = (struct indir *)up;
  290                         continue;
  291                 }
  292                 idx = offset & NMASK;
  293                 return (cip->array[idx]);
  294         }
  295         return (0);
  296 }
  297 
  298 /*
  299  * Write a given sector, prune the tree if the value is 0
  300  */
  301 
  302 static int
  303 s_write(struct indir *ip, off_t offset, uintptr_t ptr)
  304 {
  305         struct indir *cip, *lip[10];
  306         int idx, li;
  307         uintptr_t up;
  308 
  309         if (md_debug > 1)
  310                 printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
  311         up = 0;
  312         li = 0;
  313         cip = ip;
  314         for (;;) {
  315                 lip[li++] = cip;
  316                 if (cip->shift) {
  317                         idx = (offset >> cip->shift) & NMASK;
  318                         up = cip->array[idx];
  319                         if (up != 0) {
  320                                 cip = (struct indir *)up;
  321                                 continue;
  322                         }
  323                         /* Allocate branch */
  324                         cip->array[idx] =
  325                             (uintptr_t)new_indir(cip->shift - nshift);
  326                         if (cip->array[idx] == 0)
  327                                 return (ENOSPC);
  328                         cip->used++;
  329                         up = cip->array[idx];
  330                         cip = (struct indir *)up;
  331                         continue;
  332                 }
  333                 /* leafnode */
  334                 idx = offset & NMASK;
  335                 up = cip->array[idx];
  336                 if (up != 0)
  337                         cip->used--;
  338                 cip->array[idx] = ptr;
  339                 if (ptr != 0)
  340                         cip->used++;
  341                 break;
  342         }
  343         if (cip->used != 0 || li == 1)
  344                 return (0);
  345         li--;
  346         while (cip->used == 0 && cip != ip) {
  347                 li--;
  348                 idx = (offset >> lip[li]->shift) & NMASK;
  349                 up = lip[li]->array[idx];
  350                 KASSERT(up == (uintptr_t)cip, ("md screwed up"));
  351                 del_indir(cip);
  352                 lip[li]->array[idx] = 0;
  353                 lip[li]->used--;
  354                 cip = lip[li];
  355         }
  356         return (0);
  357 }
  358 
  359 static int
  360 mdopen(dev_t dev, int flag, int fmt, struct thread *td)
  361 {
  362         struct md_s *sc;
  363 
  364         if (md_debug)
  365                 printf("mdopen(%s %x %x %p)\n",
  366                         devtoname(dev), flag, fmt, td);
  367 
  368         sc = dev->si_drv1;
  369 
  370         sc->disk.d_sectorsize = sc->secsize;
  371         sc->disk.d_mediasize = (off_t)sc->nsect * sc->secsize;
  372         sc->disk.d_fwsectors = sc->nsect > 63 ? 63 : sc->nsect;
  373         sc->disk.d_fwheads = 1;
  374         sc->opencount++;
  375         return (0);
  376 }
  377 
  378 static int
  379 mdclose(dev_t dev, int flags, int fmt, struct thread *td)
  380 {
  381         struct md_s *sc = dev->si_drv1;
  382 
  383         sc->opencount--;
  384         return (0);
  385 }
  386 
  387 static int
  388 mdioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
  389 {
  390 
  391         if (md_debug)
  392                 printf("mdioctl(%s %lx %p %x %p)\n",
  393                         devtoname(dev), cmd, addr, flags, td);
  394 
  395         return (ENOIOCTL);
  396 }
  397 
  398 static int
  399 mdstart_malloc(struct md_s *sc, struct bio *bp)
  400 {
  401         int i, error;
  402         u_char *dst;
  403         unsigned secno, nsec, uc;
  404         uintptr_t sp, osp;
  405 
  406         nsec = bp->bio_bcount / sc->secsize;
  407         secno = bp->bio_pblkno;
  408         dst = bp->bio_data;
  409         error = 0;
  410         while (nsec--) {
  411                 osp = s_read(sc->indir, secno);
  412                 if (bp->bio_cmd == BIO_DELETE) {
  413                         if (osp != 0)
  414                                 error = s_write(sc->indir, secno, 0);
  415                 } else if (bp->bio_cmd == BIO_READ) {
  416                         if (osp == 0)
  417                                 bzero(dst, sc->secsize);
  418                         else if (osp <= 255)
  419                                 for (i = 0; i < sc->secsize; i++)
  420                                         dst[i] = osp;
  421                         else
  422                                 bcopy((void *)osp, dst, sc->secsize);
  423                         osp = 0;
  424                 } else if (bp->bio_cmd == BIO_WRITE) {
  425                         if (sc->flags & MD_COMPRESS) {
  426                                 uc = dst[0];
  427                                 for (i = 1; i < sc->secsize; i++)
  428                                         if (dst[i] != uc)
  429                                                 break;
  430                         } else {
  431                                 i = 0;
  432                                 uc = 0;
  433                         }
  434                         if (i == sc->secsize) {
  435                                 if (osp != uc)
  436                                         error = s_write(sc->indir, secno, uc);
  437                         } else {
  438                                 if (osp <= 255) {
  439                                         sp = (uintptr_t) uma_zalloc(
  440                                             sc->uma, M_NOWAIT);
  441                                         if (sp == 0) {
  442                                                 error = ENOSPC;
  443                                                 break;
  444                                         }
  445                                         bcopy(dst, (void *)sp, sc->secsize);
  446                                         error = s_write(sc->indir, secno, sp);
  447                                 } else {
  448                                         bcopy(dst, (void *)osp, sc->secsize);
  449                                         osp = 0;
  450                                 }
  451                         }
  452                 } else {
  453                         error = EOPNOTSUPP;
  454                 }
  455                 if (osp > 255)
  456                         uma_zfree(sc->uma, (void*)osp);
  457                 if (error)
  458                         break;
  459                 secno++;
  460                 dst += sc->secsize;
  461         }
  462         bp->bio_resid = 0;
  463         return (error);
  464 }
  465 
  466 static int
  467 mdstart_preload(struct md_s *sc, struct bio *bp)
  468 {
  469 
  470         if (bp->bio_cmd == BIO_DELETE) {
  471         } else if (bp->bio_cmd == BIO_READ) {
  472                 bcopy(sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_data, bp->bio_bcount);
  473         } else {
  474                 bcopy(bp->bio_data, sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_bcount);
  475         }
  476         bp->bio_resid = 0;
  477         return (0);
  478 }
  479 
  480 static int
  481 mdstart_vnode(struct md_s *sc, struct bio *bp)
  482 {
  483         int error;
  484         struct uio auio;
  485         struct iovec aiov;
  486         struct mount *mp;
  487 
  488         /*
  489          * VNODE I/O
  490          *
  491          * If an error occurs, we set BIO_ERROR but we do not set
  492          * B_INVAL because (for a write anyway), the buffer is
  493          * still valid.
  494          */
  495 
  496         bzero(&auio, sizeof(auio));
  497 
  498         aiov.iov_base = bp->bio_data;
  499         aiov.iov_len = bp->bio_bcount;
  500         auio.uio_iov = &aiov;
  501         auio.uio_iovcnt = 1;
  502         auio.uio_offset = (vm_ooffset_t)bp->bio_pblkno * sc->secsize;
  503         auio.uio_segflg = UIO_SYSSPACE;
  504         if(bp->bio_cmd == BIO_READ)
  505                 auio.uio_rw = UIO_READ;
  506         else
  507                 auio.uio_rw = UIO_WRITE;
  508         auio.uio_resid = bp->bio_bcount;
  509         auio.uio_td = curthread;
  510         /*
  511          * When reading set IO_DIRECT to try to avoid double-caching
  512          * the data.  When writing IO_DIRECT is not optimal, but we
  513          * must set IO_NOWDRAIN to avoid a wdrain deadlock.
  514          */
  515         if (bp->bio_cmd == BIO_READ) {
  516                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
  517                 error = VOP_READ(sc->vnode, &auio, IO_DIRECT, sc->cred);
  518         } else {
  519                 (void) vn_start_write(sc->vnode, &mp, V_WAIT);
  520                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
  521                 error = VOP_WRITE(sc->vnode, &auio, IO_NOWDRAIN, sc->cred);
  522                 vn_finished_write(mp);
  523         }
  524         VOP_UNLOCK(sc->vnode, 0, curthread);
  525         bp->bio_resid = auio.uio_resid;
  526         return (error);
  527 }
  528 
  529 static int
  530 mdstart_swap(struct md_s *sc, struct bio *bp)
  531 {
  532 
  533         if ((bp->bio_cmd == BIO_DELETE) && (sc->flags & MD_RESERVE))
  534                 biodone(bp);
  535         else
  536                 vm_pager_strategy(sc->object, bp);
  537         return (-1);
  538 }
  539 
  540 static void
  541 mdstrategy(struct bio *bp)
  542 {
  543         struct md_s *sc;
  544 
  545         if (md_debug > 1)
  546                 printf("mdstrategy(%p) %s %x, %jd, %jd %ld, %p)\n",
  547                     (void *)bp, devtoname(bp->bio_dev), bp->bio_flags,
  548                     (intmax_t)bp->bio_blkno,
  549                     (intmax_t)bp->bio_pblkno,
  550                     bp->bio_bcount / DEV_BSIZE,
  551                     (void *)bp->bio_data);
  552 
  553         sc = bp->bio_dev->si_drv1;
  554 
  555         /* XXX: LOCK(sc->lock) */
  556         bioqdisksort(&sc->bio_queue, bp);
  557         /* XXX: UNLOCK(sc->lock) */
  558 
  559         wakeup(sc);
  560 }
  561 
  562 static void
  563 md_kthread(void *arg)
  564 {
  565         struct md_s *sc;
  566         struct bio *bp;
  567         int error;
  568 
  569         sc = arg;
  570         curthread->td_base_pri = PRIBIO;
  571 
  572         mtx_lock(&Giant);
  573         for (;;) {
  574                 /* XXX: LOCK(unique unit numbers) */
  575                 bp = bioq_first(&sc->bio_queue);
  576                 if (bp)
  577                         bioq_remove(&sc->bio_queue, bp);
  578                 /* XXX: UNLOCK(unique unit numbers) */
  579                 if (!bp) {
  580                         if (sc->flags & MD_SHUTDOWN) {
  581                                 sc->procp = NULL;
  582                                 wakeup(&sc->procp);
  583                                 kthread_exit(0);
  584                         }
  585                         tsleep(sc, PRIBIO, "mdwait", 0);
  586                         continue;
  587                 }
  588 
  589                 switch (sc->type) {
  590                 case MD_MALLOC:
  591                         devstat_start_transaction(&sc->stats);
  592                         error = mdstart_malloc(sc, bp);
  593                         break;
  594                 case MD_PRELOAD:
  595                         devstat_start_transaction(&sc->stats);
  596                         error = mdstart_preload(sc, bp);
  597                         break;
  598                 case MD_VNODE:
  599                         devstat_start_transaction(&sc->stats);
  600                         error = mdstart_vnode(sc, bp);
  601                         break;
  602                 case MD_SWAP:
  603                         error = mdstart_swap(sc, bp);
  604                         break;
  605                 default:
  606                         panic("Impossible md(type)");
  607                         break;
  608                 }
  609 
  610                 if (error != -1)
  611                         biofinish(bp, &sc->stats, error);
  612         }
  613 }
  614 
  615 static struct md_s *
  616 mdfind(int unit)
  617 {
  618         struct md_s *sc;
  619 
  620         /* XXX: LOCK(unique unit numbers) */
  621         LIST_FOREACH(sc, &md_softc_list, list) {
  622                 if (sc->unit == unit)
  623                         break;
  624         }
  625         /* XXX: UNLOCK(unique unit numbers) */
  626         return (sc);
  627 }
  628 
  629 static struct md_s *
  630 mdnew(int unit)
  631 {
  632         struct md_s *sc;
  633         int error, max = -1;
  634 
  635         /* XXX: LOCK(unique unit numbers) */
  636         LIST_FOREACH(sc, &md_softc_list, list) {
  637                 if (sc->unit == unit) {
  638                         /* XXX: UNLOCK(unique unit numbers) */
  639                         return (NULL);
  640                 }
  641                 if (sc->unit > max)
  642                         max = sc->unit;
  643         }
  644         if (unit == -1)
  645                 unit = max + 1;
  646         if (unit > 255)
  647                 return (NULL);
  648         sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
  649         sc->unit = unit;
  650         sprintf(sc->name, "md%d", unit);
  651         error = kthread_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
  652         if (error) {
  653                 free(sc, M_MD);
  654                 return (NULL);
  655         }
  656         LIST_INSERT_HEAD(&md_softc_list, sc, list);
  657         /* XXX: UNLOCK(unique unit numbers) */
  658         return (sc);
  659 }
  660 
  661 static void
  662 mdinit(struct md_s *sc)
  663 {
  664 
  665         bioq_init(&sc->bio_queue);
  666         devstat_add_entry(&sc->stats, MD_NAME, sc->unit, sc->secsize,
  667                 DEVSTAT_NO_ORDERED_TAGS,
  668                 DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
  669                 DEVSTAT_PRIORITY_OTHER);
  670         sc->dev = disk_create(sc->unit, &sc->disk, 0, &md_cdevsw, &mddisk_cdevsw);
  671         sc->dev->si_drv1 = sc;
  672 }
  673 
  674 /*
  675  * XXX: we should check that the range they feed us is mapped.
  676  * XXX: we should implement read-only.
  677  */
  678 
  679 static int
  680 mdcreate_preload(struct md_ioctl *mdio)
  681 {
  682         struct md_s *sc;
  683 
  684         if (mdio->md_size == 0)
  685                 return (EINVAL);
  686         if (mdio->md_options & ~(MD_AUTOUNIT))
  687                 return (EINVAL);
  688         if (mdio->md_options & MD_AUTOUNIT) {
  689                 sc = mdnew(-1);
  690                 if (sc == NULL)
  691                         return (ENOMEM);
  692                 mdio->md_unit = sc->unit;
  693         } else {
  694                 sc = mdnew(mdio->md_unit);
  695                 if (sc == NULL)
  696                         return (EBUSY);
  697         }
  698         sc->type = MD_PRELOAD;
  699         sc->secsize = DEV_BSIZE;
  700         sc->nsect = mdio->md_size;
  701         sc->flags = mdio->md_options & MD_FORCE;
  702         /* Cast to pointer size, then to pointer to avoid warning */
  703         sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base;
  704         sc->pl_len = (mdio->md_size << DEV_BSHIFT);
  705         mdinit(sc);
  706         return (0);
  707 }
  708 
  709 
  710 static int
  711 mdcreate_malloc(struct md_ioctl *mdio)
  712 {
  713         struct md_s *sc;
  714         off_t u;
  715         uintptr_t sp;
  716         int error;
  717 
  718         error = 0;
  719         if (mdio->md_size == 0)
  720                 return (EINVAL);
  721         if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
  722                 return (EINVAL);
  723         /* Compression doesn't make sense if we have reserved space */
  724         if (mdio->md_options & MD_RESERVE)
  725                 mdio->md_options &= ~MD_COMPRESS;
  726         if (mdio->md_options & MD_AUTOUNIT) {
  727                 sc = mdnew(-1);
  728                 if (sc == NULL)
  729                         return (ENOMEM);
  730                 mdio->md_unit = sc->unit;
  731         } else {
  732                 sc = mdnew(mdio->md_unit);
  733                 if (sc == NULL)
  734                         return (EBUSY);
  735         }
  736         sc->type = MD_MALLOC;
  737         sc->secsize = DEV_BSIZE;
  738         sc->nsect = mdio->md_size;
  739         sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE);
  740         sc->indir = dimension(sc->nsect);
  741         sc->uma = uma_zcreate(sc->name, sc->secsize,
  742             NULL, NULL, NULL, NULL, 0x1ff, 0);
  743         if (mdio->md_options & MD_RESERVE) {
  744                 for (u = 0; u < sc->nsect; u++) {
  745                         sp = (uintptr_t) uma_zalloc(sc->uma, M_NOWAIT | M_ZERO);
  746                         if (sp != 0)
  747                                 error = s_write(sc->indir, u, sp);
  748                         else
  749                                 error = ENOMEM;
  750                         if (error)
  751                                 break;
  752                 }
  753         }
  754         if (!error)  {
  755                 printf("%s%d: Malloc disk\n", MD_NAME, sc->unit);
  756                 mdinit(sc);
  757         } else
  758                 mddestroy(sc, NULL);
  759         return (error);
  760 }
  761 
  762 
  763 static int
  764 mdsetcred(struct md_s *sc, struct ucred *cred)
  765 {
  766         char *tmpbuf;
  767         int error = 0;
  768 
  769         /*
  770          * Set credits in our softc
  771          */
  772 
  773         if (sc->cred)
  774                 crfree(sc->cred);
  775         sc->cred = crhold(cred);
  776 
  777         /*
  778          * Horrible kludge to establish credentials for NFS  XXX.
  779          */
  780 
  781         if (sc->vnode) {
  782                 struct uio auio;
  783                 struct iovec aiov;
  784 
  785                 tmpbuf = malloc(sc->secsize, M_TEMP, M_WAITOK);
  786                 bzero(&auio, sizeof(auio));
  787 
  788                 aiov.iov_base = tmpbuf;
  789                 aiov.iov_len = sc->secsize;
  790                 auio.uio_iov = &aiov;
  791                 auio.uio_iovcnt = 1;
  792                 auio.uio_offset = 0;
  793                 auio.uio_rw = UIO_READ;
  794                 auio.uio_segflg = UIO_SYSSPACE;
  795                 auio.uio_resid = aiov.iov_len;
  796                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
  797                 error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
  798                 VOP_UNLOCK(sc->vnode, 0, curthread);
  799                 free(tmpbuf, M_TEMP);
  800         }
  801         return (error);
  802 }
  803 
  804 static int
  805 mdcreate_vnode(struct md_ioctl *mdio, struct thread *td)
  806 {
  807         struct md_s *sc;
  808         struct vattr vattr;
  809         struct nameidata nd;
  810         int error, flags;
  811 
  812         flags = FREAD|FWRITE;
  813         NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
  814         error = vn_open(&nd, &flags, 0);
  815         if (error) {
  816                 if (error != EACCES && error != EPERM && error != EROFS)
  817                         return (error);
  818                 flags &= ~FWRITE;
  819                 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
  820                 error = vn_open(&nd, &flags, 0);
  821                 if (error)
  822                         return (error);
  823         }
  824         NDFREE(&nd, NDF_ONLY_PNBUF);
  825         if (nd.ni_vp->v_type != VREG ||
  826             (error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred, td))) {
  827                 VOP_UNLOCK(nd.ni_vp, 0, td);
  828                 (void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
  829                 return (error ? error : EINVAL);
  830         }
  831         VOP_UNLOCK(nd.ni_vp, 0, td);
  832 
  833         if (mdio->md_options & MD_AUTOUNIT) {
  834                 sc = mdnew(-1);
  835                 mdio->md_unit = sc->unit;
  836         } else {
  837                 sc = mdnew(mdio->md_unit);
  838         }
  839         if (sc == NULL) {
  840                 (void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
  841                 return (EBUSY);
  842         }
  843 
  844         sc->type = MD_VNODE;
  845         sc->flags = mdio->md_options & MD_FORCE;
  846         if (!(flags & FWRITE))
  847                 sc->flags |= MD_READONLY;
  848         sc->secsize = DEV_BSIZE;
  849         sc->vnode = nd.ni_vp;
  850 
  851         /*
  852          * If the size is specified, override the file attributes.
  853          */
  854         if (mdio->md_size)
  855                 sc->nsect = mdio->md_size;
  856         else
  857                 sc->nsect = vattr.va_size / sc->secsize; /* XXX: round up ? */
  858         if (sc->nsect == 0) {
  859                 mddestroy(sc, td);
  860                 return (EINVAL);
  861         }
  862         error = mdsetcred(sc, td->td_ucred);
  863         if (error) {
  864                 mddestroy(sc, td);
  865                 return (error);
  866         }
  867         mdinit(sc);
  868         return (0);
  869 }
  870 
  871 static int
  872 mddestroy(struct md_s *sc, struct thread *td)
  873 {
  874 
  875         GIANT_REQUIRED;
  876 
  877         if (sc->dev != NULL) {
  878                 devstat_remove_entry(&sc->stats);
  879                 disk_destroy(sc->dev);
  880         }
  881         sc->flags |= MD_SHUTDOWN;
  882         wakeup(sc);
  883         while (sc->procp != NULL)
  884                 tsleep(&sc->procp, PRIBIO, "mddestroy", hz / 10);
  885         if (sc->vnode != NULL)
  886                 (void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
  887                     FREAD : (FREAD|FWRITE), sc->cred, td);
  888         if (sc->cred != NULL)
  889                 crfree(sc->cred);
  890         if (sc->object != NULL) {
  891                 vm_pager_deallocate(sc->object);
  892         }
  893         if (sc->indir)
  894                 destroy_indir(sc, sc->indir);
  895         if (sc->uma)
  896                 uma_zdestroy(sc->uma);
  897 
  898         /* XXX: LOCK(unique unit numbers) */
  899         LIST_REMOVE(sc, list);
  900         /* XXX: UNLOCK(unique unit numbers) */
  901         free(sc, M_MD);
  902         return (0);
  903 }
  904 
  905 static int
  906 mdcreate_swap(struct md_ioctl *mdio, struct thread *td)
  907 {
  908         int error;
  909         struct md_s *sc;
  910 
  911         GIANT_REQUIRED;
  912 
  913         if (mdio->md_options & MD_AUTOUNIT) {
  914                 sc = mdnew(-1);
  915                 mdio->md_unit = sc->unit;
  916         } else {
  917                 sc = mdnew(mdio->md_unit);
  918         }
  919         if (sc == NULL)
  920                 return (EBUSY);
  921 
  922         sc->type = MD_SWAP;
  923 
  924         /*
  925          * Range check.  Disallow negative sizes or any size less then the
  926          * size of a page.  Then round to a page.
  927          */
  928 
  929         if (mdio->md_size == 0) {
  930                 mddestroy(sc, td);
  931                 return (EDOM);
  932         }
  933 
  934         /*
  935          * Allocate an OBJT_SWAP object.
  936          *
  937          * sc_secsize is PAGE_SIZE'd
  938          *
  939          * mdio->size is in DEV_BSIZE'd chunks.
  940          * Note the truncation.
  941          */
  942 
  943         sc->secsize = PAGE_SIZE;
  944         sc->nsect = mdio->md_size / (PAGE_SIZE / DEV_BSIZE);
  945         sc->object = vm_pager_allocate(OBJT_SWAP, NULL, sc->secsize * (vm_offset_t)sc->nsect, VM_PROT_DEFAULT, 0);
  946         sc->flags = mdio->md_options & MD_FORCE;
  947         if (mdio->md_options & MD_RESERVE) {
  948                 if (swap_pager_reserve(sc->object, 0, sc->nsect) < 0) {
  949                         vm_pager_deallocate(sc->object);
  950                         sc->object = NULL;
  951                         mddestroy(sc, td);
  952                         return (EDOM);
  953                 }
  954         }
  955         error = mdsetcred(sc, td->td_ucred);
  956         if (error)
  957                 mddestroy(sc, td);
  958         else
  959                 mdinit(sc);
  960         return (error);
  961 }
  962 
  963 static int
  964 mddetach(int unit, struct thread *td)
  965 {
  966         struct md_s *sc;
  967 
  968         sc = mdfind(unit);
  969         if (sc == NULL)
  970                 return (ENOENT);
  971         if (sc->opencount != 0 && !(sc->flags & MD_FORCE))
  972                 return (EBUSY);
  973         switch(sc->type) {
  974         case MD_VNODE:
  975         case MD_SWAP:
  976         case MD_MALLOC:
  977         case MD_PRELOAD:
  978                 return (mddestroy(sc, td));
  979         default:
  980                 return (EOPNOTSUPP);
  981         }
  982 }
  983 
  984 static int
  985 mdctlioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
  986 {
  987         struct md_ioctl *mdio;
  988         struct md_s *sc;
  989 
  990         if (md_debug)
  991                 printf("mdctlioctl(%s %lx %p %x %p)\n",
  992                         devtoname(dev), cmd, addr, flags, td);
  993 
  994         /*
  995          * We assert the version number in the individual ioctl
  996          * handlers instead of out here because (a) it is possible we
  997          * may add another ioctl in the future which doesn't read an
  998          * mdio, and (b) the correct return value for an unknown ioctl
  999          * is ENOIOCTL, not EINVAL.
 1000          */
 1001         mdio = (struct md_ioctl *)addr;
 1002         switch (cmd) {
 1003         case MDIOCATTACH:
 1004                 if (mdio->md_version != MDIOVERSION)
 1005                         return (EINVAL);
 1006                 switch (mdio->md_type) {
 1007                 case MD_MALLOC:
 1008                         return (mdcreate_malloc(mdio));
 1009                 case MD_PRELOAD:
 1010                         return (mdcreate_preload(mdio));
 1011                 case MD_VNODE:
 1012                         return (mdcreate_vnode(mdio, td));
 1013                 case MD_SWAP:
 1014                         return (mdcreate_swap(mdio, td));
 1015                 default:
 1016                         return (EINVAL);
 1017                 }
 1018         case MDIOCDETACH:
 1019                 if (mdio->md_version != MDIOVERSION)
 1020                         return (EINVAL);
 1021                 if (mdio->md_file != NULL || mdio->md_size != 0 ||
 1022                     mdio->md_options != 0)
 1023                         return (EINVAL);
 1024                 return (mddetach(mdio->md_unit, td));
 1025         case MDIOCQUERY:
 1026                 if (mdio->md_version != MDIOVERSION)
 1027                         return (EINVAL);
 1028                 sc = mdfind(mdio->md_unit);
 1029                 if (sc == NULL)
 1030                         return (ENOENT);
 1031                 mdio->md_type = sc->type;
 1032                 mdio->md_options = sc->flags;
 1033                 switch (sc->type) {
 1034                 case MD_MALLOC:
 1035                         mdio->md_size = sc->nsect;
 1036                         break;
 1037                 case MD_PRELOAD:
 1038                         mdio->md_size = sc->nsect;
 1039                         mdio->md_base = (uint64_t)(intptr_t)sc->pl_ptr;
 1040                         break;
 1041                 case MD_SWAP:
 1042                         mdio->md_size = sc->nsect * (PAGE_SIZE / DEV_BSIZE);
 1043                         break;
 1044                 case MD_VNODE:
 1045                         mdio->md_size = sc->nsect;
 1046                         /* XXX fill this in */
 1047                         mdio->md_file = NULL;
 1048                         break;
 1049                 }
 1050                 return (0);
 1051         default:
 1052                 return (ENOIOCTL);
 1053         };
 1054         return (ENOIOCTL);
 1055 }
 1056 
 1057 static void
 1058 md_preloaded(u_char *image, unsigned length)
 1059 {
 1060         struct md_s *sc;
 1061 
 1062         sc = mdnew(-1);
 1063         if (sc == NULL)
 1064                 return;
 1065         sc->type = MD_PRELOAD;
 1066         sc->secsize = DEV_BSIZE;
 1067         sc->nsect = length / DEV_BSIZE;
 1068         sc->pl_ptr = image;
 1069         sc->pl_len = length;
 1070         if (sc->unit == 0)
 1071                 mdrootready = 1;
 1072         mdinit(sc);
 1073 }
 1074 
 1075 static void
 1076 md_drvinit(void *unused)
 1077 {
 1078 
 1079         caddr_t mod;
 1080         caddr_t c;
 1081         u_char *ptr, *name, *type;
 1082         unsigned len;
 1083 
 1084 #ifdef MD_ROOT_SIZE
 1085         md_preloaded(mfs_root, MD_ROOT_SIZE*1024);
 1086 #endif
 1087         mod = NULL;
 1088         while ((mod = preload_search_next_name(mod)) != NULL) {
 1089                 name = (char *)preload_search_info(mod, MODINFO_NAME);
 1090                 type = (char *)preload_search_info(mod, MODINFO_TYPE);
 1091                 if (name == NULL)
 1092                         continue;
 1093                 if (type == NULL)
 1094                         continue;
 1095                 if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
 1096                         continue;
 1097                 c = preload_search_info(mod, MODINFO_ADDR);
 1098                 ptr = *(u_char **)c;
 1099                 c = preload_search_info(mod, MODINFO_SIZE);
 1100                 len = *(size_t *)c;
 1101                 printf("%s%d: Preloaded image <%s> %d bytes at %p\n",
 1102                     MD_NAME, mdunits, name, len, ptr);
 1103                 md_preloaded(ptr, len);
 1104         }
 1105         status_dev = make_dev(&mdctl_cdevsw, 0xffff00ff, UID_ROOT, GID_WHEEL,
 1106             0600, MDCTL_NAME);
 1107 }
 1108 
 1109 static int
 1110 md_modevent(module_t mod, int type, void *data)
 1111 {
 1112         int error;
 1113         struct md_s *sc;
 1114 
 1115         switch (type) {
 1116         case MOD_LOAD:
 1117                 md_drvinit(NULL);
 1118                 break;
 1119         case MOD_UNLOAD:
 1120                 LIST_FOREACH(sc, &md_softc_list, list) {
 1121                         error = mddetach(sc->unit, curthread);
 1122                         if (error != 0)
 1123                                 return (error);
 1124                 }
 1125                 if (status_dev)
 1126                         destroy_dev(status_dev);
 1127                 status_dev = 0;
 1128                 break;
 1129         default:
 1130                 break;
 1131         }
 1132         return (0);
 1133 }
 1134 
 1135 static moduledata_t md_mod = {
 1136         MD_NAME,
 1137         md_modevent,
 1138         NULL
 1139 };
 1140 DECLARE_MODULE(md, md_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+CDEV_MAJOR);
 1141 MODULE_VERSION(md, MD_MODVER);
 1142 
 1143 
 1144 #ifdef MD_ROOT
 1145 static void
 1146 md_takeroot(void *junk)
 1147 {
 1148         if (mdrootready)
 1149                 rootdevnames[0] = "ufs:/dev/md0";
 1150 }
 1151 
 1152 SYSINIT(md_root, SI_SUB_MOUNT_ROOT, SI_ORDER_FIRST, md_takeroot, NULL);
 1153 #endif

Cache object: d7abb91ca9f67c495d6daef121c9c0e2


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.