The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/md/md.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * ----------------------------------------------------------------------------
    3  * "THE BEER-WARE LICENSE" (Revision 42):
    4  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
    5  * can do whatever you want with this stuff. If we meet some day, and you think
    6  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
    7  * ----------------------------------------------------------------------------
    8  *
    9  * $FreeBSD$
   10  *
   11  */
   12 
   13 /*-
   14  * The following functions are based in the vn(4) driver: mdstart_swap(),
   15  * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
   16  * and as such under the following copyright:
   17  *
   18  * Copyright (c) 1988 University of Utah.
   19  * Copyright (c) 1990, 1993
   20  *      The Regents of the University of California.  All rights reserved.
   21  *
   22  * This code is derived from software contributed to Berkeley by
   23  * the Systems Programming Group of the University of Utah Computer
   24  * Science Department.
   25  *
   26  * Redistribution and use in source and binary forms, with or without
   27  * modification, are permitted provided that the following conditions
   28  * are met:
   29  * 1. Redistributions of source code must retain the above copyright
   30  *    notice, this list of conditions and the following disclaimer.
   31  * 2. Redistributions in binary form must reproduce the above copyright
   32  *    notice, this list of conditions and the following disclaimer in the
   33  *    documentation and/or other materials provided with the distribution.
   34  * 4. Neither the name of the University nor the names of its contributors
   35  *    may be used to endorse or promote products derived from this software
   36  *    without specific prior written permission.
   37  *
   38  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   39  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   40  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   41  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   42  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   43  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   44  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   45  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   46  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   47  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   48  * SUCH DAMAGE.
   49  *
   50  * from: Utah Hdr: vn.c 1.13 94/04/02
   51  *
   52  *      from: @(#)vn.c  8.6 (Berkeley) 4/1/94
   53  * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
   54  */
   55 
   56 #include "opt_geom.h"
   57 #include "opt_md.h"
   58 
   59 #include <sys/param.h>
   60 #include <sys/systm.h>
   61 #include <sys/bio.h>
   62 #include <sys/conf.h>
   63 #include <sys/fcntl.h>
   64 #include <sys/kernel.h>
   65 #include <sys/kthread.h>
   66 #include <sys/linker.h>
   67 #include <sys/lock.h>
   68 #include <sys/malloc.h>
   69 #include <sys/mdioctl.h>
   70 #include <sys/mutex.h>
   71 #include <sys/namei.h>
   72 #include <sys/proc.h>
   73 #include <sys/queue.h>
   74 #include <sys/sf_buf.h>
   75 #include <sys/sysctl.h>
   76 #include <sys/vnode.h>
   77 
   78 #include <geom/geom.h>
   79 
   80 #include <vm/vm.h>
   81 #include <vm/vm_object.h>
   82 #include <vm/vm_page.h>
   83 #include <vm/vm_pager.h>
   84 #include <vm/swap_pager.h>
   85 #include <vm/uma.h>
   86 
   87 #define MD_MODVER 1
   88 
   89 #define MD_SHUTDOWN 0x10000     /* Tell worker thread to terminate. */
   90 
   91 #ifndef MD_NSECT
   92 #define MD_NSECT (10000 * 2)
   93 #endif
   94 
   95 static MALLOC_DEFINE(M_MD, "MD disk", "Memory Disk");
   96 static MALLOC_DEFINE(M_MDSECT, "MD sectors", "Memory Disk Sectors");
   97 
   98 static int md_debug;
   99 SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "");
  100 
  101 #if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
  102 /* Image gets put here: */
  103 static u_char mfs_root[MD_ROOT_SIZE*1024] = "MFS Filesystem goes here";
  104 static u_char end_mfs_root[] __unused = "MFS Filesystem had better STOP here";
  105 #endif
  106 
  107 static g_init_t g_md_init;
  108 static g_fini_t g_md_fini;
  109 static g_start_t g_md_start;
  110 static g_access_t g_md_access;
  111 
  112 static int      mdunits;
  113 static struct cdev *status_dev = 0;
  114 
  115 static d_ioctl_t mdctlioctl;
  116 
  117 static struct cdevsw mdctl_cdevsw = {
  118         .d_version =    D_VERSION,
  119         .d_flags =      D_NEEDGIANT,
  120         .d_ioctl =      mdctlioctl,
  121         .d_name =       MD_NAME,
  122 };
  123 
  124 struct g_class g_md_class = {
  125         .name = "MD",
  126         .version = G_VERSION,
  127         .init = g_md_init,
  128         .fini = g_md_fini,
  129         .start = g_md_start,
  130         .access = g_md_access,
  131 };
  132 
  133 DECLARE_GEOM_CLASS(g_md_class, g_md);
  134 
  135 
  136 static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(&md_softc_list);
  137 
  138 #define NINDIR  (PAGE_SIZE / sizeof(uintptr_t))
  139 #define NMASK   (NINDIR-1)
  140 static int nshift;
  141 
  142 struct indir {
  143         uintptr_t       *array;
  144         u_int           total;
  145         u_int           used;
  146         u_int           shift;
  147 };
  148 
  149 struct md_s {
  150         int unit;
  151         LIST_ENTRY(md_s) list;
  152         struct bio_queue_head bio_queue;
  153         struct mtx queue_mtx;
  154         struct cdev *dev;
  155         enum md_types type;
  156         unsigned nsect;
  157         unsigned opencount;
  158         unsigned secsize;
  159         unsigned fwheads;
  160         unsigned fwsectors;
  161         unsigned flags;
  162         char name[20];
  163         struct proc *procp;
  164         struct g_geom *gp;
  165         struct g_provider *pp;
  166 
  167         /* MD_MALLOC related fields */
  168         struct indir *indir;
  169         uma_zone_t uma;
  170 
  171         /* MD_PRELOAD related fields */
  172         u_char *pl_ptr;
  173         unsigned pl_len;
  174 
  175         /* MD_VNODE related fields */
  176         struct vnode *vnode;
  177         struct ucred *cred;
  178 
  179         /* MD_SWAP related fields */
  180         vm_object_t object;
  181         unsigned npage;
  182 };
  183 
  184 static int mddestroy(struct md_s *sc, struct thread *td);
  185 
  186 static struct indir *
  187 new_indir(u_int shift)
  188 {
  189         struct indir *ip;
  190 
  191         ip = malloc(sizeof *ip, M_MD, M_NOWAIT | M_ZERO);
  192         if (ip == NULL)
  193                 return (NULL);
  194         ip->array = malloc(sizeof(uintptr_t) * NINDIR,
  195             M_MDSECT, M_NOWAIT | M_ZERO);
  196         if (ip->array == NULL) {
  197                 free(ip, M_MD);
  198                 return (NULL);
  199         }
  200         ip->total = NINDIR;
  201         ip->shift = shift;
  202         return (ip);
  203 }
  204 
  205 static void
  206 del_indir(struct indir *ip)
  207 {
  208 
  209         free(ip->array, M_MDSECT);
  210         free(ip, M_MD);
  211 }
  212 
  213 static void
  214 destroy_indir(struct md_s *sc, struct indir *ip)
  215 {
  216         int i;
  217 
  218         for (i = 0; i < NINDIR; i++) {
  219                 if (!ip->array[i])
  220                         continue;
  221                 if (ip->shift)
  222                         destroy_indir(sc, (struct indir*)(ip->array[i]));
  223                 else if (ip->array[i] > 255)
  224                         uma_zfree(sc->uma, (void *)(ip->array[i]));
  225         }
  226         del_indir(ip);
  227 }
  228 
  229 /*
  230  * This function does the math and alloctes the top level "indir" structure
  231  * for a device of "size" sectors.
  232  */
  233 
  234 static struct indir *
  235 dimension(off_t size)
  236 {
  237         off_t rcnt;
  238         struct indir *ip;
  239         int i, layer;
  240 
  241         rcnt = size;
  242         layer = 0;
  243         while (rcnt > NINDIR) {
  244                 rcnt /= NINDIR;
  245                 layer++;
  246         }
  247         /* figure out log2(NINDIR) */
  248         for (i = NINDIR, nshift = -1; i; nshift++)
  249                 i >>= 1;
  250 
  251         /*
  252          * XXX: the top layer is probably not fully populated, so we allocate
  253          * too much space for ip->array in here.
  254          */
  255         ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
  256         ip->array = malloc(sizeof(uintptr_t) * NINDIR,
  257             M_MDSECT, M_WAITOK | M_ZERO);
  258         ip->total = NINDIR;
  259         ip->shift = layer * nshift;
  260         return (ip);
  261 }
  262 
  263 /*
  264  * Read a given sector
  265  */
  266 
  267 static uintptr_t
  268 s_read(struct indir *ip, off_t offset)
  269 {
  270         struct indir *cip;
  271         int idx;
  272         uintptr_t up;
  273 
  274         if (md_debug > 1)
  275                 printf("s_read(%jd)\n", (intmax_t)offset);
  276         up = 0;
  277         for (cip = ip; cip != NULL;) {
  278                 if (cip->shift) {
  279                         idx = (offset >> cip->shift) & NMASK;
  280                         up = cip->array[idx];
  281                         cip = (struct indir *)up;
  282                         continue;
  283                 }
  284                 idx = offset & NMASK;
  285                 return (cip->array[idx]);
  286         }
  287         return (0);
  288 }
  289 
  290 /*
  291  * Write a given sector, prune the tree if the value is 0
  292  */
  293 
  294 static int
  295 s_write(struct indir *ip, off_t offset, uintptr_t ptr)
  296 {
  297         struct indir *cip, *lip[10];
  298         int idx, li;
  299         uintptr_t up;
  300 
  301         if (md_debug > 1)
  302                 printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
  303         up = 0;
  304         li = 0;
  305         cip = ip;
  306         for (;;) {
  307                 lip[li++] = cip;
  308                 if (cip->shift) {
  309                         idx = (offset >> cip->shift) & NMASK;
  310                         up = cip->array[idx];
  311                         if (up != 0) {
  312                                 cip = (struct indir *)up;
  313                                 continue;
  314                         }
  315                         /* Allocate branch */
  316                         cip->array[idx] =
  317                             (uintptr_t)new_indir(cip->shift - nshift);
  318                         if (cip->array[idx] == 0)
  319                                 return (ENOSPC);
  320                         cip->used++;
  321                         up = cip->array[idx];
  322                         cip = (struct indir *)up;
  323                         continue;
  324                 }
  325                 /* leafnode */
  326                 idx = offset & NMASK;
  327                 up = cip->array[idx];
  328                 if (up != 0)
  329                         cip->used--;
  330                 cip->array[idx] = ptr;
  331                 if (ptr != 0)
  332                         cip->used++;
  333                 break;
  334         }
  335         if (cip->used != 0 || li == 1)
  336                 return (0);
  337         li--;
  338         while (cip->used == 0 && cip != ip) {
  339                 li--;
  340                 idx = (offset >> lip[li]->shift) & NMASK;
  341                 up = lip[li]->array[idx];
  342                 KASSERT(up == (uintptr_t)cip, ("md screwed up"));
  343                 del_indir(cip);
  344                 lip[li]->array[idx] = 0;
  345                 lip[li]->used--;
  346                 cip = lip[li];
  347         }
  348         return (0);
  349 }
  350 
  351 
  352 static int
  353 g_md_access(struct g_provider *pp, int r, int w, int e)
  354 {
  355         struct md_s *sc;
  356 
  357         sc = pp->geom->softc;
  358         if (sc == NULL)
  359                 return (ENXIO);
  360         r += pp->acr;
  361         w += pp->acw;
  362         e += pp->ace;
  363         if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
  364                 sc->opencount = 1;
  365         } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
  366                 sc->opencount = 0;
  367         }
  368         return (0);
  369 }
  370 
  371 static void
  372 g_md_start(struct bio *bp)
  373 {
  374         struct md_s *sc;
  375 
  376         sc = bp->bio_to->geom->softc;
  377 
  378         bp->bio_pblkno = bp->bio_offset / sc->secsize;
  379         bp->bio_bcount = bp->bio_length;
  380         mtx_lock(&sc->queue_mtx);
  381         bioq_disksort(&sc->bio_queue, bp);
  382         mtx_unlock(&sc->queue_mtx);
  383 
  384         wakeup(sc);
  385 }
  386 
  387 
  388 
  389 static int
  390 mdstart_malloc(struct md_s *sc, struct bio *bp)
  391 {
  392         int i, error;
  393         u_char *dst;
  394         unsigned secno, nsec, uc;
  395         uintptr_t sp, osp;
  396 
  397         nsec = bp->bio_bcount / sc->secsize;
  398         secno = bp->bio_pblkno;
  399         dst = bp->bio_data;
  400         error = 0;
  401         while (nsec--) {
  402                 osp = s_read(sc->indir, secno);
  403                 if (bp->bio_cmd == BIO_DELETE) {
  404                         if (osp != 0)
  405                                 error = s_write(sc->indir, secno, 0);
  406                 } else if (bp->bio_cmd == BIO_READ) {
  407                         if (osp == 0)
  408                                 bzero(dst, sc->secsize);
  409                         else if (osp <= 255)
  410                                 for (i = 0; i < sc->secsize; i++)
  411                                         dst[i] = osp;
  412                         else
  413                                 bcopy((void *)osp, dst, sc->secsize);
  414                         osp = 0;
  415                 } else if (bp->bio_cmd == BIO_WRITE) {
  416                         if (sc->flags & MD_COMPRESS) {
  417                                 uc = dst[0];
  418                                 for (i = 1; i < sc->secsize; i++)
  419                                         if (dst[i] != uc)
  420                                                 break;
  421                         } else {
  422                                 i = 0;
  423                                 uc = 0;
  424                         }
  425                         if (i == sc->secsize) {
  426                                 if (osp != uc)
  427                                         error = s_write(sc->indir, secno, uc);
  428                         } else {
  429                                 if (osp <= 255) {
  430                                         sp = (uintptr_t) uma_zalloc(
  431                                             sc->uma, M_NOWAIT);
  432                                         if (sp == 0) {
  433                                                 error = ENOSPC;
  434                                                 break;
  435                                         }
  436                                         bcopy(dst, (void *)sp, sc->secsize);
  437                                         error = s_write(sc->indir, secno, sp);
  438                                 } else {
  439                                         bcopy(dst, (void *)osp, sc->secsize);
  440                                         osp = 0;
  441                                 }
  442                         }
  443                 } else {
  444                         error = EOPNOTSUPP;
  445                 }
  446                 if (osp > 255)
  447                         uma_zfree(sc->uma, (void*)osp);
  448                 if (error)
  449                         break;
  450                 secno++;
  451                 dst += sc->secsize;
  452         }
  453         bp->bio_resid = 0;
  454         return (error);
  455 }
  456 
  457 static int
  458 mdstart_preload(struct md_s *sc, struct bio *bp)
  459 {
  460 
  461         if (bp->bio_cmd == BIO_DELETE) {
  462         } else if (bp->bio_cmd == BIO_READ) {
  463                 bcopy(sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_data, bp->bio_bcount);
  464         } else {
  465                 bcopy(bp->bio_data, sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_bcount);
  466         }
  467         bp->bio_resid = 0;
  468         return (0);
  469 }
  470 
  471 static int
  472 mdstart_vnode(struct md_s *sc, struct bio *bp)
  473 {
  474         int error;
  475         struct uio auio;
  476         struct iovec aiov;
  477         struct mount *mp;
  478 
  479         /*
  480          * VNODE I/O
  481          *
  482          * If an error occurs, we set BIO_ERROR but we do not set
  483          * B_INVAL because (for a write anyway), the buffer is
  484          * still valid.
  485          */
  486 
  487         bzero(&auio, sizeof(auio));
  488 
  489         aiov.iov_base = bp->bio_data;
  490         aiov.iov_len = bp->bio_bcount;
  491         auio.uio_iov = &aiov;
  492         auio.uio_iovcnt = 1;
  493         auio.uio_offset = (vm_ooffset_t)bp->bio_pblkno * sc->secsize;
  494         auio.uio_segflg = UIO_SYSSPACE;
  495         if(bp->bio_cmd == BIO_READ)
  496                 auio.uio_rw = UIO_READ;
  497         else if(bp->bio_cmd == BIO_WRITE)
  498                 auio.uio_rw = UIO_WRITE;
  499         else
  500                 panic("wrong BIO_OP in mdstart_vnode");
  501         auio.uio_resid = bp->bio_bcount;
  502         auio.uio_td = curthread;
  503         /*
  504          * When reading set IO_DIRECT to try to avoid double-caching
  505          * the data.  When writing IO_DIRECT is not optimal.
  506          */
  507         if (bp->bio_cmd == BIO_READ) {
  508                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
  509                 error = VOP_READ(sc->vnode, &auio, IO_DIRECT, sc->cred);
  510                 VOP_UNLOCK(sc->vnode, 0, curthread);
  511         } else {
  512                 (void) vn_start_write(sc->vnode, &mp, V_WAIT);
  513                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
  514                 error = VOP_WRITE(sc->vnode, &auio,
  515                     sc->flags & MD_ASYNC ? 0 : IO_SYNC, sc->cred);
  516                 VOP_UNLOCK(sc->vnode, 0, curthread);
  517                 vn_finished_write(mp);
  518         }
  519         bp->bio_resid = auio.uio_resid;
  520         return (error);
  521 }
  522 
  523 static int
  524 mdstart_swap(struct md_s *sc, struct bio *bp)
  525 {
  526         struct sf_buf *sf;
  527         int i, rv;
  528         int offs, len, lastp, lastend;
  529         vm_page_t m;
  530         u_char *p;
  531 
  532         p = bp->bio_data;
  533 
  534         /*
  535          * offs is the ofset at whih to start operating on the
  536          * next (ie, first) page.  lastp is the last page on
  537          * which we're going to operate.  lastend is the ending
  538          * position within that last page (ie, PAGE_SIZE if
  539          * we're operating on complete aligned pages).
  540          */
  541         offs = bp->bio_offset % PAGE_SIZE;
  542         lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
  543         lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
  544 
  545         VM_OBJECT_LOCK(sc->object);
  546         vm_object_pip_add(sc->object, 1);
  547         for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
  548                 len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
  549 
  550                 m = vm_page_grab(sc->object, i,
  551                     VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
  552                 VM_OBJECT_UNLOCK(sc->object);
  553                 sf = sf_buf_alloc(m, 0);
  554                 VM_OBJECT_LOCK(sc->object);
  555                 if (bp->bio_cmd == BIO_READ) {
  556                         if (m->valid != VM_PAGE_BITS_ALL)
  557                                 rv = vm_pager_get_pages(sc->object, &m, 1, 0);
  558                         bcopy((void *)(sf_buf_kva(sf) + offs), p, len);
  559                 } else if (bp->bio_cmd == BIO_WRITE) {
  560                         if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
  561                                 rv = vm_pager_get_pages(sc->object, &m, 1, 0);
  562                         bcopy(p, (void *)(sf_buf_kva(sf) + offs), len);
  563                         m->valid = VM_PAGE_BITS_ALL;
  564 #if 0
  565                 } else if (bp->bio_cmd == BIO_DELETE) {
  566                         if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
  567                                 rv = vm_pager_get_pages(sc->object, &m, 1, 0);
  568                         bzero((void *)(sf_buf_kva(sf) + offs), len);
  569                         vm_page_dirty(m);
  570                         m->valid = VM_PAGE_BITS_ALL;
  571 #endif
  572                 }
  573                 sf_buf_free(sf);
  574                 vm_page_lock_queues();
  575                 vm_page_wakeup(m);
  576                 vm_page_activate(m);
  577                 if (bp->bio_cmd == BIO_WRITE)
  578                         vm_page_dirty(m);
  579                 vm_page_unlock_queues();
  580 
  581                 /* Actions on further pages start at offset 0 */
  582                 p += PAGE_SIZE - offs;
  583                 offs = 0;
  584 #if 0
  585 if (bootverbose || bp->bio_offset / PAGE_SIZE < 17)
  586 printf("wire_count %d busy %d flags %x hold_count %d act_count %d queue %d valid %d dirty %d @ %d\n",
  587     m->wire_count, m->busy, 
  588     m->flags, m->hold_count, m->act_count, m->queue, m->valid, m->dirty, i);
  589 #endif
  590         }
  591         vm_object_pip_subtract(sc->object, 1);
  592         vm_object_set_writeable_dirty(sc->object);
  593         VM_OBJECT_UNLOCK(sc->object);
  594         return (0);
  595 }
  596 
  597 static void
  598 md_kthread(void *arg)
  599 {
  600         struct md_s *sc;
  601         struct bio *bp;
  602         int error, hasgiant;
  603 
  604         sc = arg;
  605         curthread->td_base_pri = PRIBIO;
  606 
  607         switch (sc->type) {
  608         case MD_VNODE:
  609                 mtx_lock(&Giant);
  610                 hasgiant = 1;
  611                 break;
  612         case MD_MALLOC:
  613         case MD_PRELOAD:
  614         case MD_SWAP:
  615         default:
  616                 hasgiant = 0;
  617                 break;
  618         }
  619         
  620         for (;;) {
  621                 mtx_lock(&sc->queue_mtx);
  622                 bp = bioq_first(&sc->bio_queue);
  623                 if (bp)
  624                         bioq_remove(&sc->bio_queue, bp);
  625                 if (!bp) {
  626                         if (sc->flags & MD_SHUTDOWN) {
  627                                 mtx_unlock(&sc->queue_mtx);
  628                                 sc->procp = NULL;
  629                                 wakeup(&sc->procp);
  630                                 if (hasgiant)
  631                                         mtx_unlock(&Giant);
  632                                 kthread_exit(0);
  633                         }
  634                         msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
  635                         continue;
  636                 }
  637                 mtx_unlock(&sc->queue_mtx);
  638                 if (bp->bio_cmd == BIO_GETATTR) {
  639                         if (sc->fwsectors && sc->fwheads &&
  640                             (g_handleattr_int(bp, "GEOM::fwsectors",
  641                             sc->fwsectors) ||
  642                             g_handleattr_int(bp, "GEOM::fwheads",
  643                             sc->fwheads)))
  644                                 error = -1;
  645                         else
  646                                 error = EOPNOTSUPP;
  647                 } else {
  648                         switch (sc->type) {
  649                         case MD_MALLOC:
  650                                 error = mdstart_malloc(sc, bp);
  651                                 break;
  652                         case MD_PRELOAD:
  653                                 error = mdstart_preload(sc, bp);
  654                                 break;
  655                         case MD_VNODE:
  656                                 error = mdstart_vnode(sc, bp);
  657                                 break;
  658                         case MD_SWAP:
  659                                 error = mdstart_swap(sc, bp);
  660                                 break;
  661                         default:
  662                                 panic("Impossible md(type)");
  663                                 break;
  664                         }
  665                 }
  666 
  667                 if (error != -1) {
  668                         bp->bio_completed = bp->bio_length;
  669                         g_io_deliver(bp, error);
  670                 }
  671         }
  672 }
  673 
  674 static struct md_s *
  675 mdfind(int unit)
  676 {
  677         struct md_s *sc;
  678 
  679         /* XXX: LOCK(unique unit numbers) */
  680         LIST_FOREACH(sc, &md_softc_list, list) {
  681                 if (sc->unit == unit)
  682                         break;
  683         }
  684         /* XXX: UNLOCK(unique unit numbers) */
  685         return (sc);
  686 }
  687 
  688 static struct md_s *
  689 mdnew(int unit)
  690 {
  691         struct md_s *sc;
  692         int error, max = -1;
  693 
  694         /* XXX: LOCK(unique unit numbers) */
  695         LIST_FOREACH(sc, &md_softc_list, list) {
  696                 if (sc->unit == unit) {
  697                         /* XXX: UNLOCK(unique unit numbers) */
  698                         return (NULL);
  699                 }
  700                 if (sc->unit > max)
  701                         max = sc->unit;
  702         }
  703         if (unit == -1)
  704                 unit = max + 1;
  705         sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
  706         sc->unit = unit;
  707         bioq_init(&sc->bio_queue);
  708         mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
  709         sprintf(sc->name, "md%d", unit);
  710         error = kthread_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
  711         if (error) {
  712                 free(sc, M_MD);
  713                 return (NULL);
  714         }
  715         LIST_INSERT_HEAD(&md_softc_list, sc, list);
  716         /* XXX: UNLOCK(unique unit numbers) */
  717         return (sc);
  718 }
  719 
  720 static void
  721 mdinit(struct md_s *sc)
  722 {
  723 
  724         struct g_geom *gp;
  725         struct g_provider *pp;
  726 
  727         DROP_GIANT();
  728         g_topology_lock();
  729         gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
  730         gp->softc = sc;
  731         pp = g_new_providerf(gp, "md%d", sc->unit);
  732         pp->mediasize = (off_t)sc->nsect * sc->secsize;
  733         pp->sectorsize = sc->secsize;
  734         sc->gp = gp;
  735         sc->pp = pp;
  736         g_error_provider(pp, 0);
  737         g_topology_unlock();
  738         PICKUP_GIANT();
  739 }
  740 
  741 /*
  742  * XXX: we should check that the range they feed us is mapped.
  743  * XXX: we should implement read-only.
  744  */
  745 
  746 static int
  747 mdcreate_preload(struct md_ioctl *mdio)
  748 {
  749         struct md_s *sc;
  750 
  751         if (mdio->md_size == 0)
  752                 return (EINVAL);
  753         if (mdio->md_options & ~(MD_AUTOUNIT))
  754                 return (EINVAL);
  755         if (mdio->md_options & MD_AUTOUNIT) {
  756                 sc = mdnew(-1);
  757                 if (sc == NULL)
  758                         return (ENOMEM);
  759                 mdio->md_unit = sc->unit;
  760         } else {
  761                 sc = mdnew(mdio->md_unit);
  762                 if (sc == NULL)
  763                         return (EBUSY);
  764         }
  765         sc->type = MD_PRELOAD;
  766         sc->secsize = DEV_BSIZE;
  767         sc->nsect = mdio->md_size;
  768         sc->flags = mdio->md_options & MD_FORCE;
  769         /* Cast to pointer size, then to pointer to avoid warning */
  770         sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base;
  771         sc->pl_len = (mdio->md_size << DEV_BSHIFT);
  772         mdinit(sc);
  773         return (0);
  774 }
  775 
  776 
  777 static int
  778 mdcreate_malloc(struct md_ioctl *mdio)
  779 {
  780         struct md_s *sc;
  781         off_t u;
  782         uintptr_t sp;
  783         int error;
  784 
  785         error = 0;
  786         if (mdio->md_size == 0)
  787                 return (EINVAL);
  788         if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
  789                 return (EINVAL);
  790         if (mdio->md_secsize != 0 && !powerof2(mdio->md_secsize))
  791                 return (EINVAL);
  792         /* Compression doesn't make sense if we have reserved space */
  793         if (mdio->md_options & MD_RESERVE)
  794                 mdio->md_options &= ~MD_COMPRESS;
  795         if (mdio->md_options & MD_AUTOUNIT) {
  796                 sc = mdnew(-1);
  797                 if (sc == NULL)
  798                         return (ENOMEM);
  799                 mdio->md_unit = sc->unit;
  800         } else {
  801                 sc = mdnew(mdio->md_unit);
  802                 if (sc == NULL)
  803                         return (EBUSY);
  804         }
  805         sc->type = MD_MALLOC;
  806         if (mdio->md_secsize != 0)
  807                 sc->secsize = mdio->md_secsize;
  808         else
  809                 sc->secsize = DEV_BSIZE;
  810         if (mdio->md_fwsectors != 0)
  811                 sc->fwsectors = mdio->md_fwsectors;
  812         if (mdio->md_fwheads != 0)
  813                 sc->fwheads = mdio->md_fwheads;
  814         sc->nsect = (mdio->md_size * DEV_BSIZE) / sc->secsize;
  815         sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE);
  816         sc->indir = dimension(sc->nsect);
  817         sc->uma = uma_zcreate(sc->name, sc->secsize,
  818             NULL, NULL, NULL, NULL, 0x1ff, 0);
  819         if (mdio->md_options & MD_RESERVE) {
  820                 for (u = 0; u < sc->nsect; u++) {
  821                         sp = (uintptr_t) uma_zalloc(sc->uma, M_NOWAIT | M_ZERO);
  822                         if (sp != 0)
  823                                 error = s_write(sc->indir, u, sp);
  824                         else
  825                                 error = ENOMEM;
  826                         if (error)
  827                                 break;
  828                 }
  829         }
  830         if (error)  {
  831                 mddestroy(sc, NULL);
  832                 return (error);
  833         }
  834         mdinit(sc);
  835         if (!(mdio->md_options & MD_RESERVE))
  836                 sc->pp->flags |= G_PF_CANDELETE;
  837         return (0);
  838 }
  839 
  840 
  841 static int
  842 mdsetcred(struct md_s *sc, struct ucred *cred)
  843 {
  844         char *tmpbuf;
  845         int error = 0;
  846 
  847         /*
  848          * Set credits in our softc
  849          */
  850 
  851         if (sc->cred)
  852                 crfree(sc->cred);
  853         sc->cred = crhold(cred);
  854 
  855         /*
  856          * Horrible kludge to establish credentials for NFS  XXX.
  857          */
  858 
  859         if (sc->vnode) {
  860                 struct uio auio;
  861                 struct iovec aiov;
  862 
  863                 tmpbuf = malloc(sc->secsize, M_TEMP, M_WAITOK);
  864                 bzero(&auio, sizeof(auio));
  865 
  866                 aiov.iov_base = tmpbuf;
  867                 aiov.iov_len = sc->secsize;
  868                 auio.uio_iov = &aiov;
  869                 auio.uio_iovcnt = 1;
  870                 auio.uio_offset = 0;
  871                 auio.uio_rw = UIO_READ;
  872                 auio.uio_segflg = UIO_SYSSPACE;
  873                 auio.uio_resid = aiov.iov_len;
  874                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
  875                 error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
  876                 VOP_UNLOCK(sc->vnode, 0, curthread);
  877                 free(tmpbuf, M_TEMP);
  878         }
  879         return (error);
  880 }
  881 
  882 static int
  883 mdcreate_vnode(struct md_ioctl *mdio, struct thread *td)
  884 {
  885         struct md_s *sc;
  886         struct vattr vattr;
  887         struct nameidata nd;
  888         int error, flags;
  889 
  890         flags = FREAD|FWRITE;
  891         NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
  892         error = vn_open(&nd, &flags, 0, -1);
  893         if (error) {
  894                 if (error != EACCES && error != EPERM && error != EROFS)
  895                         return (error);
  896                 flags &= ~FWRITE;
  897                 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
  898                 error = vn_open(&nd, &flags, 0, -1);
  899                 if (error)
  900                         return (error);
  901         }
  902         NDFREE(&nd, NDF_ONLY_PNBUF);
  903         if (nd.ni_vp->v_type != VREG ||
  904             (error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred, td))) {
  905                 VOP_UNLOCK(nd.ni_vp, 0, td);
  906                 (void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
  907                 return (error ? error : EINVAL);
  908         }
  909         VOP_UNLOCK(nd.ni_vp, 0, td);
  910 
  911         if (mdio->md_options & MD_AUTOUNIT) {
  912                 sc = mdnew(-1);
  913                 mdio->md_unit = sc->unit;
  914         } else {
  915                 sc = mdnew(mdio->md_unit);
  916         }
  917         if (sc == NULL) {
  918                 (void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
  919                 return (EBUSY);
  920         }
  921 
  922         if (mdio->md_fwsectors != 0)
  923                 sc->fwsectors = mdio->md_fwsectors;
  924         if (mdio->md_fwheads != 0)
  925                 sc->fwheads = mdio->md_fwheads;
  926         sc->type = MD_VNODE;
  927         sc->flags = mdio->md_options & (MD_FORCE | MD_ASYNC);
  928         if (!(flags & FWRITE))
  929                 sc->flags |= MD_READONLY;
  930         sc->secsize = DEV_BSIZE;
  931         sc->vnode = nd.ni_vp;
  932 
  933         /*
  934          * If the size is specified, override the file attributes.
  935          */
  936         if (mdio->md_size)
  937                 sc->nsect = mdio->md_size;
  938         else
  939                 sc->nsect = vattr.va_size / sc->secsize; /* XXX: round up ? */
  940         if (sc->nsect == 0) {
  941                 mddestroy(sc, td);
  942                 return (EINVAL);
  943         }
  944         error = mdsetcred(sc, td->td_ucred);
  945         if (error) {
  946                 mddestroy(sc, td);
  947                 return (error);
  948         }
  949         mdinit(sc);
  950         return (0);
  951 }
  952 
  953 static void
  954 md_zapit(void *p, int cancel)
  955 {
  956         if (cancel)
  957                 return;
  958         g_wither_geom(p, ENXIO);
  959 }
  960 
  961 static int
  962 mddestroy(struct md_s *sc, struct thread *td)
  963 {
  964 
  965         GIANT_REQUIRED;
  966 
  967         mtx_destroy(&sc->queue_mtx);
  968         if (sc->gp) {
  969                 sc->gp->softc = NULL;
  970                 g_waitfor_event(md_zapit, sc->gp, M_WAITOK, sc->gp, NULL);
  971                 sc->gp = NULL;
  972                 sc->pp = NULL;
  973         }
  974         sc->flags |= MD_SHUTDOWN;
  975         wakeup(sc);
  976         while (sc->procp != NULL)
  977                 tsleep(&sc->procp, PRIBIO, "mddestroy", hz / 10);
  978         if (sc->vnode != NULL)
  979                 (void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
  980                     FREAD : (FREAD|FWRITE), sc->cred, td);
  981         if (sc->cred != NULL)
  982                 crfree(sc->cred);
  983         if (sc->object != NULL) {
  984                 vm_object_deallocate(sc->object);
  985         }
  986         if (sc->indir)
  987                 destroy_indir(sc, sc->indir);
  988         if (sc->uma)
  989                 uma_zdestroy(sc->uma);
  990 
  991         /* XXX: LOCK(unique unit numbers) */
  992         LIST_REMOVE(sc, list);
  993         /* XXX: UNLOCK(unique unit numbers) */
  994         free(sc, M_MD);
  995         return (0);
  996 }
  997 
  998 static int
  999 mdcreate_swap(struct md_ioctl *mdio, struct thread *td)
 1000 {
 1001         int error;
 1002         struct md_s *sc;
 1003 
 1004         GIANT_REQUIRED;
 1005 
 1006         if (mdio->md_options & MD_AUTOUNIT) {
 1007                 sc = mdnew(-1);
 1008                 mdio->md_unit = sc->unit;
 1009         } else {
 1010                 sc = mdnew(mdio->md_unit);
 1011         }
 1012         if (sc == NULL)
 1013                 return (EBUSY);
 1014 
 1015         sc->type = MD_SWAP;
 1016 
 1017         /*
 1018          * Range check.  Disallow negative sizes or any size less then the
 1019          * size of a page.  Then round to a page.
 1020          */
 1021 
 1022         if (mdio->md_size == 0) {
 1023                 mddestroy(sc, td);
 1024                 return (EDOM);
 1025         }
 1026 
 1027         /*
 1028          * Allocate an OBJT_SWAP object.
 1029          *
 1030          * sc_nsect is in units of DEV_BSIZE.
 1031          * sc_npage is in units of PAGE_SIZE.
 1032          *
 1033          * Note the truncation.
 1034          */
 1035 
 1036         sc->secsize = DEV_BSIZE;
 1037         sc->npage = mdio->md_size / (PAGE_SIZE / DEV_BSIZE);
 1038         sc->nsect = sc->npage * (PAGE_SIZE / DEV_BSIZE);
 1039         if (mdio->md_fwsectors != 0)
 1040                 sc->fwsectors = mdio->md_fwsectors;
 1041         if (mdio->md_fwheads != 0)
 1042                 sc->fwheads = mdio->md_fwheads;
 1043         sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * 
 1044             (vm_offset_t)sc->npage, VM_PROT_DEFAULT, 0);
 1045         sc->flags = mdio->md_options & MD_FORCE;
 1046         if (mdio->md_options & MD_RESERVE) {
 1047                 if (swap_pager_reserve(sc->object, 0, sc->npage) < 0) {
 1048                         vm_object_deallocate(sc->object);
 1049                         sc->object = NULL;
 1050                         mddestroy(sc, td);
 1051                         return (EDOM);
 1052                 }
 1053         }
 1054         error = mdsetcred(sc, td->td_ucred);
 1055         if (error) {
 1056                 mddestroy(sc, td);
 1057                 return (error);
 1058         }
 1059         mdinit(sc);
 1060         if (!(mdio->md_options & MD_RESERVE))
 1061                 sc->pp->flags |= G_PF_CANDELETE;
 1062         return (0);
 1063 }
 1064 
 1065 static int
 1066 mddetach(int unit, struct thread *td)
 1067 {
 1068         struct md_s *sc;
 1069 
 1070         sc = mdfind(unit);
 1071         if (sc == NULL)
 1072                 return (ENOENT);
 1073         if (sc->opencount != 0 && !(sc->flags & MD_FORCE))
 1074                 return (EBUSY);
 1075         switch(sc->type) {
 1076         case MD_VNODE:
 1077         case MD_SWAP:
 1078         case MD_MALLOC:
 1079         case MD_PRELOAD:
 1080                 return (mddestroy(sc, td));
 1081         default:
 1082                 return (EOPNOTSUPP);
 1083         }
 1084 }
 1085 
 1086 static int
 1087 mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
 1088 {
 1089         struct md_ioctl *mdio;
 1090         struct md_s *sc;
 1091         int i;
 1092 
 1093         if (md_debug)
 1094                 printf("mdctlioctl(%s %lx %p %x %p)\n",
 1095                         devtoname(dev), cmd, addr, flags, td);
 1096 
 1097         /*
 1098          * We assert the version number in the individual ioctl
 1099          * handlers instead of out here because (a) it is possible we
 1100          * may add another ioctl in the future which doesn't read an
 1101          * mdio, and (b) the correct return value for an unknown ioctl
 1102          * is ENOIOCTL, not EINVAL.
 1103          */
 1104         mdio = (struct md_ioctl *)addr;
 1105         switch (cmd) {
 1106         case MDIOCATTACH:
 1107                 if (mdio->md_version != MDIOVERSION)
 1108                         return (EINVAL);
 1109                 switch (mdio->md_type) {
 1110                 case MD_MALLOC:
 1111                         return (mdcreate_malloc(mdio));
 1112                 case MD_PRELOAD:
 1113                         return (mdcreate_preload(mdio));
 1114                 case MD_VNODE:
 1115                         return (mdcreate_vnode(mdio, td));
 1116                 case MD_SWAP:
 1117                         return (mdcreate_swap(mdio, td));
 1118                 default:
 1119                         return (EINVAL);
 1120                 }
 1121         case MDIOCDETACH:
 1122                 if (mdio->md_version != MDIOVERSION)
 1123                         return (EINVAL);
 1124                 if (mdio->md_file != NULL || mdio->md_size != 0 ||
 1125                     mdio->md_options != 0)
 1126                         return (EINVAL);
 1127                 return (mddetach(mdio->md_unit, td));
 1128         case MDIOCQUERY:
 1129                 if (mdio->md_version != MDIOVERSION)
 1130                         return (EINVAL);
 1131                 sc = mdfind(mdio->md_unit);
 1132                 if (sc == NULL)
 1133                         return (ENOENT);
 1134                 mdio->md_type = sc->type;
 1135                 mdio->md_options = sc->flags;
 1136                 switch (sc->type) {
 1137                 case MD_MALLOC:
 1138                         mdio->md_size = sc->nsect;
 1139                         break;
 1140                 case MD_PRELOAD:
 1141                         mdio->md_size = sc->nsect;
 1142                         mdio->md_base = (uint64_t)(intptr_t)sc->pl_ptr;
 1143                         break;
 1144                 case MD_SWAP:
 1145                         mdio->md_size = sc->nsect;
 1146                         break;
 1147                 case MD_VNODE:
 1148                         mdio->md_size = sc->nsect;
 1149                         /* XXX fill this in */
 1150                         mdio->md_file = NULL;
 1151                         break;
 1152                 }
 1153                 return (0);
 1154         case MDIOCLIST:
 1155                 i = 1;
 1156                 LIST_FOREACH(sc, &md_softc_list, list) {
 1157                         if (i == MDNPAD - 1)
 1158                                 mdio->md_pad[i] = -1;
 1159                         else
 1160                                 mdio->md_pad[i++] = sc->unit;
 1161                 }
 1162                 mdio->md_pad[0] = i - 1;
 1163                 return (0);
 1164         default:
 1165                 return (ENOIOCTL);
 1166         };
 1167         return (ENOIOCTL);
 1168 }
 1169 
 1170 static void
 1171 md_preloaded(u_char *image, unsigned length)
 1172 {
 1173         struct md_s *sc;
 1174 
 1175         sc = mdnew(-1);
 1176         if (sc == NULL)
 1177                 return;
 1178         sc->type = MD_PRELOAD;
 1179         sc->secsize = DEV_BSIZE;
 1180         sc->nsect = length / DEV_BSIZE;
 1181         sc->pl_ptr = image;
 1182         sc->pl_len = length;
 1183 #ifdef MD_ROOT
 1184         if (sc->unit == 0)
 1185                 rootdevnames[0] = "ufs:/dev/md0";
 1186 #endif
 1187         mdinit(sc);
 1188 }
 1189 
 1190 static void
 1191 g_md_init(struct g_class *mp __unused)
 1192 {
 1193 
 1194         caddr_t mod;
 1195         caddr_t c;
 1196         u_char *ptr, *name, *type;
 1197         unsigned len;
 1198 
 1199         mod = NULL;
 1200         g_topology_unlock();
 1201 #ifdef MD_ROOT_SIZE
 1202         md_preloaded(mfs_root, MD_ROOT_SIZE*1024);
 1203 #endif
 1204         while ((mod = preload_search_next_name(mod)) != NULL) {
 1205                 name = (char *)preload_search_info(mod, MODINFO_NAME);
 1206                 type = (char *)preload_search_info(mod, MODINFO_TYPE);
 1207                 if (name == NULL)
 1208                         continue;
 1209                 if (type == NULL)
 1210                         continue;
 1211                 if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
 1212                         continue;
 1213                 c = preload_search_info(mod, MODINFO_ADDR);
 1214                 ptr = *(u_char **)c;
 1215                 c = preload_search_info(mod, MODINFO_SIZE);
 1216                 len = *(size_t *)c;
 1217                 printf("%s%d: Preloaded image <%s> %d bytes at %p\n",
 1218                     MD_NAME, mdunits, name, len, ptr);
 1219                 md_preloaded(ptr, len);
 1220         }
 1221         status_dev = make_dev(&mdctl_cdevsw, 0xffff00ff, UID_ROOT, GID_WHEEL,
 1222             0600, MDCTL_NAME);
 1223         g_topology_lock();
 1224 }
 1225 
 1226 static void
 1227 g_md_fini(struct g_class *mp __unused)
 1228 {
 1229 
 1230         if (status_dev != NULL)
 1231                 destroy_dev(status_dev);
 1232 }

Cache object: 06e7813e5d0ebeb9df446aad96837147


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.