The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/md/md.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * ----------------------------------------------------------------------------
    3  * "THE BEER-WARE LICENSE" (Revision 42):
    4  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
    5  * can do whatever you want with this stuff. If we meet some day, and you think
    6  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
    7  * ----------------------------------------------------------------------------
    8  *
    9  * $FreeBSD: releng/6.0/sys/dev/md/md.c 151065 2005-10-07 15:08:41Z phk $
   10  *
   11  */
   12 
   13 /*-
   14  * The following functions are based in the vn(4) driver: mdstart_swap(),
   15  * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
   16  * and as such under the following copyright:
   17  *
   18  * Copyright (c) 1988 University of Utah.
   19  * Copyright (c) 1990, 1993
   20  *      The Regents of the University of California.  All rights reserved.
   21  *
   22  * This code is derived from software contributed to Berkeley by
   23  * the Systems Programming Group of the University of Utah Computer
   24  * Science Department.
   25  *
   26  * Redistribution and use in source and binary forms, with or without
   27  * modification, are permitted provided that the following conditions
   28  * are met:
   29  * 1. Redistributions of source code must retain the above copyright
   30  *    notice, this list of conditions and the following disclaimer.
   31  * 2. Redistributions in binary form must reproduce the above copyright
   32  *    notice, this list of conditions and the following disclaimer in the
   33  *    documentation and/or other materials provided with the distribution.
   34  * 4. Neither the name of the University nor the names of its contributors
   35  *    may be used to endorse or promote products derived from this software
   36  *    without specific prior written permission.
   37  *
   38  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   39  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   40  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   41  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   42  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   43  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   44  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   45  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   46  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   47  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   48  * SUCH DAMAGE.
   49  *
   50  * from: Utah Hdr: vn.c 1.13 94/04/02
   51  *
   52  *      from: @(#)vn.c  8.6 (Berkeley) 4/1/94
   53  * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
   54  */
   55 
   56 #include "opt_geom.h"
   57 #include "opt_md.h"
   58 
   59 #include <sys/param.h>
   60 #include <sys/systm.h>
   61 #include <sys/bio.h>
   62 #include <sys/conf.h>
   63 #include <sys/fcntl.h>
   64 #include <sys/kernel.h>
   65 #include <sys/kthread.h>
   66 #include <sys/linker.h>
   67 #include <sys/lock.h>
   68 #include <sys/malloc.h>
   69 #include <sys/mdioctl.h>
   70 #include <sys/mutex.h>
   71 #include <sys/sx.h>
   72 #include <sys/namei.h>
   73 #include <sys/proc.h>
   74 #include <sys/queue.h>
   75 #include <sys/sched.h>
   76 #include <sys/sf_buf.h>
   77 #include <sys/sysctl.h>
   78 #include <sys/vnode.h>
   79 
   80 #include <geom/geom.h>
   81 
   82 #include <vm/vm.h>
   83 #include <vm/vm_object.h>
   84 #include <vm/vm_page.h>
   85 #include <vm/vm_pager.h>
   86 #include <vm/swap_pager.h>
   87 #include <vm/uma.h>
   88 
   89 #define MD_MODVER 1
   90 
   91 #define MD_SHUTDOWN 0x10000     /* Tell worker thread to terminate. */
   92 
   93 #ifndef MD_NSECT
   94 #define MD_NSECT (10000 * 2)
   95 #endif
   96 
   97 static MALLOC_DEFINE(M_MD, "MD disk", "Memory Disk");
   98 static MALLOC_DEFINE(M_MDSECT, "MD sectors", "Memory Disk Sectors");
   99 
  100 static int md_debug;
  101 SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "");
  102 
  103 #if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
  104 /* Image gets put here: */
  105 static u_char mfs_root[MD_ROOT_SIZE*1024] = "MFS Filesystem goes here";
  106 static u_char end_mfs_root[] __unused = "MFS Filesystem had better STOP here";
  107 #endif
  108 
  109 static g_init_t g_md_init;
  110 static g_fini_t g_md_fini;
  111 static g_start_t g_md_start;
  112 static g_access_t g_md_access;
  113 
  114 static int      mdunits;
  115 static struct cdev *status_dev = 0;
  116 static struct sx md_sx;
  117 
  118 static d_ioctl_t mdctlioctl;
  119 
  120 static struct cdevsw mdctl_cdevsw = {
  121         .d_version =    D_VERSION,
  122         .d_ioctl =      mdctlioctl,
  123         .d_name =       MD_NAME,
  124 };
  125 
  126 struct g_class g_md_class = {
  127         .name = "MD",
  128         .version = G_VERSION,
  129         .init = g_md_init,
  130         .fini = g_md_fini,
  131         .start = g_md_start,
  132         .access = g_md_access,
  133 };
  134 
  135 DECLARE_GEOM_CLASS(g_md_class, g_md);
  136 
  137 
  138 static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(&md_softc_list);
  139 
  140 #define NINDIR  (PAGE_SIZE / sizeof(uintptr_t))
  141 #define NMASK   (NINDIR-1)
  142 static int nshift;
  143 
  144 struct indir {
  145         uintptr_t       *array;
  146         u_int           total;
  147         u_int           used;
  148         u_int           shift;
  149 };
  150 
  151 struct md_s {
  152         int unit;
  153         LIST_ENTRY(md_s) list;
  154         struct bio_queue_head bio_queue;
  155         struct mtx queue_mtx;
  156         struct cdev *dev;
  157         enum md_types type;
  158         off_t mediasize;
  159         unsigned sectorsize;
  160         unsigned opencount;
  161         unsigned fwheads;
  162         unsigned fwsectors;
  163         unsigned flags;
  164         char name[20];
  165         struct proc *procp;
  166         struct g_geom *gp;
  167         struct g_provider *pp;
  168         int (*start)(struct md_s *sc, struct bio *bp);
  169 
  170         /* MD_MALLOC related fields */
  171         struct indir *indir;
  172         uma_zone_t uma;
  173 
  174         /* MD_PRELOAD related fields */
  175         u_char *pl_ptr;
  176         size_t pl_len;
  177 
  178         /* MD_VNODE related fields */
  179         struct vnode *vnode;
  180         char file[PATH_MAX];
  181         struct ucred *cred;
  182 
  183         /* MD_SWAP related fields */
  184         vm_object_t object;
  185 };
  186 
  187 static struct indir *
  188 new_indir(u_int shift)
  189 {
  190         struct indir *ip;
  191 
  192         ip = malloc(sizeof *ip, M_MD, M_NOWAIT | M_ZERO);
  193         if (ip == NULL)
  194                 return (NULL);
  195         ip->array = malloc(sizeof(uintptr_t) * NINDIR,
  196             M_MDSECT, M_NOWAIT | M_ZERO);
  197         if (ip->array == NULL) {
  198                 free(ip, M_MD);
  199                 return (NULL);
  200         }
  201         ip->total = NINDIR;
  202         ip->shift = shift;
  203         return (ip);
  204 }
  205 
  206 static void
  207 del_indir(struct indir *ip)
  208 {
  209 
  210         free(ip->array, M_MDSECT);
  211         free(ip, M_MD);
  212 }
  213 
  214 static void
  215 destroy_indir(struct md_s *sc, struct indir *ip)
  216 {
  217         int i;
  218 
  219         for (i = 0; i < NINDIR; i++) {
  220                 if (!ip->array[i])
  221                         continue;
  222                 if (ip->shift)
  223                         destroy_indir(sc, (struct indir*)(ip->array[i]));
  224                 else if (ip->array[i] > 255)
  225                         uma_zfree(sc->uma, (void *)(ip->array[i]));
  226         }
  227         del_indir(ip);
  228 }
  229 
  230 /*
  231  * This function does the math and alloctes the top level "indir" structure
  232  * for a device of "size" sectors.
  233  */
  234 
  235 static struct indir *
  236 dimension(off_t size)
  237 {
  238         off_t rcnt;
  239         struct indir *ip;
  240         int i, layer;
  241 
  242         rcnt = size;
  243         layer = 0;
  244         while (rcnt > NINDIR) {
  245                 rcnt /= NINDIR;
  246                 layer++;
  247         }
  248         /* figure out log2(NINDIR) */
  249         for (i = NINDIR, nshift = -1; i; nshift++)
  250                 i >>= 1;
  251 
  252         /*
  253          * XXX: the top layer is probably not fully populated, so we allocate
  254          * too much space for ip->array in here.
  255          */
  256         ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
  257         ip->array = malloc(sizeof(uintptr_t) * NINDIR,
  258             M_MDSECT, M_WAITOK | M_ZERO);
  259         ip->total = NINDIR;
  260         ip->shift = layer * nshift;
  261         return (ip);
  262 }
  263 
  264 /*
  265  * Read a given sector
  266  */
  267 
  268 static uintptr_t
  269 s_read(struct indir *ip, off_t offset)
  270 {
  271         struct indir *cip;
  272         int idx;
  273         uintptr_t up;
  274 
  275         if (md_debug > 1)
  276                 printf("s_read(%jd)\n", (intmax_t)offset);
  277         up = 0;
  278         for (cip = ip; cip != NULL;) {
  279                 if (cip->shift) {
  280                         idx = (offset >> cip->shift) & NMASK;
  281                         up = cip->array[idx];
  282                         cip = (struct indir *)up;
  283                         continue;
  284                 }
  285                 idx = offset & NMASK;
  286                 return (cip->array[idx]);
  287         }
  288         return (0);
  289 }
  290 
  291 /*
  292  * Write a given sector, prune the tree if the value is 0
  293  */
  294 
  295 static int
  296 s_write(struct indir *ip, off_t offset, uintptr_t ptr)
  297 {
  298         struct indir *cip, *lip[10];
  299         int idx, li;
  300         uintptr_t up;
  301 
  302         if (md_debug > 1)
  303                 printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
  304         up = 0;
  305         li = 0;
  306         cip = ip;
  307         for (;;) {
  308                 lip[li++] = cip;
  309                 if (cip->shift) {
  310                         idx = (offset >> cip->shift) & NMASK;
  311                         up = cip->array[idx];
  312                         if (up != 0) {
  313                                 cip = (struct indir *)up;
  314                                 continue;
  315                         }
  316                         /* Allocate branch */
  317                         cip->array[idx] =
  318                             (uintptr_t)new_indir(cip->shift - nshift);
  319                         if (cip->array[idx] == 0)
  320                                 return (ENOSPC);
  321                         cip->used++;
  322                         up = cip->array[idx];
  323                         cip = (struct indir *)up;
  324                         continue;
  325                 }
  326                 /* leafnode */
  327                 idx = offset & NMASK;
  328                 up = cip->array[idx];
  329                 if (up != 0)
  330                         cip->used--;
  331                 cip->array[idx] = ptr;
  332                 if (ptr != 0)
  333                         cip->used++;
  334                 break;
  335         }
  336         if (cip->used != 0 || li == 1)
  337                 return (0);
  338         li--;
  339         while (cip->used == 0 && cip != ip) {
  340                 li--;
  341                 idx = (offset >> lip[li]->shift) & NMASK;
  342                 up = lip[li]->array[idx];
  343                 KASSERT(up == (uintptr_t)cip, ("md screwed up"));
  344                 del_indir(cip);
  345                 lip[li]->array[idx] = 0;
  346                 lip[li]->used--;
  347                 cip = lip[li];
  348         }
  349         return (0);
  350 }
  351 
  352 
  353 static int
  354 g_md_access(struct g_provider *pp, int r, int w, int e)
  355 {
  356         struct md_s *sc;
  357 
  358         sc = pp->geom->softc;
  359         if (sc == NULL)
  360                 return (ENXIO);
  361         r += pp->acr;
  362         w += pp->acw;
  363         e += pp->ace;
  364         if ((sc->flags & MD_READONLY) != 0 && w > 0)
  365                 return (EROFS);
  366         if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
  367                 sc->opencount = 1;
  368         } else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
  369                 sc->opencount = 0;
  370         }
  371         return (0);
  372 }
  373 
  374 static void
  375 g_md_start(struct bio *bp)
  376 {
  377         struct md_s *sc;
  378 
  379         sc = bp->bio_to->geom->softc;
  380         mtx_lock(&sc->queue_mtx);
  381         bioq_disksort(&sc->bio_queue, bp);
  382         mtx_unlock(&sc->queue_mtx);
  383         wakeup(sc);
  384 }
  385 
  386 static int
  387 mdstart_malloc(struct md_s *sc, struct bio *bp)
  388 {
  389         int i, error;
  390         u_char *dst;
  391         off_t secno, nsec, uc;
  392         uintptr_t sp, osp;
  393 
  394         nsec = bp->bio_length / sc->sectorsize;
  395         secno = bp->bio_offset / sc->sectorsize;
  396         dst = bp->bio_data;
  397         error = 0;
  398         while (nsec--) {
  399                 osp = s_read(sc->indir, secno);
  400                 if (bp->bio_cmd == BIO_DELETE) {
  401                         if (osp != 0)
  402                                 error = s_write(sc->indir, secno, 0);
  403                 } else if (bp->bio_cmd == BIO_READ) {
  404                         if (osp == 0)
  405                                 bzero(dst, sc->sectorsize);
  406                         else if (osp <= 255)
  407                                 for (i = 0; i < sc->sectorsize; i++)
  408                                         dst[i] = osp;
  409                         else
  410                                 bcopy((void *)osp, dst, sc->sectorsize);
  411                         osp = 0;
  412                 } else if (bp->bio_cmd == BIO_WRITE) {
  413                         if (sc->flags & MD_COMPRESS) {
  414                                 uc = dst[0];
  415                                 for (i = 1; i < sc->sectorsize; i++)
  416                                         if (dst[i] != uc)
  417                                                 break;
  418                         } else {
  419                                 i = 0;
  420                                 uc = 0;
  421                         }
  422                         if (i == sc->sectorsize) {
  423                                 if (osp != uc)
  424                                         error = s_write(sc->indir, secno, uc);
  425                         } else {
  426                                 if (osp <= 255) {
  427                                         sp = (uintptr_t)uma_zalloc(sc->uma,
  428                                             M_NOWAIT);
  429                                         if (sp == 0) {
  430                                                 error = ENOSPC;
  431                                                 break;
  432                                         }
  433                                         bcopy(dst, (void *)sp, sc->sectorsize);
  434                                         error = s_write(sc->indir, secno, sp);
  435                                 } else {
  436                                         bcopy(dst, (void *)osp, sc->sectorsize);
  437                                         osp = 0;
  438                                 }
  439                         }
  440                 } else {
  441                         error = EOPNOTSUPP;
  442                 }
  443                 if (osp > 255)
  444                         uma_zfree(sc->uma, (void*)osp);
  445                 if (error != 0)
  446                         break;
  447                 secno++;
  448                 dst += sc->sectorsize;
  449         }
  450         bp->bio_resid = 0;
  451         return (error);
  452 }
  453 
  454 static int
  455 mdstart_preload(struct md_s *sc, struct bio *bp)
  456 {
  457 
  458         switch (bp->bio_cmd) {
  459         case BIO_READ:
  460                 bcopy(sc->pl_ptr + bp->bio_offset, bp->bio_data,
  461                     bp->bio_length);
  462                 break;
  463         case BIO_WRITE:
  464                 bcopy(bp->bio_data, sc->pl_ptr + bp->bio_offset,
  465                     bp->bio_length);
  466                 break;
  467         }
  468         bp->bio_resid = 0;
  469         return (0);
  470 }
  471 
  472 static int
  473 mdstart_vnode(struct md_s *sc, struct bio *bp)
  474 {
  475         int error;
  476         struct uio auio;
  477         struct iovec aiov;
  478         struct mount *mp;
  479 
  480         mtx_assert(&Giant, MA_OWNED);
  481         /*
  482          * VNODE I/O
  483          *
  484          * If an error occurs, we set BIO_ERROR but we do not set
  485          * B_INVAL because (for a write anyway), the buffer is
  486          * still valid.
  487          */
  488 
  489         bzero(&auio, sizeof(auio));
  490 
  491         aiov.iov_base = bp->bio_data;
  492         aiov.iov_len = bp->bio_length;
  493         auio.uio_iov = &aiov;
  494         auio.uio_iovcnt = 1;
  495         auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
  496         auio.uio_segflg = UIO_SYSSPACE;
  497         if(bp->bio_cmd == BIO_READ)
  498                 auio.uio_rw = UIO_READ;
  499         else if(bp->bio_cmd == BIO_WRITE)
  500                 auio.uio_rw = UIO_WRITE;
  501         else
  502                 panic("wrong BIO_OP in mdstart_vnode");
  503         auio.uio_resid = bp->bio_length;
  504         auio.uio_td = curthread;
  505         /*
  506          * When reading set IO_DIRECT to try to avoid double-caching
  507          * the data.  When writing IO_DIRECT is not optimal.
  508          */
  509         if (bp->bio_cmd == BIO_READ) {
  510                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
  511                 error = VOP_READ(sc->vnode, &auio, IO_DIRECT, sc->cred);
  512                 VOP_UNLOCK(sc->vnode, 0, curthread);
  513         } else {
  514                 (void) vn_start_write(sc->vnode, &mp, V_WAIT);
  515                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
  516                 error = VOP_WRITE(sc->vnode, &auio,
  517                     sc->flags & MD_ASYNC ? 0 : IO_SYNC, sc->cred);
  518                 VOP_UNLOCK(sc->vnode, 0, curthread);
  519                 vn_finished_write(mp);
  520         }
  521         bp->bio_resid = auio.uio_resid;
  522         return (error);
  523 }
  524 
  525 static int
  526 mdstart_swap(struct md_s *sc, struct bio *bp)
  527 {
  528         struct sf_buf *sf;
  529         int rv, offs, len, lastend;
  530         vm_pindex_t i, lastp;
  531         vm_page_t m;
  532         u_char *p;
  533 
  534         p = bp->bio_data;
  535 
  536         /*
  537          * offs is the ofset at whih to start operating on the
  538          * next (ie, first) page.  lastp is the last page on
  539          * which we're going to operate.  lastend is the ending
  540          * position within that last page (ie, PAGE_SIZE if
  541          * we're operating on complete aligned pages).
  542          */
  543         offs = bp->bio_offset % PAGE_SIZE;
  544         lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
  545         lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
  546 
  547         rv = VM_PAGER_OK;
  548         VM_OBJECT_LOCK(sc->object);
  549         vm_object_pip_add(sc->object, 1);
  550         for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
  551                 len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
  552 
  553                 m = vm_page_grab(sc->object, i,
  554                     VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
  555                 VM_OBJECT_UNLOCK(sc->object);
  556                 sched_pin();
  557                 sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
  558                 VM_OBJECT_LOCK(sc->object);
  559                 if (bp->bio_cmd == BIO_READ) {
  560                         if (m->valid != VM_PAGE_BITS_ALL)
  561                                 rv = vm_pager_get_pages(sc->object, &m, 1, 0);
  562                         if (rv == VM_PAGER_ERROR) {
  563                                 sf_buf_free(sf);
  564                                 sched_unpin();
  565                                 vm_page_lock_queues();
  566                                 vm_page_wakeup(m);
  567                                 vm_page_unlock_queues();
  568                                 break;
  569                         }
  570                         bcopy((void *)(sf_buf_kva(sf) + offs), p, len);
  571                 } else if (bp->bio_cmd == BIO_WRITE) {
  572                         if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
  573                                 rv = vm_pager_get_pages(sc->object, &m, 1, 0);
  574                         if (rv == VM_PAGER_ERROR) {
  575                                 sf_buf_free(sf);
  576                                 sched_unpin();
  577                                 vm_page_lock_queues();
  578                                 vm_page_wakeup(m);
  579                                 vm_page_unlock_queues();
  580                                 break;
  581                         }
  582                         bcopy(p, (void *)(sf_buf_kva(sf) + offs), len);
  583                         m->valid = VM_PAGE_BITS_ALL;
  584 #if 0
  585                 } else if (bp->bio_cmd == BIO_DELETE) {
  586                         if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
  587                                 rv = vm_pager_get_pages(sc->object, &m, 1, 0);
  588                         if (rv == VM_PAGER_ERROR) {
  589                                 sf_buf_free(sf);
  590                                 sched_unpin();
  591                                 vm_page_lock_queues();
  592                                 vm_page_wakeup(m);
  593                                 vm_page_unlock_queues();
  594                                 break;
  595                         }
  596                         bzero((void *)(sf_buf_kva(sf) + offs), len);
  597                         vm_page_dirty(m);
  598                         m->valid = VM_PAGE_BITS_ALL;
  599 #endif
  600                 }
  601                 sf_buf_free(sf);
  602                 sched_unpin();
  603                 vm_page_lock_queues();
  604                 vm_page_wakeup(m);
  605                 vm_page_activate(m);
  606                 if (bp->bio_cmd == BIO_WRITE)
  607                         vm_page_dirty(m);
  608                 vm_page_unlock_queues();
  609 
  610                 /* Actions on further pages start at offset 0 */
  611                 p += PAGE_SIZE - offs;
  612                 offs = 0;
  613 #if 0
  614 if (bootverbose || bp->bio_offset / PAGE_SIZE < 17)
  615 printf("wire_count %d busy %d flags %x hold_count %d act_count %d queue %d valid %d dirty %d @ %d\n",
  616     m->wire_count, m->busy, 
  617     m->flags, m->hold_count, m->act_count, m->queue, m->valid, m->dirty, i);
  618 #endif
  619         }
  620         vm_object_pip_subtract(sc->object, 1);
  621         vm_object_set_writeable_dirty(sc->object);
  622         VM_OBJECT_UNLOCK(sc->object);
  623         return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
  624 }
  625 
  626 static void
  627 md_kthread(void *arg)
  628 {
  629         struct md_s *sc;
  630         struct bio *bp;
  631         int error, hasgiant;
  632 
  633         sc = arg;
  634         mtx_lock_spin(&sched_lock);
  635         sched_prio(curthread, PRIBIO);
  636         mtx_unlock_spin(&sched_lock);
  637 
  638         switch (sc->type) {
  639         case MD_VNODE:
  640                 mtx_lock(&Giant);
  641                 hasgiant = 1;
  642                 break;
  643         case MD_MALLOC:
  644         case MD_PRELOAD:
  645         case MD_SWAP:
  646         default:
  647                 hasgiant = 0;
  648                 break;
  649         }
  650         
  651         for (;;) {
  652                 if (sc->flags & MD_SHUTDOWN) {
  653                         sc->procp = NULL;
  654                         wakeup(&sc->procp);
  655                         if (hasgiant)
  656                                 mtx_unlock(&Giant);
  657                         kthread_exit(0);
  658                 }
  659                 mtx_lock(&sc->queue_mtx);
  660                 bp = bioq_takefirst(&sc->bio_queue);
  661                 if (!bp) {
  662                         msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
  663                         continue;
  664                 }
  665                 mtx_unlock(&sc->queue_mtx);
  666                 if (bp->bio_cmd == BIO_GETATTR) {
  667                         if (sc->fwsectors && sc->fwheads &&
  668                             (g_handleattr_int(bp, "GEOM::fwsectors",
  669                             sc->fwsectors) ||
  670                             g_handleattr_int(bp, "GEOM::fwheads",
  671                             sc->fwheads)))
  672                                 error = -1;
  673                         else
  674                                 error = EOPNOTSUPP;
  675                 } else {
  676                         error = sc->start(sc, bp);
  677                 }
  678 
  679                 if (error != -1) {
  680                         bp->bio_completed = bp->bio_length;
  681                         g_io_deliver(bp, error);
  682                 }
  683         }
  684 }
  685 
  686 static struct md_s *
  687 mdfind(int unit)
  688 {
  689         struct md_s *sc;
  690 
  691         LIST_FOREACH(sc, &md_softc_list, list) {
  692                 if (sc->unit == unit)
  693                         break;
  694         }
  695         return (sc);
  696 }
  697 
  698 static struct md_s *
  699 mdnew(int unit, int *errp, enum md_types type)
  700 {
  701         struct md_s *sc, *sc2;
  702         int error, max = -1;
  703 
  704         *errp = 0;
  705         LIST_FOREACH(sc2, &md_softc_list, list) {
  706                 if (unit == sc2->unit) {
  707                         *errp = EBUSY;
  708                         return (NULL);
  709                 }
  710                 if (unit == -1 && sc2->unit > max) 
  711                         max = sc2->unit;
  712         }
  713         if (unit == -1)
  714                 unit = max + 1;
  715         sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
  716         sc->type = type;
  717         bioq_init(&sc->bio_queue);
  718         mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
  719         sc->unit = unit;
  720         sprintf(sc->name, "md%d", unit);
  721         LIST_INSERT_HEAD(&md_softc_list, sc, list);
  722         error = kthread_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
  723         if (error == 0)
  724                 return (sc);
  725         LIST_REMOVE(sc, list);
  726         mtx_destroy(&sc->queue_mtx);
  727         free(sc, M_MD);
  728         *errp = error;
  729         return (NULL);
  730 }
  731 
  732 static void
  733 mdinit(struct md_s *sc)
  734 {
  735 
  736         struct g_geom *gp;
  737         struct g_provider *pp;
  738 
  739         g_topology_lock();
  740         gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
  741         gp->softc = sc;
  742         pp = g_new_providerf(gp, "md%d", sc->unit);
  743         pp->mediasize = sc->mediasize;
  744         pp->sectorsize = sc->sectorsize;
  745         sc->gp = gp;
  746         sc->pp = pp;
  747         g_error_provider(pp, 0);
  748         g_topology_unlock();
  749 }
  750 
  751 /*
  752  * XXX: we should check that the range they feed us is mapped.
  753  * XXX: we should implement read-only.
  754  */
  755 
  756 static int
  757 mdcreate_preload(struct md_s *sc, struct md_ioctl *mdio)
  758 {
  759 
  760         if (mdio->md_options & ~(MD_AUTOUNIT | MD_FORCE))
  761                 return (EINVAL);
  762         sc->flags = mdio->md_options & MD_FORCE;
  763         /* Cast to pointer size, then to pointer to avoid warning */
  764         sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base;
  765         sc->pl_len = (size_t)sc->mediasize;
  766         return (0);
  767 }
  768 
  769 
  770 static int
  771 mdcreate_malloc(struct md_s *sc, struct md_ioctl *mdio)
  772 {
  773         uintptr_t sp;
  774         int error;
  775         off_t u;
  776 
  777         error = 0;
  778         if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
  779                 return (EINVAL);
  780         if (mdio->md_sectorsize != 0 && !powerof2(mdio->md_sectorsize))
  781                 return (EINVAL);
  782         /* Compression doesn't make sense if we have reserved space */
  783         if (mdio->md_options & MD_RESERVE)
  784                 mdio->md_options &= ~MD_COMPRESS;
  785         if (mdio->md_fwsectors != 0)
  786                 sc->fwsectors = mdio->md_fwsectors;
  787         if (mdio->md_fwheads != 0)
  788                 sc->fwheads = mdio->md_fwheads;
  789         sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE);
  790         sc->indir = dimension(sc->mediasize / sc->sectorsize);
  791         sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
  792             0x1ff, 0);
  793         if (mdio->md_options & MD_RESERVE) {
  794                 off_t nsectors;
  795 
  796                 nsectors = sc->mediasize / sc->sectorsize;
  797                 for (u = 0; u < nsectors; u++) {
  798                         sp = (uintptr_t)uma_zalloc(sc->uma, M_NOWAIT | M_ZERO);
  799                         if (sp != 0)
  800                                 error = s_write(sc->indir, u, sp);
  801                         else
  802                                 error = ENOMEM;
  803                         if (error != 0)
  804                                 break;
  805                 }
  806         }
  807         return (error);
  808 }
  809 
  810 
  811 static int
  812 mdsetcred(struct md_s *sc, struct ucred *cred)
  813 {
  814         char *tmpbuf;
  815         int error = 0;
  816 
  817         /*
  818          * Set credits in our softc
  819          */
  820 
  821         if (sc->cred)
  822                 crfree(sc->cred);
  823         sc->cred = crhold(cred);
  824 
  825         /*
  826          * Horrible kludge to establish credentials for NFS  XXX.
  827          */
  828 
  829         if (sc->vnode) {
  830                 struct uio auio;
  831                 struct iovec aiov;
  832 
  833                 tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
  834                 bzero(&auio, sizeof(auio));
  835 
  836                 aiov.iov_base = tmpbuf;
  837                 aiov.iov_len = sc->sectorsize;
  838                 auio.uio_iov = &aiov;
  839                 auio.uio_iovcnt = 1;
  840                 auio.uio_offset = 0;
  841                 auio.uio_rw = UIO_READ;
  842                 auio.uio_segflg = UIO_SYSSPACE;
  843                 auio.uio_resid = aiov.iov_len;
  844                 vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
  845                 error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
  846                 VOP_UNLOCK(sc->vnode, 0, curthread);
  847                 free(tmpbuf, M_TEMP);
  848         }
  849         return (error);
  850 }
  851 
  852 static int
  853 mdcreate_vnode(struct md_s *sc, struct md_ioctl *mdio, struct thread *td)
  854 {
  855         struct vattr vattr;
  856         struct nameidata nd;
  857         int error, flags;
  858 
  859         error = copyinstr(mdio->md_file, sc->file, sizeof(sc->file), NULL);
  860         if (error != 0)
  861                 return (error);
  862         flags = FREAD|FWRITE;
  863         /*
  864          * If the user specified that this is a read only device, unset the
  865          * FWRITE mask before trying to open the backing store.
  866          */
  867         if ((mdio->md_options & MD_READONLY) != 0)
  868                 flags &= ~FWRITE;
  869         NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td);
  870         error = vn_open(&nd, &flags, 0, -1);
  871         NDFREE(&nd, NDF_ONLY_PNBUF);
  872         if (error != 0)
  873                 return (error);
  874         if (nd.ni_vp->v_type != VREG ||
  875             (error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred, td))) {
  876                 VOP_UNLOCK(nd.ni_vp, 0, td);
  877                 (void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
  878                 return (error ? error : EINVAL);
  879         }
  880         VOP_UNLOCK(nd.ni_vp, 0, td);
  881 
  882         if (mdio->md_fwsectors != 0)
  883                 sc->fwsectors = mdio->md_fwsectors;
  884         if (mdio->md_fwheads != 0)
  885                 sc->fwheads = mdio->md_fwheads;
  886         sc->flags = mdio->md_options & (MD_FORCE | MD_ASYNC);
  887         if (!(flags & FWRITE))
  888                 sc->flags |= MD_READONLY;
  889         sc->vnode = nd.ni_vp;
  890 
  891         error = mdsetcred(sc, td->td_ucred);
  892         if (error != 0) {
  893                 (void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
  894                 return (error);
  895         }
  896         return (0);
  897 }
  898 
  899 static int
  900 mddestroy(struct md_s *sc, struct thread *td)
  901 {
  902 
  903 
  904         if (sc->gp) {
  905                 sc->gp->softc = NULL;
  906                 g_topology_lock();
  907                 g_wither_geom(sc->gp, ENXIO);
  908                 g_topology_unlock();
  909                 sc->gp = NULL;
  910                 sc->pp = NULL;
  911         }
  912         sc->flags |= MD_SHUTDOWN;
  913         wakeup(sc);
  914         while (sc->procp != NULL)
  915                 tsleep(&sc->procp, PRIBIO, "mddestroy", hz / 10);
  916         mtx_destroy(&sc->queue_mtx);
  917         if (sc->vnode != NULL) {
  918                 mtx_lock(&Giant);
  919                 (void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
  920                     FREAD : (FREAD|FWRITE), sc->cred, td);
  921                 mtx_unlock(&Giant);
  922         }
  923         if (sc->cred != NULL)
  924                 crfree(sc->cred);
  925         if (sc->object != NULL)
  926                 vm_object_deallocate(sc->object);
  927         if (sc->indir)
  928                 destroy_indir(sc, sc->indir);
  929         if (sc->uma)
  930                 uma_zdestroy(sc->uma);
  931 
  932         LIST_REMOVE(sc, list);
  933         free(sc, M_MD);
  934         return (0);
  935 }
  936 
  937 static int
  938 mdcreate_swap(struct md_s *sc, struct md_ioctl *mdio, struct thread *td)
  939 {
  940         vm_ooffset_t npage;
  941         int error;
  942 
  943         /*
  944          * Range check.  Disallow negative sizes or any size less then the
  945          * size of a page.  Then round to a page.
  946          */
  947         if (sc->mediasize == 0 || (sc->mediasize % PAGE_SIZE) != 0)
  948                 return (EDOM);
  949 
  950         /*
  951          * Allocate an OBJT_SWAP object.
  952          *
  953          * Note the truncation.
  954          */
  955 
  956         npage = mdio->md_mediasize / PAGE_SIZE;
  957         if (mdio->md_fwsectors != 0)
  958                 sc->fwsectors = mdio->md_fwsectors;
  959         if (mdio->md_fwheads != 0)
  960                 sc->fwheads = mdio->md_fwheads;
  961         sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
  962             VM_PROT_DEFAULT, 0);
  963         if (sc->object == NULL)
  964                 return (ENOMEM);
  965         sc->flags = mdio->md_options & MD_FORCE;
  966         if (mdio->md_options & MD_RESERVE) {
  967                 if (swap_pager_reserve(sc->object, 0, npage) < 0) {
  968                         vm_object_deallocate(sc->object);
  969                         sc->object = NULL;
  970                         return (EDOM);
  971                 }
  972         }
  973         error = mdsetcred(sc, td->td_ucred);
  974         if (error != 0) {
  975                 vm_object_deallocate(sc->object);
  976                 sc->object = NULL;
  977         }
  978         return (error);
  979 }
  980 
  981 
  982 static int
  983 xmdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
  984 {
  985         struct md_ioctl *mdio;
  986         struct md_s *sc;
  987         int error, i;
  988 
  989         if (md_debug)
  990                 printf("mdctlioctl(%s %lx %p %x %p)\n",
  991                         devtoname(dev), cmd, addr, flags, td);
  992 
  993         mdio = (struct md_ioctl *)addr;
  994         if (mdio->md_version != MDIOVERSION)
  995                 return (EINVAL);
  996 
  997         /*
  998          * We assert the version number in the individual ioctl
  999          * handlers instead of out here because (a) it is possible we
 1000          * may add another ioctl in the future which doesn't read an
 1001          * mdio, and (b) the correct return value for an unknown ioctl
 1002          * is ENOIOCTL, not EINVAL.
 1003          */
 1004         error = 0;
 1005         switch (cmd) {
 1006         case MDIOCATTACH:
 1007                 switch (mdio->md_type) {
 1008                 case MD_MALLOC:
 1009                 case MD_PRELOAD:
 1010                 case MD_VNODE:
 1011                 case MD_SWAP:
 1012                         break;
 1013                 default:
 1014                         return (EINVAL);
 1015                 }
 1016                 if (mdio->md_options & MD_AUTOUNIT)
 1017                         sc = mdnew(-1, &error, mdio->md_type);
 1018                 else
 1019                         sc = mdnew(mdio->md_unit, &error, mdio->md_type);
 1020                 if (sc == NULL)
 1021                         return (error);
 1022                 if (mdio->md_options & MD_AUTOUNIT)
 1023                         mdio->md_unit = sc->unit;
 1024                 sc->mediasize = mdio->md_mediasize;
 1025                 if (mdio->md_sectorsize == 0)
 1026                         sc->sectorsize = DEV_BSIZE;
 1027                 else
 1028                         sc->sectorsize = mdio->md_sectorsize;
 1029                 error = EDOOFUS;
 1030                 switch (sc->type) {
 1031                 case MD_MALLOC:
 1032                         sc->start = mdstart_malloc;
 1033                         error = mdcreate_malloc(sc, mdio);
 1034                         break;
 1035                 case MD_PRELOAD:
 1036                         sc->start = mdstart_preload;
 1037                         error = mdcreate_preload(sc, mdio);
 1038                         break;
 1039                 case MD_VNODE:
 1040                         sc->start = mdstart_vnode;
 1041                         error = mdcreate_vnode(sc, mdio, td);
 1042                         break;
 1043                 case MD_SWAP:
 1044                         sc->start = mdstart_swap;
 1045                         error = mdcreate_swap(sc, mdio, td);
 1046                         break;
 1047                 }
 1048                 if (error != 0) {
 1049                         mddestroy(sc, td);
 1050                         return (error);
 1051                 }
 1052 
 1053                 /* Prune off any residual fractional sector */
 1054                 i = sc->mediasize % sc->sectorsize;
 1055                 sc->mediasize -= i;
 1056 
 1057                 mdinit(sc);
 1058                 return (0);
 1059         case MDIOCDETACH:
 1060                 if (mdio->md_mediasize != 0 || mdio->md_options != 0)
 1061                         return (EINVAL);
 1062 
 1063                 sc = mdfind(mdio->md_unit);
 1064                 if (sc == NULL)
 1065                         return (ENOENT);
 1066                 if (sc->opencount != 0 && !(sc->flags & MD_FORCE))
 1067                         return (EBUSY);
 1068                 return (mddestroy(sc, td));
 1069         case MDIOCQUERY:
 1070                 sc = mdfind(mdio->md_unit);
 1071                 if (sc == NULL)
 1072                         return (ENOENT);
 1073                 mdio->md_type = sc->type;
 1074                 mdio->md_options = sc->flags;
 1075                 mdio->md_mediasize = sc->mediasize;
 1076                 mdio->md_sectorsize = sc->sectorsize;
 1077                 if (sc->type == MD_VNODE)
 1078                         error = copyout(sc->file, mdio->md_file,
 1079                             strlen(sc->file) + 1);
 1080                 return (error);
 1081         case MDIOCLIST:
 1082                 i = 1;
 1083                 LIST_FOREACH(sc, &md_softc_list, list) {
 1084                         if (i == MDNPAD - 1)
 1085                                 mdio->md_pad[i] = -1;
 1086                         else
 1087                                 mdio->md_pad[i++] = sc->unit;
 1088                 }
 1089                 mdio->md_pad[0] = i - 1;
 1090                 return (0);
 1091         default:
 1092                 return (ENOIOCTL);
 1093         };
 1094 }
 1095 
 1096 static int
 1097 mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
 1098 {
 1099         int error; 
 1100 
 1101         sx_xlock(&md_sx);
 1102         error = xmdctlioctl(dev, cmd, addr, flags, td);
 1103         sx_xunlock(&md_sx);
 1104         return (error);
 1105 }
 1106 
 1107 static void
 1108 md_preloaded(u_char *image, size_t length)
 1109 {
 1110         struct md_s *sc;
 1111         int error;
 1112 
 1113         sc = mdnew(-1, &error, MD_PRELOAD);
 1114         if (sc == NULL)
 1115                 return;
 1116         sc->mediasize = length;
 1117         sc->sectorsize = DEV_BSIZE;
 1118         sc->pl_ptr = image;
 1119         sc->pl_len = length;
 1120         sc->start = mdstart_preload;
 1121 #ifdef MD_ROOT
 1122         if (sc->unit == 0)
 1123                 rootdevnames[0] = "ufs:/dev/md0";
 1124 #endif
 1125         mdinit(sc);
 1126 }
 1127 
 1128 static void
 1129 g_md_init(struct g_class *mp __unused)
 1130 {
 1131 
 1132         caddr_t mod;
 1133         caddr_t c;
 1134         u_char *ptr, *name, *type;
 1135         unsigned len;
 1136 
 1137         mod = NULL;
 1138         sx_init(&md_sx, "MD config lock");
 1139         g_topology_unlock();
 1140 #ifdef MD_ROOT_SIZE
 1141         sx_xlock(&md_sx);
 1142         md_preloaded(mfs_root, MD_ROOT_SIZE * 1024);
 1143         sx_xunlock(&md_sx);
 1144 #endif
 1145         /* XXX: are preload_* static or do they need Giant ? */
 1146         while ((mod = preload_search_next_name(mod)) != NULL) {
 1147                 name = (char *)preload_search_info(mod, MODINFO_NAME);
 1148                 if (name == NULL)
 1149                         continue;
 1150                 type = (char *)preload_search_info(mod, MODINFO_TYPE);
 1151                 if (type == NULL)
 1152                         continue;
 1153                 if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
 1154                         continue;
 1155                 c = preload_search_info(mod, MODINFO_ADDR);
 1156                 ptr = *(u_char **)c;
 1157                 c = preload_search_info(mod, MODINFO_SIZE);
 1158                 len = *(size_t *)c;
 1159                 printf("%s%d: Preloaded image <%s> %d bytes at %p\n",
 1160                     MD_NAME, mdunits, name, len, ptr);
 1161                 sx_xlock(&md_sx);
 1162                 md_preloaded(ptr, len);
 1163                 sx_xunlock(&md_sx);
 1164         }
 1165         status_dev = make_dev(&mdctl_cdevsw, MAXMINOR, UID_ROOT, GID_WHEEL,
 1166             0600, MDCTL_NAME);
 1167         g_topology_lock();
 1168 }
 1169 
 1170 static void
 1171 g_md_fini(struct g_class *mp __unused)
 1172 {
 1173 
 1174         sx_destroy(&md_sx);
 1175         if (status_dev != NULL)
 1176                 destroy_dev(status_dev);
 1177 }

Cache object: 412061acb907d5efe1124080e7bbf5ed


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.