The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   23  *
   24  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
   25  * All rights reserved.
   26  *
   27  * Portions Copyright 2010 Robert Milkowski
   28  *
   29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
   30  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
   31  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
   32  * Copyright (c) 2014 Integros [integros.com]
   33  */
   34 
   35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
   36 
   37 /*
   38  * ZFS volume emulation driver.
   39  *
   40  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
   41  * Volumes are accessed through the symbolic links named:
   42  *
   43  * /dev/zvol/<pool_name>/<dataset_name>
   44  *
   45  * Volumes are persistent through reboot.  No user command needs to be
   46  * run before opening and using a device.
   47  *
   48  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
   49  * in the system. Except when they're simply character devices (volmode=dev).
   50  */
   51 
   52 #include <sys/types.h>
   53 #include <sys/param.h>
   54 #include <sys/kernel.h>
   55 #include <sys/errno.h>
   56 #include <sys/uio.h>
   57 #include <sys/bio.h>
   58 #include <sys/buf.h>
   59 #include <sys/kmem.h>
   60 #include <sys/conf.h>
   61 #include <sys/cmn_err.h>
   62 #include <sys/stat.h>
   63 #include <sys/proc.h>
   64 #include <sys/zap.h>
   65 #include <sys/spa.h>
   66 #include <sys/spa_impl.h>
   67 #include <sys/zio.h>
   68 #include <sys/disk.h>
   69 #include <sys/dmu_traverse.h>
   70 #include <sys/dnode.h>
   71 #include <sys/dsl_dataset.h>
   72 #include <sys/dsl_prop.h>
   73 #include <sys/dsl_dir.h>
   74 #include <sys/byteorder.h>
   75 #include <sys/sunddi.h>
   76 #include <sys/dirent.h>
   77 #include <sys/policy.h>
   78 #include <sys/queue.h>
   79 #include <sys/fs/zfs.h>
   80 #include <sys/zfs_ioctl.h>
   81 #include <sys/zil.h>
   82 #include <sys/zfs_znode.h>
   83 #include <sys/zfs_rlock.h>
   84 #include <sys/vdev_impl.h>
   85 #include <sys/vdev_raidz.h>
   86 #include <sys/zvol.h>
   87 #include <sys/zil_impl.h>
   88 #include <sys/dataset_kstats.h>
   89 #include <sys/dbuf.h>
   90 #include <sys/dmu_tx.h>
   91 #include <sys/zfeature.h>
   92 #include <sys/zio_checksum.h>
   93 #include <sys/zil_impl.h>
   94 #include <sys/filio.h>
   95 #include <sys/freebsd_event.h>
   96 
   97 #include <geom/geom.h>
   98 #include <sys/zvol.h>
   99 #include <sys/zvol_impl.h>
  100 
  101 #include "zfs_namecheck.h"
  102 
  103 #define ZVOL_DUMPSIZE           "dumpsize"
  104 
  105 #ifdef ZVOL_LOCK_DEBUG
  106 #define ZVOL_RW_READER          RW_WRITER
  107 #define ZVOL_RW_READ_HELD       RW_WRITE_HELD
  108 #else
  109 #define ZVOL_RW_READER          RW_READER
  110 #define ZVOL_RW_READ_HELD       RW_READ_HELD
  111 #endif
  112 
  113 enum zvol_geom_state {
  114         ZVOL_GEOM_UNINIT,
  115         ZVOL_GEOM_STOPPED,
  116         ZVOL_GEOM_RUNNING,
  117 };
  118 
  119 struct zvol_state_os {
  120 #define zso_dev         _zso_state._zso_dev
  121 #define zso_geom        _zso_state._zso_geom
  122         union {
  123                 /* volmode=dev */
  124                 struct zvol_state_dev {
  125                         struct cdev *zsd_cdev;
  126                         uint64_t zsd_sync_cnt;
  127                         struct selinfo zsd_selinfo;
  128                 } _zso_dev;
  129 
  130                 /* volmode=geom */
  131                 struct zvol_state_geom {
  132                         struct g_provider *zsg_provider;
  133                         struct bio_queue_head zsg_queue;
  134                         struct mtx zsg_queue_mtx;
  135                         enum zvol_geom_state zsg_state;
  136                 } _zso_geom;
  137         } _zso_state;
  138         int zso_dying;
  139 };
  140 
  141 static uint32_t zvol_minors;
  142 
  143 SYSCTL_DECL(_vfs_zfs);
  144 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
  145 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
  146         "Expose as GEOM providers (1), device files (2) or neither");
  147 static boolean_t zpool_on_zvol = B_FALSE;
  148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
  149         "Allow zpools to use zvols as vdevs (DANGEROUS)");
  150 
  151 /*
  152  * Toggle unmap functionality.
  153  */
  154 boolean_t zvol_unmap_enabled = B_TRUE;
  155 
  156 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
  157         &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
  158 
  159 /*
  160  * zvol maximum transfer in one DMU tx.
  161  */
  162 int zvol_maxphys = DMU_MAX_ACCESS / 2;
  163 
  164 static void zvol_ensure_zilog(zvol_state_t *zv);
  165 
  166 static d_open_t         zvol_cdev_open;
  167 static d_close_t        zvol_cdev_close;
  168 static d_ioctl_t        zvol_cdev_ioctl;
  169 static d_read_t         zvol_cdev_read;
  170 static d_write_t        zvol_cdev_write;
  171 static d_strategy_t     zvol_geom_bio_strategy;
  172 static d_kqfilter_t     zvol_cdev_kqfilter;
  173 
  174 static struct cdevsw zvol_cdevsw = {
  175         .d_name =       "zvol",
  176         .d_version =    D_VERSION,
  177         .d_flags =      D_DISK | D_TRACKCLOSE,
  178         .d_open =       zvol_cdev_open,
  179         .d_close =      zvol_cdev_close,
  180         .d_ioctl =      zvol_cdev_ioctl,
  181         .d_read =       zvol_cdev_read,
  182         .d_write =      zvol_cdev_write,
  183         .d_strategy =   zvol_geom_bio_strategy,
  184         .d_kqfilter =   zvol_cdev_kqfilter,
  185 };
  186 
  187 static void             zvol_filter_detach(struct knote *kn);
  188 static int              zvol_filter_vnode(struct knote *kn, long hint);
  189 
  190 static struct filterops zvol_filterops_vnode = {
  191         .f_isfd = 1,
  192         .f_detach = zvol_filter_detach,
  193         .f_event = zvol_filter_vnode,
  194 };
  195 
  196 extern uint_t zfs_geom_probe_vdev_key;
  197 
  198 struct g_class zfs_zvol_class = {
  199         .name = "ZFS::ZVOL",
  200         .version = G_VERSION,
  201 };
  202 
  203 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
  204 
  205 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
  206 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
  207 static void zvol_geom_run(zvol_state_t *zv);
  208 static void zvol_geom_destroy(zvol_state_t *zv);
  209 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
  210 static void zvol_geom_worker(void *arg);
  211 static void zvol_geom_bio_start(struct bio *bp);
  212 static int zvol_geom_bio_getattr(struct bio *bp);
  213 /* static d_strategy_t  zvol_geom_bio_strategy; (declared elsewhere) */
  214 
  215 /*
  216  * GEOM mode implementation
  217  */
  218 
  219 static int
  220 zvol_geom_open(struct g_provider *pp, int flag, int count)
  221 {
  222         zvol_state_t *zv;
  223         int err = 0;
  224         boolean_t drop_suspend = B_FALSE;
  225 
  226         if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
  227                 /*
  228                  * If zfs_geom_probe_vdev_key is set, that means that zfs is
  229                  * attempting to probe geom providers while looking for a
  230                  * replacement for a missing VDEV.  In this case, the
  231                  * spa_namespace_lock will not be held, but it is still illegal
  232                  * to use a zvol as a vdev.  Deadlocks can result if another
  233                  * thread has spa_namespace_lock.
  234                  */
  235                 return (SET_ERROR(EOPNOTSUPP));
  236         }
  237 
  238 retry:
  239         rw_enter(&zvol_state_lock, ZVOL_RW_READER);
  240         /*
  241          * Obtain a copy of private under zvol_state_lock to make sure either
  242          * the result of zvol free code setting private to NULL is observed,
  243          * or the zv is protected from being freed because of the positive
  244          * zv_open_count.
  245          */
  246         zv = pp->private;
  247         if (zv == NULL) {
  248                 rw_exit(&zvol_state_lock);
  249                 err = SET_ERROR(ENXIO);
  250                 goto out_locked;
  251         }
  252 
  253         mutex_enter(&zv->zv_state_lock);
  254         if (zv->zv_zso->zso_dying) {
  255                 rw_exit(&zvol_state_lock);
  256                 err = SET_ERROR(ENXIO);
  257                 goto out_zv_locked;
  258         }
  259         ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
  260 
  261         /*
  262          * Make sure zvol is not suspended during first open
  263          * (hold zv_suspend_lock) and respect proper lock acquisition
  264          * ordering - zv_suspend_lock before zv_state_lock.
  265          */
  266         if (zv->zv_open_count == 0) {
  267                 drop_suspend = B_TRUE;
  268                 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
  269                         mutex_exit(&zv->zv_state_lock);
  270                         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
  271                         mutex_enter(&zv->zv_state_lock);
  272                         /* Check to see if zv_suspend_lock is needed. */
  273                         if (zv->zv_open_count != 0) {
  274                                 rw_exit(&zv->zv_suspend_lock);
  275                                 drop_suspend = B_FALSE;
  276                         }
  277                 }
  278         }
  279         rw_exit(&zvol_state_lock);
  280 
  281         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
  282 
  283         if (zv->zv_open_count == 0) {
  284                 boolean_t drop_namespace = B_FALSE;
  285 
  286                 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
  287 
  288                 /*
  289                  * Take spa_namespace_lock to prevent lock inversion when
  290                  * zvols from one pool are opened as vdevs in another.
  291                  */
  292                 if (!mutex_owned(&spa_namespace_lock)) {
  293                         if (!mutex_tryenter(&spa_namespace_lock)) {
  294                                 mutex_exit(&zv->zv_state_lock);
  295                                 rw_exit(&zv->zv_suspend_lock);
  296                                 kern_yield(PRI_USER);
  297                                 goto retry;
  298                         } else {
  299                                 drop_namespace = B_TRUE;
  300                         }
  301                 }
  302                 err = zvol_first_open(zv, !(flag & FWRITE));
  303                 if (drop_namespace)
  304                         mutex_exit(&spa_namespace_lock);
  305                 if (err)
  306                         goto out_zv_locked;
  307                 pp->mediasize = zv->zv_volsize;
  308                 pp->stripeoffset = 0;
  309                 pp->stripesize = zv->zv_volblocksize;
  310         }
  311 
  312         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
  313 
  314         /*
  315          * Check for a bad on-disk format version now since we
  316          * lied about owning the dataset readonly before.
  317          */
  318         if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
  319             dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
  320                 err = SET_ERROR(EROFS);
  321                 goto out_opened;
  322         }
  323         if (zv->zv_flags & ZVOL_EXCL) {
  324                 err = SET_ERROR(EBUSY);
  325                 goto out_opened;
  326         }
  327         if (flag & O_EXCL) {
  328                 if (zv->zv_open_count != 0) {
  329                         err = SET_ERROR(EBUSY);
  330                         goto out_opened;
  331                 }
  332                 zv->zv_flags |= ZVOL_EXCL;
  333         }
  334 
  335         zv->zv_open_count += count;
  336 out_opened:
  337         if (zv->zv_open_count == 0) {
  338                 zvol_last_close(zv);
  339                 wakeup(zv);
  340         }
  341 out_zv_locked:
  342         mutex_exit(&zv->zv_state_lock);
  343 out_locked:
  344         if (drop_suspend)
  345                 rw_exit(&zv->zv_suspend_lock);
  346         return (err);
  347 }
  348 
  349 static int
  350 zvol_geom_close(struct g_provider *pp, int flag, int count)
  351 {
  352         (void) flag;
  353         zvol_state_t *zv;
  354         boolean_t drop_suspend = B_TRUE;
  355         int new_open_count;
  356 
  357         rw_enter(&zvol_state_lock, ZVOL_RW_READER);
  358         zv = pp->private;
  359         if (zv == NULL) {
  360                 rw_exit(&zvol_state_lock);
  361                 return (SET_ERROR(ENXIO));
  362         }
  363 
  364         mutex_enter(&zv->zv_state_lock);
  365         if (zv->zv_flags & ZVOL_EXCL) {
  366                 ASSERT3U(zv->zv_open_count, ==, 1);
  367                 zv->zv_flags &= ~ZVOL_EXCL;
  368         }
  369 
  370         ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
  371 
  372         /*
  373          * If the open count is zero, this is a spurious close.
  374          * That indicates a bug in the kernel / DDI framework.
  375          */
  376         ASSERT3U(zv->zv_open_count, >, 0);
  377 
  378         /*
  379          * Make sure zvol is not suspended during last close
  380          * (hold zv_suspend_lock) and respect proper lock acquisition
  381          * ordering - zv_suspend_lock before zv_state_lock.
  382          */
  383         new_open_count = zv->zv_open_count - count;
  384         if (new_open_count == 0) {
  385                 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
  386                         mutex_exit(&zv->zv_state_lock);
  387                         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
  388                         mutex_enter(&zv->zv_state_lock);
  389                         /* Check to see if zv_suspend_lock is needed. */
  390                         new_open_count = zv->zv_open_count - count;
  391                         if (new_open_count != 0) {
  392                                 rw_exit(&zv->zv_suspend_lock);
  393                                 drop_suspend = B_FALSE;
  394                         }
  395                 }
  396         } else {
  397                 drop_suspend = B_FALSE;
  398         }
  399         rw_exit(&zvol_state_lock);
  400 
  401         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
  402 
  403         /*
  404          * You may get multiple opens, but only one close.
  405          */
  406         zv->zv_open_count = new_open_count;
  407         if (zv->zv_open_count == 0) {
  408                 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
  409                 zvol_last_close(zv);
  410                 wakeup(zv);
  411         }
  412 
  413         mutex_exit(&zv->zv_state_lock);
  414 
  415         if (drop_suspend)
  416                 rw_exit(&zv->zv_suspend_lock);
  417         return (0);
  418 }
  419 
  420 static void
  421 zvol_geom_run(zvol_state_t *zv)
  422 {
  423         struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
  424         struct g_provider *pp = zsg->zsg_provider;
  425 
  426         ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
  427 
  428         g_error_provider(pp, 0);
  429 
  430         kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
  431             "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
  432 }
  433 
  434 static void
  435 zvol_geom_destroy(zvol_state_t *zv)
  436 {
  437         struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
  438         struct g_provider *pp = zsg->zsg_provider;
  439 
  440         ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
  441 
  442         g_topology_assert();
  443 
  444         mutex_enter(&zv->zv_state_lock);
  445         VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
  446         mutex_exit(&zv->zv_state_lock);
  447         zsg->zsg_provider = NULL;
  448         g_wither_geom(pp->geom, ENXIO);
  449 }
  450 
  451 void
  452 zvol_wait_close(zvol_state_t *zv)
  453 {
  454 
  455         if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
  456                 return;
  457         mutex_enter(&zv->zv_state_lock);
  458         zv->zv_zso->zso_dying = B_TRUE;
  459 
  460         if (zv->zv_open_count)
  461                 msleep(zv, &zv->zv_state_lock,
  462                     PRIBIO, "zvol:dying", 10*hz);
  463         mutex_exit(&zv->zv_state_lock);
  464 }
  465 
  466 
  467 static int
  468 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
  469 {
  470         int count, error, flags;
  471 
  472         g_topology_assert();
  473 
  474         /*
  475          * To make it easier we expect either open or close, but not both
  476          * at the same time.
  477          */
  478         KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
  479             (acr <= 0 && acw <= 0 && ace <= 0),
  480             ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
  481             pp->name, acr, acw, ace));
  482 
  483         if (pp->private == NULL) {
  484                 if (acr <= 0 && acw <= 0 && ace <= 0)
  485                         return (0);
  486                 return (pp->error);
  487         }
  488 
  489         /*
  490          * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
  491          * ace != 0, because GEOM already handles that and handles it a bit
  492          * differently. GEOM allows for multiple read/exclusive consumers and
  493          * ZFS allows only one exclusive consumer, no matter if it is reader or
  494          * writer. I like better the way GEOM works so I'll leave it for GEOM
  495          * to decide what to do.
  496          */
  497 
  498         count = acr + acw + ace;
  499         if (count == 0)
  500                 return (0);
  501 
  502         flags = 0;
  503         if (acr != 0 || ace != 0)
  504                 flags |= FREAD;
  505         if (acw != 0)
  506                 flags |= FWRITE;
  507 
  508         g_topology_unlock();
  509         if (count > 0)
  510                 error = zvol_geom_open(pp, flags, count);
  511         else
  512                 error = zvol_geom_close(pp, flags, -count);
  513         g_topology_lock();
  514         return (error);
  515 }
  516 
  517 static void
  518 zvol_geom_worker(void *arg)
  519 {
  520         zvol_state_t *zv = arg;
  521         struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
  522         struct bio *bp;
  523 
  524         ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
  525 
  526         thread_lock(curthread);
  527         sched_prio(curthread, PRIBIO);
  528         thread_unlock(curthread);
  529 
  530         for (;;) {
  531                 mtx_lock(&zsg->zsg_queue_mtx);
  532                 bp = bioq_takefirst(&zsg->zsg_queue);
  533                 if (bp == NULL) {
  534                         if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
  535                                 zsg->zsg_state = ZVOL_GEOM_RUNNING;
  536                                 wakeup(&zsg->zsg_state);
  537                                 mtx_unlock(&zsg->zsg_queue_mtx);
  538                                 kthread_exit();
  539                         }
  540                         msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
  541                             PRIBIO | PDROP, "zvol:io", 0);
  542                         continue;
  543                 }
  544                 mtx_unlock(&zsg->zsg_queue_mtx);
  545                 zvol_geom_bio_strategy(bp);
  546         }
  547 }
  548 
  549 static void
  550 zvol_geom_bio_start(struct bio *bp)
  551 {
  552         zvol_state_t *zv = bp->bio_to->private;
  553         struct zvol_state_geom *zsg;
  554         boolean_t first;
  555 
  556         if (zv == NULL) {
  557                 g_io_deliver(bp, ENXIO);
  558                 return;
  559         }
  560         if (bp->bio_cmd == BIO_GETATTR) {
  561                 if (zvol_geom_bio_getattr(bp))
  562                         g_io_deliver(bp, EOPNOTSUPP);
  563                 return;
  564         }
  565 
  566         if (!THREAD_CAN_SLEEP()) {
  567                 zsg = &zv->zv_zso->zso_geom;
  568                 mtx_lock(&zsg->zsg_queue_mtx);
  569                 first = (bioq_first(&zsg->zsg_queue) == NULL);
  570                 bioq_insert_tail(&zsg->zsg_queue, bp);
  571                 mtx_unlock(&zsg->zsg_queue_mtx);
  572                 if (first)
  573                         wakeup_one(&zsg->zsg_queue);
  574                 return;
  575         }
  576 
  577         zvol_geom_bio_strategy(bp);
  578 }
  579 
  580 static int
  581 zvol_geom_bio_getattr(struct bio *bp)
  582 {
  583         zvol_state_t *zv;
  584 
  585         zv = bp->bio_to->private;
  586         ASSERT3P(zv, !=, NULL);
  587 
  588         spa_t *spa = dmu_objset_spa(zv->zv_objset);
  589         uint64_t refd, avail, usedobjs, availobjs;
  590 
  591         if (g_handleattr_int(bp, "GEOM::candelete", 1))
  592                 return (0);
  593         if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
  594                 dmu_objset_space(zv->zv_objset, &refd, &avail,
  595                     &usedobjs, &availobjs);
  596                 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
  597                         return (0);
  598         } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
  599                 dmu_objset_space(zv->zv_objset, &refd, &avail,
  600                     &usedobjs, &availobjs);
  601                 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
  602                         return (0);
  603         } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
  604                 avail = metaslab_class_get_space(spa_normal_class(spa));
  605                 avail -= metaslab_class_get_alloc(spa_normal_class(spa));
  606                 if (g_handleattr_off_t(bp, "poolblocksavail",
  607                     avail / DEV_BSIZE))
  608                         return (0);
  609         } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
  610                 refd = metaslab_class_get_alloc(spa_normal_class(spa));
  611                 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
  612                         return (0);
  613         }
  614         return (1);
  615 }
  616 
  617 static void
  618 zvol_filter_detach(struct knote *kn)
  619 {
  620         zvol_state_t *zv;
  621         struct zvol_state_dev *zsd;
  622 
  623         zv = kn->kn_hook;
  624         zsd = &zv->zv_zso->zso_dev;
  625 
  626         knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
  627 }
  628 
  629 static int
  630 zvol_filter_vnode(struct knote *kn, long hint)
  631 {
  632         kn->kn_fflags |= kn->kn_sfflags & hint;
  633 
  634         return (kn->kn_fflags != 0);
  635 }
  636 
  637 static int
  638 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
  639 {
  640         zvol_state_t *zv;
  641         struct zvol_state_dev *zsd;
  642 
  643         zv = dev->si_drv2;
  644         zsd = &zv->zv_zso->zso_dev;
  645 
  646         if (kn->kn_filter != EVFILT_VNODE)
  647                 return (EINVAL);
  648 
  649         /* XXX: extend support for other NOTE_* events */
  650         if (kn->kn_sfflags != NOTE_ATTRIB)
  651                 return (EINVAL);
  652 
  653         kn->kn_fop = &zvol_filterops_vnode;
  654         kn->kn_hook = zv;
  655         knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
  656 
  657         return (0);
  658 }
  659 
  660 static void
  661 zvol_geom_bio_strategy(struct bio *bp)
  662 {
  663         zvol_state_t *zv;
  664         uint64_t off, volsize;
  665         size_t resid;
  666         char *addr;
  667         objset_t *os;
  668         zfs_locked_range_t *lr;
  669         int error = 0;
  670         boolean_t doread = B_FALSE;
  671         boolean_t is_dumpified;
  672         boolean_t sync;
  673 
  674         if (bp->bio_to)
  675                 zv = bp->bio_to->private;
  676         else
  677                 zv = bp->bio_dev->si_drv2;
  678 
  679         if (zv == NULL) {
  680                 error = SET_ERROR(ENXIO);
  681                 goto out;
  682         }
  683 
  684         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
  685 
  686         switch (bp->bio_cmd) {
  687         case BIO_READ:
  688                 doread = B_TRUE;
  689                 break;
  690         case BIO_WRITE:
  691         case BIO_FLUSH:
  692         case BIO_DELETE:
  693                 if (zv->zv_flags & ZVOL_RDONLY) {
  694                         error = SET_ERROR(EROFS);
  695                         goto resume;
  696                 }
  697                 zvol_ensure_zilog(zv);
  698                 if (bp->bio_cmd == BIO_FLUSH)
  699                         goto sync;
  700                 break;
  701         default:
  702                 error = SET_ERROR(EOPNOTSUPP);
  703                 goto resume;
  704         }
  705 
  706         off = bp->bio_offset;
  707         volsize = zv->zv_volsize;
  708 
  709         os = zv->zv_objset;
  710         ASSERT3P(os, !=, NULL);
  711 
  712         addr = bp->bio_data;
  713         resid = bp->bio_length;
  714 
  715         if (resid > 0 && off >= volsize) {
  716                 error = SET_ERROR(EIO);
  717                 goto resume;
  718         }
  719 
  720         is_dumpified = B_FALSE;
  721         sync = !doread && !is_dumpified &&
  722             zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
  723 
  724         /*
  725          * There must be no buffer changes when doing a dmu_sync() because
  726          * we can't change the data whilst calculating the checksum.
  727          */
  728         lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
  729             doread ? RL_READER : RL_WRITER);
  730 
  731         if (bp->bio_cmd == BIO_DELETE) {
  732                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
  733                 error = dmu_tx_assign(tx, TXG_WAIT);
  734                 if (error != 0) {
  735                         dmu_tx_abort(tx);
  736                 } else {
  737                         zvol_log_truncate(zv, tx, off, resid, sync);
  738                         dmu_tx_commit(tx);
  739                         error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
  740                             off, resid);
  741                         resid = 0;
  742                 }
  743                 goto unlock;
  744         }
  745         while (resid != 0 && off < volsize) {
  746                 size_t size = MIN(resid, zvol_maxphys);
  747                 if (doread) {
  748                         error = dmu_read(os, ZVOL_OBJ, off, size, addr,
  749                             DMU_READ_PREFETCH);
  750                 } else {
  751                         dmu_tx_t *tx = dmu_tx_create(os);
  752                         dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
  753                         error = dmu_tx_assign(tx, TXG_WAIT);
  754                         if (error) {
  755                                 dmu_tx_abort(tx);
  756                         } else {
  757                                 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
  758                                 zvol_log_write(zv, tx, off, size, sync);
  759                                 dmu_tx_commit(tx);
  760                         }
  761                 }
  762                 if (error) {
  763                         /* Convert checksum errors into IO errors. */
  764                         if (error == ECKSUM)
  765                                 error = SET_ERROR(EIO);
  766                         break;
  767                 }
  768                 off += size;
  769                 addr += size;
  770                 resid -= size;
  771         }
  772 unlock:
  773         zfs_rangelock_exit(lr);
  774 
  775         bp->bio_completed = bp->bio_length - resid;
  776         if (bp->bio_completed < bp->bio_length && off > volsize)
  777                 error = SET_ERROR(EINVAL);
  778 
  779         switch (bp->bio_cmd) {
  780         case BIO_FLUSH:
  781                 break;
  782         case BIO_READ:
  783                 dataset_kstats_update_read_kstats(&zv->zv_kstat,
  784                     bp->bio_completed);
  785                 break;
  786         case BIO_WRITE:
  787                 dataset_kstats_update_write_kstats(&zv->zv_kstat,
  788                     bp->bio_completed);
  789                 break;
  790         case BIO_DELETE:
  791                 break;
  792         default:
  793                 break;
  794         }
  795 
  796         if (sync) {
  797 sync:
  798                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
  799         }
  800 resume:
  801         rw_exit(&zv->zv_suspend_lock);
  802 out:
  803         if (bp->bio_to)
  804                 g_io_deliver(bp, error);
  805         else
  806                 biofinish(bp, NULL, error);
  807 }
  808 
  809 /*
  810  * Character device mode implementation
  811  */
  812 
  813 static int
  814 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
  815 {
  816         zvol_state_t *zv;
  817         uint64_t volsize;
  818         zfs_locked_range_t *lr;
  819         int error = 0;
  820         zfs_uio_t uio;
  821 
  822         zfs_uio_init(&uio, uio_s);
  823 
  824         zv = dev->si_drv2;
  825 
  826         volsize = zv->zv_volsize;
  827         /*
  828          * uio_loffset == volsize isn't an error as
  829          * it's required for EOF processing.
  830          */
  831         if (zfs_uio_resid(&uio) > 0 &&
  832             (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
  833                 return (SET_ERROR(EIO));
  834 
  835         ssize_t start_resid = zfs_uio_resid(&uio);
  836         lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
  837             zfs_uio_resid(&uio), RL_READER);
  838         while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
  839                 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
  840 
  841                 /* Don't read past the end. */
  842                 if (bytes > volsize - zfs_uio_offset(&uio))
  843                         bytes = volsize - zfs_uio_offset(&uio);
  844 
  845                 error =  dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
  846                 if (error) {
  847                         /* Convert checksum errors into IO errors. */
  848                         if (error == ECKSUM)
  849                                 error = SET_ERROR(EIO);
  850                         break;
  851                 }
  852         }
  853         zfs_rangelock_exit(lr);
  854         int64_t nread = start_resid - zfs_uio_resid(&uio);
  855         dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
  856 
  857         return (error);
  858 }
  859 
  860 static int
  861 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
  862 {
  863         zvol_state_t *zv;
  864         uint64_t volsize;
  865         zfs_locked_range_t *lr;
  866         int error = 0;
  867         boolean_t sync;
  868         zfs_uio_t uio;
  869 
  870         zv = dev->si_drv2;
  871 
  872         volsize = zv->zv_volsize;
  873 
  874         zfs_uio_init(&uio, uio_s);
  875 
  876         if (zfs_uio_resid(&uio) > 0 &&
  877             (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
  878                 return (SET_ERROR(EIO));
  879 
  880         ssize_t start_resid = zfs_uio_resid(&uio);
  881         sync = (ioflag & IO_SYNC) ||
  882             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
  883 
  884         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
  885         zvol_ensure_zilog(zv);
  886 
  887         lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
  888             zfs_uio_resid(&uio), RL_WRITER);
  889         while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
  890                 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
  891                 uint64_t off = zfs_uio_offset(&uio);
  892                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
  893 
  894                 if (bytes > volsize - off)      /* Don't write past the end. */
  895                         bytes = volsize - off;
  896 
  897                 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
  898                 error = dmu_tx_assign(tx, TXG_WAIT);
  899                 if (error) {
  900                         dmu_tx_abort(tx);
  901                         break;
  902                 }
  903                 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
  904                 if (error == 0)
  905                         zvol_log_write(zv, tx, off, bytes, sync);
  906                 dmu_tx_commit(tx);
  907 
  908                 if (error)
  909                         break;
  910         }
  911         zfs_rangelock_exit(lr);
  912         int64_t nwritten = start_resid - zfs_uio_resid(&uio);
  913         dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
  914         if (sync)
  915                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
  916         rw_exit(&zv->zv_suspend_lock);
  917         return (error);
  918 }
  919 
  920 static int
  921 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
  922 {
  923         zvol_state_t *zv;
  924         struct zvol_state_dev *zsd;
  925         int err = 0;
  926         boolean_t drop_suspend = B_FALSE;
  927 
  928 retry:
  929         rw_enter(&zvol_state_lock, ZVOL_RW_READER);
  930         /*
  931          * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
  932          * the result of zvol free code setting si_drv2 to NULL is observed,
  933          * or the zv is protected from being freed because of the positive
  934          * zv_open_count.
  935          */
  936         zv = dev->si_drv2;
  937         if (zv == NULL) {
  938                 rw_exit(&zvol_state_lock);
  939                 err = SET_ERROR(ENXIO);
  940                 goto out_locked;
  941         }
  942 
  943         mutex_enter(&zv->zv_state_lock);
  944         if (zv->zv_zso->zso_dying) {
  945                 rw_exit(&zvol_state_lock);
  946                 err = SET_ERROR(ENXIO);
  947                 goto out_zv_locked;
  948         }
  949         ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
  950 
  951         /*
  952          * Make sure zvol is not suspended during first open
  953          * (hold zv_suspend_lock) and respect proper lock acquisition
  954          * ordering - zv_suspend_lock before zv_state_lock.
  955          */
  956         if (zv->zv_open_count == 0) {
  957                 drop_suspend = B_TRUE;
  958                 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
  959                         mutex_exit(&zv->zv_state_lock);
  960                         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
  961                         mutex_enter(&zv->zv_state_lock);
  962                         /* Check to see if zv_suspend_lock is needed. */
  963                         if (zv->zv_open_count != 0) {
  964                                 rw_exit(&zv->zv_suspend_lock);
  965                                 drop_suspend = B_FALSE;
  966                         }
  967                 }
  968         }
  969         rw_exit(&zvol_state_lock);
  970 
  971         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
  972 
  973         if (zv->zv_open_count == 0) {
  974                 boolean_t drop_namespace = B_FALSE;
  975 
  976                 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
  977 
  978                 /*
  979                  * Take spa_namespace_lock to prevent lock inversion when
  980                  * zvols from one pool are opened as vdevs in another.
  981                  */
  982                 if (!mutex_owned(&spa_namespace_lock)) {
  983                         if (!mutex_tryenter(&spa_namespace_lock)) {
  984                                 mutex_exit(&zv->zv_state_lock);
  985                                 rw_exit(&zv->zv_suspend_lock);
  986                                 kern_yield(PRI_USER);
  987                                 goto retry;
  988                         } else {
  989                                 drop_namespace = B_TRUE;
  990                         }
  991                 }
  992                 err = zvol_first_open(zv, !(flags & FWRITE));
  993                 if (drop_namespace)
  994                         mutex_exit(&spa_namespace_lock);
  995                 if (err)
  996                         goto out_zv_locked;
  997         }
  998 
  999         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 1000 
 1001         if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
 1002                 err = SET_ERROR(EROFS);
 1003                 goto out_opened;
 1004         }
 1005         if (zv->zv_flags & ZVOL_EXCL) {
 1006                 err = SET_ERROR(EBUSY);
 1007                 goto out_opened;
 1008         }
 1009         if (flags & O_EXCL) {
 1010                 if (zv->zv_open_count != 0) {
 1011                         err = SET_ERROR(EBUSY);
 1012                         goto out_opened;
 1013                 }
 1014                 zv->zv_flags |= ZVOL_EXCL;
 1015         }
 1016 
 1017         zv->zv_open_count++;
 1018         if (flags & O_SYNC) {
 1019                 zsd = &zv->zv_zso->zso_dev;
 1020                 zsd->zsd_sync_cnt++;
 1021                 if (zsd->zsd_sync_cnt == 1 &&
 1022                     (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
 1023                         zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
 1024         }
 1025 out_opened:
 1026         if (zv->zv_open_count == 0) {
 1027                 zvol_last_close(zv);
 1028                 wakeup(zv);
 1029         }
 1030 out_zv_locked:
 1031         mutex_exit(&zv->zv_state_lock);
 1032 out_locked:
 1033         if (drop_suspend)
 1034                 rw_exit(&zv->zv_suspend_lock);
 1035         return (err);
 1036 }
 1037 
 1038 static int
 1039 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 1040 {
 1041         zvol_state_t *zv;
 1042         struct zvol_state_dev *zsd;
 1043         boolean_t drop_suspend = B_TRUE;
 1044 
 1045         rw_enter(&zvol_state_lock, ZVOL_RW_READER);
 1046         zv = dev->si_drv2;
 1047         if (zv == NULL) {
 1048                 rw_exit(&zvol_state_lock);
 1049                 return (SET_ERROR(ENXIO));
 1050         }
 1051 
 1052         mutex_enter(&zv->zv_state_lock);
 1053         if (zv->zv_flags & ZVOL_EXCL) {
 1054                 ASSERT3U(zv->zv_open_count, ==, 1);
 1055                 zv->zv_flags &= ~ZVOL_EXCL;
 1056         }
 1057 
 1058         ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
 1059 
 1060         /*
 1061          * If the open count is zero, this is a spurious close.
 1062          * That indicates a bug in the kernel / DDI framework.
 1063          */
 1064         ASSERT3U(zv->zv_open_count, >, 0);
 1065         /*
 1066          * Make sure zvol is not suspended during last close
 1067          * (hold zv_suspend_lock) and respect proper lock acquisition
 1068          * ordering - zv_suspend_lock before zv_state_lock.
 1069          */
 1070         if (zv->zv_open_count == 1) {
 1071                 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
 1072                         mutex_exit(&zv->zv_state_lock);
 1073                         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 1074                         mutex_enter(&zv->zv_state_lock);
 1075                         /* Check to see if zv_suspend_lock is needed. */
 1076                         if (zv->zv_open_count != 1) {
 1077                                 rw_exit(&zv->zv_suspend_lock);
 1078                                 drop_suspend = B_FALSE;
 1079                         }
 1080                 }
 1081         } else {
 1082                 drop_suspend = B_FALSE;
 1083         }
 1084         rw_exit(&zvol_state_lock);
 1085 
 1086         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 1087 
 1088         /*
 1089          * You may get multiple opens, but only one close.
 1090          */
 1091         zv->zv_open_count--;
 1092         if (flags & O_SYNC) {
 1093                 zsd = &zv->zv_zso->zso_dev;
 1094                 zsd->zsd_sync_cnt--;
 1095         }
 1096 
 1097         if (zv->zv_open_count == 0) {
 1098                 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
 1099                 zvol_last_close(zv);
 1100                 wakeup(zv);
 1101         }
 1102 
 1103         mutex_exit(&zv->zv_state_lock);
 1104 
 1105         if (drop_suspend)
 1106                 rw_exit(&zv->zv_suspend_lock);
 1107         return (0);
 1108 }
 1109 
 1110 static int
 1111 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
 1112     int fflag, struct thread *td)
 1113 {
 1114         zvol_state_t *zv;
 1115         zfs_locked_range_t *lr;
 1116         off_t offset, length;
 1117         int error;
 1118         boolean_t sync;
 1119 
 1120         zv = dev->si_drv2;
 1121 
 1122         error = 0;
 1123         KASSERT(zv->zv_open_count > 0,
 1124             ("Device with zero access count in %s", __func__));
 1125 
 1126         switch (cmd) {
 1127         case DIOCGSECTORSIZE:
 1128                 *(uint32_t *)data = DEV_BSIZE;
 1129                 break;
 1130         case DIOCGMEDIASIZE:
 1131                 *(off_t *)data = zv->zv_volsize;
 1132                 break;
 1133         case DIOCGFLUSH:
 1134                 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 1135                 if (zv->zv_zilog != NULL)
 1136                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
 1137                 rw_exit(&zv->zv_suspend_lock);
 1138                 break;
 1139         case DIOCGDELETE:
 1140                 if (!zvol_unmap_enabled)
 1141                         break;
 1142 
 1143                 offset = ((off_t *)data)[0];
 1144                 length = ((off_t *)data)[1];
 1145                 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
 1146                     offset < 0 || offset >= zv->zv_volsize ||
 1147                     length <= 0) {
 1148                         printf("%s: offset=%jd length=%jd\n", __func__, offset,
 1149                             length);
 1150                         error = SET_ERROR(EINVAL);
 1151                         break;
 1152                 }
 1153                 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 1154                 zvol_ensure_zilog(zv);
 1155                 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
 1156                     RL_WRITER);
 1157                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 1158                 error = dmu_tx_assign(tx, TXG_WAIT);
 1159                 if (error != 0) {
 1160                         sync = FALSE;
 1161                         dmu_tx_abort(tx);
 1162                 } else {
 1163                         sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
 1164                         zvol_log_truncate(zv, tx, offset, length, sync);
 1165                         dmu_tx_commit(tx);
 1166                         error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
 1167                             offset, length);
 1168                 }
 1169                 zfs_rangelock_exit(lr);
 1170                 if (sync)
 1171                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
 1172                 rw_exit(&zv->zv_suspend_lock);
 1173                 break;
 1174         case DIOCGSTRIPESIZE:
 1175                 *(off_t *)data = zv->zv_volblocksize;
 1176                 break;
 1177         case DIOCGSTRIPEOFFSET:
 1178                 *(off_t *)data = 0;
 1179                 break;
 1180         case DIOCGATTR: {
 1181                 spa_t *spa = dmu_objset_spa(zv->zv_objset);
 1182                 struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
 1183                 uint64_t refd, avail, usedobjs, availobjs;
 1184 
 1185                 if (strcmp(arg->name, "GEOM::candelete") == 0)
 1186                         arg->value.i = 1;
 1187                 else if (strcmp(arg->name, "blocksavail") == 0) {
 1188                         dmu_objset_space(zv->zv_objset, &refd, &avail,
 1189                             &usedobjs, &availobjs);
 1190                         arg->value.off = avail / DEV_BSIZE;
 1191                 } else if (strcmp(arg->name, "blocksused") == 0) {
 1192                         dmu_objset_space(zv->zv_objset, &refd, &avail,
 1193                             &usedobjs, &availobjs);
 1194                         arg->value.off = refd / DEV_BSIZE;
 1195                 } else if (strcmp(arg->name, "poolblocksavail") == 0) {
 1196                         avail = metaslab_class_get_space(spa_normal_class(spa));
 1197                         avail -= metaslab_class_get_alloc(
 1198                             spa_normal_class(spa));
 1199                         arg->value.off = avail / DEV_BSIZE;
 1200                 } else if (strcmp(arg->name, "poolblocksused") == 0) {
 1201                         refd = metaslab_class_get_alloc(spa_normal_class(spa));
 1202                         arg->value.off = refd / DEV_BSIZE;
 1203                 } else
 1204                         error = SET_ERROR(ENOIOCTL);
 1205                 break;
 1206         }
 1207         case FIOSEEKHOLE:
 1208         case FIOSEEKDATA: {
 1209                 off_t *off = (off_t *)data;
 1210                 uint64_t noff;
 1211                 boolean_t hole;
 1212 
 1213                 hole = (cmd == FIOSEEKHOLE);
 1214                 noff = *off;
 1215                 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
 1216                 *off = noff;
 1217                 break;
 1218         }
 1219         default:
 1220                 error = SET_ERROR(ENOIOCTL);
 1221         }
 1222 
 1223         return (error);
 1224 }
 1225 
 1226 /*
 1227  * Misc. helpers
 1228  */
 1229 
 1230 static void
 1231 zvol_ensure_zilog(zvol_state_t *zv)
 1232 {
 1233         ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
 1234 
 1235         /*
 1236          * Open a ZIL if this is the first time we have written to this
 1237          * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
 1238          * than zv_state_lock so that we don't need to acquire an
 1239          * additional lock in this path.
 1240          */
 1241         if (zv->zv_zilog == NULL) {
 1242                 if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
 1243                         rw_exit(&zv->zv_suspend_lock);
 1244                         rw_enter(&zv->zv_suspend_lock, RW_WRITER);
 1245                 }
 1246                 if (zv->zv_zilog == NULL) {
 1247                         zv->zv_zilog = zil_open(zv->zv_objset,
 1248                             zvol_get_data, &zv->zv_kstat.dk_zil_sums);
 1249                         zv->zv_flags |= ZVOL_WRITTEN_TO;
 1250                         /* replay / destroy done in zvol_os_create_minor() */
 1251                         VERIFY0(zv->zv_zilog->zl_header->zh_flags &
 1252                             ZIL_REPLAY_NEEDED);
 1253                 }
 1254                 rw_downgrade(&zv->zv_suspend_lock);
 1255         }
 1256 }
 1257 
 1258 boolean_t
 1259 zvol_os_is_zvol(const char *device)
 1260 {
 1261         return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
 1262 }
 1263 
 1264 void
 1265 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 1266 {
 1267         ASSERT(RW_LOCK_HELD(&zvol_state_lock));
 1268         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 1269 
 1270         /* Move to a new hashtable entry.  */
 1271         zv->zv_hash = zvol_name_hash(zv->zv_name);
 1272         hlist_del(&zv->zv_hlink);
 1273         hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
 1274 
 1275         if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 1276                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
 1277                 struct g_provider *pp = zsg->zsg_provider;
 1278                 struct g_geom *gp;
 1279 
 1280                 g_topology_lock();
 1281                 gp = pp->geom;
 1282                 ASSERT3P(gp, !=, NULL);
 1283 
 1284                 zsg->zsg_provider = NULL;
 1285                 g_wither_provider(pp, ENXIO);
 1286 
 1287                 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
 1288                 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
 1289                 pp->sectorsize = DEV_BSIZE;
 1290                 pp->mediasize = zv->zv_volsize;
 1291                 pp->private = zv;
 1292                 zsg->zsg_provider = pp;
 1293                 g_error_provider(pp, 0);
 1294                 g_topology_unlock();
 1295         } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
 1296                 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
 1297                 struct cdev *dev;
 1298                 struct make_dev_args args;
 1299 
 1300                 dev = zsd->zsd_cdev;
 1301                 if (dev != NULL) {
 1302                         destroy_dev(dev);
 1303                         dev = zsd->zsd_cdev = NULL;
 1304                         if (zv->zv_open_count > 0) {
 1305                                 zv->zv_flags &= ~ZVOL_EXCL;
 1306                                 zv->zv_open_count = 0;
 1307                                 /* XXX  need suspend lock but lock order */
 1308                                 zvol_last_close(zv);
 1309                         }
 1310                 }
 1311 
 1312                 make_dev_args_init(&args);
 1313                 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
 1314                 args.mda_devsw = &zvol_cdevsw;
 1315                 args.mda_cr = NULL;
 1316                 args.mda_uid = UID_ROOT;
 1317                 args.mda_gid = GID_OPERATOR;
 1318                 args.mda_mode = 0640;
 1319                 args.mda_si_drv2 = zv;
 1320                 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
 1321                     == 0) {
 1322 #if __FreeBSD_version > 1300130
 1323                         dev->si_iosize_max = maxphys;
 1324 #else
 1325                         dev->si_iosize_max = MAXPHYS;
 1326 #endif
 1327                         zsd->zsd_cdev = dev;
 1328                 }
 1329         }
 1330         strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
 1331 }
 1332 
 1333 /*
 1334  * Remove minor node for the specified volume.
 1335  */
 1336 void
 1337 zvol_os_free(zvol_state_t *zv)
 1338 {
 1339         ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
 1340         ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
 1341         ASSERT0(zv->zv_open_count);
 1342 
 1343         ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
 1344 
 1345         rw_destroy(&zv->zv_suspend_lock);
 1346         zfs_rangelock_fini(&zv->zv_rangelock);
 1347 
 1348         if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 1349                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
 1350                 struct g_provider *pp __maybe_unused = zsg->zsg_provider;
 1351 
 1352                 ASSERT3P(pp->private, ==, NULL);
 1353 
 1354                 g_topology_lock();
 1355                 zvol_geom_destroy(zv);
 1356                 g_topology_unlock();
 1357                 mtx_destroy(&zsg->zsg_queue_mtx);
 1358         } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
 1359                 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
 1360                 struct cdev *dev = zsd->zsd_cdev;
 1361 
 1362                 if (dev != NULL) {
 1363                         ASSERT3P(dev->si_drv2, ==, NULL);
 1364                         destroy_dev(dev);
 1365                         knlist_clear(&zsd->zsd_selinfo.si_note, 0);
 1366                         knlist_destroy(&zsd->zsd_selinfo.si_note);
 1367                 }
 1368         }
 1369 
 1370         mutex_destroy(&zv->zv_state_lock);
 1371         dataset_kstats_destroy(&zv->zv_kstat);
 1372         kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
 1373         kmem_free(zv, sizeof (zvol_state_t));
 1374         zvol_minors--;
 1375 }
 1376 
 1377 /*
 1378  * Create a minor node (plus a whole lot more) for the specified volume.
 1379  */
 1380 int
 1381 zvol_os_create_minor(const char *name)
 1382 {
 1383         zvol_state_t *zv;
 1384         objset_t *os;
 1385         dmu_object_info_t *doi;
 1386         uint64_t volsize;
 1387         uint64_t volmode, hash;
 1388         int error;
 1389         bool replayed_zil = B_FALSE;
 1390 
 1391         ZFS_LOG(1, "Creating ZVOL %s...", name);
 1392         hash = zvol_name_hash(name);
 1393         if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
 1394                 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 1395                 mutex_exit(&zv->zv_state_lock);
 1396                 return (SET_ERROR(EEXIST));
 1397         }
 1398 
 1399         DROP_GIANT();
 1400 
 1401         doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
 1402 
 1403         /* Lie and say we're read-only. */
 1404         error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
 1405         if (error)
 1406                 goto out_doi;
 1407 
 1408         error = dmu_object_info(os, ZVOL_OBJ, doi);
 1409         if (error)
 1410                 goto out_dmu_objset_disown;
 1411 
 1412         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 1413         if (error)
 1414                 goto out_dmu_objset_disown;
 1415 
 1416         error = dsl_prop_get_integer(name,
 1417             zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
 1418         if (error || volmode == ZFS_VOLMODE_DEFAULT)
 1419                 volmode = zvol_volmode;
 1420         error = 0;
 1421 
 1422         /*
 1423          * zvol_alloc equivalent ...
 1424          */
 1425         zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
 1426         zv->zv_hash = hash;
 1427         mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
 1428         zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
 1429         zv->zv_volmode = volmode;
 1430         if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 1431                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
 1432                 struct g_provider *pp;
 1433                 struct g_geom *gp;
 1434 
 1435                 zsg->zsg_state = ZVOL_GEOM_UNINIT;
 1436                 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
 1437 
 1438                 g_topology_lock();
 1439                 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
 1440                 gp->start = zvol_geom_bio_start;
 1441                 gp->access = zvol_geom_access;
 1442                 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
 1443                 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
 1444                 pp->sectorsize = DEV_BSIZE;
 1445                 pp->mediasize = 0;
 1446                 pp->private = zv;
 1447 
 1448                 zsg->zsg_provider = pp;
 1449                 bioq_init(&zsg->zsg_queue);
 1450         } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
 1451                 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
 1452                 struct cdev *dev;
 1453                 struct make_dev_args args;
 1454 
 1455                 make_dev_args_init(&args);
 1456                 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
 1457                 args.mda_devsw = &zvol_cdevsw;
 1458                 args.mda_cr = NULL;
 1459                 args.mda_uid = UID_ROOT;
 1460                 args.mda_gid = GID_OPERATOR;
 1461                 args.mda_mode = 0640;
 1462                 args.mda_si_drv2 = zv;
 1463                 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
 1464                     == 0) {
 1465 #if __FreeBSD_version > 1300130
 1466                         dev->si_iosize_max = maxphys;
 1467 #else
 1468                         dev->si_iosize_max = MAXPHYS;
 1469 #endif
 1470                         zsd->zsd_cdev = dev;
 1471                         knlist_init_sx(&zsd->zsd_selinfo.si_note,
 1472                             &zv->zv_state_lock);
 1473                 }
 1474         }
 1475         (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
 1476         rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
 1477         zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
 1478 
 1479         if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
 1480                 zv->zv_flags |= ZVOL_RDONLY;
 1481 
 1482         zv->zv_volblocksize = doi->doi_data_block_size;
 1483         zv->zv_volsize = volsize;
 1484         zv->zv_objset = os;
 1485 
 1486         ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
 1487         error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
 1488         if (error)
 1489                 goto out_dmu_objset_disown;
 1490         ASSERT3P(zv->zv_zilog, ==, NULL);
 1491         zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
 1492         if (spa_writeable(dmu_objset_spa(os))) {
 1493                 if (zil_replay_disable)
 1494                         replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
 1495                 else
 1496                         replayed_zil = zil_replay(os, zv, zvol_replay_vector);
 1497         }
 1498         if (replayed_zil)
 1499                 zil_close(zv->zv_zilog);
 1500         zv->zv_zilog = NULL;
 1501 
 1502         /* TODO: prefetch for geom tasting */
 1503 
 1504         zv->zv_objset = NULL;
 1505 out_dmu_objset_disown:
 1506         dmu_objset_disown(os, B_TRUE, FTAG);
 1507 
 1508         if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
 1509                 zvol_geom_run(zv);
 1510                 g_topology_unlock();
 1511         }
 1512 out_doi:
 1513         kmem_free(doi, sizeof (dmu_object_info_t));
 1514         if (error == 0) {
 1515                 rw_enter(&zvol_state_lock, RW_WRITER);
 1516                 zvol_insert(zv);
 1517                 zvol_minors++;
 1518                 rw_exit(&zvol_state_lock);
 1519                 ZFS_LOG(1, "ZVOL %s created.", name);
 1520         }
 1521         PICKUP_GIANT();
 1522         return (error);
 1523 }
 1524 
 1525 void
 1526 zvol_os_clear_private(zvol_state_t *zv)
 1527 {
 1528         ASSERT(RW_LOCK_HELD(&zvol_state_lock));
 1529         if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 1530                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
 1531                 struct g_provider *pp = zsg->zsg_provider;
 1532 
 1533                 if (pp->private == NULL) /* already cleared */
 1534                         return;
 1535 
 1536                 mtx_lock(&zsg->zsg_queue_mtx);
 1537                 zsg->zsg_state = ZVOL_GEOM_STOPPED;
 1538                 pp->private = NULL;
 1539                 wakeup_one(&zsg->zsg_queue);
 1540                 while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
 1541                         msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
 1542                             0, "zvol:w", 0);
 1543                 mtx_unlock(&zsg->zsg_queue_mtx);
 1544                 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
 1545         } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
 1546                 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
 1547                 struct cdev *dev = zsd->zsd_cdev;
 1548 
 1549                 if (dev != NULL)
 1550                         dev->si_drv2 = NULL;
 1551         }
 1552 }
 1553 
 1554 int
 1555 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
 1556 {
 1557         zv->zv_volsize = volsize;
 1558         if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
 1559                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
 1560                 struct g_provider *pp = zsg->zsg_provider;
 1561 
 1562                 g_topology_lock();
 1563 
 1564                 if (pp->private == NULL) {
 1565                         g_topology_unlock();
 1566                         return (SET_ERROR(ENXIO));
 1567                 }
 1568 
 1569                 /*
 1570                  * Do not invoke resize event when initial size was zero.
 1571                  * ZVOL initializes the size on first open, this is not
 1572                  * real resizing.
 1573                  */
 1574                 if (pp->mediasize == 0)
 1575                         pp->mediasize = zv->zv_volsize;
 1576                 else
 1577                         g_resize_provider(pp, zv->zv_volsize);
 1578 
 1579                 g_topology_unlock();
 1580         } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
 1581                 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
 1582 
 1583                 KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
 1584         }
 1585         return (0);
 1586 }
 1587 
 1588 void
 1589 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
 1590 {
 1591         // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
 1592 }
 1593 
 1594 void
 1595 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
 1596 {
 1597         // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
 1598 }
 1599 
 1600 /*
 1601  * Public interfaces
 1602  */
 1603 
 1604 int
 1605 zvol_busy(void)
 1606 {
 1607         return (zvol_minors != 0);
 1608 }
 1609 
 1610 int
 1611 zvol_init(void)
 1612 {
 1613         zvol_init_impl();
 1614         return (0);
 1615 }
 1616 
 1617 void
 1618 zvol_fini(void)
 1619 {
 1620         zvol_fini_impl();
 1621 }

Cache object: c122bd814a2835079fa98320ee0e504b


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.