The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/zvol.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
   23  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   24  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
   25  * LLNL-CODE-403049.
   26  *
   27  * ZFS volume emulation driver.
   28  *
   29  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
   30  * Volumes are accessed through the symbolic links named:
   31  *
   32  * /dev/<pool_name>/<dataset_name>
   33  *
   34  * Volumes are persistent through reboot and module load.  No user command
   35  * needs to be run before opening and using a device.
   36  *
   37  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
   38  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
   39  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
   40  */
   41 
   42 /*
   43  * Note on locking of zvol state structures.
   44  *
   45  * These structures are used to maintain internal state used to emulate block
   46  * devices on top of zvols. In particular, management of device minor number
   47  * operations - create, remove, rename, and set_snapdev - involves access to
   48  * these structures. The zvol_state_lock is primarily used to protect the
   49  * zvol_state_list. The zv->zv_state_lock is used to protect the contents
   50  * of the zvol_state_t structures, as well as to make sure that when the
   51  * time comes to remove the structure from the list, it is not in use, and
   52  * therefore, it can be taken off zvol_state_list and freed.
   53  *
   54  * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
   55  * e.g. for the duration of receive and rollback operations. This lock can be
   56  * held for significant periods of time. Given that it is undesirable to hold
   57  * mutexes for long periods of time, the following lock ordering applies:
   58  * - take zvol_state_lock if necessary, to protect zvol_state_list
   59  * - take zv_suspend_lock if necessary, by the code path in question
   60  * - take zv_state_lock to protect zvol_state_t
   61  *
   62  * The minor operations are issued to spa->spa_zvol_taskq queues, that are
   63  * single-threaded (to preserve order of minor operations), and are executed
   64  * through the zvol_task_cb that dispatches the specific operations. Therefore,
   65  * these operations are serialized per pool. Consequently, we can be certain
   66  * that for a given zvol, there is only one operation at a time in progress.
   67  * That is why one can be sure that first, zvol_state_t for a given zvol is
   68  * allocated and placed on zvol_state_list, and then other minor operations
   69  * for this zvol are going to proceed in the order of issue.
   70  *
   71  */
   72 
   73 #include <sys/dataset_kstats.h>
   74 #include <sys/dbuf.h>
   75 #include <sys/dmu_traverse.h>
   76 #include <sys/dsl_dataset.h>
   77 #include <sys/dsl_prop.h>
   78 #include <sys/dsl_dir.h>
   79 #include <sys/zap.h>
   80 #include <sys/zfeature.h>
   81 #include <sys/zil_impl.h>
   82 #include <sys/dmu_tx.h>
   83 #include <sys/zio.h>
   84 #include <sys/zfs_rlock.h>
   85 #include <sys/spa_impl.h>
   86 #include <sys/zvol.h>
   87 #include <sys/zvol_impl.h>
   88 
   89 unsigned int zvol_inhibit_dev = 0;
   90 unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
   91 
   92 struct hlist_head *zvol_htable;
   93 static list_t zvol_state_list;
   94 krwlock_t zvol_state_lock;
   95 
   96 typedef enum {
   97         ZVOL_ASYNC_REMOVE_MINORS,
   98         ZVOL_ASYNC_RENAME_MINORS,
   99         ZVOL_ASYNC_SET_SNAPDEV,
  100         ZVOL_ASYNC_SET_VOLMODE,
  101         ZVOL_ASYNC_MAX
  102 } zvol_async_op_t;
  103 
  104 typedef struct {
  105         zvol_async_op_t op;
  106         char name1[MAXNAMELEN];
  107         char name2[MAXNAMELEN];
  108         uint64_t value;
  109 } zvol_task_t;
  110 
  111 uint64_t
  112 zvol_name_hash(const char *name)
  113 {
  114         int i;
  115         uint64_t crc = -1ULL;
  116         const uint8_t *p = (const uint8_t *)name;
  117         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
  118         for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) {
  119                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF];
  120         }
  121         return (crc);
  122 }
  123 
  124 /*
  125  * Find a zvol_state_t given the name and hash generated by zvol_name_hash.
  126  * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
  127  * return (NULL) without the taking locks. The zv_suspend_lock is always taken
  128  * before zv_state_lock. The mode argument indicates the mode (including none)
  129  * for zv_suspend_lock to be taken.
  130  */
  131 zvol_state_t *
  132 zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
  133 {
  134         zvol_state_t *zv;
  135         struct hlist_node *p = NULL;
  136 
  137         rw_enter(&zvol_state_lock, RW_READER);
  138         hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
  139                 zv = hlist_entry(p, zvol_state_t, zv_hlink);
  140                 mutex_enter(&zv->zv_state_lock);
  141                 if (zv->zv_hash == hash &&
  142                     strncmp(zv->zv_name, name, MAXNAMELEN) == 0) {
  143                         /*
  144                          * this is the right zvol, take the locks in the
  145                          * right order
  146                          */
  147                         if (mode != RW_NONE &&
  148                             !rw_tryenter(&zv->zv_suspend_lock, mode)) {
  149                                 mutex_exit(&zv->zv_state_lock);
  150                                 rw_enter(&zv->zv_suspend_lock, mode);
  151                                 mutex_enter(&zv->zv_state_lock);
  152                                 /*
  153                                  * zvol cannot be renamed as we continue
  154                                  * to hold zvol_state_lock
  155                                  */
  156                                 ASSERT(zv->zv_hash == hash &&
  157                                     strncmp(zv->zv_name, name, MAXNAMELEN)
  158                                     == 0);
  159                         }
  160                         rw_exit(&zvol_state_lock);
  161                         return (zv);
  162                 }
  163                 mutex_exit(&zv->zv_state_lock);
  164         }
  165         rw_exit(&zvol_state_lock);
  166 
  167         return (NULL);
  168 }
  169 
  170 /*
  171  * Find a zvol_state_t given the name.
  172  * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
  173  * return (NULL) without the taking locks. The zv_suspend_lock is always taken
  174  * before zv_state_lock. The mode argument indicates the mode (including none)
  175  * for zv_suspend_lock to be taken.
  176  */
  177 static zvol_state_t *
  178 zvol_find_by_name(const char *name, int mode)
  179 {
  180         return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode));
  181 }
  182 
  183 /*
  184  * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
  185  */
  186 void
  187 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
  188 {
  189         zfs_creat_t *zct = arg;
  190         nvlist_t *nvprops = zct->zct_props;
  191         int error;
  192         uint64_t volblocksize, volsize;
  193 
  194         VERIFY(nvlist_lookup_uint64(nvprops,
  195             zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
  196         if (nvlist_lookup_uint64(nvprops,
  197             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
  198                 volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
  199 
  200         /*
  201          * These properties must be removed from the list so the generic
  202          * property setting step won't apply to them.
  203          */
  204         VERIFY(nvlist_remove_all(nvprops,
  205             zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
  206         (void) nvlist_remove_all(nvprops,
  207             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
  208 
  209         error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
  210             DMU_OT_NONE, 0, tx);
  211         ASSERT(error == 0);
  212 
  213         error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
  214             DMU_OT_NONE, 0, tx);
  215         ASSERT(error == 0);
  216 
  217         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
  218         ASSERT(error == 0);
  219 }
  220 
  221 /*
  222  * ZFS_IOC_OBJSET_STATS entry point.
  223  */
  224 int
  225 zvol_get_stats(objset_t *os, nvlist_t *nv)
  226 {
  227         int error;
  228         dmu_object_info_t *doi;
  229         uint64_t val;
  230 
  231         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
  232         if (error)
  233                 return (SET_ERROR(error));
  234 
  235         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
  236         doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
  237         error = dmu_object_info(os, ZVOL_OBJ, doi);
  238 
  239         if (error == 0) {
  240                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
  241                     doi->doi_data_block_size);
  242         }
  243 
  244         kmem_free(doi, sizeof (dmu_object_info_t));
  245 
  246         return (SET_ERROR(error));
  247 }
  248 
  249 /*
  250  * Sanity check volume size.
  251  */
  252 int
  253 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
  254 {
  255         if (volsize == 0)
  256                 return (SET_ERROR(EINVAL));
  257 
  258         if (volsize % blocksize != 0)
  259                 return (SET_ERROR(EINVAL));
  260 
  261 #ifdef _ILP32
  262         if (volsize - 1 > SPEC_MAXOFFSET_T)
  263                 return (SET_ERROR(EOVERFLOW));
  264 #endif
  265         return (0);
  266 }
  267 
  268 /*
  269  * Ensure the zap is flushed then inform the VFS of the capacity change.
  270  */
  271 static int
  272 zvol_update_volsize(uint64_t volsize, objset_t *os)
  273 {
  274         dmu_tx_t *tx;
  275         int error;
  276         uint64_t txg;
  277 
  278         tx = dmu_tx_create(os);
  279         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
  280         dmu_tx_mark_netfree(tx);
  281         error = dmu_tx_assign(tx, TXG_WAIT);
  282         if (error) {
  283                 dmu_tx_abort(tx);
  284                 return (SET_ERROR(error));
  285         }
  286         txg = dmu_tx_get_txg(tx);
  287 
  288         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
  289             &volsize, tx);
  290         dmu_tx_commit(tx);
  291 
  292         txg_wait_synced(dmu_objset_pool(os), txg);
  293 
  294         if (error == 0)
  295                 error = dmu_free_long_range(os,
  296                     ZVOL_OBJ, volsize, DMU_OBJECT_END);
  297 
  298         return (error);
  299 }
  300 
  301 /*
  302  * Set ZFS_PROP_VOLSIZE set entry point.  Note that modifying the volume
  303  * size will result in a udev "change" event being generated.
  304  */
  305 int
  306 zvol_set_volsize(const char *name, uint64_t volsize)
  307 {
  308         objset_t *os = NULL;
  309         uint64_t readonly;
  310         int error;
  311         boolean_t owned = B_FALSE;
  312 
  313         error = dsl_prop_get_integer(name,
  314             zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
  315         if (error != 0)
  316                 return (SET_ERROR(error));
  317         if (readonly)
  318                 return (SET_ERROR(EROFS));
  319 
  320         zvol_state_t *zv = zvol_find_by_name(name, RW_READER);
  321 
  322         ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
  323             RW_READ_HELD(&zv->zv_suspend_lock)));
  324 
  325         if (zv == NULL || zv->zv_objset == NULL) {
  326                 if (zv != NULL)
  327                         rw_exit(&zv->zv_suspend_lock);
  328                 if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE,
  329                     FTAG, &os)) != 0) {
  330                         if (zv != NULL)
  331                                 mutex_exit(&zv->zv_state_lock);
  332                         return (SET_ERROR(error));
  333                 }
  334                 owned = B_TRUE;
  335                 if (zv != NULL)
  336                         zv->zv_objset = os;
  337         } else {
  338                 os = zv->zv_objset;
  339         }
  340 
  341         dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP);
  342 
  343         if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
  344             (error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
  345                 goto out;
  346 
  347         error = zvol_update_volsize(volsize, os);
  348         if (error == 0 && zv != NULL) {
  349                 zv->zv_volsize = volsize;
  350                 zv->zv_changed = 1;
  351         }
  352 out:
  353         kmem_free(doi, sizeof (dmu_object_info_t));
  354 
  355         if (owned) {
  356                 dmu_objset_disown(os, B_TRUE, FTAG);
  357                 if (zv != NULL)
  358                         zv->zv_objset = NULL;
  359         } else {
  360                 rw_exit(&zv->zv_suspend_lock);
  361         }
  362 
  363         if (zv != NULL)
  364                 mutex_exit(&zv->zv_state_lock);
  365 
  366         if (error == 0 && zv != NULL)
  367                 zvol_os_update_volsize(zv, volsize);
  368 
  369         return (SET_ERROR(error));
  370 }
  371 
  372 /*
  373  * Sanity check volume block size.
  374  */
  375 int
  376 zvol_check_volblocksize(const char *name, uint64_t volblocksize)
  377 {
  378         /* Record sizes above 128k need the feature to be enabled */
  379         if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
  380                 spa_t *spa;
  381                 int error;
  382 
  383                 if ((error = spa_open(name, &spa, FTAG)) != 0)
  384                         return (error);
  385 
  386                 if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
  387                         spa_close(spa, FTAG);
  388                         return (SET_ERROR(ENOTSUP));
  389                 }
  390 
  391                 /*
  392                  * We don't allow setting the property above 1MB,
  393                  * unless the tunable has been changed.
  394                  */
  395                 if (volblocksize > zfs_max_recordsize)
  396                         return (SET_ERROR(EDOM));
  397 
  398                 spa_close(spa, FTAG);
  399         }
  400 
  401         if (volblocksize < SPA_MINBLOCKSIZE ||
  402             volblocksize > SPA_MAXBLOCKSIZE ||
  403             !ISP2(volblocksize))
  404                 return (SET_ERROR(EDOM));
  405 
  406         return (0);
  407 }
  408 
  409 /*
  410  * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
  411  * implement DKIOCFREE/free-long-range.
  412  */
  413 static int
  414 zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
  415 {
  416         zvol_state_t *zv = arg1;
  417         lr_truncate_t *lr = arg2;
  418         uint64_t offset, length;
  419 
  420         if (byteswap)
  421                 byteswap_uint64_array(lr, sizeof (*lr));
  422 
  423         offset = lr->lr_offset;
  424         length = lr->lr_length;
  425 
  426         dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
  427         dmu_tx_mark_netfree(tx);
  428         int error = dmu_tx_assign(tx, TXG_WAIT);
  429         if (error != 0) {
  430                 dmu_tx_abort(tx);
  431         } else {
  432                 (void) zil_replaying(zv->zv_zilog, tx);
  433                 dmu_tx_commit(tx);
  434                 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset,
  435                     length);
  436         }
  437 
  438         return (error);
  439 }
  440 
  441 /*
  442  * Replay a TX_WRITE ZIL transaction that didn't get committed
  443  * after a system failure
  444  */
  445 static int
  446 zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
  447 {
  448         zvol_state_t *zv = arg1;
  449         lr_write_t *lr = arg2;
  450         objset_t *os = zv->zv_objset;
  451         char *data = (char *)(lr + 1);  /* data follows lr_write_t */
  452         uint64_t offset, length;
  453         dmu_tx_t *tx;
  454         int error;
  455 
  456         if (byteswap)
  457                 byteswap_uint64_array(lr, sizeof (*lr));
  458 
  459         offset = lr->lr_offset;
  460         length = lr->lr_length;
  461 
  462         /* If it's a dmu_sync() block, write the whole block */
  463         if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
  464                 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
  465                 if (length < blocksize) {
  466                         offset -= offset % blocksize;
  467                         length = blocksize;
  468                 }
  469         }
  470 
  471         tx = dmu_tx_create(os);
  472         dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
  473         error = dmu_tx_assign(tx, TXG_WAIT);
  474         if (error) {
  475                 dmu_tx_abort(tx);
  476         } else {
  477                 dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
  478                 (void) zil_replaying(zv->zv_zilog, tx);
  479                 dmu_tx_commit(tx);
  480         }
  481 
  482         return (error);
  483 }
  484 
  485 static int
  486 zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
  487 {
  488         (void) arg1, (void) arg2, (void) byteswap;
  489         return (SET_ERROR(ENOTSUP));
  490 }
  491 
  492 /*
  493  * Callback vectors for replaying records.
  494  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
  495  */
  496 zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
  497         zvol_replay_err,        /* no such transaction type */
  498         zvol_replay_err,        /* TX_CREATE */
  499         zvol_replay_err,        /* TX_MKDIR */
  500         zvol_replay_err,        /* TX_MKXATTR */
  501         zvol_replay_err,        /* TX_SYMLINK */
  502         zvol_replay_err,        /* TX_REMOVE */
  503         zvol_replay_err,        /* TX_RMDIR */
  504         zvol_replay_err,        /* TX_LINK */
  505         zvol_replay_err,        /* TX_RENAME */
  506         zvol_replay_write,      /* TX_WRITE */
  507         zvol_replay_truncate,   /* TX_TRUNCATE */
  508         zvol_replay_err,        /* TX_SETATTR */
  509         zvol_replay_err,        /* TX_ACL */
  510         zvol_replay_err,        /* TX_CREATE_ATTR */
  511         zvol_replay_err,        /* TX_CREATE_ACL_ATTR */
  512         zvol_replay_err,        /* TX_MKDIR_ACL */
  513         zvol_replay_err,        /* TX_MKDIR_ATTR */
  514         zvol_replay_err,        /* TX_MKDIR_ACL_ATTR */
  515         zvol_replay_err,        /* TX_WRITE2 */
  516         zvol_replay_err,        /* TX_SETSAXATTR */
  517         zvol_replay_err,        /* TX_RENAME_EXCHANGE */
  518         zvol_replay_err,        /* TX_RENAME_WHITEOUT */
  519 };
  520 
  521 /*
  522  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
  523  *
  524  * We store data in the log buffers if it's small enough.
  525  * Otherwise we will later flush the data out via dmu_sync().
  526  */
  527 static const ssize_t zvol_immediate_write_sz = 32768;
  528 
  529 void
  530 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
  531     uint64_t size, int sync)
  532 {
  533         uint32_t blocksize = zv->zv_volblocksize;
  534         zilog_t *zilog = zv->zv_zilog;
  535         itx_wr_state_t write_state;
  536         uint64_t sz = size;
  537 
  538         if (zil_replaying(zilog, tx))
  539                 return;
  540 
  541         if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
  542                 write_state = WR_INDIRECT;
  543         else if (!spa_has_slogs(zilog->zl_spa) &&
  544             size >= blocksize && blocksize > zvol_immediate_write_sz)
  545                 write_state = WR_INDIRECT;
  546         else if (sync)
  547                 write_state = WR_COPIED;
  548         else
  549                 write_state = WR_NEED_COPY;
  550 
  551         while (size) {
  552                 itx_t *itx;
  553                 lr_write_t *lr;
  554                 itx_wr_state_t wr_state = write_state;
  555                 ssize_t len = size;
  556 
  557                 if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog))
  558                         wr_state = WR_NEED_COPY;
  559                 else if (wr_state == WR_INDIRECT)
  560                         len = MIN(blocksize - P2PHASE(offset, blocksize), size);
  561 
  562                 itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
  563                     (wr_state == WR_COPIED ? len : 0));
  564                 lr = (lr_write_t *)&itx->itx_lr;
  565                 if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
  566                     offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
  567                         zil_itx_destroy(itx);
  568                         itx = zil_itx_create(TX_WRITE, sizeof (*lr));
  569                         lr = (lr_write_t *)&itx->itx_lr;
  570                         wr_state = WR_NEED_COPY;
  571                 }
  572 
  573                 itx->itx_wr_state = wr_state;
  574                 lr->lr_foid = ZVOL_OBJ;
  575                 lr->lr_offset = offset;
  576                 lr->lr_length = len;
  577                 lr->lr_blkoff = 0;
  578                 BP_ZERO(&lr->lr_blkptr);
  579 
  580                 itx->itx_private = zv;
  581                 itx->itx_sync = sync;
  582 
  583                 (void) zil_itx_assign(zilog, itx, tx);
  584 
  585                 offset += len;
  586                 size -= len;
  587         }
  588 
  589         if (write_state == WR_COPIED || write_state == WR_NEED_COPY) {
  590                 dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg);
  591         }
  592 }
  593 
  594 /*
  595  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
  596  */
  597 void
  598 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
  599     boolean_t sync)
  600 {
  601         itx_t *itx;
  602         lr_truncate_t *lr;
  603         zilog_t *zilog = zv->zv_zilog;
  604 
  605         if (zil_replaying(zilog, tx))
  606                 return;
  607 
  608         itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
  609         lr = (lr_truncate_t *)&itx->itx_lr;
  610         lr->lr_foid = ZVOL_OBJ;
  611         lr->lr_offset = off;
  612         lr->lr_length = len;
  613 
  614         itx->itx_sync = sync;
  615         zil_itx_assign(zilog, itx, tx);
  616 }
  617 
  618 
  619 static void
  620 zvol_get_done(zgd_t *zgd, int error)
  621 {
  622         (void) error;
  623         if (zgd->zgd_db)
  624                 dmu_buf_rele(zgd->zgd_db, zgd);
  625 
  626         zfs_rangelock_exit(zgd->zgd_lr);
  627 
  628         kmem_free(zgd, sizeof (zgd_t));
  629 }
  630 
  631 /*
  632  * Get data to generate a TX_WRITE intent log record.
  633  */
  634 int
  635 zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
  636     struct lwb *lwb, zio_t *zio)
  637 {
  638         zvol_state_t *zv = arg;
  639         uint64_t offset = lr->lr_offset;
  640         uint64_t size = lr->lr_length;
  641         dmu_buf_t *db;
  642         zgd_t *zgd;
  643         int error;
  644 
  645         ASSERT3P(lwb, !=, NULL);
  646         ASSERT3P(zio, !=, NULL);
  647         ASSERT3U(size, !=, 0);
  648 
  649         zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
  650         zgd->zgd_lwb = lwb;
  651 
  652         /*
  653          * Write records come in two flavors: immediate and indirect.
  654          * For small writes it's cheaper to store the data with the
  655          * log record (immediate); for large writes it's cheaper to
  656          * sync the data and get a pointer to it (indirect) so that
  657          * we don't have to write the data twice.
  658          */
  659         if (buf != NULL) { /* immediate write */
  660                 zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
  661                     size, RL_READER);
  662                 error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
  663                     DMU_READ_NO_PREFETCH);
  664         } else { /* indirect write */
  665                 /*
  666                  * Have to lock the whole block to ensure when it's written out
  667                  * and its checksum is being calculated that no one can change
  668                  * the data. Contrarily to zfs_get_data we need not re-check
  669                  * blocksize after we get the lock because it cannot be changed.
  670                  */
  671                 size = zv->zv_volblocksize;
  672                 offset = P2ALIGN_TYPED(offset, size, uint64_t);
  673                 zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
  674                     size, RL_READER);
  675                 error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
  676                     DMU_READ_NO_PREFETCH);
  677                 if (error == 0) {
  678                         blkptr_t *bp = &lr->lr_blkptr;
  679 
  680                         zgd->zgd_db = db;
  681                         zgd->zgd_bp = bp;
  682 
  683                         ASSERT(db != NULL);
  684                         ASSERT(db->db_offset == offset);
  685                         ASSERT(db->db_size == size);
  686 
  687                         error = dmu_sync(zio, lr->lr_common.lrc_txg,
  688                             zvol_get_done, zgd);
  689 
  690                         if (error == 0)
  691                                 return (0);
  692                 }
  693         }
  694 
  695         zvol_get_done(zgd, error);
  696 
  697         return (SET_ERROR(error));
  698 }
  699 
  700 /*
  701  * The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
  702  */
  703 
  704 void
  705 zvol_insert(zvol_state_t *zv)
  706 {
  707         ASSERT(RW_WRITE_HELD(&zvol_state_lock));
  708         list_insert_head(&zvol_state_list, zv);
  709         hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
  710 }
  711 
  712 /*
  713  * Simply remove the zvol from to list of zvols.
  714  */
  715 static void
  716 zvol_remove(zvol_state_t *zv)
  717 {
  718         ASSERT(RW_WRITE_HELD(&zvol_state_lock));
  719         list_remove(&zvol_state_list, zv);
  720         hlist_del(&zv->zv_hlink);
  721 }
  722 
  723 /*
  724  * Setup zv after we just own the zv->objset
  725  */
  726 static int
  727 zvol_setup_zv(zvol_state_t *zv)
  728 {
  729         uint64_t volsize;
  730         int error;
  731         uint64_t ro;
  732         objset_t *os = zv->zv_objset;
  733 
  734         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
  735         ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
  736 
  737         zv->zv_zilog = NULL;
  738         zv->zv_flags &= ~ZVOL_WRITTEN_TO;
  739 
  740         error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
  741         if (error)
  742                 return (SET_ERROR(error));
  743 
  744         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
  745         if (error)
  746                 return (SET_ERROR(error));
  747 
  748         error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn);
  749         if (error)
  750                 return (SET_ERROR(error));
  751 
  752         zvol_os_set_capacity(zv, volsize >> 9);
  753         zv->zv_volsize = volsize;
  754 
  755         if (ro || dmu_objset_is_snapshot(os) ||
  756             !spa_writeable(dmu_objset_spa(os))) {
  757                 zvol_os_set_disk_ro(zv, 1);
  758                 zv->zv_flags |= ZVOL_RDONLY;
  759         } else {
  760                 zvol_os_set_disk_ro(zv, 0);
  761                 zv->zv_flags &= ~ZVOL_RDONLY;
  762         }
  763         return (0);
  764 }
  765 
  766 /*
  767  * Shutdown every zv_objset related stuff except zv_objset itself.
  768  * The is the reverse of zvol_setup_zv.
  769  */
  770 static void
  771 zvol_shutdown_zv(zvol_state_t *zv)
  772 {
  773         ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
  774             RW_LOCK_HELD(&zv->zv_suspend_lock));
  775 
  776         if (zv->zv_flags & ZVOL_WRITTEN_TO) {
  777                 ASSERT(zv->zv_zilog != NULL);
  778                 zil_close(zv->zv_zilog);
  779         }
  780 
  781         zv->zv_zilog = NULL;
  782 
  783         dnode_rele(zv->zv_dn, zv);
  784         zv->zv_dn = NULL;
  785 
  786         /*
  787          * Evict cached data. We must write out any dirty data before
  788          * disowning the dataset.
  789          */
  790         if (zv->zv_flags & ZVOL_WRITTEN_TO)
  791                 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
  792         (void) dmu_objset_evict_dbufs(zv->zv_objset);
  793 }
  794 
  795 /*
  796  * return the proper tag for rollback and recv
  797  */
  798 void *
  799 zvol_tag(zvol_state_t *zv)
  800 {
  801         ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
  802         return (zv->zv_open_count > 0 ? zv : NULL);
  803 }
  804 
  805 /*
  806  * Suspend the zvol for recv and rollback.
  807  */
  808 zvol_state_t *
  809 zvol_suspend(const char *name)
  810 {
  811         zvol_state_t *zv;
  812 
  813         zv = zvol_find_by_name(name, RW_WRITER);
  814 
  815         if (zv == NULL)
  816                 return (NULL);
  817 
  818         /* block all I/O, release in zvol_resume. */
  819         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
  820         ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
  821 
  822         atomic_inc(&zv->zv_suspend_ref);
  823 
  824         if (zv->zv_open_count > 0)
  825                 zvol_shutdown_zv(zv);
  826 
  827         /*
  828          * do not hold zv_state_lock across suspend/resume to
  829          * avoid locking up zvol lookups
  830          */
  831         mutex_exit(&zv->zv_state_lock);
  832 
  833         /* zv_suspend_lock is released in zvol_resume() */
  834         return (zv);
  835 }
  836 
  837 int
  838 zvol_resume(zvol_state_t *zv)
  839 {
  840         int error = 0;
  841 
  842         ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
  843 
  844         mutex_enter(&zv->zv_state_lock);
  845 
  846         if (zv->zv_open_count > 0) {
  847                 VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset));
  848                 VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv);
  849                 VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset));
  850                 dmu_objset_rele(zv->zv_objset, zv);
  851 
  852                 error = zvol_setup_zv(zv);
  853         }
  854 
  855         mutex_exit(&zv->zv_state_lock);
  856 
  857         rw_exit(&zv->zv_suspend_lock);
  858         /*
  859          * We need this because we don't hold zvol_state_lock while releasing
  860          * zv_suspend_lock. zvol_remove_minors_impl thus cannot check
  861          * zv_suspend_lock to determine it is safe to free because rwlock is
  862          * not inherent atomic.
  863          */
  864         atomic_dec(&zv->zv_suspend_ref);
  865 
  866         return (SET_ERROR(error));
  867 }
  868 
  869 int
  870 zvol_first_open(zvol_state_t *zv, boolean_t readonly)
  871 {
  872         objset_t *os;
  873         int error;
  874 
  875         ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
  876         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
  877         ASSERT(mutex_owned(&spa_namespace_lock));
  878 
  879         boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
  880         error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
  881         if (error)
  882                 return (SET_ERROR(error));
  883 
  884         zv->zv_objset = os;
  885 
  886         error = zvol_setup_zv(zv);
  887         if (error) {
  888                 dmu_objset_disown(os, 1, zv);
  889                 zv->zv_objset = NULL;
  890         }
  891 
  892         return (error);
  893 }
  894 
  895 void
  896 zvol_last_close(zvol_state_t *zv)
  897 {
  898         ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
  899         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
  900 
  901         zvol_shutdown_zv(zv);
  902 
  903         dmu_objset_disown(zv->zv_objset, 1, zv);
  904         zv->zv_objset = NULL;
  905 }
  906 
  907 typedef struct minors_job {
  908         list_t *list;
  909         list_node_t link;
  910         /* input */
  911         char *name;
  912         /* output */
  913         int error;
  914 } minors_job_t;
  915 
  916 /*
  917  * Prefetch zvol dnodes for the minors_job
  918  */
  919 static void
  920 zvol_prefetch_minors_impl(void *arg)
  921 {
  922         minors_job_t *job = arg;
  923         char *dsname = job->name;
  924         objset_t *os = NULL;
  925 
  926         job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
  927             FTAG, &os);
  928         if (job->error == 0) {
  929                 dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
  930                 dmu_objset_disown(os, B_TRUE, FTAG);
  931         }
  932 }
  933 
  934 /*
  935  * Mask errors to continue dmu_objset_find() traversal
  936  */
  937 static int
  938 zvol_create_snap_minor_cb(const char *dsname, void *arg)
  939 {
  940         minors_job_t *j = arg;
  941         list_t *minors_list = j->list;
  942         const char *name = j->name;
  943 
  944         ASSERT0(MUTEX_HELD(&spa_namespace_lock));
  945 
  946         /* skip the designated dataset */
  947         if (name && strcmp(dsname, name) == 0)
  948                 return (0);
  949 
  950         /* at this point, the dsname should name a snapshot */
  951         if (strchr(dsname, '@') == 0) {
  952                 dprintf("zvol_create_snap_minor_cb(): "
  953                     "%s is not a snapshot name\n", dsname);
  954         } else {
  955                 minors_job_t *job;
  956                 char *n = kmem_strdup(dsname);
  957                 if (n == NULL)
  958                         return (0);
  959 
  960                 job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
  961                 job->name = n;
  962                 job->list = minors_list;
  963                 job->error = 0;
  964                 list_insert_tail(minors_list, job);
  965                 /* don't care if dispatch fails, because job->error is 0 */
  966                 taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
  967                     TQ_SLEEP);
  968         }
  969 
  970         return (0);
  971 }
  972 
  973 /*
  974  * If spa_keystore_load_wkey() is called for an encrypted zvol,
  975  * we need to look for any clones also using the key. This function
  976  * is "best effort" - so we just skip over it if there are failures.
  977  */
  978 static void
  979 zvol_add_clones(const char *dsname, list_t *minors_list)
  980 {
  981         /* Also check if it has clones */
  982         dsl_dir_t *dd = NULL;
  983         dsl_pool_t *dp = NULL;
  984 
  985         if (dsl_pool_hold(dsname, FTAG, &dp) != 0)
  986                 return;
  987 
  988         if (!spa_feature_is_enabled(dp->dp_spa,
  989             SPA_FEATURE_ENCRYPTION))
  990                 goto out;
  991 
  992         if (dsl_dir_hold(dp, dsname, FTAG, &dd, NULL) != 0)
  993                 goto out;
  994 
  995         if (dsl_dir_phys(dd)->dd_clones == 0)
  996                 goto out;
  997 
  998         zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
  999         zap_attribute_t *za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 1000         objset_t *mos = dd->dd_pool->dp_meta_objset;
 1001 
 1002         for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones);
 1003             zap_cursor_retrieve(zc, za) == 0;
 1004             zap_cursor_advance(zc)) {
 1005                 dsl_dataset_t *clone;
 1006                 minors_job_t *job;
 1007 
 1008                 if (dsl_dataset_hold_obj(dd->dd_pool,
 1009                     za->za_first_integer, FTAG, &clone) == 0) {
 1010 
 1011                         char name[ZFS_MAX_DATASET_NAME_LEN];
 1012                         dsl_dataset_name(clone, name);
 1013 
 1014                         char *n = kmem_strdup(name);
 1015                         job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
 1016                         job->name = n;
 1017                         job->list = minors_list;
 1018                         job->error = 0;
 1019                         list_insert_tail(minors_list, job);
 1020 
 1021                         dsl_dataset_rele(clone, FTAG);
 1022                 }
 1023         }
 1024         zap_cursor_fini(zc);
 1025         kmem_free(za, sizeof (zap_attribute_t));
 1026         kmem_free(zc, sizeof (zap_cursor_t));
 1027 
 1028 out:
 1029         if (dd != NULL)
 1030                 dsl_dir_rele(dd, FTAG);
 1031         dsl_pool_rele(dp, FTAG);
 1032 }
 1033 
 1034 /*
 1035  * Mask errors to continue dmu_objset_find() traversal
 1036  */
 1037 static int
 1038 zvol_create_minors_cb(const char *dsname, void *arg)
 1039 {
 1040         uint64_t snapdev;
 1041         int error;
 1042         list_t *minors_list = arg;
 1043 
 1044         ASSERT0(MUTEX_HELD(&spa_namespace_lock));
 1045 
 1046         error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
 1047         if (error)
 1048                 return (0);
 1049 
 1050         /*
 1051          * Given the name and the 'snapdev' property, create device minor nodes
 1052          * with the linkages to zvols/snapshots as needed.
 1053          * If the name represents a zvol, create a minor node for the zvol, then
 1054          * check if its snapshots are 'visible', and if so, iterate over the
 1055          * snapshots and create device minor nodes for those.
 1056          */
 1057         if (strchr(dsname, '@') == 0) {
 1058                 minors_job_t *job;
 1059                 char *n = kmem_strdup(dsname);
 1060                 if (n == NULL)
 1061                         return (0);
 1062 
 1063                 job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
 1064                 job->name = n;
 1065                 job->list = minors_list;
 1066                 job->error = 0;
 1067                 list_insert_tail(minors_list, job);
 1068                 /* don't care if dispatch fails, because job->error is 0 */
 1069                 taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
 1070                     TQ_SLEEP);
 1071 
 1072                 zvol_add_clones(dsname, minors_list);
 1073 
 1074                 if (snapdev == ZFS_SNAPDEV_VISIBLE) {
 1075                         /*
 1076                          * traverse snapshots only, do not traverse children,
 1077                          * and skip the 'dsname'
 1078                          */
 1079                         (void) dmu_objset_find(dsname,
 1080                             zvol_create_snap_minor_cb, (void *)job,
 1081                             DS_FIND_SNAPSHOTS);
 1082                 }
 1083         } else {
 1084                 dprintf("zvol_create_minors_cb(): %s is not a zvol name\n",
 1085                     dsname);
 1086         }
 1087 
 1088         return (0);
 1089 }
 1090 
 1091 /*
 1092  * Create minors for the specified dataset, including children and snapshots.
 1093  * Pay attention to the 'snapdev' property and iterate over the snapshots
 1094  * only if they are 'visible'. This approach allows one to assure that the
 1095  * snapshot metadata is read from disk only if it is needed.
 1096  *
 1097  * The name can represent a dataset to be recursively scanned for zvols and
 1098  * their snapshots, or a single zvol snapshot. If the name represents a
 1099  * dataset, the scan is performed in two nested stages:
 1100  * - scan the dataset for zvols, and
 1101  * - for each zvol, create a minor node, then check if the zvol's snapshots
 1102  *   are 'visible', and only then iterate over the snapshots if needed
 1103  *
 1104  * If the name represents a snapshot, a check is performed if the snapshot is
 1105  * 'visible' (which also verifies that the parent is a zvol), and if so,
 1106  * a minor node for that snapshot is created.
 1107  */
 1108 void
 1109 zvol_create_minors_recursive(const char *name)
 1110 {
 1111         list_t minors_list;
 1112         minors_job_t *job;
 1113 
 1114         if (zvol_inhibit_dev)
 1115                 return;
 1116 
 1117         /*
 1118          * This is the list for prefetch jobs. Whenever we found a match
 1119          * during dmu_objset_find, we insert a minors_job to the list and do
 1120          * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need
 1121          * any lock because all list operation is done on the current thread.
 1122          *
 1123          * We will use this list to do zvol_os_create_minor after prefetch
 1124          * so we don't have to traverse using dmu_objset_find again.
 1125          */
 1126         list_create(&minors_list, sizeof (minors_job_t),
 1127             offsetof(minors_job_t, link));
 1128 
 1129 
 1130         if (strchr(name, '@') != NULL) {
 1131                 uint64_t snapdev;
 1132 
 1133                 int error = dsl_prop_get_integer(name, "snapdev",
 1134                     &snapdev, NULL);
 1135 
 1136                 if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
 1137                         (void) zvol_os_create_minor(name);
 1138         } else {
 1139                 fstrans_cookie_t cookie = spl_fstrans_mark();
 1140                 (void) dmu_objset_find(name, zvol_create_minors_cb,
 1141                     &minors_list, DS_FIND_CHILDREN);
 1142                 spl_fstrans_unmark(cookie);
 1143         }
 1144 
 1145         taskq_wait_outstanding(system_taskq, 0);
 1146 
 1147         /*
 1148          * Prefetch is completed, we can do zvol_os_create_minor
 1149          * sequentially.
 1150          */
 1151         while ((job = list_head(&minors_list)) != NULL) {
 1152                 list_remove(&minors_list, job);
 1153                 if (!job->error)
 1154                         (void) zvol_os_create_minor(job->name);
 1155                 kmem_strfree(job->name);
 1156                 kmem_free(job, sizeof (minors_job_t));
 1157         }
 1158 
 1159         list_destroy(&minors_list);
 1160 }
 1161 
 1162 void
 1163 zvol_create_minor(const char *name)
 1164 {
 1165         /*
 1166          * Note: the dsl_pool_config_lock must not be held.
 1167          * Minor node creation needs to obtain the zvol_state_lock.
 1168          * zvol_open() obtains the zvol_state_lock and then the dsl pool
 1169          * config lock.  Therefore, we can't have the config lock now if
 1170          * we are going to wait for the zvol_state_lock, because it
 1171          * would be a lock order inversion which could lead to deadlock.
 1172          */
 1173 
 1174         if (zvol_inhibit_dev)
 1175                 return;
 1176 
 1177         if (strchr(name, '@') != NULL) {
 1178                 uint64_t snapdev;
 1179 
 1180                 int error = dsl_prop_get_integer(name,
 1181                     "snapdev", &snapdev, NULL);
 1182 
 1183                 if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
 1184                         (void) zvol_os_create_minor(name);
 1185         } else {
 1186                 (void) zvol_os_create_minor(name);
 1187         }
 1188 }
 1189 
 1190 /*
 1191  * Remove minors for specified dataset including children and snapshots.
 1192  */
 1193 
 1194 static void
 1195 zvol_free_task(void *arg)
 1196 {
 1197         zvol_os_free(arg);
 1198 }
 1199 
 1200 void
 1201 zvol_remove_minors_impl(const char *name)
 1202 {
 1203         zvol_state_t *zv, *zv_next;
 1204         int namelen = ((name) ? strlen(name) : 0);
 1205         taskqid_t t;
 1206         list_t free_list;
 1207 
 1208         if (zvol_inhibit_dev)
 1209                 return;
 1210 
 1211         list_create(&free_list, sizeof (zvol_state_t),
 1212             offsetof(zvol_state_t, zv_next));
 1213 
 1214         rw_enter(&zvol_state_lock, RW_WRITER);
 1215 
 1216         for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 1217                 zv_next = list_next(&zvol_state_list, zv);
 1218 
 1219                 mutex_enter(&zv->zv_state_lock);
 1220                 if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
 1221                     (strncmp(zv->zv_name, name, namelen) == 0 &&
 1222                     (zv->zv_name[namelen] == '/' ||
 1223                     zv->zv_name[namelen] == '@'))) {
 1224                         /*
 1225                          * By holding zv_state_lock here, we guarantee that no
 1226                          * one is currently using this zv
 1227                          */
 1228 
 1229                         /* If in use, leave alone */
 1230                         if (zv->zv_open_count > 0 ||
 1231                             atomic_read(&zv->zv_suspend_ref)) {
 1232                                 mutex_exit(&zv->zv_state_lock);
 1233                                 continue;
 1234                         }
 1235 
 1236                         zvol_remove(zv);
 1237 
 1238                         /*
 1239                          * Cleared while holding zvol_state_lock as a writer
 1240                          * which will prevent zvol_open() from opening it.
 1241                          */
 1242                         zvol_os_clear_private(zv);
 1243 
 1244                         /* Drop zv_state_lock before zvol_free() */
 1245                         mutex_exit(&zv->zv_state_lock);
 1246 
 1247                         /* Try parallel zv_free, if failed do it in place */
 1248                         t = taskq_dispatch(system_taskq, zvol_free_task, zv,
 1249                             TQ_SLEEP);
 1250                         if (t == TASKQID_INVALID)
 1251                                 list_insert_head(&free_list, zv);
 1252                 } else {
 1253                         mutex_exit(&zv->zv_state_lock);
 1254                 }
 1255         }
 1256         rw_exit(&zvol_state_lock);
 1257 
 1258         /* Drop zvol_state_lock before calling zvol_free() */
 1259         while ((zv = list_head(&free_list)) != NULL) {
 1260                 list_remove(&free_list, zv);
 1261                 zvol_os_free(zv);
 1262         }
 1263 }
 1264 
 1265 /* Remove minor for this specific volume only */
 1266 static void
 1267 zvol_remove_minor_impl(const char *name)
 1268 {
 1269         zvol_state_t *zv = NULL, *zv_next;
 1270 
 1271         if (zvol_inhibit_dev)
 1272                 return;
 1273 
 1274         rw_enter(&zvol_state_lock, RW_WRITER);
 1275 
 1276         for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 1277                 zv_next = list_next(&zvol_state_list, zv);
 1278 
 1279                 mutex_enter(&zv->zv_state_lock);
 1280                 if (strcmp(zv->zv_name, name) == 0) {
 1281                         /*
 1282                          * By holding zv_state_lock here, we guarantee that no
 1283                          * one is currently using this zv
 1284                          */
 1285 
 1286                         /* If in use, leave alone */
 1287                         if (zv->zv_open_count > 0 ||
 1288                             atomic_read(&zv->zv_suspend_ref)) {
 1289                                 mutex_exit(&zv->zv_state_lock);
 1290                                 continue;
 1291                         }
 1292                         zvol_remove(zv);
 1293 
 1294                         zvol_os_clear_private(zv);
 1295                         mutex_exit(&zv->zv_state_lock);
 1296                         break;
 1297                 } else {
 1298                         mutex_exit(&zv->zv_state_lock);
 1299                 }
 1300         }
 1301 
 1302         /* Drop zvol_state_lock before calling zvol_free() */
 1303         rw_exit(&zvol_state_lock);
 1304 
 1305         if (zv != NULL)
 1306                 zvol_os_free(zv);
 1307 }
 1308 
 1309 /*
 1310  * Rename minors for specified dataset including children and snapshots.
 1311  */
 1312 static void
 1313 zvol_rename_minors_impl(const char *oldname, const char *newname)
 1314 {
 1315         zvol_state_t *zv, *zv_next;
 1316         int oldnamelen;
 1317 
 1318         if (zvol_inhibit_dev)
 1319                 return;
 1320 
 1321         oldnamelen = strlen(oldname);
 1322 
 1323         rw_enter(&zvol_state_lock, RW_READER);
 1324 
 1325         for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 1326                 zv_next = list_next(&zvol_state_list, zv);
 1327 
 1328                 mutex_enter(&zv->zv_state_lock);
 1329 
 1330                 if (strcmp(zv->zv_name, oldname) == 0) {
 1331                         zvol_os_rename_minor(zv, newname);
 1332                 } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
 1333                     (zv->zv_name[oldnamelen] == '/' ||
 1334                     zv->zv_name[oldnamelen] == '@')) {
 1335                         char *name = kmem_asprintf("%s%c%s", newname,
 1336                             zv->zv_name[oldnamelen],
 1337                             zv->zv_name + oldnamelen + 1);
 1338                         zvol_os_rename_minor(zv, name);
 1339                         kmem_strfree(name);
 1340                 }
 1341 
 1342                 mutex_exit(&zv->zv_state_lock);
 1343         }
 1344 
 1345         rw_exit(&zvol_state_lock);
 1346 }
 1347 
 1348 typedef struct zvol_snapdev_cb_arg {
 1349         uint64_t snapdev;
 1350 } zvol_snapdev_cb_arg_t;
 1351 
 1352 static int
 1353 zvol_set_snapdev_cb(const char *dsname, void *param)
 1354 {
 1355         zvol_snapdev_cb_arg_t *arg = param;
 1356 
 1357         if (strchr(dsname, '@') == NULL)
 1358                 return (0);
 1359 
 1360         switch (arg->snapdev) {
 1361                 case ZFS_SNAPDEV_VISIBLE:
 1362                         (void) zvol_os_create_minor(dsname);
 1363                         break;
 1364                 case ZFS_SNAPDEV_HIDDEN:
 1365                         (void) zvol_remove_minor_impl(dsname);
 1366                         break;
 1367         }
 1368 
 1369         return (0);
 1370 }
 1371 
 1372 static void
 1373 zvol_set_snapdev_impl(char *name, uint64_t snapdev)
 1374 {
 1375         zvol_snapdev_cb_arg_t arg = {snapdev};
 1376         fstrans_cookie_t cookie = spl_fstrans_mark();
 1377         /*
 1378          * The zvol_set_snapdev_sync() sets snapdev appropriately
 1379          * in the dataset hierarchy. Here, we only scan snapshots.
 1380          */
 1381         dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS);
 1382         spl_fstrans_unmark(cookie);
 1383 }
 1384 
 1385 static void
 1386 zvol_set_volmode_impl(char *name, uint64_t volmode)
 1387 {
 1388         fstrans_cookie_t cookie;
 1389         uint64_t old_volmode;
 1390         zvol_state_t *zv;
 1391 
 1392         if (strchr(name, '@') != NULL)
 1393                 return;
 1394 
 1395         /*
 1396          * It's unfortunate we need to remove minors before we create new ones:
 1397          * this is necessary because our backing gendisk (zvol_state->zv_disk)
 1398          * could be different when we set, for instance, volmode from "geom"
 1399          * to "dev" (or vice versa).
 1400          */
 1401         zv = zvol_find_by_name(name, RW_NONE);
 1402         if (zv == NULL && volmode == ZFS_VOLMODE_NONE)
 1403                         return;
 1404         if (zv != NULL) {
 1405                 old_volmode = zv->zv_volmode;
 1406                 mutex_exit(&zv->zv_state_lock);
 1407                 if (old_volmode == volmode)
 1408                         return;
 1409                 zvol_wait_close(zv);
 1410         }
 1411         cookie = spl_fstrans_mark();
 1412         switch (volmode) {
 1413                 case ZFS_VOLMODE_NONE:
 1414                         (void) zvol_remove_minor_impl(name);
 1415                         break;
 1416                 case ZFS_VOLMODE_GEOM:
 1417                 case ZFS_VOLMODE_DEV:
 1418                         (void) zvol_remove_minor_impl(name);
 1419                         (void) zvol_os_create_minor(name);
 1420                         break;
 1421                 case ZFS_VOLMODE_DEFAULT:
 1422                         (void) zvol_remove_minor_impl(name);
 1423                         if (zvol_volmode == ZFS_VOLMODE_NONE)
 1424                                 break;
 1425                         else /* if zvol_volmode is invalid defaults to "geom" */
 1426                                 (void) zvol_os_create_minor(name);
 1427                         break;
 1428         }
 1429         spl_fstrans_unmark(cookie);
 1430 }
 1431 
 1432 static zvol_task_t *
 1433 zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
 1434     uint64_t value)
 1435 {
 1436         zvol_task_t *task;
 1437 
 1438         /* Never allow tasks on hidden names. */
 1439         if (name1[0] == '$')
 1440                 return (NULL);
 1441 
 1442         task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
 1443         task->op = op;
 1444         task->value = value;
 1445 
 1446         strlcpy(task->name1, name1, MAXNAMELEN);
 1447         if (name2 != NULL)
 1448                 strlcpy(task->name2, name2, MAXNAMELEN);
 1449 
 1450         return (task);
 1451 }
 1452 
 1453 static void
 1454 zvol_task_free(zvol_task_t *task)
 1455 {
 1456         kmem_free(task, sizeof (zvol_task_t));
 1457 }
 1458 
 1459 /*
 1460  * The worker thread function performed asynchronously.
 1461  */
 1462 static void
 1463 zvol_task_cb(void *arg)
 1464 {
 1465         zvol_task_t *task = arg;
 1466 
 1467         switch (task->op) {
 1468         case ZVOL_ASYNC_REMOVE_MINORS:
 1469                 zvol_remove_minors_impl(task->name1);
 1470                 break;
 1471         case ZVOL_ASYNC_RENAME_MINORS:
 1472                 zvol_rename_minors_impl(task->name1, task->name2);
 1473                 break;
 1474         case ZVOL_ASYNC_SET_SNAPDEV:
 1475                 zvol_set_snapdev_impl(task->name1, task->value);
 1476                 break;
 1477         case ZVOL_ASYNC_SET_VOLMODE:
 1478                 zvol_set_volmode_impl(task->name1, task->value);
 1479                 break;
 1480         default:
 1481                 VERIFY(0);
 1482                 break;
 1483         }
 1484 
 1485         zvol_task_free(task);
 1486 }
 1487 
 1488 typedef struct zvol_set_prop_int_arg {
 1489         const char *zsda_name;
 1490         uint64_t zsda_value;
 1491         zprop_source_t zsda_source;
 1492         dmu_tx_t *zsda_tx;
 1493 } zvol_set_prop_int_arg_t;
 1494 
 1495 /*
 1496  * Sanity check the dataset for safe use by the sync task.  No additional
 1497  * conditions are imposed.
 1498  */
 1499 static int
 1500 zvol_set_snapdev_check(void *arg, dmu_tx_t *tx)
 1501 {
 1502         zvol_set_prop_int_arg_t *zsda = arg;
 1503         dsl_pool_t *dp = dmu_tx_pool(tx);
 1504         dsl_dir_t *dd;
 1505         int error;
 1506 
 1507         error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
 1508         if (error != 0)
 1509                 return (error);
 1510 
 1511         dsl_dir_rele(dd, FTAG);
 1512 
 1513         return (error);
 1514 }
 1515 
 1516 static int
 1517 zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 1518 {
 1519         (void) arg;
 1520         char dsname[MAXNAMELEN];
 1521         zvol_task_t *task;
 1522         uint64_t snapdev;
 1523 
 1524         dsl_dataset_name(ds, dsname);
 1525         if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0)
 1526                 return (0);
 1527         task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev);
 1528         if (task == NULL)
 1529                 return (0);
 1530 
 1531         (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
 1532             task, TQ_SLEEP);
 1533         return (0);
 1534 }
 1535 
 1536 /*
 1537  * Traverse all child datasets and apply snapdev appropriately.
 1538  * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
 1539  * dataset and read the effective "snapdev" on every child in the callback
 1540  * function: this is because the value is not guaranteed to be the same in the
 1541  * whole dataset hierarchy.
 1542  */
 1543 static void
 1544 zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx)
 1545 {
 1546         zvol_set_prop_int_arg_t *zsda = arg;
 1547         dsl_pool_t *dp = dmu_tx_pool(tx);
 1548         dsl_dir_t *dd;
 1549         dsl_dataset_t *ds;
 1550         int error;
 1551 
 1552         VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
 1553         zsda->zsda_tx = tx;
 1554 
 1555         error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
 1556         if (error == 0) {
 1557                 dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV),
 1558                     zsda->zsda_source, sizeof (zsda->zsda_value), 1,
 1559                     &zsda->zsda_value, zsda->zsda_tx);
 1560                 dsl_dataset_rele(ds, FTAG);
 1561         }
 1562         dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb,
 1563             zsda, DS_FIND_CHILDREN);
 1564 
 1565         dsl_dir_rele(dd, FTAG);
 1566 }
 1567 
 1568 int
 1569 zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev)
 1570 {
 1571         zvol_set_prop_int_arg_t zsda;
 1572 
 1573         zsda.zsda_name = ddname;
 1574         zsda.zsda_source = source;
 1575         zsda.zsda_value = snapdev;
 1576 
 1577         return (dsl_sync_task(ddname, zvol_set_snapdev_check,
 1578             zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
 1579 }
 1580 
 1581 /*
 1582  * Sanity check the dataset for safe use by the sync task.  No additional
 1583  * conditions are imposed.
 1584  */
 1585 static int
 1586 zvol_set_volmode_check(void *arg, dmu_tx_t *tx)
 1587 {
 1588         zvol_set_prop_int_arg_t *zsda = arg;
 1589         dsl_pool_t *dp = dmu_tx_pool(tx);
 1590         dsl_dir_t *dd;
 1591         int error;
 1592 
 1593         error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
 1594         if (error != 0)
 1595                 return (error);
 1596 
 1597         dsl_dir_rele(dd, FTAG);
 1598 
 1599         return (error);
 1600 }
 1601 
 1602 static int
 1603 zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 1604 {
 1605         (void) arg;
 1606         char dsname[MAXNAMELEN];
 1607         zvol_task_t *task;
 1608         uint64_t volmode;
 1609 
 1610         dsl_dataset_name(ds, dsname);
 1611         if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0)
 1612                 return (0);
 1613         task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode);
 1614         if (task == NULL)
 1615                 return (0);
 1616 
 1617         (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
 1618             task, TQ_SLEEP);
 1619         return (0);
 1620 }
 1621 
 1622 /*
 1623  * Traverse all child datasets and apply volmode appropriately.
 1624  * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
 1625  * dataset and read the effective "volmode" on every child in the callback
 1626  * function: this is because the value is not guaranteed to be the same in the
 1627  * whole dataset hierarchy.
 1628  */
 1629 static void
 1630 zvol_set_volmode_sync(void *arg, dmu_tx_t *tx)
 1631 {
 1632         zvol_set_prop_int_arg_t *zsda = arg;
 1633         dsl_pool_t *dp = dmu_tx_pool(tx);
 1634         dsl_dir_t *dd;
 1635         dsl_dataset_t *ds;
 1636         int error;
 1637 
 1638         VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
 1639         zsda->zsda_tx = tx;
 1640 
 1641         error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
 1642         if (error == 0) {
 1643                 dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE),
 1644                     zsda->zsda_source, sizeof (zsda->zsda_value), 1,
 1645                     &zsda->zsda_value, zsda->zsda_tx);
 1646                 dsl_dataset_rele(ds, FTAG);
 1647         }
 1648 
 1649         dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb,
 1650             zsda, DS_FIND_CHILDREN);
 1651 
 1652         dsl_dir_rele(dd, FTAG);
 1653 }
 1654 
 1655 int
 1656 zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode)
 1657 {
 1658         zvol_set_prop_int_arg_t zsda;
 1659 
 1660         zsda.zsda_name = ddname;
 1661         zsda.zsda_source = source;
 1662         zsda.zsda_value = volmode;
 1663 
 1664         return (dsl_sync_task(ddname, zvol_set_volmode_check,
 1665             zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
 1666 }
 1667 
 1668 void
 1669 zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
 1670 {
 1671         zvol_task_t *task;
 1672         taskqid_t id;
 1673 
 1674         task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL);
 1675         if (task == NULL)
 1676                 return;
 1677 
 1678         id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 1679         if ((async == B_FALSE) && (id != TASKQID_INVALID))
 1680                 taskq_wait_id(spa->spa_zvol_taskq, id);
 1681 }
 1682 
 1683 void
 1684 zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
 1685     boolean_t async)
 1686 {
 1687         zvol_task_t *task;
 1688         taskqid_t id;
 1689 
 1690         task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL);
 1691         if (task == NULL)
 1692                 return;
 1693 
 1694         id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 1695         if ((async == B_FALSE) && (id != TASKQID_INVALID))
 1696                 taskq_wait_id(spa->spa_zvol_taskq, id);
 1697 }
 1698 
 1699 boolean_t
 1700 zvol_is_zvol(const char *name)
 1701 {
 1702 
 1703         return (zvol_os_is_zvol(name));
 1704 }
 1705 
 1706 int
 1707 zvol_init_impl(void)
 1708 {
 1709         int i;
 1710 
 1711         list_create(&zvol_state_list, sizeof (zvol_state_t),
 1712             offsetof(zvol_state_t, zv_next));
 1713         rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
 1714 
 1715         zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head),
 1716             KM_SLEEP);
 1717         for (i = 0; i < ZVOL_HT_SIZE; i++)
 1718                 INIT_HLIST_HEAD(&zvol_htable[i]);
 1719 
 1720         return (0);
 1721 }
 1722 
 1723 void
 1724 zvol_fini_impl(void)
 1725 {
 1726         zvol_remove_minors_impl(NULL);
 1727 
 1728         /*
 1729          * The call to "zvol_remove_minors_impl" may dispatch entries to
 1730          * the system_taskq, but it doesn't wait for those entries to
 1731          * complete before it returns. Thus, we must wait for all of the
 1732          * removals to finish, before we can continue.
 1733          */
 1734         taskq_wait_outstanding(system_taskq, 0);
 1735 
 1736         kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
 1737         list_destroy(&zvol_state_list);
 1738         rw_destroy(&zvol_state_lock);
 1739 }

Cache object: d13aced0589d3a8bbd7d185a13514672


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.