The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/dmu_tx.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
   24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
   25  */
   26 
   27 #include <sys/dmu.h>
   28 #include <sys/dmu_impl.h>
   29 #include <sys/dbuf.h>
   30 #include <sys/dmu_tx.h>
   31 #include <sys/dmu_objset.h>
   32 #include <sys/dsl_dataset.h>
   33 #include <sys/dsl_dir.h>
   34 #include <sys/dsl_pool.h>
   35 #include <sys/zap_impl.h>
   36 #include <sys/spa.h>
   37 #include <sys/sa.h>
   38 #include <sys/sa_impl.h>
   39 #include <sys/zfs_context.h>
   40 #include <sys/trace_zfs.h>
   41 
   42 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
   43     uint64_t arg1, uint64_t arg2);
   44 
   45 dmu_tx_stats_t dmu_tx_stats = {
   46         { "dmu_tx_assigned",            KSTAT_DATA_UINT64 },
   47         { "dmu_tx_delay",               KSTAT_DATA_UINT64 },
   48         { "dmu_tx_error",               KSTAT_DATA_UINT64 },
   49         { "dmu_tx_suspended",           KSTAT_DATA_UINT64 },
   50         { "dmu_tx_group",               KSTAT_DATA_UINT64 },
   51         { "dmu_tx_memory_reserve",      KSTAT_DATA_UINT64 },
   52         { "dmu_tx_memory_reclaim",      KSTAT_DATA_UINT64 },
   53         { "dmu_tx_dirty_throttle",      KSTAT_DATA_UINT64 },
   54         { "dmu_tx_dirty_delay",         KSTAT_DATA_UINT64 },
   55         { "dmu_tx_dirty_over_max",      KSTAT_DATA_UINT64 },
   56         { "dmu_tx_dirty_frees_delay",   KSTAT_DATA_UINT64 },
   57         { "dmu_tx_wrlog_delay",         KSTAT_DATA_UINT64 },
   58         { "dmu_tx_quota",               KSTAT_DATA_UINT64 },
   59 };
   60 
   61 static kstat_t *dmu_tx_ksp;
   62 
   63 dmu_tx_t *
   64 dmu_tx_create_dd(dsl_dir_t *dd)
   65 {
   66         dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
   67         tx->tx_dir = dd;
   68         if (dd != NULL)
   69                 tx->tx_pool = dd->dd_pool;
   70         list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
   71             offsetof(dmu_tx_hold_t, txh_node));
   72         list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
   73             offsetof(dmu_tx_callback_t, dcb_node));
   74         tx->tx_start = gethrtime();
   75         return (tx);
   76 }
   77 
   78 dmu_tx_t *
   79 dmu_tx_create(objset_t *os)
   80 {
   81         dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
   82         tx->tx_objset = os;
   83         return (tx);
   84 }
   85 
   86 dmu_tx_t *
   87 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
   88 {
   89         dmu_tx_t *tx = dmu_tx_create_dd(NULL);
   90 
   91         TXG_VERIFY(dp->dp_spa, txg);
   92         tx->tx_pool = dp;
   93         tx->tx_txg = txg;
   94         tx->tx_anyobj = TRUE;
   95 
   96         return (tx);
   97 }
   98 
   99 int
  100 dmu_tx_is_syncing(dmu_tx_t *tx)
  101 {
  102         return (tx->tx_anyobj);
  103 }
  104 
  105 int
  106 dmu_tx_private_ok(dmu_tx_t *tx)
  107 {
  108         return (tx->tx_anyobj);
  109 }
  110 
  111 static dmu_tx_hold_t *
  112 dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
  113     uint64_t arg1, uint64_t arg2)
  114 {
  115         dmu_tx_hold_t *txh;
  116 
  117         if (dn != NULL) {
  118                 (void) zfs_refcount_add(&dn->dn_holds, tx);
  119                 if (tx->tx_txg != 0) {
  120                         mutex_enter(&dn->dn_mtx);
  121                         /*
  122                          * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
  123                          * problem, but there's no way for it to happen (for
  124                          * now, at least).
  125                          */
  126                         ASSERT(dn->dn_assigned_txg == 0);
  127                         dn->dn_assigned_txg = tx->tx_txg;
  128                         (void) zfs_refcount_add(&dn->dn_tx_holds, tx);
  129                         mutex_exit(&dn->dn_mtx);
  130                 }
  131         }
  132 
  133         txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
  134         txh->txh_tx = tx;
  135         txh->txh_dnode = dn;
  136         zfs_refcount_create(&txh->txh_space_towrite);
  137         zfs_refcount_create(&txh->txh_memory_tohold);
  138         txh->txh_type = type;
  139         txh->txh_arg1 = arg1;
  140         txh->txh_arg2 = arg2;
  141         list_insert_tail(&tx->tx_holds, txh);
  142 
  143         return (txh);
  144 }
  145 
  146 static dmu_tx_hold_t *
  147 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
  148     enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
  149 {
  150         dnode_t *dn = NULL;
  151         dmu_tx_hold_t *txh;
  152         int err;
  153 
  154         if (object != DMU_NEW_OBJECT) {
  155                 err = dnode_hold(os, object, FTAG, &dn);
  156                 if (err != 0) {
  157                         tx->tx_err = err;
  158                         return (NULL);
  159                 }
  160         }
  161         txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
  162         if (dn != NULL)
  163                 dnode_rele(dn, FTAG);
  164         return (txh);
  165 }
  166 
  167 void
  168 dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
  169 {
  170         /*
  171          * If we're syncing, they can manipulate any object anyhow, and
  172          * the hold on the dnode_t can cause problems.
  173          */
  174         if (!dmu_tx_is_syncing(tx))
  175                 (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
  176 }
  177 
  178 /*
  179  * This function reads specified data from disk.  The specified data will
  180  * be needed to perform the transaction -- i.e, it will be read after
  181  * we do dmu_tx_assign().  There are two reasons that we read the data now
  182  * (before dmu_tx_assign()):
  183  *
  184  * 1. Reading it now has potentially better performance.  The transaction
  185  * has not yet been assigned, so the TXG is not held open, and also the
  186  * caller typically has less locks held when calling dmu_tx_hold_*() than
  187  * after the transaction has been assigned.  This reduces the lock (and txg)
  188  * hold times, thus reducing lock contention.
  189  *
  190  * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
  191  * that are detected before they start making changes to the DMU state
  192  * (i.e. now).  Once the transaction has been assigned, and some DMU
  193  * state has been changed, it can be difficult to recover from an i/o
  194  * error (e.g. to undo the changes already made in memory at the DMU
  195  * layer).  Typically code to do so does not exist in the caller -- it
  196  * assumes that the data has already been cached and thus i/o errors are
  197  * not possible.
  198  *
  199  * It has been observed that the i/o initiated here can be a performance
  200  * problem, and it appears to be optional, because we don't look at the
  201  * data which is read.  However, removing this read would only serve to
  202  * move the work elsewhere (after the dmu_tx_assign()), where it may
  203  * have a greater impact on performance (in addition to the impact on
  204  * fault tolerance noted above).
  205  */
  206 static int
  207 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
  208 {
  209         int err;
  210         dmu_buf_impl_t *db;
  211 
  212         rw_enter(&dn->dn_struct_rwlock, RW_READER);
  213         db = dbuf_hold_level(dn, level, blkid, FTAG);
  214         rw_exit(&dn->dn_struct_rwlock);
  215         if (db == NULL)
  216                 return (SET_ERROR(EIO));
  217         /*
  218          * PARTIAL_FIRST allows caching for uncacheable blocks.  It will
  219          * be cleared after dmu_buf_will_dirty() call dbuf_read() again.
  220          */
  221         err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH |
  222             (level == 0 ? DB_RF_PARTIAL_FIRST : 0));
  223         dbuf_rele(db, FTAG);
  224         return (err);
  225 }
  226 
  227 static void
  228 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
  229 {
  230         dnode_t *dn = txh->txh_dnode;
  231         int err = 0;
  232 
  233         if (len == 0)
  234                 return;
  235 
  236         (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
  237 
  238         if (dn == NULL)
  239                 return;
  240 
  241         /*
  242          * For i/o error checking, read the blocks that will be needed
  243          * to perform the write: the first and last level-0 blocks (if
  244          * they are not aligned, i.e. if they are partial-block writes),
  245          * and all the level-1 blocks.
  246          */
  247         if (dn->dn_maxblkid == 0) {
  248                 if (off < dn->dn_datablksz &&
  249                     (off > 0 || len < dn->dn_datablksz)) {
  250                         err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
  251                         if (err != 0) {
  252                                 txh->txh_tx->tx_err = err;
  253                         }
  254                 }
  255         } else {
  256                 zio_t *zio = zio_root(dn->dn_objset->os_spa,
  257                     NULL, NULL, ZIO_FLAG_CANFAIL);
  258 
  259                 /* first level-0 block */
  260                 uint64_t start = off >> dn->dn_datablkshift;
  261                 if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
  262                         err = dmu_tx_check_ioerr(zio, dn, 0, start);
  263                         if (err != 0) {
  264                                 txh->txh_tx->tx_err = err;
  265                         }
  266                 }
  267 
  268                 /* last level-0 block */
  269                 uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
  270                 if (end != start && end <= dn->dn_maxblkid &&
  271                     P2PHASE(off + len, dn->dn_datablksz)) {
  272                         err = dmu_tx_check_ioerr(zio, dn, 0, end);
  273                         if (err != 0) {
  274                                 txh->txh_tx->tx_err = err;
  275                         }
  276                 }
  277 
  278                 /* level-1 blocks */
  279                 if (dn->dn_nlevels > 1) {
  280                         int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
  281                         for (uint64_t i = (start >> shft) + 1;
  282                             i < end >> shft; i++) {
  283                                 err = dmu_tx_check_ioerr(zio, dn, 1, i);
  284                                 if (err != 0) {
  285                                         txh->txh_tx->tx_err = err;
  286                                 }
  287                         }
  288                 }
  289 
  290                 err = zio_wait(zio);
  291                 if (err != 0) {
  292                         txh->txh_tx->tx_err = err;
  293                 }
  294         }
  295 }
  296 
  297 static void
  298 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
  299 {
  300         (void) zfs_refcount_add_many(&txh->txh_space_towrite,
  301             DNODE_MIN_SIZE, FTAG);
  302 }
  303 
  304 void
  305 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
  306 {
  307         dmu_tx_hold_t *txh;
  308 
  309         ASSERT0(tx->tx_txg);
  310         ASSERT3U(len, <=, DMU_MAX_ACCESS);
  311         ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
  312 
  313         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
  314             object, THT_WRITE, off, len);
  315         if (txh != NULL) {
  316                 dmu_tx_count_write(txh, off, len);
  317                 dmu_tx_count_dnode(txh);
  318         }
  319 }
  320 
  321 void
  322 dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
  323 {
  324         dmu_tx_hold_t *txh;
  325 
  326         ASSERT0(tx->tx_txg);
  327         ASSERT3U(len, <=, DMU_MAX_ACCESS);
  328         ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
  329 
  330         txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
  331         if (txh != NULL) {
  332                 dmu_tx_count_write(txh, off, len);
  333                 dmu_tx_count_dnode(txh);
  334         }
  335 }
  336 
  337 /*
  338  * This function marks the transaction as being a "net free".  The end
  339  * result is that refquotas will be disabled for this transaction, and
  340  * this transaction will be able to use half of the pool space overhead
  341  * (see dsl_pool_adjustedsize()).  Therefore this function should only
  342  * be called for transactions that we expect will not cause a net increase
  343  * in the amount of space used (but it's OK if that is occasionally not true).
  344  */
  345 void
  346 dmu_tx_mark_netfree(dmu_tx_t *tx)
  347 {
  348         tx->tx_netfree = B_TRUE;
  349 }
  350 
  351 static void
  352 dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
  353 {
  354         dmu_tx_t *tx = txh->txh_tx;
  355         dnode_t *dn = txh->txh_dnode;
  356         int err;
  357 
  358         ASSERT(tx->tx_txg == 0);
  359 
  360         dmu_tx_count_dnode(txh);
  361 
  362         if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
  363                 return;
  364         if (len == DMU_OBJECT_END)
  365                 len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
  366 
  367         dmu_tx_count_dnode(txh);
  368 
  369         /*
  370          * For i/o error checking, we read the first and last level-0
  371          * blocks if they are not aligned, and all the level-1 blocks.
  372          *
  373          * Note:  dbuf_free_range() assumes that we have not instantiated
  374          * any level-0 dbufs that will be completely freed.  Therefore we must
  375          * exercise care to not read or count the first and last blocks
  376          * if they are blocksize-aligned.
  377          */
  378         if (dn->dn_datablkshift == 0) {
  379                 if (off != 0 || len < dn->dn_datablksz)
  380                         dmu_tx_count_write(txh, 0, dn->dn_datablksz);
  381         } else {
  382                 /* first block will be modified if it is not aligned */
  383                 if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
  384                         dmu_tx_count_write(txh, off, 1);
  385                 /* last block will be modified if it is not aligned */
  386                 if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
  387                         dmu_tx_count_write(txh, off + len, 1);
  388         }
  389 
  390         /*
  391          * Check level-1 blocks.
  392          */
  393         if (dn->dn_nlevels > 1) {
  394                 int shift = dn->dn_datablkshift + dn->dn_indblkshift -
  395                     SPA_BLKPTRSHIFT;
  396                 uint64_t start = off >> shift;
  397                 uint64_t end = (off + len) >> shift;
  398 
  399                 ASSERT(dn->dn_indblkshift != 0);
  400 
  401                 /*
  402                  * dnode_reallocate() can result in an object with indirect
  403                  * blocks having an odd data block size.  In this case,
  404                  * just check the single block.
  405                  */
  406                 if (dn->dn_datablkshift == 0)
  407                         start = end = 0;
  408 
  409                 zio_t *zio = zio_root(tx->tx_pool->dp_spa,
  410                     NULL, NULL, ZIO_FLAG_CANFAIL);
  411                 for (uint64_t i = start; i <= end; i++) {
  412                         uint64_t ibyte = i << shift;
  413                         err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
  414                         i = ibyte >> shift;
  415                         if (err == ESRCH || i > end)
  416                                 break;
  417                         if (err != 0) {
  418                                 tx->tx_err = err;
  419                                 (void) zio_wait(zio);
  420                                 return;
  421                         }
  422 
  423                         (void) zfs_refcount_add_many(&txh->txh_memory_tohold,
  424                             1 << dn->dn_indblkshift, FTAG);
  425 
  426                         err = dmu_tx_check_ioerr(zio, dn, 1, i);
  427                         if (err != 0) {
  428                                 tx->tx_err = err;
  429                                 (void) zio_wait(zio);
  430                                 return;
  431                         }
  432                 }
  433                 err = zio_wait(zio);
  434                 if (err != 0) {
  435                         tx->tx_err = err;
  436                         return;
  437                 }
  438         }
  439 }
  440 
  441 void
  442 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
  443 {
  444         dmu_tx_hold_t *txh;
  445 
  446         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
  447             object, THT_FREE, off, len);
  448         if (txh != NULL)
  449                 (void) dmu_tx_hold_free_impl(txh, off, len);
  450 }
  451 
  452 void
  453 dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
  454 {
  455         dmu_tx_hold_t *txh;
  456 
  457         txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
  458         if (txh != NULL)
  459                 (void) dmu_tx_hold_free_impl(txh, off, len);
  460 }
  461 
  462 static void
  463 dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
  464 {
  465         dmu_tx_t *tx = txh->txh_tx;
  466         dnode_t *dn = txh->txh_dnode;
  467         int err;
  468         extern int zap_micro_max_size;
  469 
  470         ASSERT(tx->tx_txg == 0);
  471 
  472         dmu_tx_count_dnode(txh);
  473 
  474         /*
  475          * Modifying a almost-full microzap is around the worst case (128KB)
  476          *
  477          * If it is a fat zap, the worst case would be 7*16KB=112KB:
  478          * - 3 blocks overwritten: target leaf, ptrtbl block, header block
  479          * - 4 new blocks written if adding:
  480          *    - 2 blocks for possibly split leaves,
  481          *    - 2 grown ptrtbl blocks
  482          */
  483         (void) zfs_refcount_add_many(&txh->txh_space_towrite,
  484             zap_micro_max_size, FTAG);
  485 
  486         if (dn == NULL)
  487                 return;
  488 
  489         ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
  490 
  491         if (dn->dn_maxblkid == 0 || name == NULL) {
  492                 /*
  493                  * This is a microzap (only one block), or we don't know
  494                  * the name.  Check the first block for i/o errors.
  495                  */
  496                 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
  497                 if (err != 0) {
  498                         tx->tx_err = err;
  499                 }
  500         } else {
  501                 /*
  502                  * Access the name so that we'll check for i/o errors to
  503                  * the leaf blocks, etc.  We ignore ENOENT, as this name
  504                  * may not yet exist.
  505                  */
  506                 err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
  507                 if (err == EIO || err == ECKSUM || err == ENXIO) {
  508                         tx->tx_err = err;
  509                 }
  510         }
  511 }
  512 
  513 void
  514 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
  515 {
  516         dmu_tx_hold_t *txh;
  517 
  518         ASSERT0(tx->tx_txg);
  519 
  520         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
  521             object, THT_ZAP, add, (uintptr_t)name);
  522         if (txh != NULL)
  523                 dmu_tx_hold_zap_impl(txh, name);
  524 }
  525 
  526 void
  527 dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
  528 {
  529         dmu_tx_hold_t *txh;
  530 
  531         ASSERT0(tx->tx_txg);
  532         ASSERT(dn != NULL);
  533 
  534         txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
  535         if (txh != NULL)
  536                 dmu_tx_hold_zap_impl(txh, name);
  537 }
  538 
  539 void
  540 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
  541 {
  542         dmu_tx_hold_t *txh;
  543 
  544         ASSERT(tx->tx_txg == 0);
  545 
  546         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
  547             object, THT_BONUS, 0, 0);
  548         if (txh)
  549                 dmu_tx_count_dnode(txh);
  550 }
  551 
  552 void
  553 dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
  554 {
  555         dmu_tx_hold_t *txh;
  556 
  557         ASSERT0(tx->tx_txg);
  558 
  559         txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
  560         if (txh)
  561                 dmu_tx_count_dnode(txh);
  562 }
  563 
  564 void
  565 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
  566 {
  567         dmu_tx_hold_t *txh;
  568 
  569         ASSERT(tx->tx_txg == 0);
  570 
  571         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
  572             DMU_NEW_OBJECT, THT_SPACE, space, 0);
  573         if (txh) {
  574                 (void) zfs_refcount_add_many(
  575                     &txh->txh_space_towrite, space, FTAG);
  576         }
  577 }
  578 
  579 #ifdef ZFS_DEBUG
  580 void
  581 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
  582 {
  583         boolean_t match_object = B_FALSE;
  584         boolean_t match_offset = B_FALSE;
  585 
  586         DB_DNODE_ENTER(db);
  587         dnode_t *dn = DB_DNODE(db);
  588         ASSERT(tx->tx_txg != 0);
  589         ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
  590         ASSERT3U(dn->dn_object, ==, db->db.db_object);
  591 
  592         if (tx->tx_anyobj) {
  593                 DB_DNODE_EXIT(db);
  594                 return;
  595         }
  596 
  597         /* XXX No checking on the meta dnode for now */
  598         if (db->db.db_object == DMU_META_DNODE_OBJECT) {
  599                 DB_DNODE_EXIT(db);
  600                 return;
  601         }
  602 
  603         for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
  604             txh = list_next(&tx->tx_holds, txh)) {
  605                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
  606                 if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
  607                         match_object = TRUE;
  608                 if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
  609                         int datablkshift = dn->dn_datablkshift ?
  610                             dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
  611                         int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
  612                         int shift = datablkshift + epbs * db->db_level;
  613                         uint64_t beginblk = shift >= 64 ? 0 :
  614                             (txh->txh_arg1 >> shift);
  615                         uint64_t endblk = shift >= 64 ? 0 :
  616                             ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
  617                         uint64_t blkid = db->db_blkid;
  618 
  619                         /* XXX txh_arg2 better not be zero... */
  620 
  621                         dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
  622                             txh->txh_type, (u_longlong_t)beginblk,
  623                             (u_longlong_t)endblk);
  624 
  625                         switch (txh->txh_type) {
  626                         case THT_WRITE:
  627                                 if (blkid >= beginblk && blkid <= endblk)
  628                                         match_offset = TRUE;
  629                                 /*
  630                                  * We will let this hold work for the bonus
  631                                  * or spill buffer so that we don't need to
  632                                  * hold it when creating a new object.
  633                                  */
  634                                 if (blkid == DMU_BONUS_BLKID ||
  635                                     blkid == DMU_SPILL_BLKID)
  636                                         match_offset = TRUE;
  637                                 /*
  638                                  * They might have to increase nlevels,
  639                                  * thus dirtying the new TLIBs.  Or the
  640                                  * might have to change the block size,
  641                                  * thus dirying the new lvl=0 blk=0.
  642                                  */
  643                                 if (blkid == 0)
  644                                         match_offset = TRUE;
  645                                 break;
  646                         case THT_FREE:
  647                                 /*
  648                                  * We will dirty all the level 1 blocks in
  649                                  * the free range and perhaps the first and
  650                                  * last level 0 block.
  651                                  */
  652                                 if (blkid >= beginblk && (blkid <= endblk ||
  653                                     txh->txh_arg2 == DMU_OBJECT_END))
  654                                         match_offset = TRUE;
  655                                 break;
  656                         case THT_SPILL:
  657                                 if (blkid == DMU_SPILL_BLKID)
  658                                         match_offset = TRUE;
  659                                 break;
  660                         case THT_BONUS:
  661                                 if (blkid == DMU_BONUS_BLKID)
  662                                         match_offset = TRUE;
  663                                 break;
  664                         case THT_ZAP:
  665                                 match_offset = TRUE;
  666                                 break;
  667                         case THT_NEWOBJECT:
  668                                 match_object = TRUE;
  669                                 break;
  670                         default:
  671                                 cmn_err(CE_PANIC, "bad txh_type %d",
  672                                     txh->txh_type);
  673                         }
  674                 }
  675                 if (match_object && match_offset) {
  676                         DB_DNODE_EXIT(db);
  677                         return;
  678                 }
  679         }
  680         DB_DNODE_EXIT(db);
  681         panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
  682             (u_longlong_t)db->db.db_object, db->db_level,
  683             (u_longlong_t)db->db_blkid);
  684 }
  685 #endif
  686 
  687 /*
  688  * If we can't do 10 iops, something is wrong.  Let us go ahead
  689  * and hit zfs_dirty_data_max.
  690  */
  691 static const hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */
  692 
  693 /*
  694  * We delay transactions when we've determined that the backend storage
  695  * isn't able to accommodate the rate of incoming writes.
  696  *
  697  * If there is already a transaction waiting, we delay relative to when
  698  * that transaction finishes waiting.  This way the calculated min_time
  699  * is independent of the number of threads concurrently executing
  700  * transactions.
  701  *
  702  * If we are the only waiter, wait relative to when the transaction
  703  * started, rather than the current time.  This credits the transaction for
  704  * "time already served", e.g. reading indirect blocks.
  705  *
  706  * The minimum time for a transaction to take is calculated as:
  707  *     min_time = scale * (dirty - min) / (max - dirty)
  708  *     min_time is then capped at zfs_delay_max_ns.
  709  *
  710  * The delay has two degrees of freedom that can be adjusted via tunables.
  711  * The percentage of dirty data at which we start to delay is defined by
  712  * zfs_delay_min_dirty_percent. This should typically be at or above
  713  * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
  714  * delay after writing at full speed has failed to keep up with the incoming
  715  * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
  716  * speaking, this variable determines the amount of delay at the midpoint of
  717  * the curve.
  718  *
  719  * delay
  720  *  10ms +-------------------------------------------------------------*+
  721  *       |                                                             *|
  722  *   9ms +                                                             *+
  723  *       |                                                             *|
  724  *   8ms +                                                             *+
  725  *       |                                                            * |
  726  *   7ms +                                                            * +
  727  *       |                                                            * |
  728  *   6ms +                                                            * +
  729  *       |                                                            * |
  730  *   5ms +                                                           *  +
  731  *       |                                                           *  |
  732  *   4ms +                                                           *  +
  733  *       |                                                           *  |
  734  *   3ms +                                                          *   +
  735  *       |                                                          *   |
  736  *   2ms +                                              (midpoint) *    +
  737  *       |                                                  |    **     |
  738  *   1ms +                                                  v ***       +
  739  *       |             zfs_delay_scale ---------->     ********         |
  740  *     0 +-------------------------------------*********----------------+
  741  *       0%                    <- zfs_dirty_data_max ->               100%
  742  *
  743  * Note that since the delay is added to the outstanding time remaining on the
  744  * most recent transaction, the delay is effectively the inverse of IOPS.
  745  * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
  746  * was chosen such that small changes in the amount of accumulated dirty data
  747  * in the first 3/4 of the curve yield relatively small differences in the
  748  * amount of delay.
  749  *
  750  * The effects can be easier to understand when the amount of delay is
  751  * represented on a log scale:
  752  *
  753  * delay
  754  * 100ms +-------------------------------------------------------------++
  755  *       +                                                              +
  756  *       |                                                              |
  757  *       +                                                             *+
  758  *  10ms +                                                             *+
  759  *       +                                                           ** +
  760  *       |                                              (midpoint)  **  |
  761  *       +                                                  |     **    +
  762  *   1ms +                                                  v ****      +
  763  *       +             zfs_delay_scale ---------->        *****         +
  764  *       |                                             ****             |
  765  *       +                                          ****                +
  766  * 100us +                                        **                    +
  767  *       +                                       *                      +
  768  *       |                                      *                       |
  769  *       +                                     *                        +
  770  *  10us +                                     *                        +
  771  *       +                                                              +
  772  *       |                                                              |
  773  *       +                                                              +
  774  *       +--------------------------------------------------------------+
  775  *       0%                    <- zfs_dirty_data_max ->               100%
  776  *
  777  * Note here that only as the amount of dirty data approaches its limit does
  778  * the delay start to increase rapidly. The goal of a properly tuned system
  779  * should be to keep the amount of dirty data out of that range by first
  780  * ensuring that the appropriate limits are set for the I/O scheduler to reach
  781  * optimal throughput on the backend storage, and then by changing the value
  782  * of zfs_delay_scale to increase the steepness of the curve.
  783  */
  784 static void
  785 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
  786 {
  787         dsl_pool_t *dp = tx->tx_pool;
  788         uint64_t delay_min_bytes, wrlog;
  789         hrtime_t wakeup, tx_time = 0, now;
  790 
  791         /* Calculate minimum transaction time for the dirty data amount. */
  792         delay_min_bytes =
  793             zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
  794         if (dirty > delay_min_bytes) {
  795                 /*
  796                  * The caller has already waited until we are under the max.
  797                  * We make them pass us the amount of dirty data so we don't
  798                  * have to handle the case of it being >= the max, which
  799                  * could cause a divide-by-zero if it's == the max.
  800                  */
  801                 ASSERT3U(dirty, <, zfs_dirty_data_max);
  802 
  803                 tx_time = zfs_delay_scale * (dirty - delay_min_bytes) /
  804                     (zfs_dirty_data_max - dirty);
  805         }
  806 
  807         /* Calculate minimum transaction time for the TX_WRITE log size. */
  808         wrlog = aggsum_upper_bound(&dp->dp_wrlog_total);
  809         delay_min_bytes =
  810             zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;
  811         if (wrlog >= zfs_wrlog_data_max) {
  812                 tx_time = zfs_delay_max_ns;
  813         } else if (wrlog > delay_min_bytes) {
  814                 tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) /
  815                     (zfs_wrlog_data_max - wrlog), tx_time);
  816         }
  817 
  818         if (tx_time == 0)
  819                 return;
  820 
  821         tx_time = MIN(tx_time, zfs_delay_max_ns);
  822         now = gethrtime();
  823         if (now > tx->tx_start + tx_time)
  824                 return;
  825 
  826         DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
  827             uint64_t, tx_time);
  828 
  829         mutex_enter(&dp->dp_lock);
  830         wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time);
  831         dp->dp_last_wakeup = wakeup;
  832         mutex_exit(&dp->dp_lock);
  833 
  834         zfs_sleep_until(wakeup);
  835 }
  836 
  837 /*
  838  * This routine attempts to assign the transaction to a transaction group.
  839  * To do so, we must determine if there is sufficient free space on disk.
  840  *
  841  * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
  842  * on it), then it is assumed that there is sufficient free space,
  843  * unless there's insufficient slop space in the pool (see the comment
  844  * above spa_slop_shift in spa_misc.c).
  845  *
  846  * If it is not a "netfree" transaction, then if the data already on disk
  847  * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
  848  * ENOSPC.  Otherwise, if the current rough estimate of pending changes,
  849  * plus the rough estimate of this transaction's changes, may exceed the
  850  * allowed usage, then this will fail with ERESTART, which will cause the
  851  * caller to wait for the pending changes to be written to disk (by waiting
  852  * for the next TXG to open), and then check the space usage again.
  853  *
  854  * The rough estimate of pending changes is comprised of the sum of:
  855  *
  856  *  - this transaction's holds' txh_space_towrite
  857  *
  858  *  - dd_tempreserved[], which is the sum of in-flight transactions'
  859  *    holds' txh_space_towrite (i.e. those transactions that have called
  860  *    dmu_tx_assign() but not yet called dmu_tx_commit()).
  861  *
  862  *  - dd_space_towrite[], which is the amount of dirtied dbufs.
  863  *
  864  * Note that all of these values are inflated by spa_get_worst_case_asize(),
  865  * which means that we may get ERESTART well before we are actually in danger
  866  * of running out of space, but this also mitigates any small inaccuracies
  867  * in the rough estimate (e.g. txh_space_towrite doesn't take into account
  868  * indirect blocks, and dd_space_towrite[] doesn't take into account changes
  869  * to the MOS).
  870  *
  871  * Note that due to this algorithm, it is possible to exceed the allowed
  872  * usage by one transaction.  Also, as we approach the allowed usage,
  873  * we will allow a very limited amount of changes into each TXG, thus
  874  * decreasing performance.
  875  */
  876 static int
  877 dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
  878 {
  879         spa_t *spa = tx->tx_pool->dp_spa;
  880 
  881         ASSERT0(tx->tx_txg);
  882 
  883         if (tx->tx_err) {
  884                 DMU_TX_STAT_BUMP(dmu_tx_error);
  885                 return (tx->tx_err);
  886         }
  887 
  888         if (spa_suspended(spa)) {
  889                 DMU_TX_STAT_BUMP(dmu_tx_suspended);
  890 
  891                 /*
  892                  * If the user has indicated a blocking failure mode
  893                  * then return ERESTART which will block in dmu_tx_wait().
  894                  * Otherwise, return EIO so that an error can get
  895                  * propagated back to the VOP calls.
  896                  *
  897                  * Note that we always honor the txg_how flag regardless
  898                  * of the failuremode setting.
  899                  */
  900                 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
  901                     !(txg_how & TXG_WAIT))
  902                         return (SET_ERROR(EIO));
  903 
  904                 return (SET_ERROR(ERESTART));
  905         }
  906 
  907         if (!tx->tx_dirty_delayed &&
  908             dsl_pool_need_wrlog_delay(tx->tx_pool)) {
  909                 tx->tx_wait_dirty = B_TRUE;
  910                 DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay);
  911                 return (SET_ERROR(ERESTART));
  912         }
  913 
  914         if (!tx->tx_dirty_delayed &&
  915             dsl_pool_need_dirty_delay(tx->tx_pool)) {
  916                 tx->tx_wait_dirty = B_TRUE;
  917                 DMU_TX_STAT_BUMP(dmu_tx_dirty_delay);
  918                 return (SET_ERROR(ERESTART));
  919         }
  920 
  921         tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
  922         tx->tx_needassign_txh = NULL;
  923 
  924         /*
  925          * NB: No error returns are allowed after txg_hold_open, but
  926          * before processing the dnode holds, due to the
  927          * dmu_tx_unassign() logic.
  928          */
  929 
  930         uint64_t towrite = 0;
  931         uint64_t tohold = 0;
  932         for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
  933             txh = list_next(&tx->tx_holds, txh)) {
  934                 dnode_t *dn = txh->txh_dnode;
  935                 if (dn != NULL) {
  936                         /*
  937                          * This thread can't hold the dn_struct_rwlock
  938                          * while assigning the tx, because this can lead to
  939                          * deadlock. Specifically, if this dnode is already
  940                          * assigned to an earlier txg, this thread may need
  941                          * to wait for that txg to sync (the ERESTART case
  942                          * below).  The other thread that has assigned this
  943                          * dnode to an earlier txg prevents this txg from
  944                          * syncing until its tx can complete (calling
  945                          * dmu_tx_commit()), but it may need to acquire the
  946                          * dn_struct_rwlock to do so (e.g. via
  947                          * dmu_buf_hold*()).
  948                          *
  949                          * Note that this thread can't hold the lock for
  950                          * read either, but the rwlock doesn't record
  951                          * enough information to make that assertion.
  952                          */
  953                         ASSERT(!RW_WRITE_HELD(&dn->dn_struct_rwlock));
  954 
  955                         mutex_enter(&dn->dn_mtx);
  956                         if (dn->dn_assigned_txg == tx->tx_txg - 1) {
  957                                 mutex_exit(&dn->dn_mtx);
  958                                 tx->tx_needassign_txh = txh;
  959                                 DMU_TX_STAT_BUMP(dmu_tx_group);
  960                                 return (SET_ERROR(ERESTART));
  961                         }
  962                         if (dn->dn_assigned_txg == 0)
  963                                 dn->dn_assigned_txg = tx->tx_txg;
  964                         ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
  965                         (void) zfs_refcount_add(&dn->dn_tx_holds, tx);
  966                         mutex_exit(&dn->dn_mtx);
  967                 }
  968                 towrite += zfs_refcount_count(&txh->txh_space_towrite);
  969                 tohold += zfs_refcount_count(&txh->txh_memory_tohold);
  970         }
  971 
  972         /* needed allocation: worst-case estimate of write space */
  973         uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
  974         /* calculate memory footprint estimate */
  975         uint64_t memory = towrite + tohold;
  976 
  977         if (tx->tx_dir != NULL && asize != 0) {
  978                 int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
  979                     asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
  980                 if (err != 0)
  981                         return (err);
  982         }
  983 
  984         DMU_TX_STAT_BUMP(dmu_tx_assigned);
  985 
  986         return (0);
  987 }
  988 
  989 static void
  990 dmu_tx_unassign(dmu_tx_t *tx)
  991 {
  992         if (tx->tx_txg == 0)
  993                 return;
  994 
  995         txg_rele_to_quiesce(&tx->tx_txgh);
  996 
  997         /*
  998          * Walk the transaction's hold list, removing the hold on the
  999          * associated dnode, and notifying waiters if the refcount drops to 0.
 1000          */
 1001         for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
 1002             txh && txh != tx->tx_needassign_txh;
 1003             txh = list_next(&tx->tx_holds, txh)) {
 1004                 dnode_t *dn = txh->txh_dnode;
 1005 
 1006                 if (dn == NULL)
 1007                         continue;
 1008                 mutex_enter(&dn->dn_mtx);
 1009                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 1010 
 1011                 if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
 1012                         dn->dn_assigned_txg = 0;
 1013                         cv_broadcast(&dn->dn_notxholds);
 1014                 }
 1015                 mutex_exit(&dn->dn_mtx);
 1016         }
 1017 
 1018         txg_rele_to_sync(&tx->tx_txgh);
 1019 
 1020         tx->tx_lasttried_txg = tx->tx_txg;
 1021         tx->tx_txg = 0;
 1022 }
 1023 
 1024 /*
 1025  * Assign tx to a transaction group; txg_how is a bitmask:
 1026  *
 1027  * If TXG_WAIT is set and the currently open txg is full, this function
 1028  * will wait until there's a new txg. This should be used when no locks
 1029  * are being held. With this bit set, this function will only fail if
 1030  * we're truly out of space (or over quota).
 1031  *
 1032  * If TXG_WAIT is *not* set and we can't assign into the currently open
 1033  * txg without blocking, this function will return immediately with
 1034  * ERESTART. This should be used whenever locks are being held.  On an
 1035  * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
 1036  * and try again.
 1037  *
 1038  * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
 1039  * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
 1040  * details on the throttle). This is used by the VFS operations, after
 1041  * they have already called dmu_tx_wait() (though most likely on a
 1042  * different tx).
 1043  *
 1044  * It is guaranteed that subsequent successful calls to dmu_tx_assign()
 1045  * will assign the tx to monotonically increasing txgs. Of course this is
 1046  * not strong monotonicity, because the same txg can be returned multiple
 1047  * times in a row. This guarantee holds both for subsequent calls from
 1048  * one thread and for multiple threads. For example, it is impossible to
 1049  * observe the following sequence of events:
 1050  *
 1051  *          Thread 1                            Thread 2
 1052  *
 1053  *     dmu_tx_assign(T1, ...)
 1054  *     1 <- dmu_tx_get_txg(T1)
 1055  *                                       dmu_tx_assign(T2, ...)
 1056  *                                       2 <- dmu_tx_get_txg(T2)
 1057  *     dmu_tx_assign(T3, ...)
 1058  *     1 <- dmu_tx_get_txg(T3)
 1059  */
 1060 int
 1061 dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
 1062 {
 1063         int err;
 1064 
 1065         ASSERT(tx->tx_txg == 0);
 1066         ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
 1067         ASSERT(!dsl_pool_sync_context(tx->tx_pool));
 1068 
 1069         /* If we might wait, we must not hold the config lock. */
 1070         IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
 1071 
 1072         if ((txg_how & TXG_NOTHROTTLE))
 1073                 tx->tx_dirty_delayed = B_TRUE;
 1074 
 1075         while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
 1076                 dmu_tx_unassign(tx);
 1077 
 1078                 if (err != ERESTART || !(txg_how & TXG_WAIT))
 1079                         return (err);
 1080 
 1081                 dmu_tx_wait(tx);
 1082         }
 1083 
 1084         txg_rele_to_quiesce(&tx->tx_txgh);
 1085 
 1086         return (0);
 1087 }
 1088 
 1089 void
 1090 dmu_tx_wait(dmu_tx_t *tx)
 1091 {
 1092         spa_t *spa = tx->tx_pool->dp_spa;
 1093         dsl_pool_t *dp = tx->tx_pool;
 1094         hrtime_t before;
 1095 
 1096         ASSERT(tx->tx_txg == 0);
 1097         ASSERT(!dsl_pool_config_held(tx->tx_pool));
 1098 
 1099         before = gethrtime();
 1100 
 1101         if (tx->tx_wait_dirty) {
 1102                 uint64_t dirty;
 1103 
 1104                 /*
 1105                  * dmu_tx_try_assign() has determined that we need to wait
 1106                  * because we've consumed much or all of the dirty buffer
 1107                  * space.
 1108                  */
 1109                 mutex_enter(&dp->dp_lock);
 1110                 if (dp->dp_dirty_total >= zfs_dirty_data_max)
 1111                         DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max);
 1112                 while (dp->dp_dirty_total >= zfs_dirty_data_max)
 1113                         cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
 1114                 dirty = dp->dp_dirty_total;
 1115                 mutex_exit(&dp->dp_lock);
 1116 
 1117                 dmu_tx_delay(tx, dirty);
 1118 
 1119                 tx->tx_wait_dirty = B_FALSE;
 1120 
 1121                 /*
 1122                  * Note: setting tx_dirty_delayed only has effect if the
 1123                  * caller used TX_WAIT.  Otherwise they are going to
 1124                  * destroy this tx and try again.  The common case,
 1125                  * zfs_write(), uses TX_WAIT.
 1126                  */
 1127                 tx->tx_dirty_delayed = B_TRUE;
 1128         } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
 1129                 /*
 1130                  * If the pool is suspended we need to wait until it
 1131                  * is resumed.  Note that it's possible that the pool
 1132                  * has become active after this thread has tried to
 1133                  * obtain a tx.  If that's the case then tx_lasttried_txg
 1134                  * would not have been set.
 1135                  */
 1136                 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
 1137         } else if (tx->tx_needassign_txh) {
 1138                 dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
 1139 
 1140                 mutex_enter(&dn->dn_mtx);
 1141                 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
 1142                         cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
 1143                 mutex_exit(&dn->dn_mtx);
 1144                 tx->tx_needassign_txh = NULL;
 1145         } else {
 1146                 /*
 1147                  * If we have a lot of dirty data just wait until we sync
 1148                  * out a TXG at which point we'll hopefully have synced
 1149                  * a portion of the changes.
 1150                  */
 1151                 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
 1152         }
 1153 
 1154         spa_tx_assign_add_nsecs(spa, gethrtime() - before);
 1155 }
 1156 
 1157 static void
 1158 dmu_tx_destroy(dmu_tx_t *tx)
 1159 {
 1160         dmu_tx_hold_t *txh;
 1161 
 1162         while ((txh = list_head(&tx->tx_holds)) != NULL) {
 1163                 dnode_t *dn = txh->txh_dnode;
 1164 
 1165                 list_remove(&tx->tx_holds, txh);
 1166                 zfs_refcount_destroy_many(&txh->txh_space_towrite,
 1167                     zfs_refcount_count(&txh->txh_space_towrite));
 1168                 zfs_refcount_destroy_many(&txh->txh_memory_tohold,
 1169                     zfs_refcount_count(&txh->txh_memory_tohold));
 1170                 kmem_free(txh, sizeof (dmu_tx_hold_t));
 1171                 if (dn != NULL)
 1172                         dnode_rele(dn, tx);
 1173         }
 1174 
 1175         list_destroy(&tx->tx_callbacks);
 1176         list_destroy(&tx->tx_holds);
 1177         kmem_free(tx, sizeof (dmu_tx_t));
 1178 }
 1179 
 1180 void
 1181 dmu_tx_commit(dmu_tx_t *tx)
 1182 {
 1183         ASSERT(tx->tx_txg != 0);
 1184 
 1185         /*
 1186          * Go through the transaction's hold list and remove holds on
 1187          * associated dnodes, notifying waiters if no holds remain.
 1188          */
 1189         for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
 1190             txh = list_next(&tx->tx_holds, txh)) {
 1191                 dnode_t *dn = txh->txh_dnode;
 1192 
 1193                 if (dn == NULL)
 1194                         continue;
 1195 
 1196                 mutex_enter(&dn->dn_mtx);
 1197                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 1198 
 1199                 if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
 1200                         dn->dn_assigned_txg = 0;
 1201                         cv_broadcast(&dn->dn_notxholds);
 1202                 }
 1203                 mutex_exit(&dn->dn_mtx);
 1204         }
 1205 
 1206         if (tx->tx_tempreserve_cookie)
 1207                 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
 1208 
 1209         if (!list_is_empty(&tx->tx_callbacks))
 1210                 txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
 1211 
 1212         if (tx->tx_anyobj == FALSE)
 1213                 txg_rele_to_sync(&tx->tx_txgh);
 1214 
 1215         dmu_tx_destroy(tx);
 1216 }
 1217 
 1218 void
 1219 dmu_tx_abort(dmu_tx_t *tx)
 1220 {
 1221         ASSERT(tx->tx_txg == 0);
 1222 
 1223         /*
 1224          * Call any registered callbacks with an error code.
 1225          */
 1226         if (!list_is_empty(&tx->tx_callbacks))
 1227                 dmu_tx_do_callbacks(&tx->tx_callbacks, SET_ERROR(ECANCELED));
 1228 
 1229         dmu_tx_destroy(tx);
 1230 }
 1231 
 1232 uint64_t
 1233 dmu_tx_get_txg(dmu_tx_t *tx)
 1234 {
 1235         ASSERT(tx->tx_txg != 0);
 1236         return (tx->tx_txg);
 1237 }
 1238 
 1239 dsl_pool_t *
 1240 dmu_tx_pool(dmu_tx_t *tx)
 1241 {
 1242         ASSERT(tx->tx_pool != NULL);
 1243         return (tx->tx_pool);
 1244 }
 1245 
 1246 void
 1247 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
 1248 {
 1249         dmu_tx_callback_t *dcb;
 1250 
 1251         dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
 1252 
 1253         dcb->dcb_func = func;
 1254         dcb->dcb_data = data;
 1255 
 1256         list_insert_tail(&tx->tx_callbacks, dcb);
 1257 }
 1258 
 1259 /*
 1260  * Call all the commit callbacks on a list, with a given error code.
 1261  */
 1262 void
 1263 dmu_tx_do_callbacks(list_t *cb_list, int error)
 1264 {
 1265         dmu_tx_callback_t *dcb;
 1266 
 1267         while ((dcb = list_tail(cb_list)) != NULL) {
 1268                 list_remove(cb_list, dcb);
 1269                 dcb->dcb_func(dcb->dcb_data, error);
 1270                 kmem_free(dcb, sizeof (dmu_tx_callback_t));
 1271         }
 1272 }
 1273 
 1274 /*
 1275  * Interface to hold a bunch of attributes.
 1276  * used for creating new files.
 1277  * attrsize is the total size of all attributes
 1278  * to be added during object creation
 1279  *
 1280  * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
 1281  */
 1282 
 1283 /*
 1284  * hold necessary attribute name for attribute registration.
 1285  * should be a very rare case where this is needed.  If it does
 1286  * happen it would only happen on the first write to the file system.
 1287  */
 1288 static void
 1289 dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
 1290 {
 1291         if (!sa->sa_need_attr_registration)
 1292                 return;
 1293 
 1294         for (int i = 0; i != sa->sa_num_attrs; i++) {
 1295                 if (!sa->sa_attr_table[i].sa_registered) {
 1296                         if (sa->sa_reg_attr_obj)
 1297                                 dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
 1298                                     B_TRUE, sa->sa_attr_table[i].sa_name);
 1299                         else
 1300                                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
 1301                                     B_TRUE, sa->sa_attr_table[i].sa_name);
 1302                 }
 1303         }
 1304 }
 1305 
 1306 void
 1307 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
 1308 {
 1309         dmu_tx_hold_t *txh;
 1310 
 1311         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
 1312             THT_SPILL, 0, 0);
 1313         if (txh != NULL)
 1314                 (void) zfs_refcount_add_many(&txh->txh_space_towrite,
 1315                     SPA_OLD_MAXBLOCKSIZE, FTAG);
 1316 }
 1317 
 1318 void
 1319 dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
 1320 {
 1321         sa_os_t *sa = tx->tx_objset->os_sa;
 1322 
 1323         dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 1324 
 1325         if (tx->tx_objset->os_sa->sa_master_obj == 0)
 1326                 return;
 1327 
 1328         if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
 1329                 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
 1330         } else {
 1331                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
 1332                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
 1333                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 1334                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 1335         }
 1336 
 1337         dmu_tx_sa_registration_hold(sa, tx);
 1338 
 1339         if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill)
 1340                 return;
 1341 
 1342         (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
 1343             THT_SPILL, 0, 0);
 1344 }
 1345 
 1346 /*
 1347  * Hold SA attribute
 1348  *
 1349  * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
 1350  *
 1351  * variable_size is the total size of all variable sized attributes
 1352  * passed to this function.  It is not the total size of all
 1353  * variable size attributes that *may* exist on this object.
 1354  */
 1355 void
 1356 dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
 1357 {
 1358         uint64_t object;
 1359         sa_os_t *sa = tx->tx_objset->os_sa;
 1360 
 1361         ASSERT(hdl != NULL);
 1362 
 1363         object = sa_handle_object(hdl);
 1364 
 1365         dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
 1366         DB_DNODE_ENTER(db);
 1367         dmu_tx_hold_bonus_by_dnode(tx, DB_DNODE(db));
 1368         DB_DNODE_EXIT(db);
 1369 
 1370         if (tx->tx_objset->os_sa->sa_master_obj == 0)
 1371                 return;
 1372 
 1373         if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
 1374             tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
 1375                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
 1376                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
 1377                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 1378                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 1379         }
 1380 
 1381         dmu_tx_sa_registration_hold(sa, tx);
 1382 
 1383         if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
 1384                 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
 1385 
 1386         if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
 1387                 ASSERT(tx->tx_txg == 0);
 1388                 dmu_tx_hold_spill(tx, object);
 1389         } else {
 1390                 dnode_t *dn;
 1391 
 1392                 DB_DNODE_ENTER(db);
 1393                 dn = DB_DNODE(db);
 1394                 if (dn->dn_have_spill) {
 1395                         ASSERT(tx->tx_txg == 0);
 1396                         dmu_tx_hold_spill(tx, object);
 1397                 }
 1398                 DB_DNODE_EXIT(db);
 1399         }
 1400 }
 1401 
 1402 void
 1403 dmu_tx_init(void)
 1404 {
 1405         dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc",
 1406             KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t),
 1407             KSTAT_FLAG_VIRTUAL);
 1408 
 1409         if (dmu_tx_ksp != NULL) {
 1410                 dmu_tx_ksp->ks_data = &dmu_tx_stats;
 1411                 kstat_install(dmu_tx_ksp);
 1412         }
 1413 }
 1414 
 1415 void
 1416 dmu_tx_fini(void)
 1417 {
 1418         if (dmu_tx_ksp != NULL) {
 1419                 kstat_delete(dmu_tx_ksp);
 1420                 dmu_tx_ksp = NULL;
 1421         }
 1422 }
 1423 
 1424 #if defined(_KERNEL)
 1425 EXPORT_SYMBOL(dmu_tx_create);
 1426 EXPORT_SYMBOL(dmu_tx_hold_write);
 1427 EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode);
 1428 EXPORT_SYMBOL(dmu_tx_hold_free);
 1429 EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode);
 1430 EXPORT_SYMBOL(dmu_tx_hold_zap);
 1431 EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode);
 1432 EXPORT_SYMBOL(dmu_tx_hold_bonus);
 1433 EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode);
 1434 EXPORT_SYMBOL(dmu_tx_abort);
 1435 EXPORT_SYMBOL(dmu_tx_assign);
 1436 EXPORT_SYMBOL(dmu_tx_wait);
 1437 EXPORT_SYMBOL(dmu_tx_commit);
 1438 EXPORT_SYMBOL(dmu_tx_mark_netfree);
 1439 EXPORT_SYMBOL(dmu_tx_get_txg);
 1440 EXPORT_SYMBOL(dmu_tx_callback_register);
 1441 EXPORT_SYMBOL(dmu_tx_do_callbacks);
 1442 EXPORT_SYMBOL(dmu_tx_hold_spill);
 1443 EXPORT_SYMBOL(dmu_tx_hold_sa_create);
 1444 EXPORT_SYMBOL(dmu_tx_hold_sa);
 1445 #endif

Cache object: d9a7057337f46d93c58458f347cb79ca


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.