The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/include/sys/dbuf.h

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   23  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
   24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
   25  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
   26  */
   27 
   28 #ifndef _SYS_DBUF_H
   29 #define _SYS_DBUF_H
   30 
   31 #include <sys/dmu.h>
   32 #include <sys/spa.h>
   33 #include <sys/txg.h>
   34 #include <sys/zio.h>
   35 #include <sys/arc.h>
   36 #include <sys/zfs_context.h>
   37 #include <sys/zfs_refcount.h>
   38 #include <sys/zrlock.h>
   39 #include <sys/multilist.h>
   40 
   41 #ifdef  __cplusplus
   42 extern "C" {
   43 #endif
   44 
   45 #define IN_DMU_SYNC 2
   46 
   47 /*
   48  * define flags for dbuf_read
   49  */
   50 
   51 #define DB_RF_MUST_SUCCEED      (1 << 0)
   52 #define DB_RF_CANFAIL           (1 << 1)
   53 #define DB_RF_HAVESTRUCT        (1 << 2)
   54 #define DB_RF_NOPREFETCH        (1 << 3)
   55 #define DB_RF_NEVERWAIT         (1 << 4)
   56 #define DB_RF_CACHED            (1 << 5)
   57 #define DB_RF_NO_DECRYPT        (1 << 6)
   58 #define DB_RF_PARTIAL_FIRST     (1 << 7)
   59 #define DB_RF_PARTIAL_MORE      (1 << 8)
   60 
   61 /*
   62  * The simplified state transition diagram for dbufs looks like:
   63  *
   64  *              +----> READ ----+
   65  *              |               |
   66  *              |               V
   67  *  (alloc)-->UNCACHED       CACHED-->EVICTING-->(free)
   68  *              |               ^        ^
   69  *              |               |        |
   70  *              +----> FILL ----+        |
   71  *              |                        |
   72  *              |                        |
   73  *              +--------> NOFILL -------+
   74  *
   75  * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
   76  * to find all dbufs in a range of a dnode and must be less than any other
   77  * dbuf_states_t (see comment on dn_dbufs in dnode.h).
   78  */
   79 typedef enum dbuf_states {
   80         DB_SEARCH = -1,
   81         DB_UNCACHED,
   82         DB_FILL,
   83         DB_NOFILL,
   84         DB_READ,
   85         DB_CACHED,
   86         DB_EVICTING
   87 } dbuf_states_t;
   88 
   89 typedef enum dbuf_cached_state {
   90         DB_NO_CACHE = -1,
   91         DB_DBUF_CACHE,
   92         DB_DBUF_METADATA_CACHE,
   93         DB_CACHE_MAX
   94 } dbuf_cached_state_t;
   95 
   96 struct dnode;
   97 struct dmu_tx;
   98 
   99 /*
  100  * level = 0 means the user data
  101  * level = 1 means the single indirect block
  102  * etc.
  103  */
  104 
  105 struct dmu_buf_impl;
  106 
  107 typedef enum override_states {
  108         DR_NOT_OVERRIDDEN,
  109         DR_IN_DMU_SYNC,
  110         DR_OVERRIDDEN
  111 } override_states_t;
  112 
  113 typedef enum db_lock_type {
  114         DLT_NONE,
  115         DLT_PARENT,
  116         DLT_OBJSET
  117 } db_lock_type_t;
  118 
  119 typedef struct dbuf_dirty_record {
  120         /* link on our parents dirty list */
  121         list_node_t dr_dirty_node;
  122 
  123         /* transaction group this data will sync in */
  124         uint64_t dr_txg;
  125 
  126         /* zio of outstanding write IO */
  127         zio_t *dr_zio;
  128 
  129         /* pointer back to our dbuf */
  130         struct dmu_buf_impl *dr_dbuf;
  131 
  132         /* list link for dbuf dirty records */
  133         list_node_t dr_dbuf_node;
  134 
  135         /*
  136          * The dnode we are part of.  Note that the dnode can not be moved or
  137          * evicted due to the hold that's added by dnode_setdirty() or
  138          * dmu_objset_sync_dnodes(), and released by dnode_rele_task() or
  139          * userquota_updates_task().  This hold is necessary for
  140          * dirty_lightweight_leaf-type dirty records, which don't have a hold
  141          * on a dbuf.
  142          */
  143         dnode_t *dr_dnode;
  144 
  145         /* pointer to parent dirty record */
  146         struct dbuf_dirty_record *dr_parent;
  147 
  148         /* How much space was changed to dsl_pool_dirty_space() for this? */
  149         unsigned int dr_accounted;
  150 
  151         /* A copy of the bp that points to us */
  152         blkptr_t dr_bp_copy;
  153 
  154         union dirty_types {
  155                 struct dirty_indirect {
  156 
  157                         /* protect access to list */
  158                         kmutex_t dr_mtx;
  159 
  160                         /* Our list of dirty children */
  161                         list_t dr_children;
  162                 } di;
  163                 struct dirty_leaf {
  164 
  165                         /*
  166                          * dr_data is set when we dirty the buffer
  167                          * so that we can retain the pointer even if it
  168                          * gets COW'd in a subsequent transaction group.
  169                          */
  170                         arc_buf_t *dr_data;
  171                         blkptr_t dr_overridden_by;
  172                         override_states_t dr_override_state;
  173                         uint8_t dr_copies;
  174                         boolean_t dr_nopwrite;
  175                         boolean_t dr_has_raw_params;
  176 
  177                         /*
  178                          * If dr_has_raw_params is set, the following crypt
  179                          * params will be set on the BP that's written.
  180                          */
  181                         boolean_t dr_byteorder;
  182                         uint8_t dr_salt[ZIO_DATA_SALT_LEN];
  183                         uint8_t dr_iv[ZIO_DATA_IV_LEN];
  184                         uint8_t dr_mac[ZIO_DATA_MAC_LEN];
  185                 } dl;
  186                 struct dirty_lightweight_leaf {
  187                         /*
  188                          * This dirty record refers to a leaf (level=0)
  189                          * block, whose dbuf has not been instantiated for
  190                          * performance reasons.
  191                          */
  192                         uint64_t dr_blkid;
  193                         abd_t *dr_abd;
  194                         zio_prop_t dr_props;
  195                         zio_flag_t dr_flags;
  196                 } dll;
  197         } dt;
  198 } dbuf_dirty_record_t;
  199 
  200 typedef struct dmu_buf_impl {
  201         /*
  202          * The following members are immutable, with the exception of
  203          * db.db_data, which is protected by db_mtx.
  204          */
  205 
  206         /* the publicly visible structure */
  207         dmu_buf_t db;
  208 
  209         /* the objset we belong to */
  210         struct objset *db_objset;
  211 
  212         /*
  213          * handle to safely access the dnode we belong to (NULL when evicted)
  214          */
  215         struct dnode_handle *db_dnode_handle;
  216 
  217         /*
  218          * our parent buffer; if the dnode points to us directly,
  219          * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
  220          * only accessed by sync thread ???
  221          * (NULL when evicted)
  222          * May change from NULL to non-NULL under the protection of db_mtx
  223          * (see dbuf_check_blkptr())
  224          */
  225         struct dmu_buf_impl *db_parent;
  226 
  227         /*
  228          * link for hash table of all dmu_buf_impl_t's
  229          */
  230         struct dmu_buf_impl *db_hash_next;
  231 
  232         /*
  233          * Our link on the owner dnodes's dn_dbufs list.
  234          * Protected by its dn_dbufs_mtx.  Should be on the same cache line
  235          * as db_level and db_blkid for the best avl_add() performance.
  236          */
  237         avl_node_t db_link;
  238 
  239         /* our block number */
  240         uint64_t db_blkid;
  241 
  242         /*
  243          * Pointer to the blkptr_t which points to us. May be NULL if we
  244          * don't have one yet. (NULL when evicted)
  245          */
  246         blkptr_t *db_blkptr;
  247 
  248         /*
  249          * Our indirection level.  Data buffers have db_level==0.
  250          * Indirect buffers which point to data buffers have
  251          * db_level==1. etc.  Buffers which contain dnodes have
  252          * db_level==0, since the dnodes are stored in a file.
  253          */
  254         uint8_t db_level;
  255 
  256         /*
  257          * Protects db_buf's contents if they contain an indirect block or data
  258          * block of the meta-dnode. We use this lock to protect the structure of
  259          * the block tree. This means that when modifying this dbuf's data, we
  260          * grab its rwlock. When modifying its parent's data (including the
  261          * blkptr to this dbuf), we grab the parent's rwlock. The lock ordering
  262          * for this lock is:
  263          * 1) dn_struct_rwlock
  264          * 2) db_rwlock
  265          * We don't currently grab multiple dbufs' db_rwlocks at once.
  266          */
  267         krwlock_t db_rwlock;
  268 
  269         /* buffer holding our data */
  270         arc_buf_t *db_buf;
  271 
  272         /* db_mtx protects the members below */
  273         kmutex_t db_mtx;
  274 
  275         /*
  276          * Current state of the buffer
  277          */
  278         dbuf_states_t db_state;
  279 
  280         /*
  281          * Refcount accessed by dmu_buf_{hold,rele}.
  282          * If nonzero, the buffer can't be destroyed.
  283          * Protected by db_mtx.
  284          */
  285         zfs_refcount_t db_holds;
  286 
  287         kcondvar_t db_changed;
  288         dbuf_dirty_record_t *db_data_pending;
  289 
  290         /* List of dirty records for the buffer sorted newest to oldest. */
  291         list_t db_dirty_records;
  292 
  293         /* Link in dbuf_cache or dbuf_metadata_cache */
  294         multilist_node_t db_cache_link;
  295 
  296         /* Tells us which dbuf cache this dbuf is in, if any */
  297         dbuf_cached_state_t db_caching_status;
  298 
  299         uint64_t db_hash;
  300 
  301         /* Data which is unique to data (leaf) blocks: */
  302 
  303         /* User callback information. */
  304         dmu_buf_user_t *db_user;
  305 
  306         /*
  307          * Evict user data as soon as the dirty and reference
  308          * counts are equal.
  309          */
  310         uint8_t db_user_immediate_evict;
  311 
  312         /*
  313          * This block was freed while a read or write was
  314          * active.
  315          */
  316         uint8_t db_freed_in_flight;
  317 
  318         /*
  319          * dnode_evict_dbufs() or dnode_evict_bonus() tried to
  320          * evict this dbuf, but couldn't due to outstanding
  321          * references.  Evict once the refcount drops to 0.
  322          */
  323         uint8_t db_pending_evict;
  324 
  325         uint8_t db_dirtycnt;
  326 
  327         /* The buffer was partially read.  More reads may follow. */
  328         uint8_t db_partial_read;
  329 } dmu_buf_impl_t;
  330 
  331 #define DBUF_HASH_MUTEX(h, idx) \
  332         (&(h)->hash_mutexes[(idx) & ((h)->hash_mutex_mask)])
  333 
  334 typedef struct dbuf_hash_table {
  335         uint64_t hash_table_mask;
  336         uint64_t hash_mutex_mask;
  337         dmu_buf_impl_t **hash_table;
  338         kmutex_t *hash_mutexes;
  339 } dbuf_hash_table_t;
  340 
  341 typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t);
  342 
  343 uint64_t dbuf_whichblock(const struct dnode *di, const int64_t level,
  344     const uint64_t offset);
  345 
  346 void dbuf_create_bonus(struct dnode *dn);
  347 int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
  348 
  349 void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
  350 
  351 dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, const void *tag);
  352 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
  353     const void *tag);
  354 int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
  355     boolean_t fail_sparse, boolean_t fail_uncached,
  356     const void *tag, dmu_buf_impl_t **dbp);
  357 
  358 int dbuf_prefetch_impl(struct dnode *dn, int64_t level, uint64_t blkid,
  359     zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
  360     void *arg);
  361 int dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
  362     zio_priority_t prio, arc_flags_t aflags);
  363 
  364 void dbuf_add_ref(dmu_buf_impl_t *db, const void *tag);
  365 boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
  366     uint64_t blkid, const void *tag);
  367 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
  368 
  369 void dbuf_rele(dmu_buf_impl_t *db, const void *tag);
  370 void dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag,
  371     boolean_t evicting);
  372 
  373 dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
  374     uint64_t blkid, uint64_t *hash_out);
  375 
  376 int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
  377 void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
  378 void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
  379 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
  380 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
  381 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
  382 dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid,
  383     dmu_tx_t *tx);
  384 arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
  385 void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
  386     bp_embedded_type_t etype, enum zio_compress comp,
  387     int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
  388 
  389 int dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
  390     const struct zio_prop *zp, zio_flag_t flags, dmu_tx_t *tx);
  391 
  392 void dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx);
  393 void dbuf_destroy(dmu_buf_impl_t *db);
  394 
  395 void dbuf_unoverride(dbuf_dirty_record_t *dr);
  396 void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
  397 void dbuf_release_bp(dmu_buf_impl_t *db);
  398 db_lock_type_t dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw,
  399     const void *tag);
  400 void dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type,
  401     const void *tag);
  402 
  403 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
  404     struct dmu_tx *);
  405 
  406 void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
  407 
  408 void dbuf_stats_init(dbuf_hash_table_t *hash);
  409 void dbuf_stats_destroy(void);
  410 
  411 int dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
  412     blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift);
  413 
  414 #define DB_DNODE(_db)           ((_db)->db_dnode_handle->dnh_dnode)
  415 #define DB_DNODE_LOCK(_db)      ((_db)->db_dnode_handle->dnh_zrlock)
  416 #define DB_DNODE_ENTER(_db)     (zrl_add(&DB_DNODE_LOCK(_db)))
  417 #define DB_DNODE_EXIT(_db)      (zrl_remove(&DB_DNODE_LOCK(_db)))
  418 #define DB_DNODE_HELD(_db)      (!zrl_is_zero(&DB_DNODE_LOCK(_db)))
  419 
  420 void dbuf_init(void);
  421 void dbuf_fini(void);
  422 
  423 boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
  424 
  425 static inline dbuf_dirty_record_t *
  426 dbuf_find_dirty_lte(dmu_buf_impl_t *db, uint64_t txg)
  427 {
  428         dbuf_dirty_record_t *dr;
  429 
  430         for (dr = list_head(&db->db_dirty_records);
  431             dr != NULL && dr->dr_txg > txg;
  432             dr = list_next(&db->db_dirty_records, dr))
  433                 continue;
  434         return (dr);
  435 }
  436 
  437 static inline dbuf_dirty_record_t *
  438 dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg)
  439 {
  440         dbuf_dirty_record_t *dr;
  441 
  442         dr = dbuf_find_dirty_lte(db, txg);
  443         if (dr && dr->dr_txg == txg)
  444                 return (dr);
  445         return (NULL);
  446 }
  447 
  448 #define DBUF_GET_BUFC_TYPE(_db) \
  449         (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
  450 
  451 #define DBUF_IS_CACHEABLE(_db)                                          \
  452         ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||         \
  453         (dbuf_is_metadata(_db) &&                                       \
  454         ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
  455 
  456 boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db);
  457 
  458 #ifdef ZFS_DEBUG
  459 
  460 /*
  461  * There should be a ## between the string literal and fmt, to make it
  462  * clear that we're joining two strings together, but gcc does not
  463  * support that preprocessor token.
  464  */
  465 #define dprintf_dbuf(dbuf, fmt, ...) do { \
  466         if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
  467         char __db_buf[32]; \
  468         uint64_t __db_obj = (dbuf)->db.db_object; \
  469         if (__db_obj == DMU_META_DNODE_OBJECT) \
  470                 (void) strlcpy(__db_buf, "mdn", sizeof (__db_buf));     \
  471         else \
  472                 (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
  473                     (u_longlong_t)__db_obj); \
  474         dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
  475             "obj=%s lvl=%u blkid=%lld " fmt, \
  476             __db_buf, (dbuf)->db_level, \
  477             (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
  478         } \
  479 } while (0)
  480 
  481 #define dprintf_dbuf_bp(db, bp, fmt, ...) do {                  \
  482         if (zfs_flags & ZFS_DEBUG_DPRINTF) {                    \
  483         char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);  \
  484         snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);          \
  485         dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);   \
  486         kmem_free(__blkbuf, BP_SPRINTF_LEN);                    \
  487         }                                                       \
  488 } while (0)
  489 
  490 #define DBUF_VERIFY(db) dbuf_verify(db)
  491 
  492 #else
  493 
  494 #define dprintf_dbuf(db, fmt, ...)
  495 #define dprintf_dbuf_bp(db, bp, fmt, ...)
  496 #define DBUF_VERIFY(db)
  497 
  498 #endif
  499 
  500 
  501 #ifdef  __cplusplus
  502 }
  503 #endif
  504 
  505 #endif /* _SYS_DBUF_H */

Cache object: a9f56f4a7614e8fed370a2af50e2cc1b


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.