The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/dsl_bookmark.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * This file and its contents are supplied under the terms of the
    5  * Common Development and Distribution License ("CDDL"), version 1.0.
    6  * You may only use this file in accordance with the terms of version
    7  * 1.0 of the CDDL.
    8  *
    9  * A full copy of the text of the CDDL should have accompanied this
   10  * source.  A copy of the CDDL is also available via the Internet at
   11  * http://www.illumos.org/license/CDDL.
   12  *
   13  * CDDL HEADER END
   14  */
   15 
   16 /*
   17  * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
   18  * Copyright 2017 Nexenta Systems, Inc.
   19  * Copyright 2019, 2020 by Christian Schwarz. All rights reserved.
   20  */
   21 
   22 #include <sys/zfs_context.h>
   23 #include <sys/dsl_dataset.h>
   24 #include <sys/dsl_dir.h>
   25 #include <sys/dsl_prop.h>
   26 #include <sys/dsl_synctask.h>
   27 #include <sys/dsl_destroy.h>
   28 #include <sys/dmu_impl.h>
   29 #include <sys/dmu_tx.h>
   30 #include <sys/arc.h>
   31 #include <sys/zap.h>
   32 #include <sys/zfeature.h>
   33 #include <sys/spa.h>
   34 #include <sys/dsl_bookmark.h>
   35 #include <zfs_namecheck.h>
   36 #include <sys/dmu_send.h>
   37 
   38 static int
   39 dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname,
   40     dsl_dataset_t **dsp, const void *tag, char **shortnamep)
   41 {
   42         char buf[ZFS_MAX_DATASET_NAME_LEN];
   43         char *hashp;
   44 
   45         if (strlen(fullname) >= ZFS_MAX_DATASET_NAME_LEN)
   46                 return (SET_ERROR(ENAMETOOLONG));
   47         hashp = strchr(fullname, '#');
   48         if (hashp == NULL)
   49                 return (SET_ERROR(EINVAL));
   50 
   51         *shortnamep = hashp + 1;
   52         if (zfs_component_namecheck(*shortnamep, NULL, NULL))
   53                 return (SET_ERROR(EINVAL));
   54         (void) strlcpy(buf, fullname, hashp - fullname + 1);
   55         return (dsl_dataset_hold(dp, buf, tag, dsp));
   56 }
   57 
   58 /*
   59  * When reading BOOKMARK_V1 bookmarks, the BOOKMARK_V2 fields are guaranteed
   60  * to be zeroed.
   61  *
   62  * Returns ESRCH if bookmark is not found.
   63  * Note, we need to use the ZAP rather than the AVL to look up bookmarks
   64  * by name, because only the ZAP honors the casesensitivity setting.
   65  */
   66 int
   67 dsl_bookmark_lookup_impl(dsl_dataset_t *ds, const char *shortname,
   68     zfs_bookmark_phys_t *bmark_phys)
   69 {
   70         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
   71         uint64_t bmark_zapobj = ds->ds_bookmarks_obj;
   72         matchtype_t mt = 0;
   73         int err;
   74 
   75         if (bmark_zapobj == 0)
   76                 return (SET_ERROR(ESRCH));
   77 
   78         if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
   79                 mt = MT_NORMALIZE;
   80 
   81         /*
   82          * Zero out the bookmark in case the one stored on disk
   83          * is in an older, shorter format.
   84          */
   85         memset(bmark_phys, 0, sizeof (*bmark_phys));
   86 
   87         err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t),
   88             sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, NULL, 0,
   89             NULL);
   90 
   91         return (err == ENOENT ? SET_ERROR(ESRCH) : err);
   92 }
   93 
   94 /*
   95  * If later_ds is non-NULL, this will return EXDEV if the specified bookmark
   96  * does not represents an earlier point in later_ds's timeline.  However,
   97  * bmp will still be filled in if we return EXDEV.
   98  *
   99  * Returns ENOENT if the dataset containing the bookmark does not exist.
  100  * Returns ESRCH if the dataset exists but the bookmark was not found in it.
  101  */
  102 int
  103 dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname,
  104     dsl_dataset_t *later_ds, zfs_bookmark_phys_t *bmp)
  105 {
  106         char *shortname;
  107         dsl_dataset_t *ds;
  108         int error;
  109 
  110         error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname);
  111         if (error != 0)
  112                 return (error);
  113 
  114         error = dsl_bookmark_lookup_impl(ds, shortname, bmp);
  115         if (error == 0 && later_ds != NULL) {
  116                 if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg))
  117                         error = SET_ERROR(EXDEV);
  118         }
  119         dsl_dataset_rele(ds, FTAG);
  120         return (error);
  121 }
  122 
  123 /*
  124  * Validates that
  125  * - bmark is a full dataset path of a bookmark (bookmark_namecheck)
  126  * - source is a full path of a snapshot or bookmark
  127  *   ({bookmark,snapshot}_namecheck)
  128  *
  129  * Returns 0 if valid, -1 otherwise.
  130  */
  131 static int
  132 dsl_bookmark_create_nvl_validate_pair(const char *bmark, const char *source)
  133 {
  134         if (bookmark_namecheck(bmark, NULL, NULL) != 0)
  135                 return (-1);
  136 
  137         int is_bmark, is_snap;
  138         is_bmark = bookmark_namecheck(source, NULL, NULL) == 0;
  139         is_snap = snapshot_namecheck(source, NULL, NULL) == 0;
  140         if (!is_bmark && !is_snap)
  141                 return (-1);
  142 
  143         return (0);
  144 }
  145 
  146 /*
  147  * Check that the given nvlist corresponds to the following schema:
  148  *  { newbookmark -> source, ... }
  149  * where
  150  * - each pair passes dsl_bookmark_create_nvl_validate_pair
  151  * - all newbookmarks are in the same pool
  152  * - all newbookmarks have unique names
  153  *
  154  * Note that this function is only validates above schema. Callers must ensure
  155  * that the bookmarks can be created, e.g. that sources exist.
  156  *
  157  * Returns 0 if the nvlist adheres to above schema.
  158  * Returns -1 if it doesn't.
  159  */
  160 int
  161 dsl_bookmark_create_nvl_validate(nvlist_t *bmarks)
  162 {
  163         char *first = NULL;
  164         size_t first_len = 0;
  165 
  166         for (nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
  167             pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) {
  168 
  169                 char *bmark = nvpair_name(pair);
  170                 char *source;
  171 
  172                 /* list structure: values must be snapshots XOR bookmarks */
  173                 if (nvpair_value_string(pair, &source) != 0)
  174                         return (-1);
  175                 if (dsl_bookmark_create_nvl_validate_pair(bmark, source) != 0)
  176                         return (-1);
  177 
  178                 /* same pool check */
  179                 if (first == NULL) {
  180                         char *cp = strpbrk(bmark, "/#");
  181                         if (cp == NULL)
  182                                 return (-1);
  183                         first = bmark;
  184                         first_len = cp - bmark;
  185                 }
  186                 if (strncmp(first, bmark, first_len) != 0)
  187                         return (-1);
  188                 switch (*(bmark + first_len)) {
  189                         case '/': /* fallthrough */
  190                         case '#':
  191                                 break;
  192                         default:
  193                                 return (-1);
  194                 }
  195 
  196                 /* unique newbookmark names; todo: O(n^2) */
  197                 for (nvpair_t *pair2 = nvlist_next_nvpair(bmarks, pair);
  198                     pair2 != NULL; pair2 = nvlist_next_nvpair(bmarks, pair2)) {
  199                         if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
  200                                 return (-1);
  201                 }
  202 
  203         }
  204         return (0);
  205 }
  206 
  207 /*
  208  * expects that newbm and source have been validated using
  209  * dsl_bookmark_create_nvl_validate_pair
  210  */
  211 static int
  212 dsl_bookmark_create_check_impl(dsl_pool_t *dp,
  213     const char *newbm, const char *source)
  214 {
  215         ASSERT0(dsl_bookmark_create_nvl_validate_pair(newbm, source));
  216         /* defer source namecheck until we know it's a snapshot or bookmark */
  217 
  218         int error;
  219         dsl_dataset_t *newbm_ds;
  220         char *newbm_short;
  221         zfs_bookmark_phys_t bmark_phys;
  222 
  223         error = dsl_bookmark_hold_ds(dp, newbm, &newbm_ds, FTAG, &newbm_short);
  224         if (error != 0)
  225                 return (error);
  226 
  227         /* Verify that the new bookmark does not already exist */
  228         error = dsl_bookmark_lookup_impl(newbm_ds, newbm_short, &bmark_phys);
  229         switch (error) {
  230         case ESRCH:
  231                 /* happy path: new bmark doesn't exist, proceed after switch */
  232                 break;
  233         case 0:
  234                 error = SET_ERROR(EEXIST);
  235                 goto eholdnewbmds;
  236         default:
  237                 /* dsl_bookmark_lookup_impl already did SET_ERROR */
  238                 goto eholdnewbmds;
  239         }
  240 
  241         /* error is retval of the following if-cascade */
  242         if (strchr(source, '@') != NULL) {
  243                 dsl_dataset_t *source_snap_ds;
  244                 ASSERT3S(snapshot_namecheck(source, NULL, NULL), ==, 0);
  245                 error = dsl_dataset_hold(dp, source, FTAG, &source_snap_ds);
  246                 if (error == 0) {
  247                         VERIFY(source_snap_ds->ds_is_snapshot);
  248                         /*
  249                          * Verify that source snapshot is an earlier point in
  250                          * newbm_ds's timeline (source may be newbm_ds's origin)
  251                          */
  252                         if (!dsl_dataset_is_before(newbm_ds, source_snap_ds, 0))
  253                                 error = SET_ERROR(
  254                                     ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR);
  255                         dsl_dataset_rele(source_snap_ds, FTAG);
  256                 }
  257         } else if (strchr(source, '#') != NULL) {
  258                 zfs_bookmark_phys_t source_phys;
  259                 ASSERT3S(bookmark_namecheck(source, NULL, NULL), ==, 0);
  260                 /*
  261                  * Source must exists and be an earlier point in newbm_ds's
  262                  * timeline (newbm_ds's origin may be a snap of source's ds)
  263                  */
  264                 error = dsl_bookmark_lookup(dp, source, newbm_ds, &source_phys);
  265                 switch (error) {
  266                 case 0:
  267                         break; /* happy path */
  268                 case EXDEV:
  269                         error = SET_ERROR(ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR);
  270                         break;
  271                 default:
  272                         /* dsl_bookmark_lookup already did SET_ERROR */
  273                         break;
  274                 }
  275         } else {
  276                 /*
  277                  * dsl_bookmark_create_nvl_validate validates that source is
  278                  * either snapshot or bookmark
  279                  */
  280                 panic("unreachable code: %s", source);
  281         }
  282 
  283 eholdnewbmds:
  284         dsl_dataset_rele(newbm_ds, FTAG);
  285         return (error);
  286 }
  287 
  288 int
  289 dsl_bookmark_create_check(void *arg, dmu_tx_t *tx)
  290 {
  291         dsl_bookmark_create_arg_t *dbca = arg;
  292         int rv = 0;
  293         int schema_err = 0;
  294         ASSERT3P(dbca, !=, NULL);
  295         ASSERT3P(dbca->dbca_bmarks, !=, NULL);
  296         /* dbca->dbca_errors is allowed to be NULL */
  297 
  298         dsl_pool_t *dp = dmu_tx_pool(tx);
  299 
  300         if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
  301                 return (SET_ERROR(ENOTSUP));
  302 
  303         if (dsl_bookmark_create_nvl_validate(dbca->dbca_bmarks) != 0)
  304                 rv = schema_err = SET_ERROR(EINVAL);
  305 
  306         for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
  307             pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
  308                 char *new = nvpair_name(pair);
  309 
  310                 int error = schema_err;
  311                 if (error == 0) {
  312                         char *source = fnvpair_value_string(pair);
  313                         error = dsl_bookmark_create_check_impl(dp, new, source);
  314                         if (error != 0)
  315                                 error = SET_ERROR(error);
  316                 }
  317 
  318                 if (error != 0) {
  319                         rv = error;
  320                         if (dbca->dbca_errors != NULL)
  321                                 fnvlist_add_int32(dbca->dbca_errors,
  322                                     new, error);
  323                 }
  324         }
  325 
  326         return (rv);
  327 }
  328 
  329 static dsl_bookmark_node_t *
  330 dsl_bookmark_node_alloc(char *shortname)
  331 {
  332         dsl_bookmark_node_t *dbn = kmem_alloc(sizeof (*dbn), KM_SLEEP);
  333         dbn->dbn_name = spa_strdup(shortname);
  334         dbn->dbn_dirty = B_FALSE;
  335         mutex_init(&dbn->dbn_lock, NULL, MUTEX_DEFAULT, NULL);
  336         return (dbn);
  337 }
  338 
  339 /*
  340  * Set the fields in the zfs_bookmark_phys_t based on the specified snapshot.
  341  */
  342 static void
  343 dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap)
  344 {
  345         spa_t *spa = dsl_dataset_get_spa(snap);
  346         objset_t *mos = spa_get_dsl(spa)->dp_meta_objset;
  347         dsl_dataset_phys_t *dsp = dsl_dataset_phys(snap);
  348 
  349         memset(zbm, 0, sizeof (zfs_bookmark_phys_t));
  350         zbm->zbm_guid = dsp->ds_guid;
  351         zbm->zbm_creation_txg = dsp->ds_creation_txg;
  352         zbm->zbm_creation_time = dsp->ds_creation_time;
  353         zbm->zbm_redaction_obj = 0;
  354 
  355         /*
  356          * If the dataset is encrypted create a larger bookmark to
  357          * accommodate the IVset guid. The IVset guid was added
  358          * after the encryption feature to prevent a problem with
  359          * raw sends. If we encounter an encrypted dataset without
  360          * an IVset guid we fall back to a normal bookmark.
  361          */
  362         if (snap->ds_dir->dd_crypto_obj != 0 &&
  363             spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
  364                 (void) zap_lookup(mos, snap->ds_object,
  365                     DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1,
  366                     &zbm->zbm_ivset_guid);
  367         }
  368 
  369         if (spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_WRITTEN)) {
  370                 zbm->zbm_flags = ZBM_FLAG_SNAPSHOT_EXISTS | ZBM_FLAG_HAS_FBN;
  371                 zbm->zbm_referenced_bytes_refd = dsp->ds_referenced_bytes;
  372                 zbm->zbm_compressed_bytes_refd = dsp->ds_compressed_bytes;
  373                 zbm->zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes;
  374 
  375                 dsl_dataset_t *nextds;
  376                 VERIFY0(dsl_dataset_hold_obj(snap->ds_dir->dd_pool,
  377                     dsp->ds_next_snap_obj, FTAG, &nextds));
  378                 dsl_deadlist_space(&nextds->ds_deadlist,
  379                     &zbm->zbm_referenced_freed_before_next_snap,
  380                     &zbm->zbm_compressed_freed_before_next_snap,
  381                     &zbm->zbm_uncompressed_freed_before_next_snap);
  382                 dsl_dataset_rele(nextds, FTAG);
  383         }
  384 }
  385 
  386 /*
  387  * Add dsl_bookmark_node_t `dbn` to the given dataset and increment appropriate
  388  * SPA feature counters.
  389  */
  390 void
  391 dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn,
  392     dmu_tx_t *tx)
  393 {
  394         dsl_pool_t *dp = dmu_tx_pool(tx);
  395         objset_t *mos = dp->dp_meta_objset;
  396 
  397         if (hds->ds_bookmarks_obj == 0) {
  398                 hds->ds_bookmarks_obj = zap_create_norm(mos,
  399                     U8_TEXTPREP_TOUPPER, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0,
  400                     tx);
  401                 spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
  402 
  403                 dsl_dataset_zapify(hds, tx);
  404                 VERIFY0(zap_add(mos, hds->ds_object,
  405                     DS_FIELD_BOOKMARK_NAMES,
  406                     sizeof (hds->ds_bookmarks_obj), 1,
  407                     &hds->ds_bookmarks_obj, tx));
  408         }
  409 
  410         avl_add(&hds->ds_bookmarks, dbn);
  411 
  412         /*
  413          * To maintain backwards compatibility with software that doesn't
  414          * understand SPA_FEATURE_BOOKMARK_V2, we need to use the smallest
  415          * possible bookmark size.
  416          */
  417         uint64_t bookmark_phys_size = BOOKMARK_PHYS_SIZE_V1;
  418         if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2) &&
  419             (dbn->dbn_phys.zbm_ivset_guid != 0 || dbn->dbn_phys.zbm_flags &
  420             ZBM_FLAG_HAS_FBN || dbn->dbn_phys.zbm_redaction_obj != 0)) {
  421                 bookmark_phys_size = BOOKMARK_PHYS_SIZE_V2;
  422                 spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2, tx);
  423         }
  424 
  425         zfs_bookmark_phys_t zero_phys = { 0 };
  426         ASSERT0(memcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size,
  427             &zero_phys, sizeof (zfs_bookmark_phys_t) - bookmark_phys_size));
  428 
  429         VERIFY0(zap_add(mos, hds->ds_bookmarks_obj, dbn->dbn_name,
  430             sizeof (uint64_t), bookmark_phys_size / sizeof (uint64_t),
  431             &dbn->dbn_phys, tx));
  432 }
  433 
  434 /*
  435  * If redaction_list is non-null, we create a redacted bookmark and redaction
  436  * list, and store the object number of the redaction list in redact_obj.
  437  */
  438 static void
  439 dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot,
  440     dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps,
  441     const void *tag, redaction_list_t **redaction_list)
  442 {
  443         dsl_pool_t *dp = dmu_tx_pool(tx);
  444         objset_t *mos = dp->dp_meta_objset;
  445         dsl_dataset_t *snapds, *bmark_fs;
  446         char *shortname;
  447         boolean_t bookmark_redacted;
  448         uint64_t *dsredactsnaps;
  449         uint64_t dsnumsnaps;
  450 
  451         VERIFY0(dsl_dataset_hold(dp, snapshot, FTAG, &snapds));
  452         VERIFY0(dsl_bookmark_hold_ds(dp, bookmark, &bmark_fs, FTAG,
  453             &shortname));
  454 
  455         dsl_bookmark_node_t *dbn = dsl_bookmark_node_alloc(shortname);
  456         dsl_bookmark_set_phys(&dbn->dbn_phys, snapds);
  457 
  458         bookmark_redacted = dsl_dataset_get_uint64_array_feature(snapds,
  459             SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps);
  460         if (redaction_list != NULL || bookmark_redacted) {
  461                 redaction_list_t *local_rl;
  462                 if (bookmark_redacted) {
  463                         redact_snaps = dsredactsnaps;
  464                         num_redact_snaps = dsnumsnaps;
  465                 }
  466                 dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos,
  467                     DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
  468                     DMU_OTN_UINT64_METADATA, sizeof (redaction_list_phys_t) +
  469                     num_redact_snaps * sizeof (uint64_t), tx);
  470                 spa_feature_incr(dp->dp_spa,
  471                     SPA_FEATURE_REDACTION_BOOKMARKS, tx);
  472 
  473                 VERIFY0(dsl_redaction_list_hold_obj(dp,
  474                     dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl));
  475                 dsl_redaction_list_long_hold(dp, local_rl, tag);
  476 
  477                 ASSERT3U((local_rl)->rl_dbuf->db_size, >=,
  478                     sizeof (redaction_list_phys_t) + num_redact_snaps *
  479                     sizeof (uint64_t));
  480                 dmu_buf_will_dirty(local_rl->rl_dbuf, tx);
  481                 memcpy(local_rl->rl_phys->rlp_snaps, redact_snaps,
  482                     sizeof (uint64_t) * num_redact_snaps);
  483                 local_rl->rl_phys->rlp_num_snaps = num_redact_snaps;
  484                 if (bookmark_redacted) {
  485                         ASSERT3P(redaction_list, ==, NULL);
  486                         local_rl->rl_phys->rlp_last_blkid = UINT64_MAX;
  487                         local_rl->rl_phys->rlp_last_object = UINT64_MAX;
  488                         dsl_redaction_list_long_rele(local_rl, tag);
  489                         dsl_redaction_list_rele(local_rl, tag);
  490                 } else {
  491                         *redaction_list = local_rl;
  492                 }
  493         }
  494 
  495         if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
  496                 spa_feature_incr(dp->dp_spa,
  497                     SPA_FEATURE_BOOKMARK_WRITTEN, tx);
  498         }
  499 
  500         dsl_bookmark_node_add(bmark_fs, dbn, tx);
  501 
  502         spa_history_log_internal_ds(bmark_fs, "bookmark", tx,
  503             "name=%s creation_txg=%llu target_snap=%llu redact_obj=%llu",
  504             shortname, (longlong_t)dbn->dbn_phys.zbm_creation_txg,
  505             (longlong_t)snapds->ds_object,
  506             (longlong_t)dbn->dbn_phys.zbm_redaction_obj);
  507 
  508         dsl_dataset_rele(bmark_fs, FTAG);
  509         dsl_dataset_rele(snapds, FTAG);
  510 }
  511 
  512 
  513 static void
  514 dsl_bookmark_create_sync_impl_book(
  515     const char *new_name, const char *source_name, dmu_tx_t *tx)
  516 {
  517         dsl_pool_t *dp = dmu_tx_pool(tx);
  518         dsl_dataset_t *bmark_fs_source, *bmark_fs_new;
  519         char *source_shortname, *new_shortname;
  520         zfs_bookmark_phys_t source_phys;
  521 
  522         VERIFY0(dsl_bookmark_hold_ds(dp, source_name, &bmark_fs_source, FTAG,
  523             &source_shortname));
  524         VERIFY0(dsl_bookmark_hold_ds(dp, new_name, &bmark_fs_new, FTAG,
  525             &new_shortname));
  526 
  527         /*
  528          * create a copy of the source bookmark by copying most of its members
  529          *
  530          * Caveat: bookmarking a redaction bookmark yields a normal bookmark
  531          * -----------------------------------------------------------------
  532          * Reasoning:
  533          * - The zbm_redaction_obj would be referred to by both source and new
  534          *   bookmark, but would be destroyed once either source or new is
  535          *   destroyed, resulting in use-after-free of the referred object.
  536          * - User expectation when issuing the `zfs bookmark` command is that
  537          *   a normal bookmark of the source is created
  538          *
  539          * Design Alternatives For Full Redaction Bookmark Copying:
  540          * - reference-count the redaction object => would require on-disk
  541          *   format change for existing redaction objects
  542          * - Copy the redaction object => cannot be done in syncing context
  543          *   because the redaction object might be too large
  544          */
  545 
  546         VERIFY0(dsl_bookmark_lookup_impl(bmark_fs_source, source_shortname,
  547             &source_phys));
  548         dsl_bookmark_node_t *new_dbn = dsl_bookmark_node_alloc(new_shortname);
  549 
  550         memcpy(&new_dbn->dbn_phys, &source_phys, sizeof (source_phys));
  551         new_dbn->dbn_phys.zbm_redaction_obj = 0;
  552 
  553         /* update feature counters */
  554         if (new_dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
  555                 spa_feature_incr(dp->dp_spa,
  556                     SPA_FEATURE_BOOKMARK_WRITTEN, tx);
  557         }
  558         /* no need for redaction bookmark counter; nulled zbm_redaction_obj */
  559         /* dsl_bookmark_node_add bumps bookmarks and v2-bookmarks counter */
  560 
  561         /*
  562          * write new bookmark
  563          *
  564          * Note that dsl_bookmark_lookup_impl guarantees that, if source is a
  565          * v1 bookmark, the v2-only fields are zeroed.
  566          * And dsl_bookmark_node_add writes back a v1-sized bookmark if
  567          * v2 bookmarks are disabled and/or v2-only fields are zeroed.
  568          * => bookmark copying works on pre-bookmark-v2 pools
  569          */
  570         dsl_bookmark_node_add(bmark_fs_new, new_dbn, tx);
  571 
  572         spa_history_log_internal_ds(bmark_fs_source, "bookmark", tx,
  573             "name=%s creation_txg=%llu source_guid=%llu",
  574             new_shortname, (longlong_t)new_dbn->dbn_phys.zbm_creation_txg,
  575             (longlong_t)source_phys.zbm_guid);
  576 
  577         dsl_dataset_rele(bmark_fs_source, FTAG);
  578         dsl_dataset_rele(bmark_fs_new, FTAG);
  579 }
  580 
  581 void
  582 dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx)
  583 {
  584         dsl_bookmark_create_arg_t *dbca = arg;
  585 
  586         ASSERT(spa_feature_is_enabled(dmu_tx_pool(tx)->dp_spa,
  587             SPA_FEATURE_BOOKMARKS));
  588 
  589         for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
  590             pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
  591 
  592                 char *new = nvpair_name(pair);
  593                 char *source = fnvpair_value_string(pair);
  594 
  595                 if (strchr(source, '@') != NULL) {
  596                         dsl_bookmark_create_sync_impl_snap(new, source, tx,
  597                             0, NULL, NULL, NULL);
  598                 } else if (strchr(source, '#') != NULL) {
  599                         dsl_bookmark_create_sync_impl_book(new, source, tx);
  600                 } else {
  601                         panic("unreachable code");
  602                 }
  603 
  604         }
  605 }
  606 
  607 /*
  608  * The bookmarks must all be in the same pool.
  609  */
  610 int
  611 dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors)
  612 {
  613         nvpair_t *pair;
  614         dsl_bookmark_create_arg_t dbca;
  615 
  616         pair = nvlist_next_nvpair(bmarks, NULL);
  617         if (pair == NULL)
  618                 return (0);
  619 
  620         dbca.dbca_bmarks = bmarks;
  621         dbca.dbca_errors = errors;
  622 
  623         return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check,
  624             dsl_bookmark_create_sync, &dbca,
  625             fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL));
  626 }
  627 
  628 static int
  629 dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx)
  630 {
  631         dsl_bookmark_create_redacted_arg_t *dbcra = arg;
  632         dsl_pool_t *dp = dmu_tx_pool(tx);
  633         int rv = 0;
  634 
  635         if (!spa_feature_is_enabled(dp->dp_spa,
  636             SPA_FEATURE_REDACTION_BOOKMARKS))
  637                 return (SET_ERROR(ENOTSUP));
  638         /*
  639          * If the list of redact snaps will not fit in the bonus buffer with
  640          * the furthest reached object and offset, fail.
  641          */
  642         if (dbcra->dbcra_numsnaps > (dmu_bonus_max() -
  643             sizeof (redaction_list_phys_t)) / sizeof (uint64_t))
  644                 return (SET_ERROR(E2BIG));
  645 
  646         if (dsl_bookmark_create_nvl_validate_pair(
  647             dbcra->dbcra_bmark, dbcra->dbcra_snap) != 0)
  648                 return (SET_ERROR(EINVAL));
  649 
  650         rv = dsl_bookmark_create_check_impl(dp,
  651             dbcra->dbcra_bmark, dbcra->dbcra_snap);
  652         return (rv);
  653 }
  654 
  655 static void
  656 dsl_bookmark_create_redacted_sync(void *arg, dmu_tx_t *tx)
  657 {
  658         dsl_bookmark_create_redacted_arg_t *dbcra = arg;
  659         dsl_bookmark_create_sync_impl_snap(dbcra->dbcra_bmark,
  660             dbcra->dbcra_snap, tx, dbcra->dbcra_numsnaps, dbcra->dbcra_snaps,
  661             dbcra->dbcra_tag, dbcra->dbcra_rl);
  662 }
  663 
  664 int
  665 dsl_bookmark_create_redacted(const char *bookmark, const char *snapshot,
  666     uint64_t numsnaps, uint64_t *snapguids, const void *tag,
  667     redaction_list_t **rl)
  668 {
  669         dsl_bookmark_create_redacted_arg_t dbcra;
  670 
  671         dbcra.dbcra_bmark = bookmark;
  672         dbcra.dbcra_snap = snapshot;
  673         dbcra.dbcra_rl = rl;
  674         dbcra.dbcra_numsnaps = numsnaps;
  675         dbcra.dbcra_snaps = snapguids;
  676         dbcra.dbcra_tag = tag;
  677 
  678         return (dsl_sync_task(bookmark, dsl_bookmark_create_redacted_check,
  679             dsl_bookmark_create_redacted_sync, &dbcra, 5,
  680             ZFS_SPACE_CHECK_NORMAL));
  681 }
  682 
  683 /*
  684  * Retrieve the list of properties given in the 'props' nvlist for a bookmark.
  685  * If 'props' is NULL, retrieves all properties.
  686  */
  687 static void
  688 dsl_bookmark_fetch_props(dsl_pool_t *dp, zfs_bookmark_phys_t *bmark_phys,
  689     nvlist_t *props, nvlist_t *out_props)
  690 {
  691         ASSERT3P(dp, !=, NULL);
  692         ASSERT3P(bmark_phys, !=, NULL);
  693         ASSERT3P(out_props, !=, NULL);
  694         ASSERT(RRW_LOCK_HELD(&dp->dp_config_rwlock));
  695 
  696         if (props == NULL || nvlist_exists(props,
  697             zfs_prop_to_name(ZFS_PROP_GUID))) {
  698                 dsl_prop_nvlist_add_uint64(out_props,
  699                     ZFS_PROP_GUID, bmark_phys->zbm_guid);
  700         }
  701         if (props == NULL || nvlist_exists(props,
  702             zfs_prop_to_name(ZFS_PROP_CREATETXG))) {
  703                 dsl_prop_nvlist_add_uint64(out_props,
  704                     ZFS_PROP_CREATETXG, bmark_phys->zbm_creation_txg);
  705         }
  706         if (props == NULL || nvlist_exists(props,
  707             zfs_prop_to_name(ZFS_PROP_CREATION))) {
  708                 dsl_prop_nvlist_add_uint64(out_props,
  709                     ZFS_PROP_CREATION, bmark_phys->zbm_creation_time);
  710         }
  711         if (props == NULL || nvlist_exists(props,
  712             zfs_prop_to_name(ZFS_PROP_IVSET_GUID))) {
  713                 dsl_prop_nvlist_add_uint64(out_props,
  714                     ZFS_PROP_IVSET_GUID, bmark_phys->zbm_ivset_guid);
  715         }
  716         if (bmark_phys->zbm_flags & ZBM_FLAG_HAS_FBN) {
  717                 if (props == NULL || nvlist_exists(props,
  718                     zfs_prop_to_name(ZFS_PROP_REFERENCED))) {
  719                         dsl_prop_nvlist_add_uint64(out_props,
  720                             ZFS_PROP_REFERENCED,
  721                             bmark_phys->zbm_referenced_bytes_refd);
  722                 }
  723                 if (props == NULL || nvlist_exists(props,
  724                     zfs_prop_to_name(ZFS_PROP_LOGICALREFERENCED))) {
  725                         dsl_prop_nvlist_add_uint64(out_props,
  726                             ZFS_PROP_LOGICALREFERENCED,
  727                             bmark_phys->zbm_uncompressed_bytes_refd);
  728                 }
  729                 if (props == NULL || nvlist_exists(props,
  730                     zfs_prop_to_name(ZFS_PROP_REFRATIO))) {
  731                         uint64_t ratio =
  732                             bmark_phys->zbm_compressed_bytes_refd == 0 ? 100 :
  733                             bmark_phys->zbm_uncompressed_bytes_refd * 100 /
  734                             bmark_phys->zbm_compressed_bytes_refd;
  735                         dsl_prop_nvlist_add_uint64(out_props,
  736                             ZFS_PROP_REFRATIO, ratio);
  737                 }
  738         }
  739 
  740         if ((props == NULL || nvlist_exists(props, "redact_snaps") ||
  741             nvlist_exists(props, "redact_complete")) &&
  742             bmark_phys->zbm_redaction_obj != 0) {
  743                 redaction_list_t *rl;
  744                 int err = dsl_redaction_list_hold_obj(dp,
  745                     bmark_phys->zbm_redaction_obj, FTAG, &rl);
  746                 if (err == 0) {
  747                         if (nvlist_exists(props, "redact_snaps")) {
  748                                 nvlist_t *nvl;
  749                                 nvl = fnvlist_alloc();
  750                                 fnvlist_add_uint64_array(nvl, ZPROP_VALUE,
  751                                     rl->rl_phys->rlp_snaps,
  752                                     rl->rl_phys->rlp_num_snaps);
  753                                 fnvlist_add_nvlist(out_props, "redact_snaps",
  754                                     nvl);
  755                                 nvlist_free(nvl);
  756                         }
  757                         if (nvlist_exists(props, "redact_complete")) {
  758                                 nvlist_t *nvl;
  759                                 nvl = fnvlist_alloc();
  760                                 fnvlist_add_boolean_value(nvl, ZPROP_VALUE,
  761                                     rl->rl_phys->rlp_last_blkid == UINT64_MAX &&
  762                                     rl->rl_phys->rlp_last_object == UINT64_MAX);
  763                                 fnvlist_add_nvlist(out_props, "redact_complete",
  764                                     nvl);
  765                                 nvlist_free(nvl);
  766                         }
  767                         dsl_redaction_list_rele(rl, FTAG);
  768                 }
  769         }
  770 }
  771 
  772 int
  773 dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl)
  774 {
  775         dsl_pool_t *dp = ds->ds_dir->dd_pool;
  776 
  777         ASSERT(dsl_pool_config_held(dp));
  778 
  779         if (dsl_dataset_is_snapshot(ds))
  780                 return (SET_ERROR(EINVAL));
  781 
  782         for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks);
  783             dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) {
  784                 nvlist_t *out_props = fnvlist_alloc();
  785 
  786                 dsl_bookmark_fetch_props(dp, &dbn->dbn_phys, props, out_props);
  787 
  788                 fnvlist_add_nvlist(outnvl, dbn->dbn_name, out_props);
  789                 fnvlist_free(out_props);
  790         }
  791         return (0);
  792 }
  793 
  794 /*
  795  * Comparison func for ds_bookmarks AVL tree.  We sort the bookmarks by
  796  * their TXG, then by their FBN-ness.  The "FBN-ness" component ensures
  797  * that all bookmarks at the same TXG that HAS_FBN are adjacent, which
  798  * dsl_bookmark_destroy_sync_impl() depends on.  Note that there may be
  799  * multiple bookmarks at the same TXG (with the same FBN-ness).  In this
  800  * case we differentiate them by an arbitrary metric (in this case,
  801  * their names).
  802  */
  803 static int
  804 dsl_bookmark_compare(const void *l, const void *r)
  805 {
  806         const dsl_bookmark_node_t *ldbn = l;
  807         const dsl_bookmark_node_t *rdbn = r;
  808 
  809         int64_t cmp = TREE_CMP(ldbn->dbn_phys.zbm_creation_txg,
  810             rdbn->dbn_phys.zbm_creation_txg);
  811         if (likely(cmp))
  812                 return (cmp);
  813         cmp = TREE_CMP((ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN),
  814             (rdbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN));
  815         if (likely(cmp))
  816                 return (cmp);
  817         cmp = strcmp(ldbn->dbn_name, rdbn->dbn_name);
  818         return (TREE_ISIGN(cmp));
  819 }
  820 
  821 /*
  822  * Cache this (head) dataset's bookmarks in the ds_bookmarks AVL tree.
  823  */
  824 int
  825 dsl_bookmark_init_ds(dsl_dataset_t *ds)
  826 {
  827         dsl_pool_t *dp = ds->ds_dir->dd_pool;
  828         objset_t *mos = dp->dp_meta_objset;
  829 
  830         ASSERT(!ds->ds_is_snapshot);
  831 
  832         avl_create(&ds->ds_bookmarks, dsl_bookmark_compare,
  833             sizeof (dsl_bookmark_node_t),
  834             offsetof(dsl_bookmark_node_t, dbn_node));
  835 
  836         if (!dsl_dataset_is_zapified(ds))
  837                 return (0);
  838 
  839         int zaperr = zap_lookup(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES,
  840             sizeof (ds->ds_bookmarks_obj), 1, &ds->ds_bookmarks_obj);
  841         if (zaperr == ENOENT)
  842                 return (0);
  843         if (zaperr != 0)
  844                 return (zaperr);
  845 
  846         if (ds->ds_bookmarks_obj == 0)
  847                 return (0);
  848 
  849         int err = 0;
  850         zap_cursor_t zc;
  851         zap_attribute_t attr;
  852 
  853         for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
  854             (err = zap_cursor_retrieve(&zc, &attr)) == 0;
  855             zap_cursor_advance(&zc)) {
  856                 dsl_bookmark_node_t *dbn =
  857                     dsl_bookmark_node_alloc(attr.za_name);
  858 
  859                 err = dsl_bookmark_lookup_impl(ds,
  860                     dbn->dbn_name, &dbn->dbn_phys);
  861                 ASSERT3U(err, !=, ENOENT);
  862                 if (err != 0) {
  863                         kmem_free(dbn, sizeof (*dbn));
  864                         break;
  865                 }
  866                 avl_add(&ds->ds_bookmarks, dbn);
  867         }
  868         zap_cursor_fini(&zc);
  869         if (err == ENOENT)
  870                 err = 0;
  871         return (err);
  872 }
  873 
  874 void
  875 dsl_bookmark_fini_ds(dsl_dataset_t *ds)
  876 {
  877         void *cookie = NULL;
  878         dsl_bookmark_node_t *dbn;
  879 
  880         if (ds->ds_is_snapshot)
  881                 return;
  882 
  883         while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != NULL) {
  884                 spa_strfree(dbn->dbn_name);
  885                 mutex_destroy(&dbn->dbn_lock);
  886                 kmem_free(dbn, sizeof (*dbn));
  887         }
  888         avl_destroy(&ds->ds_bookmarks);
  889 }
  890 
  891 /*
  892  * Retrieve the bookmarks that exist in the specified dataset, and the
  893  * requested properties of each bookmark.
  894  *
  895  * The "props" nvlist specifies which properties are requested.
  896  * See lzc_get_bookmarks() for the list of valid properties.
  897  */
  898 int
  899 dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl)
  900 {
  901         dsl_pool_t *dp;
  902         dsl_dataset_t *ds;
  903         int err;
  904 
  905         err = dsl_pool_hold(dsname, FTAG, &dp);
  906         if (err != 0)
  907                 return (err);
  908         err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
  909         if (err != 0) {
  910                 dsl_pool_rele(dp, FTAG);
  911                 return (err);
  912         }
  913 
  914         err = dsl_get_bookmarks_impl(ds, props, outnvl);
  915 
  916         dsl_dataset_rele(ds, FTAG);
  917         dsl_pool_rele(dp, FTAG);
  918         return (err);
  919 }
  920 
  921 /*
  922  * Retrieve all properties for a single bookmark in the given dataset.
  923  */
  924 int
  925 dsl_get_bookmark_props(const char *dsname, const char *bmname, nvlist_t *props)
  926 {
  927         dsl_pool_t *dp;
  928         dsl_dataset_t *ds;
  929         zfs_bookmark_phys_t bmark_phys = { 0 };
  930         int err;
  931 
  932         err = dsl_pool_hold(dsname, FTAG, &dp);
  933         if (err != 0)
  934                 return (err);
  935         err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
  936         if (err != 0) {
  937                 dsl_pool_rele(dp, FTAG);
  938                 return (err);
  939         }
  940 
  941         err = dsl_bookmark_lookup_impl(ds, bmname, &bmark_phys);
  942         if (err != 0)
  943                 goto out;
  944 
  945         dsl_bookmark_fetch_props(dp, &bmark_phys, NULL, props);
  946 out:
  947         dsl_dataset_rele(ds, FTAG);
  948         dsl_pool_rele(dp, FTAG);
  949         return (err);
  950 }
  951 
  952 typedef struct dsl_bookmark_destroy_arg {
  953         nvlist_t *dbda_bmarks;
  954         nvlist_t *dbda_success;
  955         nvlist_t *dbda_errors;
  956 } dsl_bookmark_destroy_arg_t;
  957 
  958 static void
  959 dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name,
  960     dmu_tx_t *tx)
  961 {
  962         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
  963         uint64_t bmark_zapobj = ds->ds_bookmarks_obj;
  964         matchtype_t mt = 0;
  965         uint64_t int_size, num_ints;
  966         /*
  967          * 'search' must be zeroed so that dbn_flags (which is used in
  968          * dsl_bookmark_compare()) will be zeroed even if the on-disk
  969          * (in ZAP) bookmark is shorter than offsetof(dbn_flags).
  970          */
  971         dsl_bookmark_node_t search = { 0 };
  972         char realname[ZFS_MAX_DATASET_NAME_LEN];
  973 
  974         /*
  975          * Find the real name of this bookmark, which may be different
  976          * from the given name if the dataset is case-insensitive.  Then
  977          * use the real name to find the node in the ds_bookmarks AVL tree.
  978          */
  979 
  980         if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
  981                 mt = MT_NORMALIZE;
  982 
  983         VERIFY0(zap_length(mos, bmark_zapobj, name, &int_size, &num_ints));
  984 
  985         ASSERT3U(int_size, ==, sizeof (uint64_t));
  986 
  987         if (num_ints * int_size > BOOKMARK_PHYS_SIZE_V1) {
  988                 spa_feature_decr(dmu_objset_spa(mos),
  989                     SPA_FEATURE_BOOKMARK_V2, tx);
  990         }
  991         VERIFY0(zap_lookup_norm(mos, bmark_zapobj, name, sizeof (uint64_t),
  992             num_ints, &search.dbn_phys, mt, realname, sizeof (realname), NULL));
  993 
  994         search.dbn_name = realname;
  995         dsl_bookmark_node_t *dbn = avl_find(&ds->ds_bookmarks, &search, NULL);
  996         ASSERT(dbn != NULL);
  997 
  998         if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
  999                 /*
 1000                  * If this bookmark HAS_FBN, and it is before the most
 1001                  * recent snapshot, then its TXG is a key in the head's
 1002                  * deadlist (and all clones' heads' deadlists).  If this is
 1003                  * the last thing keeping the key (i.e. there are no more
 1004                  * bookmarks with HAS_FBN at this TXG, and there is no
 1005                  * snapshot at this TXG), then remove the key.
 1006                  *
 1007                  * Note that this algorithm depends on ds_bookmarks being
 1008                  * sorted such that all bookmarks at the same TXG with
 1009                  * HAS_FBN are adjacent (with no non-HAS_FBN bookmarks
 1010                  * at the same TXG in between them).  If this were not
 1011                  * the case, we would need to examine *all* bookmarks
 1012                  * at this TXG, rather than just the adjacent ones.
 1013                  */
 1014 
 1015                 dsl_bookmark_node_t *dbn_prev =
 1016                     AVL_PREV(&ds->ds_bookmarks, dbn);
 1017                 dsl_bookmark_node_t *dbn_next =
 1018                     AVL_NEXT(&ds->ds_bookmarks, dbn);
 1019 
 1020                 boolean_t more_bookmarks_at_this_txg =
 1021                     (dbn_prev != NULL && dbn_prev->dbn_phys.zbm_creation_txg ==
 1022                     dbn->dbn_phys.zbm_creation_txg &&
 1023                     (dbn_prev->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) ||
 1024                     (dbn_next != NULL && dbn_next->dbn_phys.zbm_creation_txg ==
 1025                     dbn->dbn_phys.zbm_creation_txg &&
 1026                     (dbn_next->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN));
 1027 
 1028                 if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS) &&
 1029                     !more_bookmarks_at_this_txg &&
 1030                     dbn->dbn_phys.zbm_creation_txg <
 1031                     dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 1032                         dsl_dir_remove_clones_key(ds->ds_dir,
 1033                             dbn->dbn_phys.zbm_creation_txg, tx);
 1034                         dsl_deadlist_remove_key(&ds->ds_deadlist,
 1035                             dbn->dbn_phys.zbm_creation_txg, tx);
 1036                 }
 1037 
 1038                 spa_feature_decr(dmu_objset_spa(mos),
 1039                     SPA_FEATURE_BOOKMARK_WRITTEN, tx);
 1040         }
 1041 
 1042         if (dbn->dbn_phys.zbm_redaction_obj != 0) {
 1043                 VERIFY0(dmu_object_free(mos,
 1044                     dbn->dbn_phys.zbm_redaction_obj, tx));
 1045                 spa_feature_decr(dmu_objset_spa(mos),
 1046                     SPA_FEATURE_REDACTION_BOOKMARKS, tx);
 1047         }
 1048 
 1049         avl_remove(&ds->ds_bookmarks, dbn);
 1050         spa_strfree(dbn->dbn_name);
 1051         mutex_destroy(&dbn->dbn_lock);
 1052         kmem_free(dbn, sizeof (*dbn));
 1053 
 1054         VERIFY0(zap_remove_norm(mos, bmark_zapobj, name, mt, tx));
 1055 }
 1056 
 1057 static int
 1058 dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx)
 1059 {
 1060         dsl_bookmark_destroy_arg_t *dbda = arg;
 1061         dsl_pool_t *dp = dmu_tx_pool(tx);
 1062         int rv = 0;
 1063 
 1064         ASSERT(nvlist_empty(dbda->dbda_success));
 1065         ASSERT(nvlist_empty(dbda->dbda_errors));
 1066 
 1067         if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
 1068                 return (0);
 1069 
 1070         for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL);
 1071             pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) {
 1072                 const char *fullname = nvpair_name(pair);
 1073                 dsl_dataset_t *ds;
 1074                 zfs_bookmark_phys_t bm;
 1075                 int error;
 1076                 char *shortname;
 1077 
 1078                 error = dsl_bookmark_hold_ds(dp, fullname, &ds,
 1079                     FTAG, &shortname);
 1080                 if (error == ENOENT) {
 1081                         /* ignore it; the bookmark is "already destroyed" */
 1082                         continue;
 1083                 }
 1084                 if (error == 0) {
 1085                         error = dsl_bookmark_lookup_impl(ds, shortname, &bm);
 1086                         dsl_dataset_rele(ds, FTAG);
 1087                         if (error == ESRCH) {
 1088                                 /*
 1089                                  * ignore it; the bookmark is
 1090                                  * "already destroyed"
 1091                                  */
 1092                                 continue;
 1093                         }
 1094                         if (error == 0 && bm.zbm_redaction_obj != 0) {
 1095                                 redaction_list_t *rl = NULL;
 1096                                 error = dsl_redaction_list_hold_obj(tx->tx_pool,
 1097                                     bm.zbm_redaction_obj, FTAG, &rl);
 1098                                 if (error == ENOENT) {
 1099                                         error = 0;
 1100                                 } else if (error == 0 &&
 1101                                     dsl_redaction_list_long_held(rl)) {
 1102                                         error = SET_ERROR(EBUSY);
 1103                                 }
 1104                                 if (rl != NULL) {
 1105                                         dsl_redaction_list_rele(rl, FTAG);
 1106                                 }
 1107                         }
 1108                 }
 1109                 if (error == 0) {
 1110                         if (dmu_tx_is_syncing(tx)) {
 1111                                 fnvlist_add_boolean(dbda->dbda_success,
 1112                                     fullname);
 1113                         }
 1114                 } else {
 1115                         fnvlist_add_int32(dbda->dbda_errors, fullname, error);
 1116                         rv = error;
 1117                 }
 1118         }
 1119         return (rv);
 1120 }
 1121 
 1122 static void
 1123 dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx)
 1124 {
 1125         dsl_bookmark_destroy_arg_t *dbda = arg;
 1126         dsl_pool_t *dp = dmu_tx_pool(tx);
 1127         objset_t *mos = dp->dp_meta_objset;
 1128 
 1129         for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL);
 1130             pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) {
 1131                 dsl_dataset_t *ds;
 1132                 char *shortname;
 1133                 uint64_t zap_cnt;
 1134 
 1135                 VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair),
 1136                     &ds, FTAG, &shortname));
 1137                 dsl_bookmark_destroy_sync_impl(ds, shortname, tx);
 1138 
 1139                 /*
 1140                  * If all of this dataset's bookmarks have been destroyed,
 1141                  * free the zap object and decrement the feature's use count.
 1142                  */
 1143                 VERIFY0(zap_count(mos, ds->ds_bookmarks_obj, &zap_cnt));
 1144                 if (zap_cnt == 0) {
 1145                         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 1146                         VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx));
 1147                         ds->ds_bookmarks_obj = 0;
 1148                         spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
 1149                         VERIFY0(zap_remove(mos, ds->ds_object,
 1150                             DS_FIELD_BOOKMARK_NAMES, tx));
 1151                 }
 1152 
 1153                 spa_history_log_internal_ds(ds, "remove bookmark", tx,
 1154                     "name=%s", shortname);
 1155 
 1156                 dsl_dataset_rele(ds, FTAG);
 1157         }
 1158 }
 1159 
 1160 /*
 1161  * The bookmarks must all be in the same pool.
 1162  */
 1163 int
 1164 dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors)
 1165 {
 1166         int rv;
 1167         dsl_bookmark_destroy_arg_t dbda;
 1168         nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
 1169         if (pair == NULL)
 1170                 return (0);
 1171 
 1172         dbda.dbda_bmarks = bmarks;
 1173         dbda.dbda_errors = errors;
 1174         dbda.dbda_success = fnvlist_alloc();
 1175 
 1176         rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check,
 1177             dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks),
 1178             ZFS_SPACE_CHECK_RESERVED);
 1179         fnvlist_free(dbda.dbda_success);
 1180         return (rv);
 1181 }
 1182 
 1183 /* Return B_TRUE if there are any long holds on this dataset. */
 1184 boolean_t
 1185 dsl_redaction_list_long_held(redaction_list_t *rl)
 1186 {
 1187         return (!zfs_refcount_is_zero(&rl->rl_longholds));
 1188 }
 1189 
 1190 void
 1191 dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl,
 1192     const void *tag)
 1193 {
 1194         ASSERT(dsl_pool_config_held(dp));
 1195         (void) zfs_refcount_add(&rl->rl_longholds, tag);
 1196 }
 1197 
 1198 void
 1199 dsl_redaction_list_long_rele(redaction_list_t *rl, const void *tag)
 1200 {
 1201         (void) zfs_refcount_remove(&rl->rl_longholds, tag);
 1202 }
 1203 
 1204 static void
 1205 redaction_list_evict_sync(void *rlu)
 1206 {
 1207         redaction_list_t *rl = rlu;
 1208         zfs_refcount_destroy(&rl->rl_longholds);
 1209 
 1210         kmem_free(rl, sizeof (redaction_list_t));
 1211 }
 1212 
 1213 void
 1214 dsl_redaction_list_rele(redaction_list_t *rl, const void *tag)
 1215 {
 1216         dmu_buf_rele(rl->rl_dbuf, tag);
 1217 }
 1218 
 1219 int
 1220 dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, const void *tag,
 1221     redaction_list_t **rlp)
 1222 {
 1223         objset_t *mos = dp->dp_meta_objset;
 1224         dmu_buf_t *dbuf;
 1225         redaction_list_t *rl;
 1226         int err;
 1227 
 1228         ASSERT(dsl_pool_config_held(dp));
 1229 
 1230         err = dmu_bonus_hold(mos, rlobj, tag, &dbuf);
 1231         if (err != 0)
 1232                 return (err);
 1233 
 1234         rl = dmu_buf_get_user(dbuf);
 1235         if (rl == NULL) {
 1236                 redaction_list_t *winner = NULL;
 1237 
 1238                 rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP);
 1239                 rl->rl_dbuf = dbuf;
 1240                 rl->rl_object = rlobj;
 1241                 rl->rl_phys = dbuf->db_data;
 1242                 rl->rl_mos = dp->dp_meta_objset;
 1243                 zfs_refcount_create(&rl->rl_longholds);
 1244                 dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL,
 1245                     &rl->rl_dbuf);
 1246                 if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) {
 1247                         kmem_free(rl, sizeof (*rl));
 1248                         rl = winner;
 1249                 }
 1250         }
 1251         *rlp = rl;
 1252         return (0);
 1253 }
 1254 
 1255 /*
 1256  * Snapshot ds is being destroyed.
 1257  *
 1258  * Adjust the "freed_before_next" of any bookmarks between this snap
 1259  * and the previous snapshot, because their "next snapshot" is changing.
 1260  *
 1261  * If there are any bookmarks with HAS_FBN at this snapshot, remove
 1262  * their HAS_SNAP flag (note: there can be at most one snapshot of
 1263  * each filesystem at a given txg), and return B_TRUE.  In this case
 1264  * the caller can not remove the key in the deadlist at this TXG, because
 1265  * the HAS_FBN bookmarks require the key be there.
 1266  *
 1267  * Returns B_FALSE if there are no bookmarks with HAS_FBN at this
 1268  * snapshot's TXG.  In this case the caller can remove the key in the
 1269  * deadlist at this TXG.
 1270  */
 1271 boolean_t
 1272 dsl_bookmark_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
 1273 {
 1274         dsl_pool_t *dp = ds->ds_dir->dd_pool;
 1275 
 1276         dsl_dataset_t *head, *next;
 1277         VERIFY0(dsl_dataset_hold_obj(dp,
 1278             dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &head));
 1279         VERIFY0(dsl_dataset_hold_obj(dp,
 1280             dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &next));
 1281 
 1282         /*
 1283          * Find the first bookmark that HAS_FBN at or after the
 1284          * previous snapshot.
 1285          */
 1286         dsl_bookmark_node_t search = { 0 };
 1287         avl_index_t idx;
 1288         search.dbn_phys.zbm_creation_txg =
 1289             dsl_dataset_phys(ds)->ds_prev_snap_txg;
 1290         search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN;
 1291         /*
 1292          * The empty-string name can't be in the AVL, and it compares
 1293          * before any entries with this TXG.
 1294          */
 1295         search.dbn_name = (char *)"";
 1296         VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL);
 1297         dsl_bookmark_node_t *dbn =
 1298             avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER);
 1299 
 1300         /*
 1301          * Iterate over all bookmarks that are at or after the previous
 1302          * snapshot, and before this (being deleted) snapshot.  Adjust
 1303          * their FBN based on their new next snapshot.
 1304          */
 1305         for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg <
 1306             dsl_dataset_phys(ds)->ds_creation_txg;
 1307             dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) {
 1308                 if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN))
 1309                         continue;
 1310                 /*
 1311                  * Increase our FBN by the amount of space that was live
 1312                  * (referenced) at the time of this bookmark (i.e.
 1313                  * birth <= zbm_creation_txg), and killed between this
 1314                  * (being deleted) snapshot and the next snapshot (i.e.
 1315                  * on the next snapshot's deadlist).  (Space killed before
 1316                  * this are already on our FBN.)
 1317                  */
 1318                 uint64_t referenced, compressed, uncompressed;
 1319                 dsl_deadlist_space_range(&next->ds_deadlist,
 1320                     0, dbn->dbn_phys.zbm_creation_txg,
 1321                     &referenced, &compressed, &uncompressed);
 1322                 dbn->dbn_phys.zbm_referenced_freed_before_next_snap +=
 1323                     referenced;
 1324                 dbn->dbn_phys.zbm_compressed_freed_before_next_snap +=
 1325                     compressed;
 1326                 dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap +=
 1327                     uncompressed;
 1328                 VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj,
 1329                     dbn->dbn_name, sizeof (uint64_t),
 1330                     sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
 1331                     &dbn->dbn_phys, tx));
 1332         }
 1333         dsl_dataset_rele(next, FTAG);
 1334 
 1335         /*
 1336          * There may be several bookmarks at this txg (the TXG of the
 1337          * snapshot being deleted).  We need to clear the SNAPSHOT_EXISTS
 1338          * flag on all of them, and return TRUE if there is at least 1
 1339          * bookmark here with HAS_FBN (thus preventing the deadlist
 1340          * key from being removed).
 1341          */
 1342         boolean_t rv = B_FALSE;
 1343         for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg ==
 1344             dsl_dataset_phys(ds)->ds_creation_txg;
 1345             dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) {
 1346                 if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
 1347                         ASSERT(!(dbn->dbn_phys.zbm_flags &
 1348                             ZBM_FLAG_SNAPSHOT_EXISTS));
 1349                         continue;
 1350                 }
 1351                 ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS);
 1352                 dbn->dbn_phys.zbm_flags &= ~ZBM_FLAG_SNAPSHOT_EXISTS;
 1353                 VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj,
 1354                     dbn->dbn_name, sizeof (uint64_t),
 1355                     sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
 1356                     &dbn->dbn_phys, tx));
 1357                 rv = B_TRUE;
 1358         }
 1359         dsl_dataset_rele(head, FTAG);
 1360         return (rv);
 1361 }
 1362 
 1363 /*
 1364  * A snapshot is being created of this (head) dataset.
 1365  *
 1366  * We don't keep keys in the deadlist for the most recent snapshot, or any
 1367  * bookmarks at or after it, because there can't be any blocks on the
 1368  * deadlist in this range.  Now that the most recent snapshot is after
 1369  * all bookmarks, we need to add these keys.  Note that the caller always
 1370  * adds a key at the previous snapshot, so we only add keys for bookmarks
 1371  * after that.
 1372  */
 1373 void
 1374 dsl_bookmark_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
 1375 {
 1376         uint64_t last_key_added = UINT64_MAX;
 1377         for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
 1378             dbn != NULL && dbn->dbn_phys.zbm_creation_txg >
 1379             dsl_dataset_phys(ds)->ds_prev_snap_txg;
 1380             dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) {
 1381                 uint64_t creation_txg = dbn->dbn_phys.zbm_creation_txg;
 1382                 ASSERT3U(creation_txg, <=, last_key_added);
 1383                 /*
 1384                  * Note, there may be multiple bookmarks at this TXG,
 1385                  * and we only want to add the key for this TXG once.
 1386                  * The ds_bookmarks AVL is sorted by TXG, so we will visit
 1387                  * these bookmarks in sequence.
 1388                  */
 1389                 if ((dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) &&
 1390                     creation_txg != last_key_added) {
 1391                         dsl_deadlist_add_key(&ds->ds_deadlist,
 1392                             creation_txg, tx);
 1393                         last_key_added = creation_txg;
 1394                 }
 1395         }
 1396 }
 1397 
 1398 /*
 1399  * The next snapshot of the origin dataset has changed, due to
 1400  * promote or clone swap.  If there are any bookmarks at this dataset,
 1401  * we need to update their zbm_*_freed_before_next_snap to reflect this.
 1402  * The head dataset has the relevant bookmarks in ds_bookmarks.
 1403  */
 1404 void
 1405 dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin,
 1406     dmu_tx_t *tx)
 1407 {
 1408         dsl_pool_t *dp = dmu_tx_pool(tx);
 1409 
 1410         /*
 1411          * Find the first bookmark that HAS_FBN at the origin snapshot.
 1412          */
 1413         dsl_bookmark_node_t search = { 0 };
 1414         avl_index_t idx;
 1415         search.dbn_phys.zbm_creation_txg =
 1416             dsl_dataset_phys(origin)->ds_creation_txg;
 1417         search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN;
 1418         /*
 1419          * The empty-string name can't be in the AVL, and it compares
 1420          * before any entries with this TXG.
 1421          */
 1422         search.dbn_name = (char *)"";
 1423         VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL);
 1424         dsl_bookmark_node_t *dbn =
 1425             avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER);
 1426 
 1427         /*
 1428          * Iterate over all bookmarks that are at the origin txg.
 1429          * Adjust their FBN based on their new next snapshot.
 1430          */
 1431         for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg ==
 1432             dsl_dataset_phys(origin)->ds_creation_txg &&
 1433             (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN);
 1434             dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) {
 1435 
 1436                 /*
 1437                  * Bookmark is at the origin, therefore its
 1438                  * "next dataset" is changing, so we need
 1439                  * to reset its FBN by recomputing it in
 1440                  * dsl_bookmark_set_phys().
 1441                  */
 1442                 ASSERT3U(dbn->dbn_phys.zbm_guid, ==,
 1443                     dsl_dataset_phys(origin)->ds_guid);
 1444                 ASSERT3U(dbn->dbn_phys.zbm_referenced_bytes_refd, ==,
 1445                     dsl_dataset_phys(origin)->ds_referenced_bytes);
 1446                 ASSERT(dbn->dbn_phys.zbm_flags &
 1447                     ZBM_FLAG_SNAPSHOT_EXISTS);
 1448                 /*
 1449                  * Save and restore the zbm_redaction_obj, which
 1450                  * is zeroed by dsl_bookmark_set_phys().
 1451                  */
 1452                 uint64_t redaction_obj =
 1453                     dbn->dbn_phys.zbm_redaction_obj;
 1454                 dsl_bookmark_set_phys(&dbn->dbn_phys, origin);
 1455                 dbn->dbn_phys.zbm_redaction_obj = redaction_obj;
 1456 
 1457                 VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj,
 1458                     dbn->dbn_name, sizeof (uint64_t),
 1459                     sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
 1460                     &dbn->dbn_phys, tx));
 1461         }
 1462 }
 1463 
 1464 /*
 1465  * This block is no longer referenced by this (head) dataset.
 1466  *
 1467  * Adjust the FBN of any bookmarks that reference this block, whose "next"
 1468  * is the head dataset.
 1469  */
 1470 void
 1471 dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 1472 {
 1473         (void) tx;
 1474 
 1475         /*
 1476          * Iterate over bookmarks whose "next" is the head dataset.
 1477          */
 1478         for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
 1479             dbn != NULL && dbn->dbn_phys.zbm_creation_txg >=
 1480             dsl_dataset_phys(ds)->ds_prev_snap_txg;
 1481             dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) {
 1482                 /*
 1483                  * If the block was live (referenced) at the time of this
 1484                  * bookmark, add its space to the bookmark's FBN.
 1485                  */
 1486                 if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg &&
 1487                     (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
 1488                         mutex_enter(&dbn->dbn_lock);
 1489                         dbn->dbn_phys.zbm_referenced_freed_before_next_snap +=
 1490                             bp_get_dsize_sync(dsl_dataset_get_spa(ds), bp);
 1491                         dbn->dbn_phys.zbm_compressed_freed_before_next_snap +=
 1492                             BP_GET_PSIZE(bp);
 1493                         dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap +=
 1494                             BP_GET_UCSIZE(bp);
 1495                         /*
 1496                          * Changing the ZAP object here would be too
 1497                          * expensive.  Also, we may be called from the zio
 1498                          * interrupt thread, which can't block on i/o.
 1499                          * Therefore, we mark this bookmark as dirty and
 1500                          * modify the ZAP once per txg, in
 1501                          * dsl_bookmark_sync_done().
 1502                          */
 1503                         dbn->dbn_dirty = B_TRUE;
 1504                         mutex_exit(&dbn->dbn_lock);
 1505                 }
 1506         }
 1507 }
 1508 
 1509 void
 1510 dsl_bookmark_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
 1511 {
 1512         dsl_pool_t *dp = dmu_tx_pool(tx);
 1513 
 1514         if (dsl_dataset_is_snapshot(ds))
 1515                 return;
 1516 
 1517         /*
 1518          * We only dirty bookmarks that are at or after the most recent
 1519          * snapshot.  We can't create snapshots between
 1520          * dsl_bookmark_block_killed() and dsl_bookmark_sync_done(), so we
 1521          * don't need to look at any bookmarks before ds_prev_snap_txg.
 1522          */
 1523         for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
 1524             dbn != NULL && dbn->dbn_phys.zbm_creation_txg >=
 1525             dsl_dataset_phys(ds)->ds_prev_snap_txg;
 1526             dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) {
 1527                 if (dbn->dbn_dirty) {
 1528                         /*
 1529                          * We only dirty nodes with HAS_FBN, therefore
 1530                          * we can always use the current bookmark struct size.
 1531                          */
 1532                         ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN);
 1533                         VERIFY0(zap_update(dp->dp_meta_objset,
 1534                             ds->ds_bookmarks_obj,
 1535                             dbn->dbn_name, sizeof (uint64_t),
 1536                             sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
 1537                             &dbn->dbn_phys, tx));
 1538                         dbn->dbn_dirty = B_FALSE;
 1539                 }
 1540         }
 1541 #ifdef ZFS_DEBUG
 1542         for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks);
 1543             dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) {
 1544                 ASSERT(!dbn->dbn_dirty);
 1545         }
 1546 #endif
 1547 }
 1548 
 1549 /*
 1550  * Return the TXG of the most recent bookmark (or 0 if there are no bookmarks).
 1551  */
 1552 uint64_t
 1553 dsl_bookmark_latest_txg(dsl_dataset_t *ds)
 1554 {
 1555         ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 1556         dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
 1557         if (dbn == NULL)
 1558                 return (0);
 1559         return (dbn->dbn_phys.zbm_creation_txg);
 1560 }
 1561 
 1562 /*
 1563  * Compare the redact_block_phys_t to the bookmark. If the last block in the
 1564  * redact_block_phys_t is before the bookmark, return -1.  If the first block in
 1565  * the redact_block_phys_t is after the bookmark, return 1.  Otherwise, the
 1566  * bookmark is inside the range of the redact_block_phys_t, and we return 0.
 1567  */
 1568 static int
 1569 redact_block_zb_compare(redact_block_phys_t *first,
 1570     zbookmark_phys_t *second)
 1571 {
 1572         /*
 1573          * If the block_phys is for a previous object, or the last block in the
 1574          * block_phys is strictly before the block in the bookmark, the
 1575          * block_phys is earlier.
 1576          */
 1577         if (first->rbp_object < second->zb_object ||
 1578             (first->rbp_object == second->zb_object &&
 1579             first->rbp_blkid + (redact_block_get_count(first) - 1) <
 1580             second->zb_blkid)) {
 1581                 return (-1);
 1582         }
 1583 
 1584         /*
 1585          * If the bookmark is for a previous object, or the block in the
 1586          * bookmark is strictly before the first block in the block_phys, the
 1587          * bookmark is earlier.
 1588          */
 1589         if (first->rbp_object > second->zb_object ||
 1590             (first->rbp_object == second->zb_object &&
 1591             first->rbp_blkid > second->zb_blkid)) {
 1592                 return (1);
 1593         }
 1594 
 1595         return (0);
 1596 }
 1597 
 1598 /*
 1599  * Traverse the redaction list in the provided object, and call the callback for
 1600  * each entry we find. Don't call the callback for any records before resume.
 1601  */
 1602 int
 1603 dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume,
 1604     rl_traverse_callback_t cb, void *arg)
 1605 {
 1606         objset_t *mos = rl->rl_mos;
 1607         int err = 0;
 1608 
 1609         if (rl->rl_phys->rlp_last_object != UINT64_MAX ||
 1610             rl->rl_phys->rlp_last_blkid != UINT64_MAX) {
 1611                 /*
 1612                  * When we finish a send, we update the last object and offset
 1613                  * to UINT64_MAX.  If a send fails partway through, the last
 1614                  * object and offset will have some other value, indicating how
 1615                  * far the send got. The redaction list must be complete before
 1616                  * it can be traversed, so return EINVAL if the last object and
 1617                  * blkid are not set to UINT64_MAX.
 1618                  */
 1619                 return (SET_ERROR(EINVAL));
 1620         }
 1621 
 1622         /*
 1623          * This allows us to skip the binary search and resume checking logic
 1624          * below, if we're not resuming a redacted send.
 1625          */
 1626         if (ZB_IS_ZERO(resume))
 1627                 resume = NULL;
 1628 
 1629         /*
 1630          * Binary search for the point to resume from.
 1631          */
 1632         uint64_t maxidx = rl->rl_phys->rlp_num_entries - 1;
 1633         uint64_t minidx = 0;
 1634         while (resume != NULL && maxidx > minidx) {
 1635                 redact_block_phys_t rbp = { 0 };
 1636                 ASSERT3U(maxidx, >, minidx);
 1637                 uint64_t mididx = minidx + ((maxidx - minidx) / 2);
 1638                 err = dmu_read(mos, rl->rl_object, mididx * sizeof (rbp),
 1639                     sizeof (rbp), &rbp, DMU_READ_NO_PREFETCH);
 1640                 if (err != 0)
 1641                         break;
 1642 
 1643                 int cmp = redact_block_zb_compare(&rbp, resume);
 1644 
 1645                 if (cmp == 0) {
 1646                         minidx = mididx;
 1647                         break;
 1648                 } else if (cmp > 0) {
 1649                         maxidx =
 1650                             (mididx == minidx ? minidx : mididx - 1);
 1651                 } else {
 1652                         minidx = mididx + 1;
 1653                 }
 1654         }
 1655 
 1656         unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE;
 1657         redact_block_phys_t *buf = zio_data_buf_alloc(bufsize);
 1658 
 1659         unsigned int entries_per_buf = bufsize / sizeof (redact_block_phys_t);
 1660         uint64_t start_block = minidx / entries_per_buf;
 1661         err = dmu_read(mos, rl->rl_object, start_block * bufsize, bufsize, buf,
 1662             DMU_READ_PREFETCH);
 1663 
 1664         for (uint64_t curidx = minidx;
 1665             err == 0 && curidx < rl->rl_phys->rlp_num_entries;
 1666             curidx++) {
 1667                 /*
 1668                  * We read in the redaction list one block at a time.  Once we
 1669                  * finish with all the entries in a given block, we read in a
 1670                  * new one.  The predictive prefetcher will take care of any
 1671                  * prefetching, and this code shouldn't be the bottleneck, so we
 1672                  * don't need to do manual prefetching.
 1673                  */
 1674                 if (curidx % entries_per_buf == 0) {
 1675                         err = dmu_read(mos, rl->rl_object, curidx *
 1676                             sizeof (*buf), bufsize, buf,
 1677                             DMU_READ_PREFETCH);
 1678                         if (err != 0)
 1679                                 break;
 1680                 }
 1681                 redact_block_phys_t *rb = &buf[curidx % entries_per_buf];
 1682                 /*
 1683                  * If resume is non-null, we should either not send the data, or
 1684                  * null out resume so we don't have to keep doing these
 1685                  * comparisons.
 1686                  */
 1687                 if (resume != NULL) {
 1688                         /*
 1689                          * It is possible that after the binary search we got
 1690                          * a record before the resume point. There's two cases
 1691                          * where this can occur. If the record is the last
 1692                          * redaction record, and the resume point is after the
 1693                          * end of the redacted data, curidx will be the last
 1694                          * redaction record. In that case, the loop will end
 1695                          * after this iteration. The second case is if the
 1696                          * resume point is between two redaction records, the
 1697                          * binary search can return either the record before
 1698                          * or after the resume point. In that case, the next
 1699                          * iteration will be greater than the resume point.
 1700                          */
 1701                         if (redact_block_zb_compare(rb, resume) < 0) {
 1702                                 ASSERT3U(curidx, ==, minidx);
 1703                                 continue;
 1704                         } else {
 1705                                 /*
 1706                                  * If the place to resume is in the middle of
 1707                                  * the range described by this
 1708                                  * redact_block_phys, then modify the
 1709                                  * redact_block_phys in memory so we generate
 1710                                  * the right records.
 1711                                  */
 1712                                 if (resume->zb_object == rb->rbp_object &&
 1713                                     resume->zb_blkid > rb->rbp_blkid) {
 1714                                         uint64_t diff = resume->zb_blkid -
 1715                                             rb->rbp_blkid;
 1716                                         rb->rbp_blkid = resume->zb_blkid;
 1717                                         redact_block_set_count(rb,
 1718                                             redact_block_get_count(rb) - diff);
 1719                                 }
 1720                                 resume = NULL;
 1721                         }
 1722                 }
 1723 
 1724                 if (cb(rb, arg) != 0) {
 1725                         err = EINTR;
 1726                         break;
 1727                 }
 1728         }
 1729 
 1730         zio_data_buf_free(buf, bufsize);
 1731         return (err);
 1732 }

Cache object: e2ca347a763828d4c846928e9e2c3698


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.