| 
     1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
   23  * Copyright (c) 2013, 2014, Delphix. All rights reserved.
   24  * Copyright (c) 2019 Datto Inc.
   25  * Copyright (c) 2021, 2022, George Amanakis. All rights reserved.
   26  */
   27 
   28 /*
   29  * Routines to manage the on-disk persistent error log.
   30  *
   31  * Each pool stores a log of all logical data errors seen during normal
   32  * operation.  This is actually the union of two distinct logs: the last log,
   33  * and the current log.  All errors seen are logged to the current log.  When a
   34  * scrub completes, the current log becomes the last log, the last log is thrown
   35  * out, and the current log is reinitialized.  This way, if an error is somehow
   36  * corrected, a new scrub will show that it no longer exists, and will be
   37  * deleted from the log when the scrub completes.
   38  *
   39  * The log is stored using a ZAP object whose key is a string form of the
   40  * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an
   41  * optional 'objset:object' human-readable string describing the data.  When an
   42  * error is first logged, this string will be empty, indicating that no name is
   43  * known.  This prevents us from having to issue a potentially large amount of
   44  * I/O to discover the object name during an error path.  Instead, we do the
   45  * calculation when the data is requested, storing the result so future queries
   46  * will be faster.
   47  *
   48  * If the head_errlog feature is enabled, a different on-disk format is used.
   49  * The error log of each head dataset is stored separately in the zap object
   50  * and keyed by the head id. This enables listing every dataset affected in
   51  * userland. In order to be able to track whether an error block has been
   52  * modified or added to snapshots since it was marked as an error, a new tuple
   53  * is introduced: zbookmark_err_phys_t. It allows the storage of the birth
   54  * transaction group of an error block on-disk. The birth transaction group is
   55  * used by check_filesystem() to assess whether this block was freed,
   56  * re-written or added to a snapshot since its marking as an error.
   57  *
   58  * This log is then shipped into an nvlist where the key is the dataset name and
   59  * the value is the object name.  Userland is then responsible for uniquifying
   60  * this list and displaying it to the user.
   61  */
   62 
   63 #include <sys/dmu_tx.h>
   64 #include <sys/spa.h>
   65 #include <sys/spa_impl.h>
   66 #include <sys/zap.h>
   67 #include <sys/zio.h>
   68 #include <sys/dsl_dir.h>
   69 #include <sys/dmu_objset.h>
   70 #include <sys/dbuf.h>
   71 #include <sys/zfs_znode.h>
   72 
   73 #define NAME_MAX_LEN 64
   74 
   75 /*
   76  * spa_upgrade_errlog_limit : A zfs module parameter that controls the number
   77  *              of on-disk error log entries that will be converted to the new
   78  *              format when enabling head_errlog. Defaults to 0 which converts
   79  *              all log entries.
   80  */
   81 static uint_t spa_upgrade_errlog_limit = 0;
   82 
   83 /*
   84  * Convert a bookmark to a string.
   85  */
   86 static void
   87 bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
   88 {
   89         (void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
   90             (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
   91             (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
   92 }
   93 
   94 /*
   95  * Convert an err_phys to a string.
   96  */
   97 static void
   98 errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len)
   99 {
  100         (void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
  101             (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level,
  102             (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth);
  103 }
  104 
  105 /*
  106  * Convert a string to a err_phys.
  107  */
  108 static void
  109 name_to_errphys(char *buf, zbookmark_err_phys_t *zep)
  110 {
  111         zep->zb_object = zfs_strtonum(buf, &buf);
  112         ASSERT(*buf == ':');
  113         zep->zb_level = (int)zfs_strtonum(buf + 1, &buf);
  114         ASSERT(*buf == ':');
  115         zep->zb_blkid = zfs_strtonum(buf + 1, &buf);
  116         ASSERT(*buf == ':');
  117         zep->zb_birth = zfs_strtonum(buf + 1, &buf);
  118         ASSERT(*buf == '\0');
  119 }
  120 
  121 /*
  122  * Convert a string to a bookmark.
  123  */
  124 static void
  125 name_to_bookmark(char *buf, zbookmark_phys_t *zb)
  126 {
  127         zb->zb_objset = zfs_strtonum(buf, &buf);
  128         ASSERT(*buf == ':');
  129         zb->zb_object = zfs_strtonum(buf + 1, &buf);
  130         ASSERT(*buf == ':');
  131         zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
  132         ASSERT(*buf == ':');
  133         zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
  134         ASSERT(*buf == '\0');
  135 }
  136 
  137 #ifdef _KERNEL
  138 static void
  139 zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb)
  140 {
  141         zb->zb_objset = dataset;
  142         zb->zb_object = zep->zb_object;
  143         zb->zb_level = zep->zb_level;
  144         zb->zb_blkid = zep->zb_blkid;
  145 }
  146 #endif
  147 
  148 static void
  149 name_to_object(char *buf, uint64_t *obj)
  150 {
  151         *obj = zfs_strtonum(buf, &buf);
  152         ASSERT(*buf == '\0');
  153 }
  154 
  155 static int
  156 get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
  157     uint64_t *head_dataset_id)
  158 {
  159         dsl_pool_t *dp = spa->spa_dsl_pool;
  160         dsl_dataset_t *ds;
  161         objset_t *os;
  162 
  163         int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds);
  164         if (error != 0) {
  165                 return (error);
  166         }
  167         ASSERT(head_dataset_id);
  168         *head_dataset_id = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
  169 
  170         error = dmu_objset_from_ds(ds, &os);
  171         if (error != 0) {
  172                 dsl_dataset_rele(ds, FTAG);
  173                 return (error);
  174         }
  175 
  176         /*
  177          * If the key is not loaded dbuf_dnode_findbp() will error out with
  178          * EACCES. However in that case dnode_hold() will eventually call
  179          * dbuf_read()->zio_wait() which may call spa_log_error(). This will
  180          * lead to a deadlock due to us holding the mutex spa_errlist_lock.
  181          * Avoid this by checking here if the keys are loaded, if not return.
  182          * If the keys are not loaded the head_errlog feature is meaningless
  183          * as we cannot figure out the birth txg of the block pointer.
  184          */
  185         if (dsl_dataset_get_keystatus(ds->ds_dir) ==
  186             ZFS_KEYSTATUS_UNAVAILABLE) {
  187                 zep->zb_birth = 0;
  188                 dsl_dataset_rele(ds, FTAG);
  189                 return (0);
  190         }
  191 
  192         dnode_t *dn;
  193         blkptr_t bp;
  194 
  195         error = dnode_hold(os, zep->zb_object, FTAG, &dn);
  196         if (error != 0) {
  197                 dsl_dataset_rele(ds, FTAG);
  198                 return (error);
  199         }
  200 
  201         rw_enter(&dn->dn_struct_rwlock, RW_READER);
  202         error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL,
  203             NULL);
  204         if (error == 0 && BP_IS_HOLE(&bp))
  205                 error = SET_ERROR(ENOENT);
  206 
  207         /*
  208          * If the key is loaded but the encrypted filesystem is unmounted when
  209          * a scrub is run, then dbuf_dnode_findbp() will still error out with
  210          * EACCES (possibly due to the key mapping being removed upon
  211          * unmounting). In that case the head_errlog feature is also
  212          * meaningless as we cannot figure out the birth txg of the block
  213          * pointer.
  214          */
  215         if (error == EACCES)
  216                 error = 0;
  217         else if (!error)
  218                 zep->zb_birth = bp.blk_birth;
  219 
  220         rw_exit(&dn->dn_struct_rwlock);
  221         dnode_rele(dn, FTAG);
  222         dsl_dataset_rele(ds, FTAG);
  223         return (error);
  224 }
  225 
  226 /*
  227  * Log an uncorrectable error to the persistent error log.  We add it to the
  228  * spa's list of pending errors.  The changes are actually synced out to disk
  229  * during spa_errlog_sync().
  230  */
  231 void
  232 spa_log_error(spa_t *spa, const zbookmark_phys_t *zb)
  233 {
  234         spa_error_entry_t search;
  235         spa_error_entry_t *new;
  236         avl_tree_t *tree;
  237         avl_index_t where;
  238 
  239         /*
  240          * If we are trying to import a pool, ignore any errors, as we won't be
  241          * writing to the pool any time soon.
  242          */
  243         if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
  244                 return;
  245 
  246         mutex_enter(&spa->spa_errlist_lock);
  247 
  248         /*
  249          * If we have had a request to rotate the log, log it to the next list
  250          * instead of the current one.
  251          */
  252         if (spa->spa_scrub_active || spa->spa_scrub_finished)
  253                 tree = &spa->spa_errlist_scrub;
  254         else
  255                 tree = &spa->spa_errlist_last;
  256 
  257         search.se_bookmark = *zb;
  258         if (avl_find(tree, &search, &where) != NULL) {
  259                 mutex_exit(&spa->spa_errlist_lock);
  260                 return;
  261         }
  262 
  263         new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
  264         new->se_bookmark = *zb;
  265         avl_insert(tree, new, where);
  266 
  267         mutex_exit(&spa->spa_errlist_lock);
  268 }
  269 
  270 #ifdef _KERNEL
  271 static int
  272 find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
  273     uint64_t *birth_txg)
  274 {
  275         objset_t *os;
  276         int error = dmu_objset_from_ds(ds, &os);
  277         if (error != 0)
  278                 return (error);
  279 
  280         dnode_t *dn;
  281         blkptr_t bp;
  282 
  283         error = dnode_hold(os, zep->zb_object, FTAG, &dn);
  284         if (error != 0)
  285                 return (error);
  286 
  287         rw_enter(&dn->dn_struct_rwlock, RW_READER);
  288         error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL,
  289             NULL);
  290         if (error == 0 && BP_IS_HOLE(&bp))
  291                 error = SET_ERROR(ENOENT);
  292 
  293         *birth_txg = bp.blk_birth;
  294         rw_exit(&dn->dn_struct_rwlock);
  295         dnode_rele(dn, FTAG);
  296         return (error);
  297 }
  298 
  299 /*
  300  * Copy the bookmark to the end of the user-space buffer which starts at
  301  * uaddr and has *count unused entries, and decrement *count by 1.
  302  */
  303 static int
  304 copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count)
  305 {
  306         if (*count == 0)
  307                 return (SET_ERROR(ENOMEM));
  308 
  309         *count -= 1;
  310         if (copyout(zb, (char *)uaddr + (*count) * sizeof (zbookmark_phys_t),
  311             sizeof (zbookmark_phys_t)) != 0)
  312                 return (SET_ERROR(EFAULT));
  313         return (0);
  314 }
  315 
  316 /*
  317  * Each time the error block is referenced by a snapshot or clone, add a
  318  * zbookmark_phys_t entry to the userspace array at uaddr. The array is
  319  * filled from the back and the in-out parameter *count is modified to be the
  320  * number of unused entries at the beginning of the array.
  321  */
  322 static int
  323 check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
  324     void *uaddr, uint64_t *count)
  325 {
  326         dsl_dataset_t *ds;
  327         dsl_pool_t *dp = spa->spa_dsl_pool;
  328 
  329         int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds);
  330         if (error != 0)
  331                 return (error);
  332 
  333         uint64_t latest_txg;
  334         uint64_t txg_to_consider = spa->spa_syncing_txg;
  335         boolean_t check_snapshot = B_TRUE;
  336         error = find_birth_txg(ds, zep, &latest_txg);
  337 
  338         /*
  339          * If we cannot figure out the current birth txg of the block pointer
  340          * error out. If the filesystem is encrypted and the key is not loaded
  341          * or the encrypted filesystem is not mounted the error will be EACCES.
  342          * In that case do not return an error.
  343          */
  344         if (error == EACCES) {
  345                 dsl_dataset_rele(ds, FTAG);
  346                 return (0);
  347         }
  348         if (error) {
  349                 dsl_dataset_rele(ds, FTAG);
  350                 return (error);
  351         }
  352         if (zep->zb_birth == latest_txg) {
  353                 /* Block neither free nor rewritten. */
  354                 zbookmark_phys_t zb;
  355                 zep_to_zb(head_ds, zep, &zb);
  356                 error = copyout_entry(&zb, uaddr, count);
  357                 if (error != 0) {
  358                         dsl_dataset_rele(ds, FTAG);
  359                         return (error);
  360                 }
  361                 check_snapshot = B_FALSE;
  362         } else {
  363                 ASSERT3U(zep->zb_birth, <, latest_txg);
  364                 txg_to_consider = latest_txg;
  365         }
  366 
  367         /* How many snapshots reference this block. */
  368         uint64_t snap_count;
  369         error = zap_count(spa->spa_meta_objset,
  370             dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
  371         if (error != 0) {
  372                 dsl_dataset_rele(ds, FTAG);
  373                 return (error);
  374         }
  375 
  376         if (snap_count == 0) {
  377                 /* File system has no snapshot. */
  378                 dsl_dataset_rele(ds, FTAG);
  379                 return (0);
  380         }
  381 
  382         uint64_t *snap_obj_array = kmem_alloc(snap_count * sizeof (uint64_t),
  383             KM_SLEEP);
  384 
  385         int aff_snap_count = 0;
  386         uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
  387         uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
  388 
  389         /* Check only snapshots created from this file system. */
  390         while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
  391             snap_obj_txg <= txg_to_consider) {
  392 
  393                 dsl_dataset_rele(ds, FTAG);
  394                 error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds);
  395                 if (error != 0)
  396                         goto out;
  397 
  398                 if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds)
  399                         break;
  400 
  401                 boolean_t affected = B_TRUE;
  402                 if (check_snapshot) {
  403                         uint64_t blk_txg;
  404                         error = find_birth_txg(ds, zep, &blk_txg);
  405                         affected = (error == 0 && zep->zb_birth == blk_txg);
  406                 }
  407 
  408                 if (affected) {
  409                         snap_obj_array[aff_snap_count] = snap_obj;
  410                         aff_snap_count++;
  411 
  412                         zbookmark_phys_t zb;
  413                         zep_to_zb(snap_obj, zep, &zb);
  414                         error = copyout_entry(&zb, uaddr, count);
  415                         if (error != 0) {
  416                                 dsl_dataset_rele(ds, FTAG);
  417                                 goto out;
  418                         }
  419 
  420                         /*
  421                          * Only clones whose origins were affected could also
  422                          * have affected snapshots.
  423                          */
  424                         zap_cursor_t zc;
  425                         zap_attribute_t za;
  426                         for (zap_cursor_init(&zc, spa->spa_meta_objset,
  427                             dsl_dataset_phys(ds)->ds_next_clones_obj);
  428                             zap_cursor_retrieve(&zc, &za) == 0;
  429                             zap_cursor_advance(&zc)) {
  430                                 error = check_filesystem(spa,
  431                                     za.za_first_integer, zep, uaddr, count);
  432 
  433                                 if (error != 0) {
  434                                         zap_cursor_fini(&zc);
  435                                         goto out;
  436                                 }
  437                         }
  438                         zap_cursor_fini(&zc);
  439                 }
  440                 snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
  441                 snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
  442         }
  443         dsl_dataset_rele(ds, FTAG);
  444 
  445 out:
  446         kmem_free(snap_obj_array, sizeof (*snap_obj_array));
  447         return (error);
  448 }
  449 
  450 static int
  451 find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
  452     uint64_t *top_affected_fs)
  453 {
  454         uint64_t oldest_dsobj;
  455         int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth,
  456             &oldest_dsobj);
  457         if (error != 0)
  458                 return (error);
  459 
  460         dsl_dataset_t *ds;
  461         error = dsl_dataset_hold_obj(spa->spa_dsl_pool, oldest_dsobj,
  462             FTAG, &ds);
  463         if (error != 0)
  464                 return (error);
  465 
  466         *top_affected_fs =
  467             dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
  468         dsl_dataset_rele(ds, FTAG);
  469         return (0);
  470 }
  471 
  472 static int
  473 process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
  474     void *uaddr, uint64_t *count)
  475 {
  476         /*
  477          * If the zb_birth is 0 it means we failed to retrieve the birth txg
  478          * of the block pointer. This happens when an encrypted filesystem is
  479          * not mounted or when the key is not loaded. Do not proceed to
  480          * check_filesystem(), instead do the accounting here.
  481          */
  482         if (zep->zb_birth == 0) {
  483                 zbookmark_phys_t zb;
  484                 zep_to_zb(head_ds, zep, &zb);
  485                 int error = copyout_entry(&zb, uaddr, count);
  486                 if (error != 0) {
  487                         return (error);
  488                 }
  489                 return (0);
  490         }
  491 
  492         uint64_t top_affected_fs;
  493         int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs);
  494         if (error == 0) {
  495                 error = check_filesystem(spa, top_affected_fs, zep,
  496                     uaddr, count);
  497         }
  498 
  499         return (error);
  500 }
  501 #endif
  502 
  503 /*
  504  * If a healed bookmark matches an entry in the error log we stash it in a tree
  505  * so that we can later remove the related log entries in sync context.
  506  */
  507 static void
  508 spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb)
  509 {
  510         char name[NAME_MAX_LEN];
  511 
  512         if (obj == 0)
  513                 return;
  514 
  515         bookmark_to_name(healed_zb, name, sizeof (name));
  516         mutex_enter(&spa->spa_errlog_lock);
  517         if (zap_contains(spa->spa_meta_objset, obj, name) == 0) {
  518                 /*
  519                  * Found an error matching healed zb, add zb to our
  520                  * tree of healed errors
  521                  */
  522                 avl_tree_t *tree = &spa->spa_errlist_healed;
  523                 spa_error_entry_t search;
  524                 spa_error_entry_t *new;
  525                 avl_index_t where;
  526                 search.se_bookmark = *healed_zb;
  527                 mutex_enter(&spa->spa_errlist_lock);
  528                 if (avl_find(tree, &search, &where) != NULL) {
  529                         mutex_exit(&spa->spa_errlist_lock);
  530                         mutex_exit(&spa->spa_errlog_lock);
  531                         return;
  532                 }
  533                 new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
  534                 new->se_bookmark = *healed_zb;
  535                 avl_insert(tree, new, where);
  536                 mutex_exit(&spa->spa_errlist_lock);
  537         }
  538         mutex_exit(&spa->spa_errlog_lock);
  539 }
  540 
  541 /*
  542  * If this error exists in the given tree remove it.
  543  */
  544 static void
  545 remove_error_from_list(spa_t *spa, avl_tree_t *t, const zbookmark_phys_t *zb)
  546 {
  547         spa_error_entry_t search, *found;
  548         avl_index_t where;
  549 
  550         mutex_enter(&spa->spa_errlist_lock);
  551         search.se_bookmark = *zb;
  552         if ((found = avl_find(t, &search, &where)) != NULL) {
  553                 avl_remove(t, found);
  554                 kmem_free(found, sizeof (spa_error_entry_t));
  555         }
  556         mutex_exit(&spa->spa_errlist_lock);
  557 }
  558 
  559 
  560 /*
  561  * Removes all of the recv healed errors from both on-disk error logs
  562  */
  563 static void
  564 spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx)
  565 {
  566         char name[NAME_MAX_LEN];
  567         spa_error_entry_t *se;
  568         void *cookie = NULL;
  569 
  570         ASSERT(MUTEX_HELD(&spa->spa_errlog_lock));
  571 
  572         while ((se = avl_destroy_nodes(&spa->spa_errlist_healed,
  573             &cookie)) != NULL) {
  574                 remove_error_from_list(spa, s, &se->se_bookmark);
  575                 remove_error_from_list(spa, l, &se->se_bookmark);
  576                 bookmark_to_name(&se->se_bookmark, name, sizeof (name));
  577                 kmem_free(se, sizeof (spa_error_entry_t));
  578                 (void) zap_remove(spa->spa_meta_objset,
  579                     spa->spa_errlog_last, name, tx);
  580                 (void) zap_remove(spa->spa_meta_objset,
  581                     spa->spa_errlog_scrub, name, tx);
  582         }
  583 }
  584 
  585 /*
  586  * Stash away healed bookmarks to remove them from the on-disk error logs
  587  * later in spa_remove_healed_errors().
  588  */
  589 void
  590 spa_remove_error(spa_t *spa, zbookmark_phys_t *zb)
  591 {
  592         char name[NAME_MAX_LEN];
  593 
  594         bookmark_to_name(zb, name, sizeof (name));
  595 
  596         spa_add_healed_error(spa, spa->spa_errlog_last, zb);
  597         spa_add_healed_error(spa, spa->spa_errlog_scrub, zb);
  598 }
  599 
  600 static uint64_t
  601 approx_errlog_size_impl(spa_t *spa, uint64_t spa_err_obj)
  602 {
  603         if (spa_err_obj == 0)
  604                 return (0);
  605         uint64_t total = 0;
  606 
  607         zap_cursor_t zc;
  608         zap_attribute_t za;
  609         for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
  610             zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
  611                 uint64_t count;
  612                 if (zap_count(spa->spa_meta_objset, za.za_first_integer,
  613                     &count) == 0)
  614                         total += count;
  615         }
  616         zap_cursor_fini(&zc);
  617         return (total);
  618 }
  619 
  620 /*
  621  * Return the approximate number of errors currently in the error log.  This
  622  * will be nonzero if there are some errors, but otherwise it may be more
  623  * or less than the number of entries returned by spa_get_errlog().
  624  */
  625 uint64_t
  626 spa_approx_errlog_size(spa_t *spa)
  627 {
  628         uint64_t total = 0;
  629 
  630         if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
  631                 mutex_enter(&spa->spa_errlog_lock);
  632                 uint64_t count;
  633                 if (spa->spa_errlog_scrub != 0 &&
  634                     zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
  635                     &count) == 0)
  636                         total += count;
  637 
  638                 if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
  639                     zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
  640                     &count) == 0)
  641                         total += count;
  642                 mutex_exit(&spa->spa_errlog_lock);
  643 
  644         } else {
  645                 mutex_enter(&spa->spa_errlog_lock);
  646                 total += approx_errlog_size_impl(spa, spa->spa_errlog_last);
  647                 total += approx_errlog_size_impl(spa, spa->spa_errlog_scrub);
  648                 mutex_exit(&spa->spa_errlog_lock);
  649         }
  650         mutex_enter(&spa->spa_errlist_lock);
  651         total += avl_numnodes(&spa->spa_errlist_last);
  652         total += avl_numnodes(&spa->spa_errlist_scrub);
  653         mutex_exit(&spa->spa_errlist_lock);
  654         return (total);
  655 }
  656 
  657 /*
  658  * This function sweeps through an on-disk error log and stores all bookmarks
  659  * as error bookmarks in a new ZAP object. At the end we discard the old one,
  660  * and spa_update_errlog() will set the spa's on-disk error log to new ZAP
  661  * object.
  662  */
  663 static void
  664 sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
  665     dmu_tx_t *tx)
  666 {
  667         zap_cursor_t zc;
  668         zap_attribute_t za;
  669         zbookmark_phys_t zb;
  670         uint64_t count;
  671 
  672         *newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
  673             DMU_OT_NONE, 0, tx);
  674 
  675         /*
  676          * If we cannnot perform the upgrade we should clear the old on-disk
  677          * error logs.
  678          */
  679         if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) {
  680                 VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
  681                 return;
  682         }
  683 
  684         for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
  685             zap_cursor_retrieve(&zc, &za) == 0;
  686             zap_cursor_advance(&zc)) {
  687                 if (spa_upgrade_errlog_limit != 0 &&
  688                     zc.zc_cd == spa_upgrade_errlog_limit)
  689                         break;
  690 
  691                 name_to_bookmark(za.za_name, &zb);
  692 
  693                 zbookmark_err_phys_t zep;
  694                 zep.zb_object = zb.zb_object;
  695                 zep.zb_level = zb.zb_level;
  696                 zep.zb_blkid = zb.zb_blkid;
  697                 zep.zb_birth = 0;
  698 
  699                 /*
  700                  * We cannot use get_head_and_birth_txg() because it will
  701                  * acquire the pool config lock, which we already have. In case
  702                  * of an error we simply continue.
  703                  */
  704                 uint64_t head_dataset_obj;
  705                 dsl_pool_t *dp = spa->spa_dsl_pool;
  706                 dsl_dataset_t *ds;
  707                 objset_t *os;
  708 
  709                 int error = dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds);
  710                 if (error != 0)
  711                         continue;
  712 
  713                 head_dataset_obj =
  714                     dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
  715 
  716                 /*
  717                  * The objset and the dnode are required for getting the block
  718                  * pointer, which is used to determine if BP_IS_HOLE(). If
  719                  * getting the objset or the dnode fails, do not create a
  720                  * zap entry (presuming we know the dataset) as this may create
  721                  * spurious errors that we cannot ever resolve. If an error is
  722                  * truly persistent, it should re-appear after a scan.
  723                  */
  724                 if (dmu_objset_from_ds(ds, &os) != 0) {
  725                         dsl_dataset_rele(ds, FTAG);
  726                         continue;
  727                 }
  728 
  729                 dnode_t *dn;
  730                 blkptr_t bp;
  731 
  732                 if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) {
  733                         dsl_dataset_rele(ds, FTAG);
  734                         continue;
  735                 }
  736 
  737                 rw_enter(&dn->dn_struct_rwlock, RW_READER);
  738                 error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp,
  739                     NULL, NULL);
  740                 if (error == EACCES)
  741                         error = 0;
  742                 else if (!error)
  743                         zep.zb_birth = bp.blk_birth;
  744 
  745                 rw_exit(&dn->dn_struct_rwlock);
  746                 dnode_rele(dn, FTAG);
  747                 dsl_dataset_rele(ds, FTAG);
  748 
  749                 if (error != 0 || BP_IS_HOLE(&bp))
  750                         continue;
  751 
  752                 uint64_t err_obj;
  753                 error = zap_lookup_int_key(spa->spa_meta_objset, *newobj,
  754                     head_dataset_obj, &err_obj);
  755 
  756                 if (error == ENOENT) {
  757                         err_obj = zap_create(spa->spa_meta_objset,
  758                             DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
  759 
  760                         (void) zap_update_int_key(spa->spa_meta_objset,
  761                             *newobj, head_dataset_obj, err_obj, tx);
  762                 }
  763 
  764                 char buf[64];
  765                 errphys_to_name(&zep, buf, sizeof (buf));
  766 
  767                 const char *name = "";
  768                 (void) zap_update(spa->spa_meta_objset, err_obj,
  769                     buf, 1, strlen(name) + 1, name, tx);
  770         }
  771         zap_cursor_fini(&zc);
  772 
  773         VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
  774 }
  775 
  776 void
  777 spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx)
  778 {
  779         uint64_t newobj = 0;
  780 
  781         mutex_enter(&spa->spa_errlog_lock);
  782         if (spa->spa_errlog_last != 0) {
  783                 sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx);
  784                 spa->spa_errlog_last = newobj;
  785         }
  786 
  787         if (spa->spa_errlog_scrub != 0) {
  788                 sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx);
  789                 spa->spa_errlog_scrub = newobj;
  790         }
  791         mutex_exit(&spa->spa_errlog_lock);
  792 }
  793 
  794 #ifdef _KERNEL
  795 /*
  796  * If an error block is shared by two datasets it will be counted twice.
  797  */
  798 static int
  799 process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
  800 {
  801         zap_cursor_t zc;
  802         zap_attribute_t za;
  803 
  804         if (obj == 0)
  805                 return (0);
  806 
  807         if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
  808                 for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
  809                     zap_cursor_retrieve(&zc, &za) == 0;
  810                     zap_cursor_advance(&zc)) {
  811                         if (*count == 0) {
  812                                 zap_cursor_fini(&zc);
  813                                 return (SET_ERROR(ENOMEM));
  814                         }
  815 
  816                         zbookmark_phys_t zb;
  817                         name_to_bookmark(za.za_name, &zb);
  818 
  819                         int error = copyout_entry(&zb, uaddr, count);
  820                         if (error != 0) {
  821                                 zap_cursor_fini(&zc);
  822                                 return (error);
  823                         }
  824                 }
  825                 zap_cursor_fini(&zc);
  826                 return (0);
  827         }
  828 
  829         for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
  830             zap_cursor_retrieve(&zc, &za) == 0;
  831             zap_cursor_advance(&zc)) {
  832 
  833                 zap_cursor_t head_ds_cursor;
  834                 zap_attribute_t head_ds_attr;
  835 
  836                 uint64_t head_ds_err_obj = za.za_first_integer;
  837                 uint64_t head_ds;
  838                 name_to_object(za.za_name, &head_ds);
  839                 for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset,
  840                     head_ds_err_obj); zap_cursor_retrieve(&head_ds_cursor,
  841                     &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) {
  842 
  843                         zbookmark_err_phys_t head_ds_block;
  844                         name_to_errphys(head_ds_attr.za_name, &head_ds_block);
  845                         int error = process_error_block(spa, head_ds,
  846                             &head_ds_block, uaddr, count);
  847 
  848                         if (error != 0) {
  849                                 zap_cursor_fini(&head_ds_cursor);
  850                                 zap_cursor_fini(&zc);
  851                                 return (error);
  852                         }
  853                 }
  854                 zap_cursor_fini(&head_ds_cursor);
  855         }
  856         zap_cursor_fini(&zc);
  857         return (0);
  858 }
  859 
  860 static int
  861 process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count)
  862 {
  863         spa_error_entry_t *se;
  864 
  865         if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
  866                 for (se = avl_first(list); se != NULL;
  867                     se = AVL_NEXT(list, se)) {
  868                         int error =
  869                             copyout_entry(&se->se_bookmark, uaddr, count);
  870                         if (error != 0) {
  871                                 return (error);
  872                         }
  873                 }
  874                 return (0);
  875         }
  876 
  877         for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
  878                 zbookmark_err_phys_t zep;
  879                 zep.zb_object = se->se_bookmark.zb_object;
  880                 zep.zb_level = se->se_bookmark.zb_level;
  881                 zep.zb_blkid = se->se_bookmark.zb_blkid;
  882                 zep.zb_birth = 0;
  883 
  884                 uint64_t head_ds_obj;
  885                 int error = get_head_and_birth_txg(spa, &zep,
  886                     se->se_bookmark.zb_objset, &head_ds_obj);
  887 
  888                 if (!error)
  889                         error = process_error_block(spa, head_ds_obj, &zep,
  890                             uaddr, count);
  891                 if (error)
  892                         return (error);
  893         }
  894         return (0);
  895 }
  896 #endif
  897 
  898 /*
  899  * Copy all known errors to userland as an array of bookmarks.  This is
  900  * actually a union of the on-disk last log and current log, as well as any
  901  * pending error requests.
  902  *
  903  * Because the act of reading the on-disk log could cause errors to be
  904  * generated, we have two separate locks: one for the error log and one for the
  905  * in-core error lists.  We only need the error list lock to log and error, so
  906  * we grab the error log lock while we read the on-disk logs, and only pick up
  907  * the error list lock when we are finished.
  908  */
  909 int
  910 spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count)
  911 {
  912         int ret = 0;
  913 
  914 #ifdef _KERNEL
  915         /*
  916          * The pool config lock is needed to hold a dataset_t via (among other
  917          * places) process_error_list() -> get_head_and_birth_txg(), and lock
  918          * ordering requires that we get it before the spa_errlog_lock.
  919          */
  920         dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
  921         mutex_enter(&spa->spa_errlog_lock);
  922 
  923         ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
  924 
  925         if (!ret && !spa->spa_scrub_finished)
  926                 ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
  927                     count);
  928 
  929         mutex_enter(&spa->spa_errlist_lock);
  930         if (!ret)
  931                 ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr,
  932                     count);
  933         if (!ret)
  934                 ret = process_error_list(spa, &spa->spa_errlist_last, uaddr,
  935                     count);
  936         mutex_exit(&spa->spa_errlist_lock);
  937 
  938         mutex_exit(&spa->spa_errlog_lock);
  939         dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
  940 #else
  941         (void) spa, (void) uaddr, (void) count;
  942 #endif
  943 
  944         return (ret);
  945 }
  946 
  947 /*
  948  * Called when a scrub completes.  This simply set a bit which tells which AVL
  949  * tree to add new errors.  spa_errlog_sync() is responsible for actually
  950  * syncing the changes to the underlying objects.
  951  */
  952 void
  953 spa_errlog_rotate(spa_t *spa)
  954 {
  955         mutex_enter(&spa->spa_errlist_lock);
  956         spa->spa_scrub_finished = B_TRUE;
  957         mutex_exit(&spa->spa_errlist_lock);
  958 }
  959 
  960 /*
  961  * Discard any pending errors from the spa_t.  Called when unloading a faulted
  962  * pool, as the errors encountered during the open cannot be synced to disk.
  963  */
  964 void
  965 spa_errlog_drain(spa_t *spa)
  966 {
  967         spa_error_entry_t *se;
  968         void *cookie;
  969 
  970         mutex_enter(&spa->spa_errlist_lock);
  971 
  972         cookie = NULL;
  973         while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
  974             &cookie)) != NULL)
  975                 kmem_free(se, sizeof (spa_error_entry_t));
  976         cookie = NULL;
  977         while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
  978             &cookie)) != NULL)
  979                 kmem_free(se, sizeof (spa_error_entry_t));
  980 
  981         mutex_exit(&spa->spa_errlist_lock);
  982 }
  983 
  984 /*
  985  * Process a list of errors into the current on-disk log.
  986  */
  987 void
  988 sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
  989 {
  990         spa_error_entry_t *se;
  991         char buf[NAME_MAX_LEN];
  992         void *cookie;
  993 
  994         if (avl_numnodes(t) == 0)
  995                 return;
  996 
  997         /* create log if necessary */
  998         if (*obj == 0)
  999                 *obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
 1000                     DMU_OT_NONE, 0, tx);
 1001 
 1002         /* add errors to the current log */
 1003         if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 1004                 for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
 1005                         bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
 1006 
 1007                         const char *name = se->se_name ? se->se_name : "";
 1008                         (void) zap_update(spa->spa_meta_objset, *obj, buf, 1,
 1009                             strlen(name) + 1, name, tx);
 1010                 }
 1011         } else {
 1012                 for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
 1013                         zbookmark_err_phys_t zep;
 1014                         zep.zb_object = se->se_bookmark.zb_object;
 1015                         zep.zb_level = se->se_bookmark.zb_level;
 1016                         zep.zb_blkid = se->se_bookmark.zb_blkid;
 1017                         zep.zb_birth = 0;
 1018 
 1019                         /*
 1020                          * If we cannot find out the head dataset and birth txg
 1021                          * of the present error block, we simply continue.
 1022                          * Reinserting that error block to the error lists,
 1023                          * even if we are not syncing the final txg, results
 1024                          * in duplicate posting of errors.
 1025                          */
 1026                         uint64_t head_dataset_obj;
 1027                         int error = get_head_and_birth_txg(spa, &zep,
 1028                             se->se_bookmark.zb_objset, &head_dataset_obj);
 1029                         if (error)
 1030                                 continue;
 1031 
 1032                         uint64_t err_obj;
 1033                         error = zap_lookup_int_key(spa->spa_meta_objset,
 1034                             *obj, head_dataset_obj, &err_obj);
 1035 
 1036                         if (error == ENOENT) {
 1037                                 err_obj = zap_create(spa->spa_meta_objset,
 1038                                     DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
 1039 
 1040                                 (void) zap_update_int_key(spa->spa_meta_objset,
 1041                                     *obj, head_dataset_obj, err_obj, tx);
 1042                         }
 1043                         errphys_to_name(&zep, buf, sizeof (buf));
 1044 
 1045                         const char *name = se->se_name ? se->se_name : "";
 1046                         (void) zap_update(spa->spa_meta_objset,
 1047                             err_obj, buf, 1, strlen(name) + 1, name, tx);
 1048                 }
 1049         }
 1050         /* purge the error list */
 1051         cookie = NULL;
 1052         while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
 1053                 kmem_free(se, sizeof (spa_error_entry_t));
 1054 }
 1055 
 1056 static void
 1057 delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx)
 1058 {
 1059         if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 1060                 zap_cursor_t zc;
 1061                 zap_attribute_t za;
 1062                 for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
 1063                     zap_cursor_retrieve(&zc, &za) == 0;
 1064                     zap_cursor_advance(&zc)) {
 1065                         VERIFY0(dmu_object_free(spa->spa_meta_objset,
 1066                             za.za_first_integer, tx));
 1067                 }
 1068                 zap_cursor_fini(&zc);
 1069         }
 1070         VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
 1071 }
 1072 
 1073 /*
 1074  * Sync the error log out to disk.  This is a little tricky because the act of
 1075  * writing the error log requires the spa_errlist_lock.  So, we need to lock the
 1076  * error lists, take a copy of the lists, and then reinitialize them.  Then, we
 1077  * drop the error list lock and take the error log lock, at which point we
 1078  * do the errlog processing.  Then, if we encounter an I/O error during this
 1079  * process, we can successfully add the error to the list.  Note that this will
 1080  * result in the perpetual recycling of errors, but it is an unlikely situation
 1081  * and not a performance critical operation.
 1082  */
 1083 void
 1084 spa_errlog_sync(spa_t *spa, uint64_t txg)
 1085 {
 1086         dmu_tx_t *tx;
 1087         avl_tree_t scrub, last;
 1088         int scrub_finished;
 1089 
 1090         mutex_enter(&spa->spa_errlist_lock);
 1091 
 1092         /*
 1093          * Bail out early under normal circumstances.
 1094          */
 1095         if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
 1096             avl_numnodes(&spa->spa_errlist_last) == 0 &&
 1097             avl_numnodes(&spa->spa_errlist_healed) == 0 &&
 1098             !spa->spa_scrub_finished) {
 1099                 mutex_exit(&spa->spa_errlist_lock);
 1100                 return;
 1101         }
 1102 
 1103         spa_get_errlists(spa, &last, &scrub);
 1104         scrub_finished = spa->spa_scrub_finished;
 1105         spa->spa_scrub_finished = B_FALSE;
 1106 
 1107         mutex_exit(&spa->spa_errlist_lock);
 1108 
 1109         /*
 1110          * The pool config lock is needed to hold a dataset_t via
 1111          * sync_error_list() -> get_head_and_birth_txg(), and lock ordering
 1112          * requires that we get it before the spa_errlog_lock.
 1113          */
 1114         dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 1115         mutex_enter(&spa->spa_errlog_lock);
 1116 
 1117         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 1118 
 1119         /*
 1120          * Remove healed errors from errors.
 1121          */
 1122         spa_remove_healed_errors(spa, &last, &scrub, tx);
 1123 
 1124         /*
 1125          * Sync out the current list of errors.
 1126          */
 1127         sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
 1128 
 1129         /*
 1130          * Rotate the log if necessary.
 1131          */
 1132         if (scrub_finished) {
 1133                 if (spa->spa_errlog_last != 0)
 1134                         delete_errlog(spa, spa->spa_errlog_last, tx);
 1135                 spa->spa_errlog_last = spa->spa_errlog_scrub;
 1136                 spa->spa_errlog_scrub = 0;
 1137 
 1138                 sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
 1139         }
 1140 
 1141         /*
 1142          * Sync out any pending scrub errors.
 1143          */
 1144         sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
 1145 
 1146         /*
 1147          * Update the MOS to reflect the new values.
 1148          */
 1149         (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 1150             DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
 1151             &spa->spa_errlog_last, tx);
 1152         (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 1153             DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
 1154             &spa->spa_errlog_scrub, tx);
 1155 
 1156         dmu_tx_commit(tx);
 1157 
 1158         mutex_exit(&spa->spa_errlog_lock);
 1159         dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 1160 }
 1161 
 1162 static void
 1163 delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds,
 1164     dmu_tx_t *tx)
 1165 {
 1166         if (spa_err_obj == 0)
 1167                 return;
 1168 
 1169         zap_cursor_t zc;
 1170         zap_attribute_t za;
 1171         for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
 1172             zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
 1173                 uint64_t head_ds;
 1174                 name_to_object(za.za_name, &head_ds);
 1175                 if (head_ds == ds) {
 1176                         (void) zap_remove(spa->spa_meta_objset, spa_err_obj,
 1177                             za.za_name, tx);
 1178                         VERIFY0(dmu_object_free(spa->spa_meta_objset,
 1179                             za.za_first_integer, tx));
 1180                         break;
 1181                 }
 1182         }
 1183         zap_cursor_fini(&zc);
 1184 }
 1185 
 1186 void
 1187 spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx)
 1188 {
 1189         mutex_enter(&spa->spa_errlog_lock);
 1190         delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx);
 1191         delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx);
 1192         mutex_exit(&spa->spa_errlog_lock);
 1193 }
 1194 
 1195 static int
 1196 find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head,
 1197     uint64_t *txg)
 1198 {
 1199         dsl_dataset_t *ds;
 1200         dsl_pool_t *dp = spa->spa_dsl_pool;
 1201 
 1202         int error = dsl_dataset_hold_obj(dp, old_head, FTAG, &ds);
 1203         if (error != 0)
 1204                 return (error);
 1205 
 1206         uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 1207         uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 1208 
 1209         while (prev_obj != 0) {
 1210                 dsl_dataset_rele(ds, FTAG);
 1211                 if ((error = dsl_dataset_hold_obj(dp, prev_obj,
 1212                     FTAG, &ds)) == 0 &&
 1213                     dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head)
 1214                         break;
 1215 
 1216                 if (error != 0)
 1217                         return (error);
 1218 
 1219                 prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 1220                 prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 1221         }
 1222         dsl_dataset_rele(ds, FTAG);
 1223         ASSERT(prev_obj != 0);
 1224         *txg = prev_obj_txg;
 1225         return (0);
 1226 }
 1227 
 1228 static void
 1229 swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t
 1230     old_head, dmu_tx_t *tx)
 1231 {
 1232         if (spa_err_obj == 0)
 1233                 return;
 1234 
 1235         uint64_t old_head_errlog;
 1236         int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj,
 1237             old_head, &old_head_errlog);
 1238 
 1239         /* If no error log, then there is nothing to do. */
 1240         if (error != 0)
 1241                 return;
 1242 
 1243         uint64_t txg;
 1244         error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg);
 1245         if (error != 0)
 1246                 return;
 1247 
 1248         /*
 1249          * Create an error log if the file system being promoted does not
 1250          * already have one.
 1251          */
 1252         uint64_t new_head_errlog;
 1253         error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head,
 1254             &new_head_errlog);
 1255 
 1256         if (error != 0) {
 1257                 new_head_errlog = zap_create(spa->spa_meta_objset,
 1258                     DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
 1259 
 1260                 (void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj,
 1261                     new_head, new_head_errlog, tx);
 1262         }
 1263 
 1264         zap_cursor_t zc;
 1265         zap_attribute_t za;
 1266         zbookmark_err_phys_t err_block;
 1267         for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog);
 1268             zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
 1269 
 1270                 const char *name = "";
 1271                 name_to_errphys(za.za_name, &err_block);
 1272                 if (err_block.zb_birth < txg) {
 1273                         (void) zap_update(spa->spa_meta_objset, new_head_errlog,
 1274                             za.za_name, 1, strlen(name) + 1, name, tx);
 1275 
 1276                         (void) zap_remove(spa->spa_meta_objset, old_head_errlog,
 1277                             za.za_name, tx);
 1278                 }
 1279         }
 1280         zap_cursor_fini(&zc);
 1281 }
 1282 
 1283 void
 1284 spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds,
 1285     dmu_tx_t *tx)
 1286 {
 1287         mutex_enter(&spa->spa_errlog_lock);
 1288         swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx);
 1289         swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx);
 1290         mutex_exit(&spa->spa_errlog_lock);
 1291 }
 1292 
 1293 #if defined(_KERNEL)
 1294 /* error handling */
 1295 EXPORT_SYMBOL(spa_log_error);
 1296 EXPORT_SYMBOL(spa_approx_errlog_size);
 1297 EXPORT_SYMBOL(spa_get_errlog);
 1298 EXPORT_SYMBOL(spa_errlog_rotate);
 1299 EXPORT_SYMBOL(spa_errlog_drain);
 1300 EXPORT_SYMBOL(spa_errlog_sync);
 1301 EXPORT_SYMBOL(spa_get_errlists);
 1302 EXPORT_SYMBOL(spa_delete_dataset_errlog);
 1303 EXPORT_SYMBOL(spa_swap_errlog);
 1304 EXPORT_SYMBOL(sync_error_list);
 1305 EXPORT_SYMBOL(spa_upgrade_errlog);
 1306 #endif
 1307 
 1308 /* BEGIN CSTYLED */
 1309 ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, UINT, ZMOD_RW,
 1310         "Limit the number of errors which will be upgraded to the new "
 1311         "on-disk error log when enabling head_errlog");
 1312 /* END CSTYLED */
Cache object: 90517519b061f1aa741c46514724a894 
 
 |