The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/zfs_fm.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
   23  * Use is subject to license terms.
   24  */
   25 
   26 /*
   27  * Copyright (c) 2012,2021 by Delphix. All rights reserved.
   28  */
   29 
   30 #include <sys/spa.h>
   31 #include <sys/spa_impl.h>
   32 #include <sys/vdev.h>
   33 #include <sys/vdev_impl.h>
   34 #include <sys/zio.h>
   35 #include <sys/zio_checksum.h>
   36 
   37 #include <sys/fm/fs/zfs.h>
   38 #include <sys/fm/protocol.h>
   39 #include <sys/fm/util.h>
   40 #include <sys/sysevent.h>
   41 
   42 /*
   43  * This general routine is responsible for generating all the different ZFS
   44  * ereports.  The payload is dependent on the class, and which arguments are
   45  * supplied to the function:
   46  *
   47  *      EREPORT                 POOL    VDEV    IO
   48  *      block                   X       X       X
   49  *      data                    X               X
   50  *      device                  X       X
   51  *      pool                    X
   52  *
   53  * If we are in a loading state, all errors are chained together by the same
   54  * SPA-wide ENA (Error Numeric Association).
   55  *
   56  * For isolated I/O requests, we get the ENA from the zio_t. The propagation
   57  * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
   58  * to chain together all ereports associated with a logical piece of data.  For
   59  * read I/Os, there  are basically three 'types' of I/O, which form a roughly
   60  * layered diagram:
   61  *
   62  *      +---------------+
   63  *      | Aggregate I/O |       No associated logical data or device
   64  *      +---------------+
   65  *              |
   66  *              V
   67  *      +---------------+       Reads associated with a piece of logical data.
   68  *      |   Read I/O    |       This includes reads on behalf of RAID-Z,
   69  *      +---------------+       mirrors, gang blocks, retries, etc.
   70  *              |
   71  *              V
   72  *      +---------------+       Reads associated with a particular device, but
   73  *      | Physical I/O  |       no logical data.  Issued as part of vdev caching
   74  *      +---------------+       and I/O aggregation.
   75  *
   76  * Note that 'physical I/O' here is not the same terminology as used in the rest
   77  * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
   78  * blockpointer.  But I/O with no associated block pointer can still be related
   79  * to a logical piece of data (i.e. RAID-Z requests).
   80  *
   81  * Purely physical I/O always have unique ENAs.  They are not related to a
   82  * particular piece of logical data, and therefore cannot be chained together.
   83  * We still generate an ereport, but the DE doesn't correlate it with any
   84  * logical piece of data.  When such an I/O fails, the delegated I/O requests
   85  * will issue a retry, which will trigger the 'real' ereport with the correct
   86  * ENA.
   87  *
   88  * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
   89  * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
   90  * then inherit this pointer, so that when it is first set subsequent failures
   91  * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
   92  * this pointer is set to NULL, and no ereport will be generated (since it
   93  * doesn't actually correspond to any particular device or piece of data,
   94  * and the caller will always retry without caching or queueing anyway).
   95  *
   96  * For checksum errors, we want to include more information about the actual
   97  * error which occurs.  Accordingly, we build an ereport when the error is
   98  * noticed, but instead of sending it in immediately, we hang it off of the
   99  * io_cksum_report field of the logical IO.  When the logical IO completes
  100  * (successfully or not), zfs_ereport_finish_checksum() is called with the
  101  * good and bad versions of the buffer (if available), and we annotate the
  102  * ereport with information about the differences.
  103  */
  104 
  105 #ifdef _KERNEL
  106 /*
  107  * Duplicate ereport Detection
  108  *
  109  * Some ereports are retained momentarily for detecting duplicates.  These
  110  * are kept in a recent_events_node_t in both a time-ordered list and an AVL
  111  * tree of recent unique ereports.
  112  *
  113  * The lifespan of these recent ereports is bounded (15 mins) and a cleaner
  114  * task is used to purge stale entries.
  115  */
  116 static list_t recent_events_list;
  117 static avl_tree_t recent_events_tree;
  118 static kmutex_t recent_events_lock;
  119 static taskqid_t recent_events_cleaner_tqid;
  120 
  121 /*
  122  * Each node is about 128 bytes so 2,000 would consume 1/4 MiB.
  123  *
  124  * This setting can be changed dynamically and setting it to zero
  125  * disables duplicate detection.
  126  */
  127 static unsigned int zfs_zevent_retain_max = 2000;
  128 
  129 /*
  130  * The lifespan for a recent ereport entry. The default of 15 minutes is
  131  * intended to outlive the zfs diagnosis engine's threshold of 10 errors
  132  * over a period of 10 minutes.
  133  */
  134 static unsigned int zfs_zevent_retain_expire_secs = 900;
  135 
  136 typedef enum zfs_subclass {
  137         ZSC_IO,
  138         ZSC_DATA,
  139         ZSC_CHECKSUM
  140 } zfs_subclass_t;
  141 
  142 typedef struct {
  143         /* common criteria */
  144         uint64_t        re_pool_guid;
  145         uint64_t        re_vdev_guid;
  146         int             re_io_error;
  147         uint64_t        re_io_size;
  148         uint64_t        re_io_offset;
  149         zfs_subclass_t  re_subclass;
  150         zio_priority_t  re_io_priority;
  151 
  152         /* logical zio criteria (optional) */
  153         zbookmark_phys_t re_io_bookmark;
  154 
  155         /* internal state */
  156         avl_node_t      re_tree_link;
  157         list_node_t     re_list_link;
  158         uint64_t        re_timestamp;
  159 } recent_events_node_t;
  160 
  161 static int
  162 recent_events_compare(const void *a, const void *b)
  163 {
  164         const recent_events_node_t *node1 = a;
  165         const recent_events_node_t *node2 = b;
  166         int cmp;
  167 
  168         /*
  169          * The comparison order here is somewhat arbitrary.
  170          * What's important is that if every criteria matches, then it
  171          * is a duplicate (i.e. compare returns 0)
  172          */
  173         if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0)
  174                 return (cmp);
  175         if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0)
  176                 return (cmp);
  177         if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0)
  178                 return (cmp);
  179         if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0)
  180                 return (cmp);
  181         if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0)
  182                 return (cmp);
  183         if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0)
  184                 return (cmp);
  185         if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0)
  186                 return (cmp);
  187 
  188         const zbookmark_phys_t *zb1 = &node1->re_io_bookmark;
  189         const zbookmark_phys_t *zb2 = &node2->re_io_bookmark;
  190 
  191         if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0)
  192                 return (cmp);
  193         if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0)
  194                 return (cmp);
  195         if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0)
  196                 return (cmp);
  197         if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0)
  198                 return (cmp);
  199 
  200         return (0);
  201 }
  202 
  203 /*
  204  * workaround: vdev properties don't have inheritance
  205  */
  206 static uint64_t
  207 vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
  208 {
  209         uint64_t propdef, propval;
  210 
  211         propdef = vdev_prop_default_numeric(prop);
  212         switch (prop) {
  213                 case VDEV_PROP_CHECKSUM_N:
  214                         propval = vd->vdev_checksum_n;
  215                         break;
  216                 case VDEV_PROP_CHECKSUM_T:
  217                         propval = vd->vdev_checksum_t;
  218                         break;
  219                 case VDEV_PROP_IO_N:
  220                         propval = vd->vdev_io_n;
  221                         break;
  222                 case VDEV_PROP_IO_T:
  223                         propval = vd->vdev_io_t;
  224                         break;
  225                 default:
  226                         propval = propdef;
  227                         break;
  228         }
  229 
  230         if (propval != propdef)
  231                 return (propval);
  232 
  233         if (vd->vdev_parent == NULL)
  234                 return (propdef);
  235 
  236         return (vdev_prop_get_inherited(vd->vdev_parent, prop));
  237 }
  238 
  239 static void zfs_ereport_schedule_cleaner(void);
  240 
  241 /*
  242  * background task to clean stale recent event nodes.
  243  */
  244 static void
  245 zfs_ereport_cleaner(void *arg)
  246 {
  247         recent_events_node_t *entry;
  248         uint64_t now = gethrtime();
  249 
  250         /*
  251          * purge expired entries
  252          */
  253         mutex_enter(&recent_events_lock);
  254         while ((entry = list_tail(&recent_events_list)) != NULL) {
  255                 uint64_t age = NSEC2SEC(now - entry->re_timestamp);
  256                 if (age <= zfs_zevent_retain_expire_secs)
  257                         break;
  258 
  259                 /* remove expired node */
  260                 avl_remove(&recent_events_tree, entry);
  261                 list_remove(&recent_events_list, entry);
  262                 kmem_free(entry, sizeof (*entry));
  263         }
  264 
  265         /* Restart the cleaner if more entries remain */
  266         recent_events_cleaner_tqid = 0;
  267         if (!list_is_empty(&recent_events_list))
  268                 zfs_ereport_schedule_cleaner();
  269 
  270         mutex_exit(&recent_events_lock);
  271 }
  272 
  273 static void
  274 zfs_ereport_schedule_cleaner(void)
  275 {
  276         ASSERT(MUTEX_HELD(&recent_events_lock));
  277 
  278         uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1);
  279 
  280         recent_events_cleaner_tqid = taskq_dispatch_delay(
  281             system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP,
  282             ddi_get_lbolt() + NSEC_TO_TICK(timeout));
  283 }
  284 
  285 /*
  286  * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL
  287  */
  288 void
  289 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
  290 {
  291         uint64_t vdev_guid, pool_guid;
  292 
  293         ASSERT(vd != NULL || spa != NULL);
  294         if (vd == NULL) {
  295                 vdev_guid = 0;
  296                 pool_guid = spa_guid(spa);
  297         } else {
  298                 vdev_guid = vd->vdev_guid;
  299                 pool_guid = 0;
  300         }
  301 
  302         mutex_enter(&recent_events_lock);
  303 
  304         recent_events_node_t *next = list_head(&recent_events_list);
  305         while (next != NULL) {
  306                 recent_events_node_t *entry = next;
  307 
  308                 next = list_next(&recent_events_list, next);
  309 
  310                 if (entry->re_vdev_guid == vdev_guid ||
  311                     entry->re_pool_guid == pool_guid) {
  312                         avl_remove(&recent_events_tree, entry);
  313                         list_remove(&recent_events_list, entry);
  314                         kmem_free(entry, sizeof (*entry));
  315                 }
  316         }
  317 
  318         mutex_exit(&recent_events_lock);
  319 }
  320 
  321 /*
  322  * Check if an ereport would be a duplicate of one recently posted.
  323  *
  324  * An ereport is considered a duplicate if the set of criteria in
  325  * recent_events_node_t all match.
  326  *
  327  * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM
  328  * are candidates for duplicate checking.
  329  */
  330 static boolean_t
  331 zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd,
  332     const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size)
  333 {
  334         recent_events_node_t search = {0}, *entry;
  335 
  336         if (vd == NULL || zio == NULL)
  337                 return (B_FALSE);
  338 
  339         if (zfs_zevent_retain_max == 0)
  340                 return (B_FALSE);
  341 
  342         if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0)
  343                 search.re_subclass = ZSC_IO;
  344         else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0)
  345                 search.re_subclass = ZSC_DATA;
  346         else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0)
  347                 search.re_subclass = ZSC_CHECKSUM;
  348         else
  349                 return (B_FALSE);
  350 
  351         search.re_pool_guid = spa_guid(spa);
  352         search.re_vdev_guid = vd->vdev_guid;
  353         search.re_io_error = zio->io_error;
  354         search.re_io_priority = zio->io_priority;
  355         /* if size is supplied use it over what's in zio */
  356         if (size) {
  357                 search.re_io_size = size;
  358                 search.re_io_offset = offset;
  359         } else {
  360                 search.re_io_size = zio->io_size;
  361                 search.re_io_offset = zio->io_offset;
  362         }
  363 
  364         /* grab optional logical zio criteria */
  365         if (zb != NULL) {
  366                 search.re_io_bookmark.zb_objset = zb->zb_objset;
  367                 search.re_io_bookmark.zb_object = zb->zb_object;
  368                 search.re_io_bookmark.zb_level = zb->zb_level;
  369                 search.re_io_bookmark.zb_blkid = zb->zb_blkid;
  370         }
  371 
  372         uint64_t now = gethrtime();
  373 
  374         mutex_enter(&recent_events_lock);
  375 
  376         /* check if we have seen this one recently */
  377         entry = avl_find(&recent_events_tree, &search, NULL);
  378         if (entry != NULL) {
  379                 uint64_t age = NSEC2SEC(now - entry->re_timestamp);
  380 
  381                 /*
  382                  * There is still an active cleaner (since we're here).
  383                  * Reset the last seen time for this duplicate entry
  384                  * so that its lifespand gets extended.
  385                  */
  386                 list_remove(&recent_events_list, entry);
  387                 list_insert_head(&recent_events_list, entry);
  388                 entry->re_timestamp = now;
  389 
  390                 zfs_zevent_track_duplicate();
  391                 mutex_exit(&recent_events_lock);
  392 
  393                 return (age <= zfs_zevent_retain_expire_secs);
  394         }
  395 
  396         if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) {
  397                 /* recycle oldest node */
  398                 entry = list_tail(&recent_events_list);
  399                 ASSERT(entry != NULL);
  400                 list_remove(&recent_events_list, entry);
  401                 avl_remove(&recent_events_tree, entry);
  402         } else {
  403                 entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP);
  404         }
  405 
  406         /* record this as a recent ereport */
  407         *entry = search;
  408         avl_add(&recent_events_tree, entry);
  409         list_insert_head(&recent_events_list, entry);
  410         entry->re_timestamp = now;
  411 
  412         /* Start a cleaner if not already scheduled */
  413         if (recent_events_cleaner_tqid == 0)
  414                 zfs_ereport_schedule_cleaner();
  415 
  416         mutex_exit(&recent_events_lock);
  417         return (B_FALSE);
  418 }
  419 
  420 void
  421 zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
  422 {
  423         if (nvl)
  424                 fm_nvlist_destroy(nvl, FM_NVA_FREE);
  425 
  426         if (detector)
  427                 fm_nvlist_destroy(detector, FM_NVA_FREE);
  428 }
  429 
  430 /*
  431  * We want to rate limit ZIO delay, deadman, and checksum events so as to not
  432  * flood zevent consumers when a disk is acting up.
  433  *
  434  * Returns 1 if we're ratelimiting, 0 if not.
  435  */
  436 static int
  437 zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
  438 {
  439         int rc = 0;
  440         /*
  441          * zfs_ratelimit() returns 1 if we're *not* ratelimiting and 0 if we
  442          * are.  Invert it to get our return value.
  443          */
  444         if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
  445                 rc = !zfs_ratelimit(&vd->vdev_delay_rl);
  446         } else if (strcmp(subclass, FM_EREPORT_ZFS_DEADMAN) == 0) {
  447                 rc = !zfs_ratelimit(&vd->vdev_deadman_rl);
  448         } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
  449                 rc = !zfs_ratelimit(&vd->vdev_checksum_rl);
  450         }
  451 
  452         if (rc) {
  453                 /* We're rate limiting */
  454                 fm_erpt_dropped_increment();
  455         }
  456 
  457         return (rc);
  458 }
  459 
  460 /*
  461  * Return B_TRUE if the event actually posted, B_FALSE if not.
  462  */
  463 static boolean_t
  464 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
  465     const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
  466     zio_t *zio, uint64_t stateoroffset, uint64_t size)
  467 {
  468         nvlist_t *ereport, *detector;
  469 
  470         uint64_t ena;
  471         char class[64];
  472 
  473         if ((ereport = fm_nvlist_create(NULL)) == NULL)
  474                 return (B_FALSE);
  475 
  476         if ((detector = fm_nvlist_create(NULL)) == NULL) {
  477                 fm_nvlist_destroy(ereport, FM_NVA_FREE);
  478                 return (B_FALSE);
  479         }
  480 
  481         /*
  482          * Serialize ereport generation
  483          */
  484         mutex_enter(&spa->spa_errlist_lock);
  485 
  486         /*
  487          * Determine the ENA to use for this event.  If we are in a loading
  488          * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
  489          * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
  490          */
  491         if (spa_load_state(spa) != SPA_LOAD_NONE) {
  492                 if (spa->spa_ena == 0)
  493                         spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
  494                 ena = spa->spa_ena;
  495         } else if (zio != NULL && zio->io_logical != NULL) {
  496                 if (zio->io_logical->io_ena == 0)
  497                         zio->io_logical->io_ena =
  498                             fm_ena_generate(0, FM_ENA_FMT1);
  499                 ena = zio->io_logical->io_ena;
  500         } else {
  501                 ena = fm_ena_generate(0, FM_ENA_FMT1);
  502         }
  503 
  504         /*
  505          * Construct the full class, detector, and other standard FMA fields.
  506          */
  507         (void) snprintf(class, sizeof (class), "%s.%s",
  508             ZFS_ERROR_CLASS, subclass);
  509 
  510         fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
  511             vd != NULL ? vd->vdev_guid : 0);
  512 
  513         fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
  514 
  515         /*
  516          * Construct the per-ereport payload, depending on which parameters are
  517          * passed in.
  518          */
  519 
  520         /*
  521          * Generic payload members common to all ereports.
  522          */
  523         fm_payload_set(ereport,
  524             FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa),
  525             FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa),
  526             FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64,
  527             (uint64_t)spa_state(spa),
  528             FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
  529             (int32_t)spa_load_state(spa), NULL);
  530 
  531         fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
  532             DATA_TYPE_STRING,
  533             spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
  534             FM_EREPORT_FAILMODE_WAIT :
  535             spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
  536             FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
  537             NULL);
  538 
  539         if (vd != NULL) {
  540                 vdev_t *pvd = vd->vdev_parent;
  541                 vdev_queue_t *vq = &vd->vdev_queue;
  542                 vdev_stat_t *vs = &vd->vdev_stat;
  543                 vdev_t *spare_vd;
  544                 uint64_t *spare_guids;
  545                 char **spare_paths;
  546                 int i, spare_count;
  547 
  548                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
  549                     DATA_TYPE_UINT64, vd->vdev_guid,
  550                     FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
  551                     DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
  552                 if (vd->vdev_path != NULL)
  553                         fm_payload_set(ereport,
  554                             FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
  555                             DATA_TYPE_STRING, vd->vdev_path, NULL);
  556                 if (vd->vdev_devid != NULL)
  557                         fm_payload_set(ereport,
  558                             FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
  559                             DATA_TYPE_STRING, vd->vdev_devid, NULL);
  560                 if (vd->vdev_fru != NULL)
  561                         fm_payload_set(ereport,
  562                             FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
  563                             DATA_TYPE_STRING, vd->vdev_fru, NULL);
  564                 if (vd->vdev_enc_sysfs_path != NULL)
  565                         fm_payload_set(ereport,
  566                             FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
  567                             DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL);
  568                 if (vd->vdev_ashift)
  569                         fm_payload_set(ereport,
  570                             FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT,
  571                             DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
  572 
  573                 if (vq != NULL) {
  574                         fm_payload_set(ereport,
  575                             FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS,
  576                             DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL);
  577                         fm_payload_set(ereport,
  578                             FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS,
  579                             DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL);
  580                 }
  581 
  582                 if (vs != NULL) {
  583                         fm_payload_set(ereport,
  584                             FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS,
  585                             DATA_TYPE_UINT64, vs->vs_read_errors,
  586                             FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
  587                             DATA_TYPE_UINT64, vs->vs_write_errors,
  588                             FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
  589                             DATA_TYPE_UINT64, vs->vs_checksum_errors,
  590                             FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
  591                             DATA_TYPE_UINT64, vs->vs_slow_ios,
  592                             NULL);
  593                 }
  594 
  595                 if (pvd != NULL) {
  596                         fm_payload_set(ereport,
  597                             FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
  598                             DATA_TYPE_UINT64, pvd->vdev_guid,
  599                             FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
  600                             DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
  601                             NULL);
  602                         if (pvd->vdev_path)
  603                                 fm_payload_set(ereport,
  604                                     FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
  605                                     DATA_TYPE_STRING, pvd->vdev_path, NULL);
  606                         if (pvd->vdev_devid)
  607                                 fm_payload_set(ereport,
  608                                     FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
  609                                     DATA_TYPE_STRING, pvd->vdev_devid, NULL);
  610                 }
  611 
  612                 spare_count = spa->spa_spares.sav_count;
  613                 spare_paths = kmem_zalloc(sizeof (char *) * spare_count,
  614                     KM_SLEEP);
  615                 spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count,
  616                     KM_SLEEP);
  617 
  618                 for (i = 0; i < spare_count; i++) {
  619                         spare_vd = spa->spa_spares.sav_vdevs[i];
  620                         if (spare_vd) {
  621                                 spare_paths[i] = spare_vd->vdev_path;
  622                                 spare_guids[i] = spare_vd->vdev_guid;
  623                         }
  624                 }
  625 
  626                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS,
  627                     DATA_TYPE_STRING_ARRAY, spare_count, spare_paths,
  628                     FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS,
  629                     DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL);
  630 
  631                 kmem_free(spare_guids, sizeof (uint64_t) * spare_count);
  632                 kmem_free(spare_paths, sizeof (char *) * spare_count);
  633         }
  634 
  635         if (zio != NULL) {
  636                 /*
  637                  * Payload common to all I/Os.
  638                  */
  639                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
  640                     DATA_TYPE_INT32, zio->io_error, NULL);
  641                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
  642                     DATA_TYPE_INT32, zio->io_flags, NULL);
  643                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
  644                     DATA_TYPE_UINT32, zio->io_stage, NULL);
  645                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
  646                     DATA_TYPE_UINT32, zio->io_pipeline, NULL);
  647                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
  648                     DATA_TYPE_UINT64, zio->io_delay, NULL);
  649                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
  650                     DATA_TYPE_UINT64, zio->io_timestamp, NULL);
  651                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
  652                     DATA_TYPE_UINT64, zio->io_delta, NULL);
  653                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY,
  654                     DATA_TYPE_UINT32, zio->io_priority, NULL);
  655 
  656                 /*
  657                  * If the 'size' parameter is non-zero, it indicates this is a
  658                  * RAID-Z or other I/O where the physical offset and length are
  659                  * provided for us, instead of within the zio_t.
  660                  */
  661                 if (vd != NULL) {
  662                         if (size)
  663                                 fm_payload_set(ereport,
  664                                     FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
  665                                     DATA_TYPE_UINT64, stateoroffset,
  666                                     FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
  667                                     DATA_TYPE_UINT64, size, NULL);
  668                         else
  669                                 fm_payload_set(ereport,
  670                                     FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
  671                                     DATA_TYPE_UINT64, zio->io_offset,
  672                                     FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
  673                                     DATA_TYPE_UINT64, zio->io_size, NULL);
  674                 }
  675         } else if (vd != NULL) {
  676                 /*
  677                  * If we have a vdev but no zio, this is a device fault, and the
  678                  * 'stateoroffset' parameter indicates the previous state of the
  679                  * vdev.
  680                  */
  681                 fm_payload_set(ereport,
  682                     FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
  683                     DATA_TYPE_UINT64, stateoroffset, NULL);
  684         }
  685 
  686         /*
  687          * Payload for I/Os with corresponding logical information.
  688          */
  689         if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) {
  690                 fm_payload_set(ereport,
  691                     FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
  692                     DATA_TYPE_UINT64, zb->zb_objset,
  693                     FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
  694                     DATA_TYPE_UINT64, zb->zb_object,
  695                     FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
  696                     DATA_TYPE_INT64, zb->zb_level,
  697                     FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
  698                     DATA_TYPE_UINT64, zb->zb_blkid, NULL);
  699         }
  700 
  701         /*
  702          * Payload for tuning the zed
  703          */
  704         if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
  705                 uint64_t cksum_n, cksum_t;
  706 
  707                 cksum_n = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_N);
  708                 if (cksum_n != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N))
  709                         fm_payload_set(ereport,
  710                             FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N,
  711                             DATA_TYPE_UINT64,
  712                             cksum_n,
  713                             NULL);
  714 
  715                 cksum_t = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_T);
  716                 if (cksum_t != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T))
  717                         fm_payload_set(ereport,
  718                             FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T,
  719                             DATA_TYPE_UINT64,
  720                             cksum_t,
  721                             NULL);
  722         }
  723 
  724         if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) {
  725                 uint64_t io_n, io_t;
  726 
  727                 io_n = vdev_prop_get_inherited(vd, VDEV_PROP_IO_N);
  728                 if (io_n != vdev_prop_default_numeric(VDEV_PROP_IO_N))
  729                         fm_payload_set(ereport,
  730                             FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N,
  731                             DATA_TYPE_UINT64,
  732                             io_n,
  733                             NULL);
  734 
  735                 io_t = vdev_prop_get_inherited(vd, VDEV_PROP_IO_T);
  736                 if (io_t != vdev_prop_default_numeric(VDEV_PROP_IO_T))
  737                         fm_payload_set(ereport,
  738                             FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T,
  739                             DATA_TYPE_UINT64,
  740                             io_t,
  741                             NULL);
  742         }
  743 
  744         mutex_exit(&spa->spa_errlist_lock);
  745 
  746         *ereport_out = ereport;
  747         *detector_out = detector;
  748         return (B_TRUE);
  749 }
  750 
  751 /* if it's <= 128 bytes, save the corruption directly */
  752 #define ZFM_MAX_INLINE          (128 / sizeof (uint64_t))
  753 
  754 #define MAX_RANGES              16
  755 
  756 typedef struct zfs_ecksum_info {
  757         /* histograms of set and cleared bits by bit number in a 64-bit word */
  758         uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY];
  759         uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
  760 
  761         /* inline arrays of bits set and cleared. */
  762         uint64_t zei_bits_set[ZFM_MAX_INLINE];
  763         uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
  764 
  765         /*
  766          * for each range, the number of bits set and cleared.  The Hamming
  767          * distance between the good and bad buffers is the sum of them all.
  768          */
  769         uint32_t zei_range_sets[MAX_RANGES];
  770         uint32_t zei_range_clears[MAX_RANGES];
  771 
  772         struct zei_ranges {
  773                 uint32_t        zr_start;
  774                 uint32_t        zr_end;
  775         } zei_ranges[MAX_RANGES];
  776 
  777         size_t  zei_range_count;
  778         uint32_t zei_mingap;
  779         uint32_t zei_allowed_mingap;
  780 
  781 } zfs_ecksum_info_t;
  782 
  783 static void
  784 update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count)
  785 {
  786         size_t i;
  787         size_t bits = 0;
  788         uint64_t value = BE_64(value_arg);
  789 
  790         /* We store the bits in big-endian (largest-first) order */
  791         for (i = 0; i < 64; i++) {
  792                 if (value & (1ull << i)) {
  793                         hist[63 - i]++;
  794                         ++bits;
  795                 }
  796         }
  797         /* update the count of bits changed */
  798         *count += bits;
  799 }
  800 
  801 /*
  802  * We've now filled up the range array, and need to increase "mingap" and
  803  * shrink the range list accordingly.  zei_mingap is always the smallest
  804  * distance between array entries, so we set the new_allowed_gap to be
  805  * one greater than that.  We then go through the list, joining together
  806  * any ranges which are closer than the new_allowed_gap.
  807  *
  808  * By construction, there will be at least one.  We also update zei_mingap
  809  * to the new smallest gap, to prepare for our next invocation.
  810  */
  811 static void
  812 zei_shrink_ranges(zfs_ecksum_info_t *eip)
  813 {
  814         uint32_t mingap = UINT32_MAX;
  815         uint32_t new_allowed_gap = eip->zei_mingap + 1;
  816 
  817         size_t idx, output;
  818         size_t max = eip->zei_range_count;
  819 
  820         struct zei_ranges *r = eip->zei_ranges;
  821 
  822         ASSERT3U(eip->zei_range_count, >, 0);
  823         ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
  824 
  825         output = idx = 0;
  826         while (idx < max - 1) {
  827                 uint32_t start = r[idx].zr_start;
  828                 uint32_t end = r[idx].zr_end;
  829 
  830                 while (idx < max - 1) {
  831                         idx++;
  832 
  833                         uint32_t nstart = r[idx].zr_start;
  834                         uint32_t nend = r[idx].zr_end;
  835 
  836                         uint32_t gap = nstart - end;
  837                         if (gap < new_allowed_gap) {
  838                                 end = nend;
  839                                 continue;
  840                         }
  841                         if (gap < mingap)
  842                                 mingap = gap;
  843                         break;
  844                 }
  845                 r[output].zr_start = start;
  846                 r[output].zr_end = end;
  847                 output++;
  848         }
  849         ASSERT3U(output, <, eip->zei_range_count);
  850         eip->zei_range_count = output;
  851         eip->zei_mingap = mingap;
  852         eip->zei_allowed_mingap = new_allowed_gap;
  853 }
  854 
  855 static void
  856 zei_add_range(zfs_ecksum_info_t *eip, int start, int end)
  857 {
  858         struct zei_ranges *r = eip->zei_ranges;
  859         size_t count = eip->zei_range_count;
  860 
  861         if (count >= MAX_RANGES) {
  862                 zei_shrink_ranges(eip);
  863                 count = eip->zei_range_count;
  864         }
  865         if (count == 0) {
  866                 eip->zei_mingap = UINT32_MAX;
  867                 eip->zei_allowed_mingap = 1;
  868         } else {
  869                 int gap = start - r[count - 1].zr_end;
  870 
  871                 if (gap < eip->zei_allowed_mingap) {
  872                         r[count - 1].zr_end = end;
  873                         return;
  874                 }
  875                 if (gap < eip->zei_mingap)
  876                         eip->zei_mingap = gap;
  877         }
  878         r[count].zr_start = start;
  879         r[count].zr_end = end;
  880         eip->zei_range_count++;
  881 }
  882 
  883 static size_t
  884 zei_range_total_size(zfs_ecksum_info_t *eip)
  885 {
  886         struct zei_ranges *r = eip->zei_ranges;
  887         size_t count = eip->zei_range_count;
  888         size_t result = 0;
  889         size_t idx;
  890 
  891         for (idx = 0; idx < count; idx++)
  892                 result += (r[idx].zr_end - r[idx].zr_start);
  893 
  894         return (result);
  895 }
  896 
  897 static zfs_ecksum_info_t *
  898 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
  899     const abd_t *goodabd, const abd_t *badabd, size_t size,
  900     boolean_t drop_if_identical)
  901 {
  902         const uint64_t *good;
  903         const uint64_t *bad;
  904 
  905         size_t nui64s = size / sizeof (uint64_t);
  906 
  907         size_t inline_size;
  908         int no_inline = 0;
  909         size_t idx;
  910         size_t range;
  911 
  912         size_t offset = 0;
  913         ssize_t start = -1;
  914 
  915         zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
  916 
  917         /* don't do any annotation for injected checksum errors */
  918         if (info != NULL && info->zbc_injected)
  919                 return (eip);
  920 
  921         if (info != NULL && info->zbc_has_cksum) {
  922                 fm_payload_set(ereport,
  923                     FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
  924                     DATA_TYPE_UINT64_ARRAY,
  925                     sizeof (info->zbc_expected) / sizeof (uint64_t),
  926                     (uint64_t *)&info->zbc_expected,
  927                     FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
  928                     DATA_TYPE_UINT64_ARRAY,
  929                     sizeof (info->zbc_actual) / sizeof (uint64_t),
  930                     (uint64_t *)&info->zbc_actual,
  931                     FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
  932                     DATA_TYPE_STRING,
  933                     info->zbc_checksum_name,
  934                     NULL);
  935 
  936                 if (info->zbc_byteswapped) {
  937                         fm_payload_set(ereport,
  938                             FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
  939                             DATA_TYPE_BOOLEAN, 1,
  940                             NULL);
  941                 }
  942         }
  943 
  944         if (badabd == NULL || goodabd == NULL)
  945                 return (eip);
  946 
  947         ASSERT3U(nui64s, <=, UINT32_MAX);
  948         ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
  949         ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
  950         ASSERT3U(size, <=, UINT32_MAX);
  951 
  952         good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size);
  953         bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size);
  954 
  955         /* build up the range list by comparing the two buffers. */
  956         for (idx = 0; idx < nui64s; idx++) {
  957                 if (good[idx] == bad[idx]) {
  958                         if (start == -1)
  959                                 continue;
  960 
  961                         zei_add_range(eip, start, idx);
  962                         start = -1;
  963                 } else {
  964                         if (start != -1)
  965                                 continue;
  966 
  967                         start = idx;
  968                 }
  969         }
  970         if (start != -1)
  971                 zei_add_range(eip, start, idx);
  972 
  973         /* See if it will fit in our inline buffers */
  974         inline_size = zei_range_total_size(eip);
  975         if (inline_size > ZFM_MAX_INLINE)
  976                 no_inline = 1;
  977 
  978         /*
  979          * If there is no change and we want to drop if the buffers are
  980          * identical, do so.
  981          */
  982         if (inline_size == 0 && drop_if_identical) {
  983                 kmem_free(eip, sizeof (*eip));
  984                 abd_return_buf((abd_t *)goodabd, (void *)good, size);
  985                 abd_return_buf((abd_t *)badabd, (void *)bad, size);
  986                 return (NULL);
  987         }
  988 
  989         /*
  990          * Now walk through the ranges, filling in the details of the
  991          * differences.  Also convert our uint64_t-array offsets to byte
  992          * offsets.
  993          */
  994         for (range = 0; range < eip->zei_range_count; range++) {
  995                 size_t start = eip->zei_ranges[range].zr_start;
  996                 size_t end = eip->zei_ranges[range].zr_end;
  997 
  998                 for (idx = start; idx < end; idx++) {
  999                         uint64_t set, cleared;
 1000 
 1001                         // bits set in bad, but not in good
 1002                         set = ((~good[idx]) & bad[idx]);
 1003                         // bits set in good, but not in bad
 1004                         cleared = (good[idx] & (~bad[idx]));
 1005 
 1006                         if (!no_inline) {
 1007                                 ASSERT3U(offset, <, inline_size);
 1008                                 eip->zei_bits_set[offset] = set;
 1009                                 eip->zei_bits_cleared[offset] = cleared;
 1010                                 offset++;
 1011                         }
 1012 
 1013                         update_histogram(set, eip->zei_histogram_set,
 1014                             &eip->zei_range_sets[range]);
 1015                         update_histogram(cleared, eip->zei_histogram_cleared,
 1016                             &eip->zei_range_clears[range]);
 1017                 }
 1018 
 1019                 /* convert to byte offsets */
 1020                 eip->zei_ranges[range].zr_start *= sizeof (uint64_t);
 1021                 eip->zei_ranges[range].zr_end   *= sizeof (uint64_t);
 1022         }
 1023 
 1024         abd_return_buf((abd_t *)goodabd, (void *)good, size);
 1025         abd_return_buf((abd_t *)badabd, (void *)bad, size);
 1026 
 1027         eip->zei_allowed_mingap *= sizeof (uint64_t);
 1028         inline_size             *= sizeof (uint64_t);
 1029 
 1030         /* fill in ereport */
 1031         fm_payload_set(ereport,
 1032             FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
 1033             DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
 1034             (uint32_t *)eip->zei_ranges,
 1035             FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
 1036             DATA_TYPE_UINT32, eip->zei_allowed_mingap,
 1037             FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
 1038             DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
 1039             FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
 1040             DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
 1041             NULL);
 1042 
 1043         if (!no_inline) {
 1044                 fm_payload_set(ereport,
 1045                     FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
 1046                     DATA_TYPE_UINT8_ARRAY,
 1047                     inline_size, (uint8_t *)eip->zei_bits_set,
 1048                     FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
 1049                     DATA_TYPE_UINT8_ARRAY,
 1050                     inline_size, (uint8_t *)eip->zei_bits_cleared,
 1051                     NULL);
 1052         } else {
 1053                 fm_payload_set(ereport,
 1054                     FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
 1055                     DATA_TYPE_UINT32_ARRAY,
 1056                     NBBY * sizeof (uint64_t), eip->zei_histogram_set,
 1057                     FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
 1058                     DATA_TYPE_UINT32_ARRAY,
 1059                     NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
 1060                     NULL);
 1061         }
 1062         return (eip);
 1063 }
 1064 #else
 1065 void
 1066 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
 1067 {
 1068         (void) spa, (void) vd;
 1069 }
 1070 #endif
 1071 
 1072 /*
 1073  * Make sure our event is still valid for the given zio/vdev/pool.  For example,
 1074  * we don't want to keep logging events for a faulted or missing vdev.
 1075  */
 1076 boolean_t
 1077 zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
 1078 {
 1079 #ifdef _KERNEL
 1080         /*
 1081          * If we are doing a spa_tryimport() or in recovery mode,
 1082          * ignore errors.
 1083          */
 1084         if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
 1085             spa_load_state(spa) == SPA_LOAD_RECOVER)
 1086                 return (B_FALSE);
 1087 
 1088         /*
 1089          * If we are in the middle of opening a pool, and the previous attempt
 1090          * failed, don't bother logging any new ereports - we're just going to
 1091          * get the same diagnosis anyway.
 1092          */
 1093         if (spa_load_state(spa) != SPA_LOAD_NONE &&
 1094             spa->spa_last_open_failed)
 1095                 return (B_FALSE);
 1096 
 1097         if (zio != NULL) {
 1098                 /*
 1099                  * If this is not a read or write zio, ignore the error.  This
 1100                  * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
 1101                  */
 1102                 if (zio->io_type != ZIO_TYPE_READ &&
 1103                     zio->io_type != ZIO_TYPE_WRITE)
 1104                         return (B_FALSE);
 1105 
 1106                 if (vd != NULL) {
 1107                         /*
 1108                          * If the vdev has already been marked as failing due
 1109                          * to a failed probe, then ignore any subsequent I/O
 1110                          * errors, as the DE will automatically fault the vdev
 1111                          * on the first such failure.  This also catches cases
 1112                          * where vdev_remove_wanted is set and the device has
 1113                          * not yet been asynchronously placed into the REMOVED
 1114                          * state.
 1115                          */
 1116                         if (zio->io_vd == vd && !vdev_accessible(vd, zio))
 1117                                 return (B_FALSE);
 1118 
 1119                         /*
 1120                          * Ignore checksum errors for reads from DTL regions of
 1121                          * leaf vdevs.
 1122                          */
 1123                         if (zio->io_type == ZIO_TYPE_READ &&
 1124                             zio->io_error == ECKSUM &&
 1125                             vd->vdev_ops->vdev_op_leaf &&
 1126                             vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
 1127                                 return (B_FALSE);
 1128                 }
 1129         }
 1130 
 1131         /*
 1132          * For probe failure, we want to avoid posting ereports if we've
 1133          * already removed the device in the meantime.
 1134          */
 1135         if (vd != NULL &&
 1136             strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
 1137             (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
 1138                 return (B_FALSE);
 1139 
 1140         /* Ignore bogus delay events (like from ioctls or unqueued IOs) */
 1141         if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
 1142             (zio != NULL) && (!zio->io_timestamp)) {
 1143                 return (B_FALSE);
 1144         }
 1145 #else
 1146         (void) subclass, (void) spa, (void) vd, (void) zio;
 1147 #endif
 1148         return (B_TRUE);
 1149 }
 1150 
 1151 /*
 1152  * Post an ereport for the given subclass
 1153  *
 1154  * Returns
 1155  * - 0 if an event was posted
 1156  * - EINVAL if there was a problem posting event
 1157  * - EBUSY if the event was rate limited
 1158  * - EALREADY if the event was already posted (duplicate)
 1159  */
 1160 int
 1161 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
 1162     const zbookmark_phys_t *zb, zio_t *zio, uint64_t state)
 1163 {
 1164         int rc = 0;
 1165 #ifdef _KERNEL
 1166         nvlist_t *ereport = NULL;
 1167         nvlist_t *detector = NULL;
 1168 
 1169         if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
 1170                 return (EINVAL);
 1171 
 1172         if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0))
 1173                 return (SET_ERROR(EALREADY));
 1174 
 1175         if (zfs_is_ratelimiting_event(subclass, vd))
 1176                 return (SET_ERROR(EBUSY));
 1177 
 1178         if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
 1179             zb, zio, state, 0))
 1180                 return (SET_ERROR(EINVAL));     /* couldn't post event */
 1181 
 1182         if (ereport == NULL)
 1183                 return (SET_ERROR(EINVAL));
 1184 
 1185         /* Cleanup is handled by the callback function */
 1186         rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
 1187 #else
 1188         (void) subclass, (void) spa, (void) vd, (void) zb, (void) zio,
 1189             (void) state;
 1190 #endif
 1191         return (rc);
 1192 }
 1193 
 1194 /*
 1195  * Prepare a checksum ereport
 1196  *
 1197  * Returns
 1198  * - 0 if an event was posted
 1199  * - EINVAL if there was a problem posting event
 1200  * - EBUSY if the event was rate limited
 1201  * - EALREADY if the event was already posted (duplicate)
 1202  */
 1203 int
 1204 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
 1205     struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info)
 1206 {
 1207         zio_cksum_report_t *report;
 1208 
 1209 #ifdef _KERNEL
 1210         if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
 1211                 return (SET_ERROR(EINVAL));
 1212 
 1213         if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
 1214             offset, length))
 1215                 return (SET_ERROR(EALREADY));
 1216 
 1217         if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
 1218                 return (SET_ERROR(EBUSY));
 1219 #else
 1220         (void) zb, (void) offset;
 1221 #endif
 1222 
 1223         report = kmem_zalloc(sizeof (*report), KM_SLEEP);
 1224 
 1225         zio_vsd_default_cksum_report(zio, report);
 1226 
 1227         /* copy the checksum failure information if it was provided */
 1228         if (info != NULL) {
 1229                 report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
 1230                 memcpy(report->zcr_ckinfo, info, sizeof (*info));
 1231         }
 1232 
 1233         report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
 1234         report->zcr_align =
 1235             vdev_psize_to_asize(vd->vdev_top, report->zcr_sector);
 1236         report->zcr_length = length;
 1237 
 1238 #ifdef _KERNEL
 1239         (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
 1240             FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length);
 1241 
 1242         if (report->zcr_ereport == NULL) {
 1243                 zfs_ereport_free_checksum(report);
 1244                 return (0);
 1245         }
 1246 #endif
 1247 
 1248         mutex_enter(&spa->spa_errlist_lock);
 1249         report->zcr_next = zio->io_logical->io_cksum_report;
 1250         zio->io_logical->io_cksum_report = report;
 1251         mutex_exit(&spa->spa_errlist_lock);
 1252         return (0);
 1253 }
 1254 
 1255 void
 1256 zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data,
 1257     const abd_t *bad_data, boolean_t drop_if_identical)
 1258 {
 1259 #ifdef _KERNEL
 1260         zfs_ecksum_info_t *info;
 1261 
 1262         info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
 1263             good_data, bad_data, report->zcr_length, drop_if_identical);
 1264         if (info != NULL)
 1265                 zfs_zevent_post(report->zcr_ereport,
 1266                     report->zcr_detector, zfs_zevent_post_cb);
 1267         else
 1268                 zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector);
 1269 
 1270         report->zcr_ereport = report->zcr_detector = NULL;
 1271         if (info != NULL)
 1272                 kmem_free(info, sizeof (*info));
 1273 #else
 1274         (void) report, (void) good_data, (void) bad_data,
 1275             (void) drop_if_identical;
 1276 #endif
 1277 }
 1278 
 1279 void
 1280 zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
 1281 {
 1282 #ifdef _KERNEL
 1283         if (rpt->zcr_ereport != NULL) {
 1284                 fm_nvlist_destroy(rpt->zcr_ereport,
 1285                     FM_NVA_FREE);
 1286                 fm_nvlist_destroy(rpt->zcr_detector,
 1287                     FM_NVA_FREE);
 1288         }
 1289 #endif
 1290         rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
 1291 
 1292         if (rpt->zcr_ckinfo != NULL)
 1293                 kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
 1294 
 1295         kmem_free(rpt, sizeof (*rpt));
 1296 }
 1297 
 1298 /*
 1299  * Post a checksum ereport
 1300  *
 1301  * Returns
 1302  * - 0 if an event was posted
 1303  * - EINVAL if there was a problem posting event
 1304  * - EBUSY if the event was rate limited
 1305  * - EALREADY if the event was already posted (duplicate)
 1306  */
 1307 int
 1308 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
 1309     struct zio *zio, uint64_t offset, uint64_t length,
 1310     const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
 1311 {
 1312         int rc = 0;
 1313 #ifdef _KERNEL
 1314         nvlist_t *ereport = NULL;
 1315         nvlist_t *detector = NULL;
 1316         zfs_ecksum_info_t *info;
 1317 
 1318         if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
 1319                 return (SET_ERROR(EINVAL));
 1320 
 1321         if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
 1322             offset, length))
 1323                 return (SET_ERROR(EALREADY));
 1324 
 1325         if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
 1326                 return (SET_ERROR(EBUSY));
 1327 
 1328         if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
 1329             spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
 1330                 return (SET_ERROR(EINVAL));
 1331         }
 1332 
 1333         info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
 1334             B_FALSE);
 1335 
 1336         if (info != NULL) {
 1337                 rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
 1338                 kmem_free(info, sizeof (*info));
 1339         }
 1340 #else
 1341         (void) spa, (void) vd, (void) zb, (void) zio, (void) offset,
 1342             (void) length, (void) good_data, (void) bad_data, (void) zbc;
 1343 #endif
 1344         return (rc);
 1345 }
 1346 
 1347 /*
 1348  * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of
 1349  * change in the pool.  All sysevents are listed in sys/sysevent/eventdefs.h
 1350  * and are designed to be consumed by the ZFS Event Daemon (ZED).  For
 1351  * additional details refer to the zed(8) man page.
 1352  */
 1353 nvlist_t *
 1354 zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name,
 1355     nvlist_t *aux)
 1356 {
 1357         nvlist_t *resource = NULL;
 1358 #ifdef _KERNEL
 1359         char class[64];
 1360 
 1361         if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
 1362                 return (NULL);
 1363 
 1364         if ((resource = fm_nvlist_create(NULL)) == NULL)
 1365                 return (NULL);
 1366 
 1367         (void) snprintf(class, sizeof (class), "%s.%s.%s", type,
 1368             ZFS_ERROR_CLASS, name);
 1369         VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION));
 1370         VERIFY0(nvlist_add_string(resource, FM_CLASS, class));
 1371         VERIFY0(nvlist_add_string(resource,
 1372             FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa)));
 1373         VERIFY0(nvlist_add_uint64(resource,
 1374             FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)));
 1375         VERIFY0(nvlist_add_uint64(resource,
 1376             FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa)));
 1377         VERIFY0(nvlist_add_int32(resource,
 1378             FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa)));
 1379 
 1380         if (vd) {
 1381                 VERIFY0(nvlist_add_uint64(resource,
 1382                     FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid));
 1383                 VERIFY0(nvlist_add_uint64(resource,
 1384                     FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state));
 1385                 if (vd->vdev_path != NULL)
 1386                         VERIFY0(nvlist_add_string(resource,
 1387                             FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path));
 1388                 if (vd->vdev_devid != NULL)
 1389                         VERIFY0(nvlist_add_string(resource,
 1390                             FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid));
 1391                 if (vd->vdev_fru != NULL)
 1392                         VERIFY0(nvlist_add_string(resource,
 1393                             FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru));
 1394                 if (vd->vdev_enc_sysfs_path != NULL)
 1395                         VERIFY0(nvlist_add_string(resource,
 1396                             FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
 1397                             vd->vdev_enc_sysfs_path));
 1398         }
 1399 
 1400         /* also copy any optional payload data */
 1401         if (aux) {
 1402                 nvpair_t *elem = NULL;
 1403 
 1404                 while ((elem = nvlist_next_nvpair(aux, elem)) != NULL)
 1405                         (void) nvlist_add_nvpair(resource, elem);
 1406         }
 1407 #else
 1408         (void) spa, (void) vd, (void) type, (void) name, (void) aux;
 1409 #endif
 1410         return (resource);
 1411 }
 1412 
 1413 static void
 1414 zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name,
 1415     nvlist_t *aux)
 1416 {
 1417 #ifdef _KERNEL
 1418         nvlist_t *resource;
 1419 
 1420         resource = zfs_event_create(spa, vd, type, name, aux);
 1421         if (resource)
 1422                 zfs_zevent_post(resource, NULL, zfs_zevent_post_cb);
 1423 #else
 1424         (void) spa, (void) vd, (void) type, (void) name, (void) aux;
 1425 #endif
 1426 }
 1427 
 1428 /*
 1429  * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
 1430  * has been removed from the system.  This will cause the DE to ignore any
 1431  * recent I/O errors, inferring that they are due to the asynchronous device
 1432  * removal.
 1433  */
 1434 void
 1435 zfs_post_remove(spa_t *spa, vdev_t *vd)
 1436 {
 1437         zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL);
 1438 }
 1439 
 1440 /*
 1441  * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
 1442  * has the 'autoreplace' property set, and therefore any broken vdevs will be
 1443  * handled by higher level logic, and no vdev fault should be generated.
 1444  */
 1445 void
 1446 zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
 1447 {
 1448         zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL);
 1449 }
 1450 
 1451 /*
 1452  * The 'resource.fs.zfs.statechange' event is an internal signal that the
 1453  * given vdev has transitioned its state to DEGRADED or HEALTHY.  This will
 1454  * cause the retire agent to repair any outstanding fault management cases
 1455  * open because the device was not found (fault.fs.zfs.device).
 1456  */
 1457 void
 1458 zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
 1459 {
 1460 #ifdef _KERNEL
 1461         nvlist_t *aux;
 1462 
 1463         /*
 1464          * Add optional supplemental keys to payload
 1465          */
 1466         aux = fm_nvlist_create(NULL);
 1467         if (vd && aux) {
 1468                 if (vd->vdev_physpath) {
 1469                         fnvlist_add_string(aux,
 1470                             FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH,
 1471                             vd->vdev_physpath);
 1472                 }
 1473                 if (vd->vdev_enc_sysfs_path) {
 1474                         fnvlist_add_string(aux,
 1475                             FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
 1476                             vd->vdev_enc_sysfs_path);
 1477                 }
 1478 
 1479                 fnvlist_add_uint64(aux,
 1480                     FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate);
 1481         }
 1482 
 1483         zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE,
 1484             aux);
 1485 
 1486         if (aux)
 1487                 fm_nvlist_destroy(aux, FM_NVA_FREE);
 1488 #else
 1489         (void) spa, (void) vd, (void) laststate;
 1490 #endif
 1491 }
 1492 
 1493 #ifdef _KERNEL
 1494 void
 1495 zfs_ereport_init(void)
 1496 {
 1497         mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL);
 1498         list_create(&recent_events_list, sizeof (recent_events_node_t),
 1499             offsetof(recent_events_node_t, re_list_link));
 1500         avl_create(&recent_events_tree,  recent_events_compare,
 1501             sizeof (recent_events_node_t), offsetof(recent_events_node_t,
 1502             re_tree_link));
 1503 }
 1504 
 1505 /*
 1506  * This 'early' fini needs to run before zfs_fini() which on Linux waits
 1507  * for the system_delay_taskq to drain.
 1508  */
 1509 void
 1510 zfs_ereport_taskq_fini(void)
 1511 {
 1512         mutex_enter(&recent_events_lock);
 1513         if (recent_events_cleaner_tqid != 0) {
 1514                 taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);
 1515                 recent_events_cleaner_tqid = 0;
 1516         }
 1517         mutex_exit(&recent_events_lock);
 1518 }
 1519 
 1520 void
 1521 zfs_ereport_fini(void)
 1522 {
 1523         recent_events_node_t *entry;
 1524 
 1525         while ((entry = list_head(&recent_events_list)) != NULL) {
 1526                 avl_remove(&recent_events_tree, entry);
 1527                 list_remove(&recent_events_list, entry);
 1528                 kmem_free(entry, sizeof (*entry));
 1529         }
 1530         avl_destroy(&recent_events_tree);
 1531         list_destroy(&recent_events_list);
 1532         mutex_destroy(&recent_events_lock);
 1533 }
 1534 
 1535 void
 1536 zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name)
 1537 {
 1538         nvlist_t *aux;
 1539 
 1540         aux = fm_nvlist_create(NULL);
 1541         fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name);
 1542 
 1543         zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
 1544         fm_nvlist_destroy(aux, FM_NVA_FREE);
 1545 }
 1546 
 1547 /*
 1548  * Post when a event when a zvol is created or removed
 1549  *
 1550  * This is currently only used by macOS, since it uses the event to create
 1551  * symlinks between the volume name (mypool/myvol) and the actual /dev
 1552  * device (/dev/disk3).  For example:
 1553  *
 1554  * /var/run/zfs/dsk/mypool/myvol -> /dev/disk3
 1555  *
 1556  * name: The full name of the zvol ("mypool/myvol")
 1557  * dev_name: The full /dev name for the zvol ("/dev/disk3")
 1558  * raw_name: The raw  /dev name for the zvol ("/dev/rdisk3")
 1559  */
 1560 void
 1561 zfs_ereport_zvol_post(const char *subclass, const char *name,
 1562     const char *dev_name, const char *raw_name)
 1563 {
 1564         nvlist_t *aux;
 1565         char *r;
 1566 
 1567         boolean_t locked = mutex_owned(&spa_namespace_lock);
 1568         if (!locked) mutex_enter(&spa_namespace_lock);
 1569         spa_t *spa = spa_lookup(name);
 1570         if (!locked) mutex_exit(&spa_namespace_lock);
 1571 
 1572         if (spa == NULL)
 1573                 return;
 1574 
 1575         aux = fm_nvlist_create(NULL);
 1576         fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name);
 1577         fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME,
 1578             raw_name);
 1579         r = strchr(name, '/');
 1580         if (r && r[1])
 1581                 fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]);
 1582 
 1583         zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
 1584         fm_nvlist_destroy(aux, FM_NVA_FREE);
 1585 }
 1586 
 1587 EXPORT_SYMBOL(zfs_ereport_post);
 1588 EXPORT_SYMBOL(zfs_ereport_is_valid);
 1589 EXPORT_SYMBOL(zfs_ereport_post_checksum);
 1590 EXPORT_SYMBOL(zfs_post_remove);
 1591 EXPORT_SYMBOL(zfs_post_autoreplace);
 1592 EXPORT_SYMBOL(zfs_post_state_change);
 1593 
 1594 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW,
 1595         "Maximum recent zevents records to retain for duplicate checking");
 1596 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW,
 1597         "Expiration time for recent zevents records");
 1598 #endif /* _KERNEL */

Cache object: 640ca3a1495a8132376171d33de5178b


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.