The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 
   22 /*
   23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
   24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
   25  * Copyright (c) 2016, Intel Corporation.
   26  */
   27 
   28 #include <stddef.h>
   29 #include <string.h>
   30 #include <libuutil.h>
   31 #include <libzfs.h>
   32 #include <sys/types.h>
   33 #include <sys/time.h>
   34 #include <sys/fs/zfs.h>
   35 #include <sys/fm/protocol.h>
   36 #include <sys/fm/fs/zfs.h>
   37 #include <sys/zio.h>
   38 
   39 #include "zfs_agents.h"
   40 #include "fmd_api.h"
   41 
   42 /*
   43  * Default values for the serd engine when processing checksum or io errors. The
   44  * semantics are N <events> in T <seconds>.
   45  */
   46 #define DEFAULT_CHECKSUM_N      10      /* events */
   47 #define DEFAULT_CHECKSUM_T      600     /* seconds */
   48 #define DEFAULT_IO_N            10      /* events */
   49 #define DEFAULT_IO_T            600     /* seconds */
   50 
   51 /*
   52  * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'.  This
   53  * #define reserves enough space for two 64-bit hex values plus the length of
   54  * the longest string.
   55  */
   56 #define MAX_SERDLEN     (16 * 2 + sizeof ("zfs___checksum"))
   57 
   58 /*
   59  * On-disk case structure.  This must maintain backwards compatibility with
   60  * previous versions of the DE.  By default, any members appended to the end
   61  * will be filled with zeros if they don't exist in a previous version.
   62  */
   63 typedef struct zfs_case_data {
   64         uint64_t        zc_version;
   65         uint64_t        zc_ena;
   66         uint64_t        zc_pool_guid;
   67         uint64_t        zc_vdev_guid;
   68         int             zc_pool_state;
   69         char            zc_serd_checksum[MAX_SERDLEN];
   70         char            zc_serd_io[MAX_SERDLEN];
   71         int             zc_has_remove_timer;
   72 } zfs_case_data_t;
   73 
   74 /*
   75  * Time-of-day
   76  */
   77 typedef struct er_timeval {
   78         uint64_t        ertv_sec;
   79         uint64_t        ertv_nsec;
   80 } er_timeval_t;
   81 
   82 /*
   83  * In-core case structure.
   84  */
   85 typedef struct zfs_case {
   86         boolean_t       zc_present;
   87         uint32_t        zc_version;
   88         zfs_case_data_t zc_data;
   89         fmd_case_t      *zc_case;
   90         uu_list_node_t  zc_node;
   91         id_t            zc_remove_timer;
   92         char            *zc_fru;
   93         er_timeval_t    zc_when;
   94 } zfs_case_t;
   95 
   96 #define CASE_DATA                       "data"
   97 #define CASE_FRU                        "fru"
   98 #define CASE_DATA_VERSION_INITIAL       1
   99 #define CASE_DATA_VERSION_SERD          2
  100 
  101 typedef struct zfs_de_stats {
  102         fmd_stat_t      old_drops;
  103         fmd_stat_t      dev_drops;
  104         fmd_stat_t      vdev_drops;
  105         fmd_stat_t      import_drops;
  106         fmd_stat_t      resource_drops;
  107 } zfs_de_stats_t;
  108 
  109 zfs_de_stats_t zfs_stats = {
  110         { "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" },
  111         { "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"},
  112         { "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"},
  113         { "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" },
  114         { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
  115 };
  116 
  117 static hrtime_t zfs_remove_timeout;
  118 
  119 uu_list_pool_t *zfs_case_pool;
  120 uu_list_t *zfs_cases;
  121 
  122 #define ZFS_MAKE_RSRC(type)     \
  123     FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type
  124 #define ZFS_MAKE_EREPORT(type)  \
  125     FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
  126 
  127 /*
  128  * Write out the persistent representation of an active case.
  129  */
  130 static void
  131 zfs_case_serialize(zfs_case_t *zcp)
  132 {
  133         zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD;
  134 }
  135 
  136 /*
  137  * Read back the persistent representation of an active case.
  138  */
  139 static zfs_case_t *
  140 zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
  141 {
  142         zfs_case_t *zcp;
  143 
  144         zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP);
  145         zcp->zc_case = cp;
  146 
  147         fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data,
  148             sizeof (zcp->zc_data));
  149 
  150         if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) {
  151                 fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
  152                 return (NULL);
  153         }
  154 
  155         /*
  156          * fmd_buf_read() will have already zeroed out the remainder of the
  157          * buffer, so we don't have to do anything special if the version
  158          * doesn't include the SERD engine name.
  159          */
  160 
  161         if (zcp->zc_data.zc_has_remove_timer)
  162                 zcp->zc_remove_timer = fmd_timer_install(hdl, zcp,
  163                     NULL, zfs_remove_timeout);
  164 
  165         uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool);
  166         (void) uu_list_insert_before(zfs_cases, NULL, zcp);
  167 
  168         fmd_case_setspecific(hdl, cp, zcp);
  169 
  170         return (zcp);
  171 }
  172 
  173 /*
  174  * Iterate over any active cases.  If any cases are associated with a pool or
  175  * vdev which is no longer present on the system, close the associated case.
  176  */
  177 static void
  178 zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded)
  179 {
  180         uint64_t vdev_guid = 0;
  181         uint_t c, children;
  182         nvlist_t **child;
  183         zfs_case_t *zcp;
  184 
  185         (void) nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid);
  186 
  187         /*
  188          * Mark any cases associated with this (pool, vdev) pair.
  189          */
  190         for (zcp = uu_list_first(zfs_cases); zcp != NULL;
  191             zcp = uu_list_next(zfs_cases, zcp)) {
  192                 if (zcp->zc_data.zc_pool_guid == pool_guid &&
  193                     zcp->zc_data.zc_vdev_guid == vdev_guid) {
  194                         zcp->zc_present = B_TRUE;
  195                         zcp->zc_when = *loaded;
  196                 }
  197         }
  198 
  199         /*
  200          * Iterate over all children.
  201          */
  202         if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child,
  203             &children) == 0) {
  204                 for (c = 0; c < children; c++)
  205                         zfs_mark_vdev(pool_guid, child[c], loaded);
  206         }
  207 
  208         if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child,
  209             &children) == 0) {
  210                 for (c = 0; c < children; c++)
  211                         zfs_mark_vdev(pool_guid, child[c], loaded);
  212         }
  213 
  214         if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child,
  215             &children) == 0) {
  216                 for (c = 0; c < children; c++)
  217                         zfs_mark_vdev(pool_guid, child[c], loaded);
  218         }
  219 }
  220 
  221 static int
  222 zfs_mark_pool(zpool_handle_t *zhp, void *unused)
  223 {
  224         (void) unused;
  225         zfs_case_t *zcp;
  226         uint64_t pool_guid;
  227         uint64_t *tod;
  228         er_timeval_t loaded = { 0 };
  229         nvlist_t *config, *vd;
  230         uint_t nelem = 0;
  231         int ret;
  232 
  233         pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL);
  234         /*
  235          * Mark any cases associated with just this pool.
  236          */
  237         for (zcp = uu_list_first(zfs_cases); zcp != NULL;
  238             zcp = uu_list_next(zfs_cases, zcp)) {
  239                 if (zcp->zc_data.zc_pool_guid == pool_guid &&
  240                     zcp->zc_data.zc_vdev_guid == 0)
  241                         zcp->zc_present = B_TRUE;
  242         }
  243 
  244         if ((config = zpool_get_config(zhp, NULL)) == NULL) {
  245                 zpool_close(zhp);
  246                 return (-1);
  247         }
  248 
  249         (void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME,
  250             &tod, &nelem);
  251         if (nelem == 2) {
  252                 loaded.ertv_sec = tod[0];
  253                 loaded.ertv_nsec = tod[1];
  254                 for (zcp = uu_list_first(zfs_cases); zcp != NULL;
  255                     zcp = uu_list_next(zfs_cases, zcp)) {
  256                         if (zcp->zc_data.zc_pool_guid == pool_guid &&
  257                             zcp->zc_data.zc_vdev_guid == 0) {
  258                                 zcp->zc_when = loaded;
  259                         }
  260                 }
  261         }
  262 
  263         ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd);
  264         if (ret) {
  265                 zpool_close(zhp);
  266                 return (-1);
  267         }
  268 
  269         zfs_mark_vdev(pool_guid, vd, &loaded);
  270 
  271         zpool_close(zhp);
  272 
  273         return (0);
  274 }
  275 
  276 struct load_time_arg {
  277         uint64_t lt_guid;
  278         er_timeval_t *lt_time;
  279         boolean_t lt_found;
  280 };
  281 
  282 static int
  283 zpool_find_load_time(zpool_handle_t *zhp, void *arg)
  284 {
  285         struct load_time_arg *lta = arg;
  286         uint64_t pool_guid;
  287         uint64_t *tod;
  288         nvlist_t *config;
  289         uint_t nelem;
  290 
  291         if (lta->lt_found) {
  292                 zpool_close(zhp);
  293                 return (0);
  294         }
  295 
  296         pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL);
  297         if (pool_guid != lta->lt_guid) {
  298                 zpool_close(zhp);
  299                 return (0);
  300         }
  301 
  302         if ((config = zpool_get_config(zhp, NULL)) == NULL) {
  303                 zpool_close(zhp);
  304                 return (-1);
  305         }
  306 
  307         if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME,
  308             &tod, &nelem) == 0 && nelem == 2) {
  309                 lta->lt_found = B_TRUE;
  310                 lta->lt_time->ertv_sec = tod[0];
  311                 lta->lt_time->ertv_nsec = tod[1];
  312         }
  313 
  314         zpool_close(zhp);
  315 
  316         return (0);
  317 }
  318 
  319 static void
  320 zfs_purge_cases(fmd_hdl_t *hdl)
  321 {
  322         zfs_case_t *zcp;
  323         uu_list_walk_t *walk;
  324         libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
  325 
  326         /*
  327          * There is no way to open a pool by GUID, or lookup a vdev by GUID.  No
  328          * matter what we do, we're going to have to stomach an O(vdevs * cases)
  329          * algorithm.  In reality, both quantities are likely so small that
  330          * neither will matter. Given that iterating over pools is more
  331          * expensive than iterating over the in-memory case list, we opt for a
  332          * 'present' flag in each case that starts off cleared.  We then iterate
  333          * over all pools, marking those that are still present, and removing
  334          * those that aren't found.
  335          *
  336          * Note that we could also construct an FMRI and rely on
  337          * fmd_nvl_fmri_present(), but this would end up doing the same search.
  338          */
  339 
  340         /*
  341          * Mark the cases as not present.
  342          */
  343         for (zcp = uu_list_first(zfs_cases); zcp != NULL;
  344             zcp = uu_list_next(zfs_cases, zcp))
  345                 zcp->zc_present = B_FALSE;
  346 
  347         /*
  348          * Iterate over all pools and mark the pools and vdevs found.  If this
  349          * fails (most probably because we're out of memory), then don't close
  350          * any of the cases and we cannot be sure they are accurate.
  351          */
  352         if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0)
  353                 return;
  354 
  355         /*
  356          * Remove those cases which were not found.
  357          */
  358         walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
  359         while ((zcp = uu_list_walk_next(walk)) != NULL) {
  360                 if (!zcp->zc_present)
  361                         fmd_case_close(hdl, zcp->zc_case);
  362         }
  363         uu_list_walk_end(walk);
  364 }
  365 
  366 /*
  367  * Construct the name of a serd engine given the pool/vdev GUID and type (io or
  368  * checksum).
  369  */
  370 static void
  371 zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
  372     const char *type)
  373 {
  374         (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s",
  375             (long long unsigned int)pool_guid,
  376             (long long unsigned int)vdev_guid, type);
  377 }
  378 
  379 /*
  380  * Solve a given ZFS case.  This first checks to make sure the diagnosis is
  381  * still valid, as well as cleaning up any pending timer associated with the
  382  * case.
  383  */
  384 static void
  385 zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname)
  386 {
  387         nvlist_t *detector, *fault;
  388         boolean_t serialize;
  389         nvlist_t *fru = NULL;
  390         fmd_hdl_debug(hdl, "solving fault '%s'", faultname);
  391 
  392         /*
  393          * Construct the detector from the case data.  The detector is in the
  394          * ZFS scheme, and is either the pool or the vdev, depending on whether
  395          * this is a vdev or pool fault.
  396          */
  397         detector = fmd_nvl_alloc(hdl, FMD_SLEEP);
  398 
  399         (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0);
  400         (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS);
  401         (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL,
  402             zcp->zc_data.zc_pool_guid);
  403         if (zcp->zc_data.zc_vdev_guid != 0) {
  404                 (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV,
  405                     zcp->zc_data.zc_vdev_guid);
  406         }
  407 
  408         fault = fmd_nvl_create_fault(hdl, faultname, 100, detector,
  409             fru, detector);
  410         fmd_case_add_suspect(hdl, zcp->zc_case, fault);
  411 
  412         nvlist_free(fru);
  413 
  414         fmd_case_solve(hdl, zcp->zc_case);
  415 
  416         serialize = B_FALSE;
  417         if (zcp->zc_data.zc_has_remove_timer) {
  418                 fmd_timer_remove(hdl, zcp->zc_remove_timer);
  419                 zcp->zc_data.zc_has_remove_timer = 0;
  420                 serialize = B_TRUE;
  421         }
  422         if (serialize)
  423                 zfs_case_serialize(zcp);
  424 
  425         nvlist_free(detector);
  426 }
  427 
  428 static boolean_t
  429 timeval_earlier(er_timeval_t *a, er_timeval_t *b)
  430 {
  431         return (a->ertv_sec < b->ertv_sec ||
  432             (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec));
  433 }
  434 
  435 static void
  436 zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when)
  437 {
  438         (void) hdl;
  439         int64_t *tod;
  440         uint_t  nelem;
  441 
  442         if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod,
  443             &nelem) == 0 && nelem == 2) {
  444                 when->ertv_sec = tod[0];
  445                 when->ertv_nsec = tod[1];
  446         } else {
  447                 when->ertv_sec = when->ertv_nsec = UINT64_MAX;
  448         }
  449 }
  450 
  451 /*
  452  * Main fmd entry point.
  453  */
  454 static void
  455 zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
  456 {
  457         zfs_case_t *zcp, *dcp;
  458         int32_t pool_state;
  459         uint64_t ena, pool_guid, vdev_guid;
  460         uint64_t checksum_n, checksum_t;
  461         uint64_t io_n, io_t;
  462         er_timeval_t pool_load;
  463         er_timeval_t er_when;
  464         nvlist_t *detector;
  465         boolean_t pool_found = B_FALSE;
  466         boolean_t isresource;
  467         char *type;
  468 
  469         /*
  470          * We subscribe to notifications for vdev or pool removal.  In these
  471          * cases, there may be cases that no longer apply.  Purge any cases
  472          * that no longer apply.
  473          */
  474         if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) {
  475                 fmd_hdl_debug(hdl, "purging orphaned cases from %s",
  476                     strrchr(class, '.') + 1);
  477                 zfs_purge_cases(hdl);
  478                 zfs_stats.resource_drops.fmds_value.ui64++;
  479                 return;
  480         }
  481 
  482         isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*");
  483 
  484         if (isresource) {
  485                 /*
  486                  * For resources, we don't have a normal payload.
  487                  */
  488                 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
  489                     &vdev_guid) != 0)
  490                         pool_state = SPA_LOAD_OPEN;
  491                 else
  492                         pool_state = SPA_LOAD_NONE;
  493                 detector = NULL;
  494         } else {
  495                 (void) nvlist_lookup_nvlist(nvl,
  496                     FM_EREPORT_DETECTOR, &detector);
  497                 (void) nvlist_lookup_int32(nvl,
  498                     FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state);
  499         }
  500 
  501         /*
  502          * We also ignore all ereports generated during an import of a pool,
  503          * since the only possible fault (.pool) would result in import failure,
  504          * and hence no persistent fault.  Some day we may want to do something
  505          * with these ereports, so we continue generating them internally.
  506          */
  507         if (pool_state == SPA_LOAD_IMPORT) {
  508                 zfs_stats.import_drops.fmds_value.ui64++;
  509                 fmd_hdl_debug(hdl, "ignoring '%s' during import", class);
  510                 return;
  511         }
  512 
  513         /*
  514          * Device I/O errors are ignored during pool open.
  515          */
  516         if (pool_state == SPA_LOAD_OPEN &&
  517             (fmd_nvl_class_match(hdl, nvl,
  518             ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) ||
  519             fmd_nvl_class_match(hdl, nvl,
  520             ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) ||
  521             fmd_nvl_class_match(hdl, nvl,
  522             ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) {
  523                 fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class);
  524                 zfs_stats.dev_drops.fmds_value.ui64++;
  525                 return;
  526         }
  527 
  528         /*
  529          * We ignore ereports for anything except disks and files.
  530          */
  531         if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
  532             &type) == 0) {
  533                 if (strcmp(type, VDEV_TYPE_DISK) != 0 &&
  534                     strcmp(type, VDEV_TYPE_FILE) != 0) {
  535                         zfs_stats.vdev_drops.fmds_value.ui64++;
  536                         return;
  537                 }
  538         }
  539 
  540         /*
  541          * Determine if this ereport corresponds to an open case.
  542          * Each vdev or pool can have a single case.
  543          */
  544         (void) nvlist_lookup_uint64(nvl,
  545             FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid);
  546         if (nvlist_lookup_uint64(nvl,
  547             FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
  548                 vdev_guid = 0;
  549         if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0)
  550                 ena = 0;
  551 
  552         zfs_ereport_when(hdl, nvl, &er_when);
  553 
  554         for (zcp = uu_list_first(zfs_cases); zcp != NULL;
  555             zcp = uu_list_next(zfs_cases, zcp)) {
  556                 if (zcp->zc_data.zc_pool_guid == pool_guid) {
  557                         pool_found = B_TRUE;
  558                         pool_load = zcp->zc_when;
  559                 }
  560                 if (zcp->zc_data.zc_vdev_guid == vdev_guid)
  561                         break;
  562         }
  563 
  564         /*
  565          * Avoid falsely accusing a pool of being faulty.  Do so by
  566          * not replaying ereports that were generated prior to the
  567          * current import.  If the failure that generated them was
  568          * transient because the device was actually removed but we
  569          * didn't receive the normal asynchronous notification, we
  570          * don't want to mark it as faulted and potentially panic. If
  571          * there is still a problem we'd expect not to be able to
  572          * import the pool, or that new ereports will be generated
  573          * once the pool is used.
  574          */
  575         if (pool_found && timeval_earlier(&er_when, &pool_load)) {
  576                 fmd_hdl_debug(hdl, "ignoring pool %llx, "
  577                     "ereport time %lld.%lld, pool load time = %lld.%lld",
  578                     pool_guid, er_when.ertv_sec, er_when.ertv_nsec,
  579                     pool_load.ertv_sec, pool_load.ertv_nsec);
  580                 zfs_stats.old_drops.fmds_value.ui64++;
  581                 return;
  582         }
  583 
  584         if (!pool_found) {
  585                 /*
  586                  * Haven't yet seen this pool, but same situation
  587                  * may apply.
  588                  */
  589                 libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
  590                 struct load_time_arg la;
  591 
  592                 la.lt_guid = pool_guid;
  593                 la.lt_time = &pool_load;
  594                 la.lt_found = B_FALSE;
  595 
  596                 if (zhdl != NULL &&
  597                     zpool_iter(zhdl, zpool_find_load_time, &la) == 0 &&
  598                     la.lt_found == B_TRUE) {
  599                         pool_found = B_TRUE;
  600 
  601                         if (timeval_earlier(&er_when, &pool_load)) {
  602                                 fmd_hdl_debug(hdl, "ignoring pool %llx, "
  603                                     "ereport time %lld.%lld, "
  604                                     "pool load time = %lld.%lld",
  605                                     pool_guid, er_when.ertv_sec,
  606                                     er_when.ertv_nsec, pool_load.ertv_sec,
  607                                     pool_load.ertv_nsec);
  608                                 zfs_stats.old_drops.fmds_value.ui64++;
  609                                 return;
  610                         }
  611                 }
  612         }
  613 
  614         if (zcp == NULL) {
  615                 fmd_case_t *cs;
  616                 zfs_case_data_t data = { 0 };
  617 
  618                 /*
  619                  * If this is one of our 'fake' resource ereports, and there is
  620                  * no case open, simply discard it.
  621                  */
  622                 if (isresource) {
  623                         zfs_stats.resource_drops.fmds_value.ui64++;
  624                         fmd_hdl_debug(hdl, "discarding '%s for vdev %llu",
  625                             class, vdev_guid);
  626                         return;
  627                 }
  628 
  629                 /*
  630                  * Skip tracking some ereports
  631                  */
  632                 if (strcmp(class,
  633                     ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 ||
  634                     strcmp(class,
  635                     ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 ||
  636                     strcmp(class,
  637                     ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) {
  638                         zfs_stats.resource_drops.fmds_value.ui64++;
  639                         return;
  640                 }
  641 
  642                 /*
  643                  * Open a new case.
  644                  */
  645                 cs = fmd_case_open(hdl, NULL);
  646 
  647                 fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'",
  648                     vdev_guid, class);
  649 
  650                 /*
  651                  * Initialize the case buffer.  To commonize code, we actually
  652                  * create the buffer with existing data, and then call
  653                  * zfs_case_unserialize() to instantiate the in-core structure.
  654                  */
  655                 fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t));
  656 
  657                 data.zc_version = CASE_DATA_VERSION_SERD;
  658                 data.zc_ena = ena;
  659                 data.zc_pool_guid = pool_guid;
  660                 data.zc_vdev_guid = vdev_guid;
  661                 data.zc_pool_state = (int)pool_state;
  662 
  663                 fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data));
  664 
  665                 zcp = zfs_case_unserialize(hdl, cs);
  666                 assert(zcp != NULL);
  667                 if (pool_found)
  668                         zcp->zc_when = pool_load;
  669         }
  670 
  671         if (isresource) {
  672                 fmd_hdl_debug(hdl, "resource event '%s'", class);
  673 
  674                 if (fmd_nvl_class_match(hdl, nvl,
  675                     ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) {
  676                         /*
  677                          * The 'resource.fs.zfs.autoreplace' event indicates
  678                          * that the pool was loaded with the 'autoreplace'
  679                          * property set.  In this case, any pending device
  680                          * failures should be ignored, as the asynchronous
  681                          * autoreplace handling will take care of them.
  682                          */
  683                         fmd_case_close(hdl, zcp->zc_case);
  684                 } else if (fmd_nvl_class_match(hdl, nvl,
  685                     ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) {
  686                         /*
  687                          * The 'resource.fs.zfs.removed' event indicates that
  688                          * device removal was detected, and the device was
  689                          * closed asynchronously.  If this is the case, we
  690                          * assume that any recent I/O errors were due to the
  691                          * device removal, not any fault of the device itself.
  692                          * We reset the SERD engine, and cancel any pending
  693                          * timers.
  694                          */
  695                         if (zcp->zc_data.zc_has_remove_timer) {
  696                                 fmd_timer_remove(hdl, zcp->zc_remove_timer);
  697                                 zcp->zc_data.zc_has_remove_timer = 0;
  698                                 zfs_case_serialize(zcp);
  699                         }
  700                         if (zcp->zc_data.zc_serd_io[0] != '\0')
  701                                 fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io);
  702                         if (zcp->zc_data.zc_serd_checksum[0] != '\0')
  703                                 fmd_serd_reset(hdl,
  704                                     zcp->zc_data.zc_serd_checksum);
  705                 } else if (fmd_nvl_class_match(hdl, nvl,
  706                     ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) {
  707                         uint64_t state = 0;
  708 
  709                         if (zcp != NULL &&
  710                             nvlist_lookup_uint64(nvl,
  711                             FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 &&
  712                             state == VDEV_STATE_HEALTHY) {
  713                                 fmd_hdl_debug(hdl, "closing case after a "
  714                                     "device statechange to healthy");
  715                                 fmd_case_close(hdl, zcp->zc_case);
  716                         }
  717                 }
  718                 zfs_stats.resource_drops.fmds_value.ui64++;
  719                 return;
  720         }
  721 
  722         /*
  723          * Associate the ereport with this case.
  724          */
  725         fmd_case_add_ereport(hdl, zcp->zc_case, ep);
  726 
  727         /*
  728          * Don't do anything else if this case is already solved.
  729          */
  730         if (fmd_case_solved(hdl, zcp->zc_case))
  731                 return;
  732 
  733         fmd_hdl_debug(hdl, "error event '%s'", class);
  734 
  735         /*
  736          * Determine if we should solve the case and generate a fault.  We solve
  737          * a case if:
  738          *
  739          *      a. A pool failed to open (ereport.fs.zfs.pool)
  740          *      b. A device failed to open (ereport.fs.zfs.pool) while a pool
  741          *         was up and running.
  742          *
  743          * We may see a series of ereports associated with a pool open, all
  744          * chained together by the same ENA.  If the pool open succeeds, then
  745          * we'll see no further ereports.  To detect when a pool open has
  746          * succeeded, we associate a timer with the event.  When it expires, we
  747          * close the case.
  748          */
  749         if (fmd_nvl_class_match(hdl, nvl,
  750             ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) {
  751                 /*
  752                  * Pool level fault.  Before solving the case, go through and
  753                  * close any open device cases that may be pending.
  754                  */
  755                 for (dcp = uu_list_first(zfs_cases); dcp != NULL;
  756                     dcp = uu_list_next(zfs_cases, dcp)) {
  757                         if (dcp->zc_data.zc_pool_guid ==
  758                             zcp->zc_data.zc_pool_guid &&
  759                             dcp->zc_data.zc_vdev_guid != 0)
  760                                 fmd_case_close(hdl, dcp->zc_case);
  761                 }
  762 
  763                 zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool");
  764         } else if (fmd_nvl_class_match(hdl, nvl,
  765             ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) {
  766                 /*
  767                  * Pool level fault for reading the intent logs.
  768                  */
  769                 zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay");
  770         } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) {
  771                 /*
  772                  * Device fault.
  773                  */
  774                 zfs_case_solve(hdl, zcp, "fault.fs.zfs.device");
  775         } else if (fmd_nvl_class_match(hdl, nvl,
  776             ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) ||
  777             fmd_nvl_class_match(hdl, nvl,
  778             ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) ||
  779             fmd_nvl_class_match(hdl, nvl,
  780             ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
  781             fmd_nvl_class_match(hdl, nvl,
  782             ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
  783                 char *failmode = NULL;
  784                 boolean_t checkremove = B_FALSE;
  785                 uint32_t pri = 0;
  786                 int32_t flags = 0;
  787 
  788                 /*
  789                  * If this is a checksum or I/O error, then toss it into the
  790                  * appropriate SERD engine and check to see if it has fired.
  791                  * Ideally, we want to do something more sophisticated,
  792                  * (persistent errors for a single data block, etc).  For now,
  793                  * a single SERD engine is sufficient.
  794                  */
  795                 if (fmd_nvl_class_match(hdl, nvl,
  796                     ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) {
  797                         if (zcp->zc_data.zc_serd_io[0] == '\0') {
  798                                 if (nvlist_lookup_uint64(nvl,
  799                                     FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N,
  800                                     &io_n) != 0) {
  801                                         io_n = DEFAULT_IO_N;
  802                                 }
  803                                 if (nvlist_lookup_uint64(nvl,
  804                                     FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T,
  805                                     &io_t) != 0) {
  806                                         io_t = DEFAULT_IO_T;
  807                                 }
  808                                 zfs_serd_name(zcp->zc_data.zc_serd_io,
  809                                     pool_guid, vdev_guid, "io");
  810                                 fmd_serd_create(hdl, zcp->zc_data.zc_serd_io,
  811                                     io_n,
  812                                     SEC2NSEC(io_t));
  813                                 zfs_case_serialize(zcp);
  814                         }
  815                         if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
  816                                 checkremove = B_TRUE;
  817                 } else if (fmd_nvl_class_match(hdl, nvl,
  818                     ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
  819                         /*
  820                          * We ignore ereports for checksum errors generated by
  821                          * scrub/resilver I/O to avoid potentially further
  822                          * degrading the pool while it's being repaired.
  823                          */
  824                         if (((nvlist_lookup_uint32(nvl,
  825                             FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) &&
  826                             (pri == ZIO_PRIORITY_SCRUB ||
  827                             pri == ZIO_PRIORITY_REBUILD)) ||
  828                             ((nvlist_lookup_int32(nvl,
  829                             FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) &&
  830                             (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
  831                                 fmd_hdl_debug(hdl, "ignoring '%s' for "
  832                                     "scrub/resilver I/O", class);
  833                                 return;
  834                         }
  835 
  836                         if (zcp->zc_data.zc_serd_checksum[0] == '\0') {
  837                                 if (nvlist_lookup_uint64(nvl,
  838                                     FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N,
  839                                     &checksum_n) != 0) {
  840                                         checksum_n = DEFAULT_CHECKSUM_N;
  841                                 }
  842                                 if (nvlist_lookup_uint64(nvl,
  843                                     FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T,
  844                                     &checksum_t) != 0) {
  845                                         checksum_t = DEFAULT_CHECKSUM_T;
  846                                 }
  847 
  848                                 zfs_serd_name(zcp->zc_data.zc_serd_checksum,
  849                                     pool_guid, vdev_guid, "checksum");
  850                                 fmd_serd_create(hdl,
  851                                     zcp->zc_data.zc_serd_checksum,
  852                                     checksum_n,
  853                                     SEC2NSEC(checksum_t));
  854                                 zfs_case_serialize(zcp);
  855                         }
  856                         if (fmd_serd_record(hdl,
  857                             zcp->zc_data.zc_serd_checksum, ep)) {
  858                                 zfs_case_solve(hdl, zcp,
  859                                     "fault.fs.zfs.vdev.checksum");
  860                         }
  861                 } else if (fmd_nvl_class_match(hdl, nvl,
  862                     ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) &&
  863                     (nvlist_lookup_string(nvl,
  864                     FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) &&
  865                     failmode != NULL) {
  866                         if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE,
  867                             strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) {
  868                                 zfs_case_solve(hdl, zcp,
  869                                     "fault.fs.zfs.io_failure_continue");
  870                         } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT,
  871                             strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) {
  872                                 zfs_case_solve(hdl, zcp,
  873                                     "fault.fs.zfs.io_failure_wait");
  874                         }
  875                 } else if (fmd_nvl_class_match(hdl, nvl,
  876                     ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
  877 #ifndef __linux__
  878                         /* This causes an unexpected fault diagnosis on linux */
  879                         checkremove = B_TRUE;
  880 #endif
  881                 }
  882 
  883                 /*
  884                  * Because I/O errors may be due to device removal, we postpone
  885                  * any diagnosis until we're sure that we aren't about to
  886                  * receive a 'resource.fs.zfs.removed' event.
  887                  */
  888                 if (checkremove) {
  889                         if (zcp->zc_data.zc_has_remove_timer)
  890                                 fmd_timer_remove(hdl, zcp->zc_remove_timer);
  891                         zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL,
  892                             zfs_remove_timeout);
  893                         if (!zcp->zc_data.zc_has_remove_timer) {
  894                                 zcp->zc_data.zc_has_remove_timer = 1;
  895                                 zfs_case_serialize(zcp);
  896                         }
  897                 }
  898         }
  899 }
  900 
  901 /*
  902  * The timeout is fired when we diagnosed an I/O error, and it was not due to
  903  * device removal (which would cause the timeout to be cancelled).
  904  */
  905 static void
  906 zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data)
  907 {
  908         zfs_case_t *zcp = data;
  909 
  910         if (id == zcp->zc_remove_timer)
  911                 zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io");
  912 }
  913 
  914 /*
  915  * The specified case has been closed and any case-specific
  916  * data structures should be deallocated.
  917  */
  918 static void
  919 zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
  920 {
  921         zfs_case_t *zcp = fmd_case_getspecific(hdl, cs);
  922 
  923         if (zcp->zc_data.zc_serd_checksum[0] != '\0')
  924                 fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
  925         if (zcp->zc_data.zc_serd_io[0] != '\0')
  926                 fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
  927         if (zcp->zc_data.zc_has_remove_timer)
  928                 fmd_timer_remove(hdl, zcp->zc_remove_timer);
  929 
  930         uu_list_remove(zfs_cases, zcp);
  931         uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool);
  932         fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
  933 }
  934 
  935 /*
  936  * We use the fmd gc entry point to look for old cases that no longer apply.
  937  * This allows us to keep our set of case data small in a long running system.
  938  */
  939 static void
  940 zfs_fm_gc(fmd_hdl_t *hdl)
  941 {
  942         zfs_purge_cases(hdl);
  943 }
  944 
  945 static const fmd_hdl_ops_t fmd_ops = {
  946         zfs_fm_recv,    /* fmdo_recv */
  947         zfs_fm_timeout, /* fmdo_timeout */
  948         zfs_fm_close,   /* fmdo_close */
  949         NULL,           /* fmdo_stats */
  950         zfs_fm_gc,      /* fmdo_gc */
  951 };
  952 
  953 static const fmd_prop_t fmd_props[] = {
  954         { "checksum_N", FMD_TYPE_UINT32, "10" },
  955         { "checksum_T", FMD_TYPE_TIME, "10min" },
  956         { "io_N", FMD_TYPE_UINT32, "10" },
  957         { "io_T", FMD_TYPE_TIME, "10min" },
  958         { "remove_timeout", FMD_TYPE_TIME, "15sec" },
  959         { NULL, 0, NULL }
  960 };
  961 
  962 static const fmd_hdl_info_t fmd_info = {
  963         "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props
  964 };
  965 
  966 void
  967 _zfs_diagnosis_init(fmd_hdl_t *hdl)
  968 {
  969         libzfs_handle_t *zhdl;
  970 
  971         if ((zhdl = libzfs_init()) == NULL)
  972                 return;
  973 
  974         if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool",
  975             sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node),
  976             NULL, UU_LIST_POOL_DEBUG)) == NULL) {
  977                 libzfs_fini(zhdl);
  978                 return;
  979         }
  980 
  981         if ((zfs_cases = uu_list_create(zfs_case_pool, NULL,
  982             UU_LIST_DEBUG)) == NULL) {
  983                 uu_list_pool_destroy(zfs_case_pool);
  984                 libzfs_fini(zhdl);
  985                 return;
  986         }
  987 
  988         if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
  989                 uu_list_destroy(zfs_cases);
  990                 uu_list_pool_destroy(zfs_case_pool);
  991                 libzfs_fini(zhdl);
  992                 return;
  993         }
  994 
  995         fmd_hdl_setspecific(hdl, zhdl);
  996 
  997         (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) /
  998             sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats);
  999 
 1000         zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
 1001 }
 1002 
 1003 void
 1004 _zfs_diagnosis_fini(fmd_hdl_t *hdl)
 1005 {
 1006         zfs_case_t *zcp;
 1007         uu_list_walk_t *walk;
 1008         libzfs_handle_t *zhdl;
 1009 
 1010         /*
 1011          * Remove all active cases.
 1012          */
 1013         walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
 1014         while ((zcp = uu_list_walk_next(walk)) != NULL) {
 1015                 fmd_hdl_debug(hdl, "removing case ena %llu",
 1016                     (long long unsigned)zcp->zc_data.zc_ena);
 1017                 uu_list_remove(zfs_cases, zcp);
 1018                 uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool);
 1019                 fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
 1020         }
 1021         uu_list_walk_end(walk);
 1022 
 1023         uu_list_destroy(zfs_cases);
 1024         uu_list_pool_destroy(zfs_case_pool);
 1025 
 1026         zhdl = fmd_hdl_getspecific(hdl);
 1027         libzfs_fini(zhdl);
 1028 }

Cache object: faaf3eb6f1b94fc5d199a7aa7a5c54d2


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.