The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
   23  * Copyright (c) 2012 by Delphix. All rights reserved.
   24  * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
   25  * Copyright (c) 2016, 2017, Intel Corporation.
   26  * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
   27  */
   28 
   29 /*
   30  * ZFS syseventd module.
   31  *
   32  * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c
   33  *
   34  * The purpose of this module is to identify when devices are added to the
   35  * system, and appropriately online or replace the affected vdevs.
   36  *
   37  * When a device is added to the system:
   38  *
   39  *      1. Search for any vdevs whose devid matches that of the newly added
   40  *         device.
   41  *
   42  *      2. If no vdevs are found, then search for any vdevs whose udev path
   43  *         matches that of the new device.
   44  *
   45  *      3. If no vdevs match by either method, then ignore the event.
   46  *
   47  *      4. Attempt to online the device with a flag to indicate that it should
   48  *         be unspared when resilvering completes.  If this succeeds, then the
   49  *         same device was inserted and we should continue normally.
   50  *
   51  *      5. If the pool does not have the 'autoreplace' property set, attempt to
   52  *         online the device again without the unspare flag, which will
   53  *         generate a FMA fault.
   54  *
   55  *      6. If the pool has the 'autoreplace' property set, and the matching vdev
   56  *         is a whole disk, then label the new disk and attempt a 'zpool
   57  *         replace'.
   58  *
   59  * The module responds to EC_DEV_ADD events.  The special ESC_ZFS_VDEV_CHECK
   60  * event indicates that a device failed to open during pool load, but the
   61  * autoreplace property was set.  In this case, we deferred the associated
   62  * FMA fault until our module had a chance to process the autoreplace logic.
   63  * If the device could not be replaced, then the second online attempt will
   64  * trigger the FMA fault that we skipped earlier.
   65  *
   66  * On Linux udev provides a disk insert for both the disk and the partition.
   67  */
   68 
   69 #include <ctype.h>
   70 #include <fcntl.h>
   71 #include <libnvpair.h>
   72 #include <libzfs.h>
   73 #include <libzutil.h>
   74 #include <limits.h>
   75 #include <stddef.h>
   76 #include <stdlib.h>
   77 #include <string.h>
   78 #include <syslog.h>
   79 #include <sys/list.h>
   80 #include <sys/sunddi.h>
   81 #include <sys/sysevent/eventdefs.h>
   82 #include <sys/sysevent/dev.h>
   83 #include <thread_pool.h>
   84 #include <pthread.h>
   85 #include <unistd.h>
   86 #include <errno.h>
   87 #include "zfs_agents.h"
   88 #include "../zed_log.h"
   89 
   90 #define DEV_BYID_PATH   "/dev/disk/by-id/"
   91 #define DEV_BYPATH_PATH "/dev/disk/by-path/"
   92 #define DEV_BYVDEV_PATH "/dev/disk/by-vdev/"
   93 
   94 typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t);
   95 
   96 libzfs_handle_t *g_zfshdl;
   97 list_t g_pool_list;     /* list of unavailable pools at initialization */
   98 list_t g_device_list;   /* list of disks with asynchronous label request */
   99 tpool_t *g_tpool;
  100 boolean_t g_enumeration_done;
  101 pthread_t g_zfs_tid;    /* zfs_enum_pools() thread */
  102 
  103 typedef struct unavailpool {
  104         zpool_handle_t  *uap_zhp;
  105         list_node_t     uap_node;
  106 } unavailpool_t;
  107 
  108 typedef struct pendingdev {
  109         char            pd_physpath[128];
  110         list_node_t     pd_node;
  111 } pendingdev_t;
  112 
  113 static int
  114 zfs_toplevel_state(zpool_handle_t *zhp)
  115 {
  116         nvlist_t *nvroot;
  117         vdev_stat_t *vs;
  118         unsigned int c;
  119 
  120         verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
  121             ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
  122         verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
  123             (uint64_t **)&vs, &c) == 0);
  124         return (vs->vs_state);
  125 }
  126 
  127 static int
  128 zfs_unavail_pool(zpool_handle_t *zhp, void *data)
  129 {
  130         zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)",
  131             zpool_get_name(zhp), (int)zfs_toplevel_state(zhp));
  132 
  133         if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) {
  134                 unavailpool_t *uap;
  135                 uap = malloc(sizeof (unavailpool_t));
  136                 if (uap == NULL) {
  137                         perror("malloc");
  138                         exit(EXIT_FAILURE);
  139                 }
  140 
  141                 uap->uap_zhp = zhp;
  142                 list_insert_tail((list_t *)data, uap);
  143         } else {
  144                 zpool_close(zhp);
  145         }
  146         return (0);
  147 }
  148 
  149 /*
  150  * Two stage replace on Linux
  151  * since we get disk notifications
  152  * we can wait for partitioned disk slice to show up!
  153  *
  154  * First stage tags the disk, initiates async partitioning, and returns
  155  * Second stage finds the tag and proceeds to ZFS labeling/replace
  156  *
  157  * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach
  158  *
  159  * 1. physical match with no fs, no partition
  160  *      tag it top, partition disk
  161  *
  162  * 2. physical match again, see partition and tag
  163  *
  164  */
  165 
  166 /*
  167  * The device associated with the given vdev (either by devid or physical path)
  168  * has been added to the system.  If 'isdisk' is set, then we only attempt a
  169  * replacement if it's a whole disk.  This also implies that we should label the
  170  * disk first.
  171  *
  172  * First, we attempt to online the device (making sure to undo any spare
  173  * operation when finished).  If this succeeds, then we're done.  If it fails,
  174  * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
  175  * but that the label was not what we expected.  If the 'autoreplace' property
  176  * is enabled, then we relabel the disk (if specified), and attempt a 'zpool
  177  * replace'.  If the online is successful, but the new state is something else
  178  * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
  179  * race, and we should avoid attempting to relabel the disk.
  180  *
  181  * Also can arrive here from a ESC_ZFS_VDEV_CHECK event
  182  */
  183 static void
  184 zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
  185 {
  186         char *path;
  187         vdev_state_t newstate;
  188         nvlist_t *nvroot, *newvd;
  189         pendingdev_t *device;
  190         uint64_t wholedisk = 0ULL;
  191         uint64_t offline = 0ULL, faulted = 0ULL;
  192         uint64_t guid = 0ULL;
  193         uint64_t is_spare = 0;
  194         char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL;
  195         char rawpath[PATH_MAX], fullpath[PATH_MAX];
  196         char devpath[PATH_MAX];
  197         int ret;
  198         int online_flag = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE;
  199         boolean_t is_sd = B_FALSE;
  200         boolean_t is_mpath_wholedisk = B_FALSE;
  201         uint_t c;
  202         vdev_stat_t *vs;
  203 
  204         if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
  205                 return;
  206 
  207         /* Skip healthy disks */
  208         verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
  209             (uint64_t **)&vs, &c) == 0);
  210         if (vs->vs_state == VDEV_STATE_HEALTHY) {
  211                 zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.",
  212                     __func__, path);
  213                 return;
  214         }
  215 
  216         (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath);
  217         (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
  218             &enc_sysfs_path);
  219         (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
  220         (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline);
  221         (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted);
  222 
  223         (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid);
  224         (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_IS_SPARE, &is_spare);
  225 
  226         /*
  227          * Special case:
  228          *
  229          * We've seen times where a disk won't have a ZPOOL_CONFIG_PHYS_PATH
  230          * entry in their config. For example, on this force-faulted disk:
  231          *
  232          *      children[0]:
  233          *         type: 'disk'
  234          *         id: 0
  235          *         guid: 14309659774640089719
  236          *        path: '/dev/disk/by-vdev/L28'
  237          *        whole_disk: 0
  238          *        DTL: 654
  239          *        create_txg: 4
  240          *        com.delphix:vdev_zap_leaf: 1161
  241          *        faulted: 1
  242          *        aux_state: 'external'
  243          *      children[1]:
  244          *        type: 'disk'
  245          *        id: 1
  246          *        guid: 16002508084177980912
  247          *        path: '/dev/disk/by-vdev/L29'
  248          *        devid: 'dm-uuid-mpath-35000c500a61d68a3'
  249          *        phys_path: 'L29'
  250          *        vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32'
  251          *        whole_disk: 0
  252          *        DTL: 1028
  253          *        create_txg: 4
  254          *        com.delphix:vdev_zap_leaf: 131
  255          *
  256          * If the disk's path is a /dev/disk/by-vdev/ path, then we can infer
  257          * the ZPOOL_CONFIG_PHYS_PATH from the by-vdev disk name.
  258          */
  259         if (physpath == NULL && path != NULL) {
  260                 /* If path begins with "/dev/disk/by-vdev/" ... */
  261                 if (strncmp(path, DEV_BYVDEV_PATH,
  262                     strlen(DEV_BYVDEV_PATH)) == 0) {
  263                         /* Set physpath to the char after "/dev/disk/by-vdev" */
  264                         physpath = &path[strlen(DEV_BYVDEV_PATH)];
  265                 }
  266         }
  267 
  268         /*
  269          * We don't want to autoreplace offlined disks.  However, we do want to
  270          * replace force-faulted disks (`zpool offline -f`).  Force-faulted
  271          * disks have both offline=1 and faulted=1 in the nvlist.
  272          */
  273         if (offline && !faulted) {
  274                 zed_log_msg(LOG_INFO, "%s: %s is offline, skip autoreplace",
  275                     __func__, path);
  276                 return;
  277         }
  278 
  279         is_mpath_wholedisk = is_mpath_whole_disk(path);
  280         zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'"
  281             " %s blank disk, %s mpath blank disk, %s labeled, enc sysfs '%s', "
  282             "(guid %llu)",
  283             zpool_get_name(zhp), path,
  284             physpath ? physpath : "NULL",
  285             wholedisk ? "is" : "not",
  286             is_mpath_wholedisk? "is" : "not",
  287             labeled ? "is" : "not",
  288             enc_sysfs_path,
  289             (long long unsigned int)guid);
  290 
  291         /*
  292          * The VDEV guid is preferred for identification (gets passed in path)
  293          */
  294         if (guid != 0) {
  295                 (void) snprintf(fullpath, sizeof (fullpath), "%llu",
  296                     (long long unsigned int)guid);
  297         } else {
  298                 /*
  299                  * otherwise use path sans partition suffix for whole disks
  300                  */
  301                 (void) strlcpy(fullpath, path, sizeof (fullpath));
  302                 if (wholedisk) {
  303                         char *spath = zfs_strip_partition(fullpath);
  304                         if (!spath) {
  305                                 zed_log_msg(LOG_INFO, "%s: Can't alloc",
  306                                     __func__);
  307                                 return;
  308                         }
  309 
  310                         (void) strlcpy(fullpath, spath, sizeof (fullpath));
  311                         free(spath);
  312                 }
  313         }
  314 
  315         if (is_spare)
  316                 online_flag |= ZFS_ONLINE_SPARE;
  317 
  318         /*
  319          * Attempt to online the device.
  320          */
  321         if (zpool_vdev_online(zhp, fullpath, online_flag, &newstate) == 0 &&
  322             (newstate == VDEV_STATE_HEALTHY ||
  323             newstate == VDEV_STATE_DEGRADED)) {
  324                 zed_log_msg(LOG_INFO,
  325                     "  zpool_vdev_online: vdev '%s' ('%s') is "
  326                     "%s", fullpath, physpath, (newstate == VDEV_STATE_HEALTHY) ?
  327                     "HEALTHY" : "DEGRADED");
  328                 return;
  329         }
  330 
  331         /*
  332          * vdev_id alias rule for using scsi_debug devices (FMA automated
  333          * testing)
  334          */
  335         if (physpath != NULL && strcmp("scsidebug", physpath) == 0)
  336                 is_sd = B_TRUE;
  337 
  338         /*
  339          * If the pool doesn't have the autoreplace property set, then use
  340          * vdev online to trigger a FMA fault by posting an ereport.
  341          */
  342         if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
  343             !(wholedisk || is_mpath_wholedisk) || (physpath == NULL)) {
  344                 (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
  345                     &newstate);
  346                 zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or "
  347                     "not a blank disk for '%s' ('%s')", fullpath,
  348                     physpath);
  349                 return;
  350         }
  351 
  352         /*
  353          * Convert physical path into its current device node.  Rawpath
  354          * needs to be /dev/disk/by-vdev for a scsi_debug device since
  355          * /dev/disk/by-path will not be present.
  356          */
  357         (void) snprintf(rawpath, sizeof (rawpath), "%s%s",
  358             is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath);
  359 
  360         if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) {
  361                 zed_log_msg(LOG_INFO, "  realpath: %s failed (%s)",
  362                     rawpath, strerror(errno));
  363 
  364                 (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
  365                     &newstate);
  366 
  367                 zed_log_msg(LOG_INFO, "  zpool_vdev_online: %s FORCEFAULT (%s)",
  368                     fullpath, libzfs_error_description(g_zfshdl));
  369                 return;
  370         }
  371 
  372         /* Only autoreplace bad disks */
  373         if ((vs->vs_state != VDEV_STATE_DEGRADED) &&
  374             (vs->vs_state != VDEV_STATE_FAULTED) &&
  375             (vs->vs_state != VDEV_STATE_CANT_OPEN)) {
  376                 zed_log_msg(LOG_INFO, "  not autoreplacing since disk isn't in "
  377                     "a bad state (currently %llu)", vs->vs_state);
  378                 return;
  379         }
  380 
  381         nvlist_lookup_string(vdev, "new_devid", &new_devid);
  382 
  383         if (is_mpath_wholedisk) {
  384                 /* Don't label device mapper or multipath disks. */
  385         } else if (!labeled) {
  386                 /*
  387                  * we're auto-replacing a raw disk, so label it first
  388                  */
  389                 char *leafname;
  390 
  391                 /*
  392                  * If this is a request to label a whole disk, then attempt to
  393                  * write out the label.  Before we can label the disk, we need
  394                  * to map the physical string that was matched on to the under
  395                  * lying device node.
  396                  *
  397                  * If any part of this process fails, then do a force online
  398                  * to trigger a ZFS fault for the device (and any hot spare
  399                  * replacement).
  400                  */
  401                 leafname = strrchr(devpath, '/') + 1;
  402 
  403                 /*
  404                  * If this is a request to label a whole disk, then attempt to
  405                  * write out the label.
  406                  */
  407                 if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) {
  408                         zed_log_msg(LOG_INFO, "  zpool_label_disk: could not "
  409                             "label '%s' (%s)", leafname,
  410                             libzfs_error_description(g_zfshdl));
  411 
  412                         (void) zpool_vdev_online(zhp, fullpath,
  413                             ZFS_ONLINE_FORCEFAULT, &newstate);
  414                         return;
  415                 }
  416 
  417                 /*
  418                  * The disk labeling is asynchronous on Linux. Just record
  419                  * this label request and return as there will be another
  420                  * disk add event for the partition after the labeling is
  421                  * completed.
  422                  */
  423                 device = malloc(sizeof (pendingdev_t));
  424                 if (device == NULL) {
  425                         perror("malloc");
  426                         exit(EXIT_FAILURE);
  427                 }
  428 
  429                 (void) strlcpy(device->pd_physpath, physpath,
  430                     sizeof (device->pd_physpath));
  431                 list_insert_tail(&g_device_list, device);
  432 
  433                 zed_log_msg(LOG_INFO, "  zpool_label_disk: async '%s' (%llu)",
  434                     leafname, (u_longlong_t)guid);
  435 
  436                 return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */
  437 
  438         } else /* labeled */ {
  439                 boolean_t found = B_FALSE;
  440                 /*
  441                  * match up with request above to label the disk
  442                  */
  443                 for (device = list_head(&g_device_list); device != NULL;
  444                     device = list_next(&g_device_list, device)) {
  445                         if (strcmp(physpath, device->pd_physpath) == 0) {
  446                                 list_remove(&g_device_list, device);
  447                                 free(device);
  448                                 found = B_TRUE;
  449                                 break;
  450                         }
  451                         zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s",
  452                             physpath, device->pd_physpath);
  453                 }
  454                 if (!found) {
  455                         /* unexpected partition slice encountered */
  456                         zed_log_msg(LOG_INFO, "labeled disk %s unexpected here",
  457                             fullpath);
  458                         (void) zpool_vdev_online(zhp, fullpath,
  459                             ZFS_ONLINE_FORCEFAULT, &newstate);
  460                         return;
  461                 }
  462 
  463                 zed_log_msg(LOG_INFO, "  zpool_label_disk: resume '%s' (%llu)",
  464                     physpath, (u_longlong_t)guid);
  465 
  466                 (void) snprintf(devpath, sizeof (devpath), "%s%s",
  467                     DEV_BYID_PATH, new_devid);
  468         }
  469 
  470         /*
  471          * Construct the root vdev to pass to zpool_vdev_attach().  While adding
  472          * the entire vdev structure is harmless, we construct a reduced set of
  473          * path/physpath/wholedisk to keep it simple.
  474          */
  475         if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) {
  476                 zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory");
  477                 return;
  478         }
  479         if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
  480                 zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory");
  481                 nvlist_free(nvroot);
  482                 return;
  483         }
  484 
  485         if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 ||
  486             nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 ||
  487             nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 ||
  488             (physpath != NULL && nvlist_add_string(newvd,
  489             ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) ||
  490             (enc_sysfs_path != NULL && nvlist_add_string(newvd,
  491             ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) ||
  492             nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 ||
  493             nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 ||
  494             nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
  495             (const nvlist_t **)&newvd, 1) != 0) {
  496                 zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs");
  497                 nvlist_free(newvd);
  498                 nvlist_free(nvroot);
  499                 return;
  500         }
  501 
  502         nvlist_free(newvd);
  503 
  504         /*
  505          * Wait for udev to verify the links exist, then auto-replace
  506          * the leaf disk at same physical location.
  507          */
  508         if (zpool_label_disk_wait(path, 3000) != 0) {
  509                 zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement "
  510                     "disk %s is missing", path);
  511                 nvlist_free(nvroot);
  512                 return;
  513         }
  514 
  515         /*
  516          * Prefer sequential resilvering when supported (mirrors and dRAID),
  517          * otherwise fallback to a traditional healing resilver.
  518          */
  519         ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_TRUE);
  520         if (ret != 0) {
  521                 ret = zpool_vdev_attach(zhp, fullpath, path, nvroot,
  522                     B_TRUE, B_FALSE);
  523         }
  524 
  525         zed_log_msg(LOG_INFO, "  zpool_vdev_replace: %s with %s (%s)",
  526             fullpath, path, (ret == 0) ? "no errors" :
  527             libzfs_error_description(g_zfshdl));
  528 
  529         nvlist_free(nvroot);
  530 }
  531 
  532 /*
  533  * Utility functions to find a vdev matching given criteria.
  534  */
  535 typedef struct dev_data {
  536         const char              *dd_compare;
  537         const char              *dd_prop;
  538         zfs_process_func_t      dd_func;
  539         boolean_t               dd_found;
  540         boolean_t               dd_islabeled;
  541         uint64_t                dd_pool_guid;
  542         uint64_t                dd_vdev_guid;
  543         uint64_t                dd_new_vdev_guid;
  544         const char              *dd_new_devid;
  545         uint64_t                dd_num_spares;
  546 } dev_data_t;
  547 
  548 static void
  549 zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
  550 {
  551         dev_data_t *dp = data;
  552         char *path = NULL;
  553         uint_t c, children;
  554         nvlist_t **child;
  555         uint64_t guid = 0;
  556         uint64_t isspare = 0;
  557 
  558         /*
  559          * First iterate over any children.
  560          */
  561         if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
  562             &child, &children) == 0) {
  563                 for (c = 0; c < children; c++)
  564                         zfs_iter_vdev(zhp, child[c], data);
  565         }
  566 
  567         /*
  568          * Iterate over any spares and cache devices
  569          */
  570         if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
  571             &child, &children) == 0) {
  572                 for (c = 0; c < children; c++)
  573                         zfs_iter_vdev(zhp, child[c], data);
  574         }
  575         if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
  576             &child, &children) == 0) {
  577                 for (c = 0; c < children; c++)
  578                         zfs_iter_vdev(zhp, child[c], data);
  579         }
  580 
  581         /* once a vdev was matched and processed there is nothing left to do */
  582         if (dp->dd_found && dp->dd_num_spares == 0)
  583                 return;
  584         (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &guid);
  585 
  586         /*
  587          * Match by GUID if available otherwise fallback to devid or physical
  588          */
  589         if (dp->dd_vdev_guid != 0) {
  590                 if (guid != dp->dd_vdev_guid)
  591                         return;
  592                 zed_log_msg(LOG_INFO, "  zfs_iter_vdev: matched on %llu", guid);
  593                 dp->dd_found = B_TRUE;
  594 
  595         } else if (dp->dd_compare != NULL) {
  596                 /*
  597                  * NOTE: On Linux there is an event for partition, so unlike
  598                  * illumos, substring matching is not required to accommodate
  599                  * the partition suffix. An exact match will be present in
  600                  * the dp->dd_compare value.
  601                  * If the attached disk already contains a vdev GUID, it means
  602                  * the disk is not clean. In such a scenario, the physical path
  603                  * would be a match that makes the disk faulted when trying to
  604                  * online it. So, we would only want to proceed if either GUID
  605                  * matches with the last attached disk or the disk is in clean
  606                  * state.
  607                  */
  608                 if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
  609                     strcmp(dp->dd_compare, path) != 0) {
  610                         zed_log_msg(LOG_INFO, "  %s: no match (%s != vdev %s)",
  611                             __func__, dp->dd_compare, path);
  612                         return;
  613                 }
  614                 if (dp->dd_new_vdev_guid != 0 && dp->dd_new_vdev_guid != guid) {
  615                         zed_log_msg(LOG_INFO, "  %s: no match (GUID:%llu"
  616                             " != vdev GUID:%llu)", __func__,
  617                             dp->dd_new_vdev_guid, guid);
  618                         return;
  619                 }
  620 
  621                 zed_log_msg(LOG_INFO, "  zfs_iter_vdev: matched %s on %s",
  622                     dp->dd_prop, path);
  623                 dp->dd_found = B_TRUE;
  624 
  625                 /* pass the new devid for use by replacing code */
  626                 if (dp->dd_new_devid != NULL) {
  627                         (void) nvlist_add_string(nvl, "new_devid",
  628                             dp->dd_new_devid);
  629                 }
  630         }
  631 
  632         if (dp->dd_found == B_TRUE && nvlist_lookup_uint64(nvl,
  633             ZPOOL_CONFIG_IS_SPARE, &isspare) == 0 && isspare)
  634                 dp->dd_num_spares++;
  635 
  636         (dp->dd_func)(zhp, nvl, dp->dd_islabeled);
  637 }
  638 
  639 static void
  640 zfs_enable_ds(void *arg)
  641 {
  642         unavailpool_t *pool = (unavailpool_t *)arg;
  643 
  644         (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0);
  645         zpool_close(pool->uap_zhp);
  646         free(pool);
  647 }
  648 
  649 static int
  650 zfs_iter_pool(zpool_handle_t *zhp, void *data)
  651 {
  652         nvlist_t *config, *nvl;
  653         dev_data_t *dp = data;
  654         uint64_t pool_guid;
  655         unavailpool_t *pool;
  656 
  657         zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)",
  658             zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop);
  659 
  660         /*
  661          * For each vdev in this pool, look for a match to apply dd_func
  662          */
  663         if ((config = zpool_get_config(zhp, NULL)) != NULL) {
  664                 if (dp->dd_pool_guid == 0 ||
  665                     (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
  666                     &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) {
  667                         (void) nvlist_lookup_nvlist(config,
  668                             ZPOOL_CONFIG_VDEV_TREE, &nvl);
  669                         zfs_iter_vdev(zhp, nvl, data);
  670                 }
  671         } else {
  672                 zed_log_msg(LOG_INFO, "%s: no config\n", __func__);
  673         }
  674 
  675         /*
  676          * if this pool was originally unavailable,
  677          * then enable its datasets asynchronously
  678          */
  679         if (g_enumeration_done)  {
  680                 for (pool = list_head(&g_pool_list); pool != NULL;
  681                     pool = list_next(&g_pool_list, pool)) {
  682 
  683                         if (strcmp(zpool_get_name(zhp),
  684                             zpool_get_name(pool->uap_zhp)))
  685                                 continue;
  686                         if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) {
  687                                 list_remove(&g_pool_list, pool);
  688                                 (void) tpool_dispatch(g_tpool, zfs_enable_ds,
  689                                     pool);
  690                                 break;
  691                         }
  692                 }
  693         }
  694 
  695         zpool_close(zhp);
  696 
  697         /* cease iteration after a match */
  698         return (dp->dd_found && dp->dd_num_spares == 0);
  699 }
  700 
  701 /*
  702  * Given a physical device location, iterate over all
  703  * (pool, vdev) pairs which correspond to that location.
  704  */
  705 static boolean_t
  706 devphys_iter(const char *physical, const char *devid, zfs_process_func_t func,
  707     boolean_t is_slice, uint64_t new_vdev_guid)
  708 {
  709         dev_data_t data = { 0 };
  710 
  711         data.dd_compare = physical;
  712         data.dd_func = func;
  713         data.dd_prop = ZPOOL_CONFIG_PHYS_PATH;
  714         data.dd_found = B_FALSE;
  715         data.dd_islabeled = is_slice;
  716         data.dd_new_devid = devid;      /* used by auto replace code */
  717         data.dd_new_vdev_guid = new_vdev_guid;
  718 
  719         (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
  720 
  721         return (data.dd_found);
  722 }
  723 
  724 /*
  725  * Given a device identifier, find any vdevs with a matching by-vdev
  726  * path.  Normally we shouldn't need this as the comparison would be
  727  * made earlier in the devphys_iter().  For example, if we were replacing
  728  * /dev/disk/by-vdev/L28, normally devphys_iter() would match the
  729  * ZPOOL_CONFIG_PHYS_PATH of "L28" from the old disk config to "L28"
  730  * of the new disk config.  However, we've seen cases where
  731  * ZPOOL_CONFIG_PHYS_PATH was not in the config for the old disk.  Here's
  732  * an example of a real 2-disk mirror pool where one disk was force
  733  * faulted:
  734  *
  735  *       com.delphix:vdev_zap_top: 129
  736  *           children[0]:
  737  *               type: 'disk'
  738  *               id: 0
  739  *               guid: 14309659774640089719
  740  *               path: '/dev/disk/by-vdev/L28'
  741  *               whole_disk: 0
  742  *               DTL: 654
  743  *               create_txg: 4
  744  *               com.delphix:vdev_zap_leaf: 1161
  745  *               faulted: 1
  746  *               aux_state: 'external'
  747  *           children[1]:
  748  *               type: 'disk'
  749  *               id: 1
  750  *               guid: 16002508084177980912
  751  *               path: '/dev/disk/by-vdev/L29'
  752  *               devid: 'dm-uuid-mpath-35000c500a61d68a3'
  753  *               phys_path: 'L29'
  754  *               vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32'
  755  *               whole_disk: 0
  756  *               DTL: 1028
  757  *               create_txg: 4
  758  *               com.delphix:vdev_zap_leaf: 131
  759  *
  760  * So in the case above, the only thing we could compare is the path.
  761  *
  762  * We can do this because we assume by-vdev paths are authoritative as physical
  763  * paths.  We could not assume this for normal paths like /dev/sda since the
  764  * physical location /dev/sda points to could change over time.
  765  */
  766 static boolean_t
  767 by_vdev_path_iter(const char *by_vdev_path, const char *devid,
  768     zfs_process_func_t func, boolean_t is_slice)
  769 {
  770         dev_data_t data = { 0 };
  771 
  772         data.dd_compare = by_vdev_path;
  773         data.dd_func = func;
  774         data.dd_prop = ZPOOL_CONFIG_PATH;
  775         data.dd_found = B_FALSE;
  776         data.dd_islabeled = is_slice;
  777         data.dd_new_devid = devid;
  778 
  779         if (strncmp(by_vdev_path, DEV_BYVDEV_PATH,
  780             strlen(DEV_BYVDEV_PATH)) != 0) {
  781                 /* by_vdev_path doesn't start with "/dev/disk/by-vdev/" */
  782                 return (B_FALSE);
  783         }
  784 
  785         (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
  786 
  787         return (data.dd_found);
  788 }
  789 
  790 /*
  791  * Given a device identifier, find any vdevs with a matching devid.
  792  * On Linux we can match devid directly which is always a whole disk.
  793  */
  794 static boolean_t
  795 devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice)
  796 {
  797         dev_data_t data = { 0 };
  798 
  799         data.dd_compare = devid;
  800         data.dd_func = func;
  801         data.dd_prop = ZPOOL_CONFIG_DEVID;
  802         data.dd_found = B_FALSE;
  803         data.dd_islabeled = is_slice;
  804         data.dd_new_devid = devid;
  805 
  806         (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
  807 
  808         return (data.dd_found);
  809 }
  810 
  811 /*
  812  * Given a device guid, find any vdevs with a matching guid.
  813  */
  814 static boolean_t
  815 guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid,
  816     zfs_process_func_t func, boolean_t is_slice)
  817 {
  818         dev_data_t data = { 0 };
  819 
  820         data.dd_func = func;
  821         data.dd_found = B_FALSE;
  822         data.dd_pool_guid = pool_guid;
  823         data.dd_vdev_guid = vdev_guid;
  824         data.dd_islabeled = is_slice;
  825         data.dd_new_devid = devid;
  826 
  827         (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
  828 
  829         return (data.dd_found);
  830 }
  831 
  832 /*
  833  * Handle a EC_DEV_ADD.ESC_DISK event.
  834  *
  835  * illumos
  836  *      Expects: DEV_PHYS_PATH string in schema
  837  *      Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID
  838  *
  839  *      path: '/dev/dsk/c0t1d0s0' (persistent)
  840  *     devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a'
  841  * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a'
  842  *
  843  * linux
  844  *      provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema
  845  *      Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID
  846  *
  847  *      path: '/dev/sdc1' (not persistent)
  848  *     devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1'
  849  * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0'
  850  */
  851 static int
  852 zfs_deliver_add(nvlist_t *nvl)
  853 {
  854         char *devpath = NULL, *devid = NULL;
  855         uint64_t pool_guid = 0, vdev_guid = 0;
  856         boolean_t is_slice;
  857 
  858         /*
  859          * Expecting a devid string and an optional physical location and guid
  860          */
  861         if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) {
  862                 zed_log_msg(LOG_INFO, "%s: no dev identifier\n", __func__);
  863                 return (-1);
  864         }
  865 
  866         (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath);
  867         (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
  868         (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid);
  869 
  870         is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0);
  871 
  872         zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)",
  873             devid, devpath ? devpath : "NULL", is_slice);
  874 
  875         /*
  876          * Iterate over all vdevs looking for a match in the following order:
  877          * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk)
  878          * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location).
  879          * 3. ZPOOL_CONFIG_GUID (identifies unique vdev).
  880          * 4. ZPOOL_CONFIG_PATH for /dev/disk/by-vdev devices only (since
  881          *    by-vdev paths represent physical paths).
  882          */
  883         if (devid_iter(devid, zfs_process_add, is_slice))
  884                 return (0);
  885         if (devpath != NULL && devphys_iter(devpath, devid, zfs_process_add,
  886             is_slice, vdev_guid))
  887                 return (0);
  888         if (vdev_guid != 0)
  889                 (void) guid_iter(pool_guid, vdev_guid, devid, zfs_process_add,
  890                     is_slice);
  891 
  892         if (devpath != NULL) {
  893                 /* Can we match a /dev/disk/by-vdev/ path? */
  894                 char by_vdev_path[MAXPATHLEN];
  895                 snprintf(by_vdev_path, sizeof (by_vdev_path),
  896                     "/dev/disk/by-vdev/%s", devpath);
  897                 if (by_vdev_path_iter(by_vdev_path, devid, zfs_process_add,
  898                     is_slice))
  899                         return (0);
  900         }
  901 
  902         return (0);
  903 }
  904 
  905 /*
  906  * Called when we receive a VDEV_CHECK event, which indicates a device could not
  907  * be opened during initial pool open, but the autoreplace property was set on
  908  * the pool.  In this case, we treat it as if it were an add event.
  909  */
  910 static int
  911 zfs_deliver_check(nvlist_t *nvl)
  912 {
  913         dev_data_t data = { 0 };
  914 
  915         if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID,
  916             &data.dd_pool_guid) != 0 ||
  917             nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID,
  918             &data.dd_vdev_guid) != 0 ||
  919             data.dd_vdev_guid == 0)
  920                 return (0);
  921 
  922         zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu",
  923             data.dd_pool_guid, data.dd_vdev_guid);
  924 
  925         data.dd_func = zfs_process_add;
  926 
  927         (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
  928 
  929         return (0);
  930 }
  931 
  932 /*
  933  * Given a path to a vdev, lookup the vdev's physical size from its
  934  * config nvlist.
  935  *
  936  * Returns the vdev's physical size in bytes on success, 0 on error.
  937  */
  938 static uint64_t
  939 vdev_size_from_config(zpool_handle_t *zhp, const char *vdev_path)
  940 {
  941         nvlist_t *nvl = NULL;
  942         boolean_t avail_spare, l2cache, log;
  943         vdev_stat_t *vs = NULL;
  944         uint_t c;
  945 
  946         nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log);
  947         if (!nvl)
  948                 return (0);
  949 
  950         verify(nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS,
  951             (uint64_t **)&vs, &c) == 0);
  952         if (!vs) {
  953                 zed_log_msg(LOG_INFO, "%s: no nvlist for '%s'", __func__,
  954                     vdev_path);
  955                 return (0);
  956         }
  957 
  958         return (vs->vs_pspace);
  959 }
  960 
  961 /*
  962  * Given a path to a vdev, lookup if the vdev is a "whole disk" in the
  963  * config nvlist.  "whole disk" means that ZFS was passed a whole disk
  964  * at pool creation time, which it partitioned up and has full control over.
  965  * Thus a partition with wholedisk=1 set tells us that zfs created the
  966  * partition at creation time.  A partition without whole disk set would have
  967  * been created by externally (like with fdisk) and passed to ZFS.
  968  *
  969  * Returns the whole disk value (either 0 or 1).
  970  */
  971 static uint64_t
  972 vdev_whole_disk_from_config(zpool_handle_t *zhp, const char *vdev_path)
  973 {
  974         nvlist_t *nvl = NULL;
  975         boolean_t avail_spare, l2cache, log;
  976         uint64_t wholedisk = 0;
  977 
  978         nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log);
  979         if (!nvl)
  980                 return (0);
  981 
  982         (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
  983 
  984         return (wholedisk);
  985 }
  986 
  987 /*
  988  * If the device size grew more than 1% then return true.
  989  */
  990 #define DEVICE_GREW(oldsize, newsize) \
  991                     ((newsize > oldsize) && \
  992                     ((newsize / (newsize - oldsize)) <= 100))
  993 
  994 static int
  995 zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
  996 {
  997         boolean_t avail_spare, l2cache;
  998         nvlist_t *udev_nvl = data;
  999         nvlist_t *tgt;
 1000         int error;
 1001 
 1002         char *tmp_devname, devname[MAXPATHLEN] = "";
 1003         uint64_t guid;
 1004 
 1005         if (nvlist_lookup_uint64(udev_nvl, ZFS_EV_VDEV_GUID, &guid) == 0) {
 1006                 sprintf(devname, "%llu", (u_longlong_t)guid);
 1007         } else if (nvlist_lookup_string(udev_nvl, DEV_PHYS_PATH,
 1008             &tmp_devname) == 0) {
 1009                 strlcpy(devname, tmp_devname, MAXPATHLEN);
 1010                 zfs_append_partition(devname, MAXPATHLEN);
 1011         } else {
 1012                 zed_log_msg(LOG_INFO, "%s: no guid or physpath", __func__);
 1013         }
 1014 
 1015         zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'",
 1016             devname, zpool_get_name(zhp));
 1017 
 1018         if ((tgt = zpool_find_vdev_by_physpath(zhp, devname,
 1019             &avail_spare, &l2cache, NULL)) != NULL) {
 1020                 char *path, fullpath[MAXPATHLEN];
 1021                 uint64_t wholedisk = 0;
 1022 
 1023                 error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path);
 1024                 if (error) {
 1025                         zpool_close(zhp);
 1026                         return (0);
 1027                 }
 1028 
 1029                 (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
 1030                     &wholedisk);
 1031 
 1032                 if (wholedisk) {
 1033                         path = strrchr(path, '/');
 1034                         if (path != NULL) {
 1035                                 path = zfs_strip_partition(path + 1);
 1036                                 if (path == NULL) {
 1037                                         zpool_close(zhp);
 1038                                         return (0);
 1039                                 }
 1040                         } else {
 1041                                 zpool_close(zhp);
 1042                                 return (0);
 1043                         }
 1044 
 1045                         (void) strlcpy(fullpath, path, sizeof (fullpath));
 1046                         free(path);
 1047 
 1048                         /*
 1049                          * We need to reopen the pool associated with this
 1050                          * device so that the kernel can update the size of
 1051                          * the expanded device.  When expanding there is no
 1052                          * need to restart the scrub from the beginning.
 1053                          */
 1054                         boolean_t scrub_restart = B_FALSE;
 1055                         (void) zpool_reopen_one(zhp, &scrub_restart);
 1056                 } else {
 1057                         (void) strlcpy(fullpath, path, sizeof (fullpath));
 1058                 }
 1059 
 1060                 if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
 1061                         vdev_state_t newstate;
 1062 
 1063                         if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) {
 1064                                 /*
 1065                                  * If this disk size has not changed, then
 1066                                  * there's no need to do an autoexpand.  To
 1067                                  * check we look at the disk's size in its
 1068                                  * config, and compare it to the disk size
 1069                                  * that udev is reporting.
 1070                                  */
 1071                                 uint64_t udev_size = 0, conf_size = 0,
 1072                                     wholedisk = 0, udev_parent_size = 0;
 1073 
 1074                                 /*
 1075                                  * Get the size of our disk that udev is
 1076                                  * reporting.
 1077                                  */
 1078                                 if (nvlist_lookup_uint64(udev_nvl, DEV_SIZE,
 1079                                     &udev_size) != 0) {
 1080                                         udev_size = 0;
 1081                                 }
 1082 
 1083                                 /*
 1084                                  * Get the size of our disk's parent device
 1085                                  * from udev (where sda1's parent is sda).
 1086                                  */
 1087                                 if (nvlist_lookup_uint64(udev_nvl,
 1088                                     DEV_PARENT_SIZE, &udev_parent_size) != 0) {
 1089                                         udev_parent_size = 0;
 1090                                 }
 1091 
 1092                                 conf_size = vdev_size_from_config(zhp,
 1093                                     fullpath);
 1094 
 1095                                 wholedisk = vdev_whole_disk_from_config(zhp,
 1096                                     fullpath);
 1097 
 1098                                 /*
 1099                                  * Only attempt an autoexpand if the vdev size
 1100                                  * changed.  There are two different cases
 1101                                  * to consider.
 1102                                  *
 1103                                  * 1. wholedisk=1
 1104                                  * If you do a 'zpool create' on a whole disk
 1105                                  * (like /dev/sda), then zfs will create
 1106                                  * partitions on the disk (like /dev/sda1).  In
 1107                                  * that case, wholedisk=1 will be set in the
 1108                                  * partition's nvlist config.  So zed will need
 1109                                  * to see if your parent device (/dev/sda)
 1110                                  * expanded in size, and if so, then attempt
 1111                                  * the autoexpand.
 1112                                  *
 1113                                  * 2. wholedisk=0
 1114                                  * If you do a 'zpool create' on an existing
 1115                                  * partition, or a device that doesn't allow
 1116                                  * partitions, then wholedisk=0, and you will
 1117                                  * simply need to check if the device itself
 1118                                  * expanded in size.
 1119                                  */
 1120                                 if (DEVICE_GREW(conf_size, udev_size) ||
 1121                                     (wholedisk && DEVICE_GREW(conf_size,
 1122                                     udev_parent_size))) {
 1123                                         error = zpool_vdev_online(zhp, fullpath,
 1124                                             0, &newstate);
 1125 
 1126                                         zed_log_msg(LOG_INFO,
 1127                                             "%s: autoexpanding '%s' from %llu"
 1128                                             " to %llu bytes in pool '%s': %d",
 1129                                             __func__, fullpath, conf_size,
 1130                                             MAX(udev_size, udev_parent_size),
 1131                                             zpool_get_name(zhp), error);
 1132                                 }
 1133                         }
 1134                 }
 1135                 zpool_close(zhp);
 1136                 return (1);
 1137         }
 1138         zpool_close(zhp);
 1139         return (0);
 1140 }
 1141 
 1142 /*
 1143  * This function handles the ESC_DEV_DLE device change event.  Use the
 1144  * provided vdev guid when looking up a disk or partition, when the guid
 1145  * is not present assume the entire disk is owned by ZFS and append the
 1146  * expected -part1 partition information then lookup by physical path.
 1147  */
 1148 static int
 1149 zfs_deliver_dle(nvlist_t *nvl)
 1150 {
 1151         char *devname, name[MAXPATHLEN];
 1152         uint64_t guid;
 1153 
 1154         if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) {
 1155                 sprintf(name, "%llu", (u_longlong_t)guid);
 1156         } else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) {
 1157                 strlcpy(name, devname, MAXPATHLEN);
 1158                 zfs_append_partition(name, MAXPATHLEN);
 1159         } else {
 1160                 sprintf(name, "unknown");
 1161                 zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath");
 1162         }
 1163 
 1164         if (zpool_iter(g_zfshdl, zfsdle_vdev_online, nvl) != 1) {
 1165                 zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not "
 1166                     "found", name);
 1167                 return (1);
 1168         }
 1169 
 1170         return (0);
 1171 }
 1172 
 1173 /*
 1174  * syseventd daemon module event handler
 1175  *
 1176  * Handles syseventd daemon zfs device related events:
 1177  *
 1178  *      EC_DEV_ADD.ESC_DISK
 1179  *      EC_DEV_STATUS.ESC_DEV_DLE
 1180  *      EC_ZFS.ESC_ZFS_VDEV_CHECK
 1181  *
 1182  * Note: assumes only one thread active at a time (not thread safe)
 1183  */
 1184 static int
 1185 zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl)
 1186 {
 1187         int ret;
 1188         boolean_t is_check = B_FALSE, is_dle = B_FALSE;
 1189 
 1190         if (strcmp(class, EC_DEV_ADD) == 0) {
 1191                 /*
 1192                  * We're mainly interested in disk additions, but we also listen
 1193                  * for new loop devices, to allow for simplified testing.
 1194                  */
 1195                 if (strcmp(subclass, ESC_DISK) != 0 &&
 1196                     strcmp(subclass, ESC_LOFI) != 0)
 1197                         return (0);
 1198 
 1199                 is_check = B_FALSE;
 1200         } else if (strcmp(class, EC_ZFS) == 0 &&
 1201             strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) {
 1202                 /*
 1203                  * This event signifies that a device failed to open
 1204                  * during pool load, but the 'autoreplace' property was
 1205                  * set, so we should pretend it's just been added.
 1206                  */
 1207                 is_check = B_TRUE;
 1208         } else if (strcmp(class, EC_DEV_STATUS) == 0 &&
 1209             strcmp(subclass, ESC_DEV_DLE) == 0) {
 1210                 is_dle = B_TRUE;
 1211         } else {
 1212                 return (0);
 1213         }
 1214 
 1215         if (is_dle)
 1216                 ret = zfs_deliver_dle(nvl);
 1217         else if (is_check)
 1218                 ret = zfs_deliver_check(nvl);
 1219         else
 1220                 ret = zfs_deliver_add(nvl);
 1221 
 1222         return (ret);
 1223 }
 1224 
 1225 static void *
 1226 zfs_enum_pools(void *arg)
 1227 {
 1228         (void) arg;
 1229 
 1230         (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list);
 1231         /*
 1232          * Linux - instead of using a thread pool, each list entry
 1233          * will spawn a thread when an unavailable pool transitions
 1234          * to available. zfs_slm_fini will wait for these threads.
 1235          */
 1236         g_enumeration_done = B_TRUE;
 1237         return (NULL);
 1238 }
 1239 
 1240 /*
 1241  * called from zed daemon at startup
 1242  *
 1243  * sent messages from zevents or udev monitor
 1244  *
 1245  * For now, each agent has its own libzfs instance
 1246  */
 1247 int
 1248 zfs_slm_init(void)
 1249 {
 1250         if ((g_zfshdl = libzfs_init()) == NULL)
 1251                 return (-1);
 1252 
 1253         /*
 1254          * collect a list of unavailable pools (asynchronously,
 1255          * since this can take a while)
 1256          */
 1257         list_create(&g_pool_list, sizeof (struct unavailpool),
 1258             offsetof(struct unavailpool, uap_node));
 1259 
 1260         if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) {
 1261                 list_destroy(&g_pool_list);
 1262                 libzfs_fini(g_zfshdl);
 1263                 return (-1);
 1264         }
 1265 
 1266         pthread_setname_np(g_zfs_tid, "enum-pools");
 1267         list_create(&g_device_list, sizeof (struct pendingdev),
 1268             offsetof(struct pendingdev, pd_node));
 1269 
 1270         return (0);
 1271 }
 1272 
 1273 void
 1274 zfs_slm_fini(void)
 1275 {
 1276         unavailpool_t *pool;
 1277         pendingdev_t *device;
 1278 
 1279         /* wait for zfs_enum_pools thread to complete */
 1280         (void) pthread_join(g_zfs_tid, NULL);
 1281         /* destroy the thread pool */
 1282         if (g_tpool != NULL) {
 1283                 tpool_wait(g_tpool);
 1284                 tpool_destroy(g_tpool);
 1285         }
 1286 
 1287         while ((pool = (list_head(&g_pool_list))) != NULL) {
 1288                 list_remove(&g_pool_list, pool);
 1289                 zpool_close(pool->uap_zhp);
 1290                 free(pool);
 1291         }
 1292         list_destroy(&g_pool_list);
 1293 
 1294         while ((device = (list_head(&g_device_list))) != NULL) {
 1295                 list_remove(&g_device_list, device);
 1296                 free(device);
 1297         }
 1298         list_destroy(&g_device_list);
 1299 
 1300         libzfs_fini(g_zfshdl);
 1301 }
 1302 
 1303 void
 1304 zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl)
 1305 {
 1306         zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass);
 1307         (void) zfs_slm_deliver_event(class, subclass, nvl);
 1308 }

Cache object: 42ae6c2c1d1594cad86107fd809d1d18


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.