The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/cmd/zed/zed_disk_event.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License Version 1.0 (CDDL-1.0).
    6  * You can obtain a copy of the license from the top-level file
    7  * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
    8  * You may not use this file except in compliance with the license.
    9  *
   10  * CDDL HEADER END
   11  */
   12 
   13 /*
   14  * Copyright (c) 2016, 2017, Intel Corporation.
   15  */
   16 
   17 #ifdef HAVE_LIBUDEV
   18 
   19 #include <errno.h>
   20 #include <fcntl.h>
   21 #include <libnvpair.h>
   22 #include <libudev.h>
   23 #include <libzfs.h>
   24 #include <libzutil.h>
   25 #include <pthread.h>
   26 #include <stdlib.h>
   27 #include <string.h>
   28 
   29 #include <sys/sysevent/eventdefs.h>
   30 #include <sys/sysevent/dev.h>
   31 
   32 #include "zed_log.h"
   33 #include "zed_disk_event.h"
   34 #include "agents/zfs_agents.h"
   35 
   36 /*
   37  * Portions of ZED need to see disk events for disks belonging to ZFS pools.
   38  * A libudev monitor is established to monitor block device actions and pass
   39  * them on to internal ZED logic modules.  Initially, zfs_mod.c is the only
   40  * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM
   41  * module responsible for handling disk events for ZFS.
   42  */
   43 
   44 pthread_t g_mon_tid;
   45 struct udev *g_udev;
   46 struct udev_monitor *g_mon;
   47 
   48 
   49 #define DEV_BYID_PATH   "/dev/disk/by-id/"
   50 
   51 /* 64MB is minimum usable disk for ZFS */
   52 #define MINIMUM_SECTORS         131072ULL
   53 
   54 
   55 /*
   56  * Post disk event to SLM module
   57  *
   58  * occurs in the context of monitor thread
   59  */
   60 static void
   61 zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl)
   62 {
   63         char *strval;
   64         uint64_t numval;
   65 
   66         zed_log_msg(LOG_INFO, "zed_disk_event:");
   67         zed_log_msg(LOG_INFO, "\tclass: %s", class);
   68         zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass);
   69         if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0)
   70                 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval);
   71         if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0)
   72                 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval);
   73         if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0)
   74                 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval);
   75         if (nvlist_lookup_boolean(nvl, DEV_IS_PART) == B_TRUE)
   76                 zed_log_msg(LOG_INFO, "\t%s: B_TRUE", DEV_IS_PART);
   77         if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0)
   78                 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval);
   79         if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0)
   80                 zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval);
   81         if (nvlist_lookup_uint64(nvl, DEV_PARENT_SIZE, &numval) == 0)
   82                 zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_PARENT_SIZE, numval);
   83         if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0)
   84                 zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval);
   85         if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0)
   86                 zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval);
   87 
   88         (void) zfs_agent_post_event(class, subclass, nvl);
   89 }
   90 
   91 /*
   92  * dev_event_nvlist: place event schema into an nv pair list
   93  *
   94  * NAME                 VALUE (example)
   95  * --------------       --------------------------------------------------------
   96  * DEV_NAME             /dev/sdl
   97  * DEV_PATH             /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/...
   98  * DEV_IDENTIFIER       ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC
   99  * DEV_PHYS_PATH        pci-0000:04:00.0-sas-0x4433221101000000-lun-0
  100  * DEV_IS_PART          ---
  101  * DEV_SIZE             500107862016
  102  * ZFS_EV_POOL_GUID     17523635698032189180
  103  * ZFS_EV_VDEV_GUID     14663607734290803088
  104  */
  105 static nvlist_t *
  106 dev_event_nvlist(struct udev_device *dev)
  107 {
  108         nvlist_t *nvl;
  109         char strval[128];
  110         const char *value, *path;
  111         uint64_t guid;
  112 
  113         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
  114                 return (NULL);
  115 
  116         if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0)
  117                 (void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval);
  118         if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0)
  119                 (void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval);
  120         if ((path = udev_device_get_devnode(dev)) != NULL)
  121                 (void) nvlist_add_string(nvl, DEV_NAME, path);
  122         if ((value = udev_device_get_devpath(dev)) != NULL)
  123                 (void) nvlist_add_string(nvl, DEV_PATH, value);
  124         value = udev_device_get_devtype(dev);
  125         if ((value != NULL && strcmp("partition", value) == 0) ||
  126             (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER")
  127             != NULL)) {
  128                 (void) nvlist_add_boolean(nvl, DEV_IS_PART);
  129         }
  130         if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) {
  131                 uint64_t numval = DEV_BSIZE;
  132 
  133                 numval *= strtoull(value, NULL, 10);
  134                 (void) nvlist_add_uint64(nvl, DEV_SIZE, numval);
  135 
  136                 /*
  137                  * If the device has a parent, then get the parent block
  138                  * device's size as well.  For example, /dev/sda1's parent
  139                  * is /dev/sda.
  140                  */
  141                 struct udev_device *parent_dev = udev_device_get_parent(dev);
  142                 if ((value = udev_device_get_sysattr_value(parent_dev, "size"))
  143                     != NULL) {
  144                         uint64_t numval = DEV_BSIZE;
  145 
  146                         numval *= strtoull(value, NULL, 10);
  147                         (void) nvlist_add_uint64(nvl, DEV_PARENT_SIZE, numval);
  148                 }
  149         }
  150 
  151         /*
  152          * Grab the pool and vdev guids from blkid cache
  153          */
  154         value = udev_device_get_property_value(dev, "ID_FS_UUID");
  155         if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
  156                 (void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid);
  157 
  158         value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB");
  159         if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
  160                 (void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid);
  161 
  162         /*
  163          * Either a vdev guid or a devid must be present for matching
  164          */
  165         if (!nvlist_exists(nvl, DEV_IDENTIFIER) &&
  166             !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) {
  167                 nvlist_free(nvl);
  168                 return (NULL);
  169         }
  170 
  171         return (nvl);
  172 }
  173 
  174 /*
  175  *  Listen for block device uevents
  176  */
  177 static void *
  178 zed_udev_monitor(void *arg)
  179 {
  180         struct udev_monitor *mon = arg;
  181         char *tmp, *tmp2;
  182 
  183         zed_log_msg(LOG_INFO, "Waiting for new udev disk events...");
  184 
  185         while (1) {
  186                 struct udev_device *dev;
  187                 const char *action, *type, *part, *sectors;
  188                 const char *bus, *uuid, *devpath;
  189                 const char *class, *subclass;
  190                 nvlist_t *nvl;
  191                 boolean_t is_zfs = B_FALSE;
  192 
  193                 /* allow a cancellation while blocked (recvmsg) */
  194                 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
  195 
  196                 /* blocks at recvmsg until an event occurs */
  197                 if ((dev = udev_monitor_receive_device(mon)) == NULL) {
  198                         zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive "
  199                             "device error %d", errno);
  200                         continue;
  201                 }
  202 
  203                 /* allow all steps to complete before a cancellation */
  204                 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
  205 
  206                 /*
  207                  * Strongly typed device is the preferred filter
  208                  */
  209                 type = udev_device_get_property_value(dev, "ID_FS_TYPE");
  210                 if (type != NULL && type[0] != '\0') {
  211                         if (strcmp(type, "zfs_member") == 0) {
  212                                 is_zfs = B_TRUE;
  213                         } else {
  214                                 /* not ours, so skip */
  215                                 zed_log_msg(LOG_INFO, "zed_udev_monitor: skip "
  216                                     "%s (in use by %s)",
  217                                     udev_device_get_devnode(dev), type);
  218                                 udev_device_unref(dev);
  219                                 continue;
  220                         }
  221                 }
  222 
  223                 /*
  224                  * if this is a disk and it is partitioned, then the
  225                  * zfs label will reside in a DEVTYPE=partition and
  226                  * we can skip passing this event
  227                  *
  228                  * Special case: Blank disks are sometimes reported with
  229                  * an erroneous 'atari' partition, and should not be
  230                  * excluded from being used as an autoreplace disk:
  231                  *
  232                  * https://github.com/openzfs/zfs/issues/13497
  233                  */
  234                 type = udev_device_get_property_value(dev, "DEVTYPE");
  235                 part = udev_device_get_property_value(dev,
  236                     "ID_PART_TABLE_TYPE");
  237                 if (type != NULL && type[0] != '\0' &&
  238                     strcmp(type, "disk") == 0 &&
  239                     part != NULL && part[0] != '\0') {
  240                         const char *devname =
  241                             udev_device_get_property_value(dev, "DEVNAME");
  242 
  243                         if (strcmp(part, "atari") == 0) {
  244                                 zed_log_msg(LOG_INFO,
  245                                     "%s: %s is reporting an atari partition, "
  246                                     "but we're going to assume it's a false "
  247                                     "positive and still use it (issue #13497)",
  248                                     __func__, devname);
  249                         } else {
  250                                 zed_log_msg(LOG_INFO,
  251                                     "%s: skip %s since it has a %s partition "
  252                                     "already", __func__, devname, part);
  253                                 /* skip and wait for partition event */
  254                                 udev_device_unref(dev);
  255                                 continue;
  256                         }
  257                 }
  258 
  259                 /*
  260                  * ignore small partitions
  261                  */
  262                 sectors = udev_device_get_property_value(dev,
  263                     "ID_PART_ENTRY_SIZE");
  264                 if (sectors == NULL)
  265                         sectors = udev_device_get_sysattr_value(dev, "size");
  266                 if (sectors != NULL &&
  267                     strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) {
  268                         zed_log_msg(LOG_INFO,
  269                             "%s: %s sectors %s < %llu (minimum)",
  270                             __func__,
  271                             udev_device_get_property_value(dev, "DEVNAME"),
  272                             sectors, MINIMUM_SECTORS);
  273                         udev_device_unref(dev);
  274                         continue;
  275                 }
  276 
  277                 /*
  278                  * If the blkid probe didn't find ZFS, then a persistent
  279                  * device id string is required in the message schema
  280                  * for matching with vdevs. Preflight here for expected
  281                  * udev information.
  282                  *
  283                  * Special case:
  284                  * NVMe devices don't have ID_BUS set (at least on RHEL 7-8),
  285                  * but they are valid for autoreplace.  Add a special case for
  286                  * them by searching for "/nvme/" in the udev DEVPATH:
  287                  *
  288                  * DEVPATH=/devices/pci0000:00/0000:00:1e.0/nvme/nvme2/nvme2n1
  289                  */
  290                 bus = udev_device_get_property_value(dev, "ID_BUS");
  291                 uuid = udev_device_get_property_value(dev, "DM_UUID");
  292                 devpath = udev_device_get_devpath(dev);
  293                 if (!is_zfs && (bus == NULL && uuid == NULL &&
  294                     strstr(devpath, "/nvme/") == NULL)) {
  295                         zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid "
  296                             "source", udev_device_get_devnode(dev));
  297                         udev_device_unref(dev);
  298                         continue;
  299                 }
  300 
  301                 action = udev_device_get_action(dev);
  302                 if (strcmp(action, "add") == 0) {
  303                         class = EC_DEV_ADD;
  304                         subclass = ESC_DISK;
  305                 } else if (strcmp(action, "remove") == 0) {
  306                         class = EC_DEV_REMOVE;
  307                         subclass = ESC_DISK;
  308                 } else if (strcmp(action, "change") == 0) {
  309                         class = EC_DEV_STATUS;
  310                         subclass = ESC_DEV_DLE;
  311                 } else {
  312                         zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown",
  313                             action);
  314                         udev_device_unref(dev);
  315                         continue;
  316                 }
  317 
  318                 /*
  319                  * Special case an EC_DEV_ADD for multipath devices
  320                  *
  321                  * When a multipath device is created, udev reports the
  322                  * following:
  323                  *
  324                  * 1.   "add" event of the dm device for the multipath device
  325                  *      (like /dev/dm-3).
  326                  * 2.   "change" event to create the actual multipath device
  327                  *      symlink (like /dev/mapper/mpatha).  The event also
  328                  *      passes back the relevant DM vars we care about, like
  329                  *      DM_UUID.
  330                  * 3.   Another "change" event identical to #2 (that we ignore).
  331                  *
  332                  * To get the behavior we want, we treat the "change" event
  333                  * in #2 as a "add" event; as if "/dev/mapper/mpatha" was
  334                  * a new disk being added.
  335                  */
  336                 if (strcmp(class, EC_DEV_STATUS) == 0 &&
  337                     udev_device_get_property_value(dev, "DM_UUID") &&
  338                     udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) {
  339                         tmp = (char *)udev_device_get_devnode(dev);
  340                         tmp2 = zfs_get_underlying_path(tmp);
  341                         if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) {
  342                                 /*
  343                                  * We have a real underlying device, which
  344                                  * means that this multipath "change" event is
  345                                  * an "add" event.
  346                                  *
  347                                  * If the multipath device and the underlying
  348                                  * dev are the same name (i.e. /dev/dm-5), then
  349                                  * there is no real underlying disk for this
  350                                  * multipath device, and so this "change" event
  351                                  * really is a multipath removal.
  352                                  */
  353                                 class = EC_DEV_ADD;
  354                                 subclass = ESC_DISK;
  355                         } else {
  356                                 tmp = (char *)
  357                                     udev_device_get_property_value(dev,
  358                                     "DM_NR_VALID_PATHS");
  359                                 /* treat as a multipath remove */
  360                                 if (tmp != NULL && strcmp(tmp, "") == 0) {
  361                                         class = EC_DEV_REMOVE;
  362                                         subclass = ESC_DISK;
  363                                 }
  364                         }
  365                         free(tmp2);
  366                 }
  367 
  368                 /*
  369                  * Special case an EC_DEV_ADD for scsi_debug devices
  370                  *
  371                  * These devices require a udevadm trigger command after
  372                  * creation in order to register the vdev_id scsidebug alias
  373                  * rule (adds a persistent path (phys_path) used for fault
  374                  * management automated tests in the ZFS test suite.
  375                  *
  376                  * After udevadm trigger command, event registers as a "change"
  377                  * event but needs to instead be handled as another "add" event
  378                  * to allow for disk labeling and partitioning to occur.
  379                  */
  380                 if (strcmp(class, EC_DEV_STATUS) == 0 &&
  381                     udev_device_get_property_value(dev, "ID_VDEV") &&
  382                     udev_device_get_property_value(dev, "ID_MODEL")) {
  383                         const char *id_model, *id_model_sd = "scsi_debug";
  384 
  385                         id_model = udev_device_get_property_value(dev,
  386                             "ID_MODEL");
  387                         if (strcmp(id_model, id_model_sd) == 0) {
  388                                 class = EC_DEV_ADD;
  389                                 subclass = ESC_DISK;
  390                         }
  391                 }
  392 
  393                 if ((nvl = dev_event_nvlist(dev)) != NULL) {
  394                         zed_udev_event(class, subclass, nvl);
  395                         nvlist_free(nvl);
  396                 }
  397 
  398                 udev_device_unref(dev);
  399         }
  400 
  401         return (NULL);
  402 }
  403 
  404 int
  405 zed_disk_event_init(void)
  406 {
  407         int fd, fflags;
  408 
  409         if ((g_udev = udev_new()) == NULL) {
  410                 zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno);
  411                 return (-1);
  412         }
  413 
  414         /* Set up a udev monitor for block devices */
  415         g_mon = udev_monitor_new_from_netlink(g_udev, "udev");
  416         udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk");
  417         udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block",
  418             "partition");
  419         udev_monitor_enable_receiving(g_mon);
  420 
  421         /* Make sure monitoring socket is blocking */
  422         fd = udev_monitor_get_fd(g_mon);
  423         if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK)
  424                 (void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK);
  425 
  426         /* spawn a thread to monitor events */
  427         if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) {
  428                 udev_monitor_unref(g_mon);
  429                 udev_unref(g_udev);
  430                 zed_log_msg(LOG_WARNING, "pthread_create failed");
  431                 return (-1);
  432         }
  433 
  434         pthread_setname_np(g_mon_tid, "udev monitor");
  435         zed_log_msg(LOG_INFO, "zed_disk_event_init");
  436 
  437         return (0);
  438 }
  439 
  440 void
  441 zed_disk_event_fini(void)
  442 {
  443         /* cancel monitor thread at recvmsg() */
  444         (void) pthread_cancel(g_mon_tid);
  445         (void) pthread_join(g_mon_tid, NULL);
  446 
  447         /* cleanup udev resources */
  448         udev_monitor_unref(g_mon);
  449         udev_unref(g_udev);
  450 
  451         zed_log_msg(LOG_INFO, "zed_disk_event_fini");
  452 }
  453 
  454 #else
  455 
  456 #include "zed_disk_event.h"
  457 
  458 int
  459 zed_disk_event_init(void)
  460 {
  461         return (0);
  462 }
  463 
  464 void
  465 zed_disk_event_fini(void)
  466 {
  467 }
  468 
  469 #endif /* HAVE_LIBUDEV */

Cache object: b295ce4650b5463b5f9fd3f8da450c9c


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.