1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
6 * You can obtain a copy of the license from the top-level file
7 * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
8 * You may not use this file except in compliance with the license.
9 *
10 * CDDL HEADER END
11 */
12
13 /*
14 * Copyright (c) 2016, 2017, Intel Corporation.
15 */
16
17 #ifdef HAVE_LIBUDEV
18
19 #include <errno.h>
20 #include <fcntl.h>
21 #include <libnvpair.h>
22 #include <libudev.h>
23 #include <libzfs.h>
24 #include <libzutil.h>
25 #include <pthread.h>
26 #include <stdlib.h>
27 #include <string.h>
28
29 #include <sys/sysevent/eventdefs.h>
30 #include <sys/sysevent/dev.h>
31
32 #include "zed_log.h"
33 #include "zed_disk_event.h"
34 #include "agents/zfs_agents.h"
35
36 /*
37 * Portions of ZED need to see disk events for disks belonging to ZFS pools.
38 * A libudev monitor is established to monitor block device actions and pass
39 * them on to internal ZED logic modules. Initially, zfs_mod.c is the only
40 * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM
41 * module responsible for handling disk events for ZFS.
42 */
43
44 pthread_t g_mon_tid;
45 struct udev *g_udev;
46 struct udev_monitor *g_mon;
47
48
49 #define DEV_BYID_PATH "/dev/disk/by-id/"
50
51 /* 64MB is minimum usable disk for ZFS */
52 #define MINIMUM_SECTORS 131072ULL
53
54
55 /*
56 * Post disk event to SLM module
57 *
58 * occurs in the context of monitor thread
59 */
60 static void
61 zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl)
62 {
63 char *strval;
64 uint64_t numval;
65
66 zed_log_msg(LOG_INFO, "zed_disk_event:");
67 zed_log_msg(LOG_INFO, "\tclass: %s", class);
68 zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass);
69 if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0)
70 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval);
71 if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0)
72 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval);
73 if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0)
74 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval);
75 if (nvlist_lookup_boolean(nvl, DEV_IS_PART) == B_TRUE)
76 zed_log_msg(LOG_INFO, "\t%s: B_TRUE", DEV_IS_PART);
77 if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0)
78 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval);
79 if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0)
80 zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval);
81 if (nvlist_lookup_uint64(nvl, DEV_PARENT_SIZE, &numval) == 0)
82 zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_PARENT_SIZE, numval);
83 if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0)
84 zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval);
85 if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0)
86 zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval);
87
88 (void) zfs_agent_post_event(class, subclass, nvl);
89 }
90
91 /*
92 * dev_event_nvlist: place event schema into an nv pair list
93 *
94 * NAME VALUE (example)
95 * -------------- --------------------------------------------------------
96 * DEV_NAME /dev/sdl
97 * DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/...
98 * DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC
99 * DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0
100 * DEV_IS_PART ---
101 * DEV_SIZE 500107862016
102 * ZFS_EV_POOL_GUID 17523635698032189180
103 * ZFS_EV_VDEV_GUID 14663607734290803088
104 */
105 static nvlist_t *
106 dev_event_nvlist(struct udev_device *dev)
107 {
108 nvlist_t *nvl;
109 char strval[128];
110 const char *value, *path;
111 uint64_t guid;
112
113 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
114 return (NULL);
115
116 if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0)
117 (void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval);
118 if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0)
119 (void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval);
120 if ((path = udev_device_get_devnode(dev)) != NULL)
121 (void) nvlist_add_string(nvl, DEV_NAME, path);
122 if ((value = udev_device_get_devpath(dev)) != NULL)
123 (void) nvlist_add_string(nvl, DEV_PATH, value);
124 value = udev_device_get_devtype(dev);
125 if ((value != NULL && strcmp("partition", value) == 0) ||
126 (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER")
127 != NULL)) {
128 (void) nvlist_add_boolean(nvl, DEV_IS_PART);
129 }
130 if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) {
131 uint64_t numval = DEV_BSIZE;
132
133 numval *= strtoull(value, NULL, 10);
134 (void) nvlist_add_uint64(nvl, DEV_SIZE, numval);
135
136 /*
137 * If the device has a parent, then get the parent block
138 * device's size as well. For example, /dev/sda1's parent
139 * is /dev/sda.
140 */
141 struct udev_device *parent_dev = udev_device_get_parent(dev);
142 if ((value = udev_device_get_sysattr_value(parent_dev, "size"))
143 != NULL) {
144 uint64_t numval = DEV_BSIZE;
145
146 numval *= strtoull(value, NULL, 10);
147 (void) nvlist_add_uint64(nvl, DEV_PARENT_SIZE, numval);
148 }
149 }
150
151 /*
152 * Grab the pool and vdev guids from blkid cache
153 */
154 value = udev_device_get_property_value(dev, "ID_FS_UUID");
155 if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
156 (void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid);
157
158 value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB");
159 if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
160 (void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid);
161
162 /*
163 * Either a vdev guid or a devid must be present for matching
164 */
165 if (!nvlist_exists(nvl, DEV_IDENTIFIER) &&
166 !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) {
167 nvlist_free(nvl);
168 return (NULL);
169 }
170
171 return (nvl);
172 }
173
174 /*
175 * Listen for block device uevents
176 */
177 static void *
178 zed_udev_monitor(void *arg)
179 {
180 struct udev_monitor *mon = arg;
181 char *tmp, *tmp2;
182
183 zed_log_msg(LOG_INFO, "Waiting for new udev disk events...");
184
185 while (1) {
186 struct udev_device *dev;
187 const char *action, *type, *part, *sectors;
188 const char *bus, *uuid, *devpath;
189 const char *class, *subclass;
190 nvlist_t *nvl;
191 boolean_t is_zfs = B_FALSE;
192
193 /* allow a cancellation while blocked (recvmsg) */
194 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
195
196 /* blocks at recvmsg until an event occurs */
197 if ((dev = udev_monitor_receive_device(mon)) == NULL) {
198 zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive "
199 "device error %d", errno);
200 continue;
201 }
202
203 /* allow all steps to complete before a cancellation */
204 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
205
206 /*
207 * Strongly typed device is the preferred filter
208 */
209 type = udev_device_get_property_value(dev, "ID_FS_TYPE");
210 if (type != NULL && type[0] != '\0') {
211 if (strcmp(type, "zfs_member") == 0) {
212 is_zfs = B_TRUE;
213 } else {
214 /* not ours, so skip */
215 zed_log_msg(LOG_INFO, "zed_udev_monitor: skip "
216 "%s (in use by %s)",
217 udev_device_get_devnode(dev), type);
218 udev_device_unref(dev);
219 continue;
220 }
221 }
222
223 /*
224 * if this is a disk and it is partitioned, then the
225 * zfs label will reside in a DEVTYPE=partition and
226 * we can skip passing this event
227 *
228 * Special case: Blank disks are sometimes reported with
229 * an erroneous 'atari' partition, and should not be
230 * excluded from being used as an autoreplace disk:
231 *
232 * https://github.com/openzfs/zfs/issues/13497
233 */
234 type = udev_device_get_property_value(dev, "DEVTYPE");
235 part = udev_device_get_property_value(dev,
236 "ID_PART_TABLE_TYPE");
237 if (type != NULL && type[0] != '\0' &&
238 strcmp(type, "disk") == 0 &&
239 part != NULL && part[0] != '\0') {
240 const char *devname =
241 udev_device_get_property_value(dev, "DEVNAME");
242
243 if (strcmp(part, "atari") == 0) {
244 zed_log_msg(LOG_INFO,
245 "%s: %s is reporting an atari partition, "
246 "but we're going to assume it's a false "
247 "positive and still use it (issue #13497)",
248 __func__, devname);
249 } else {
250 zed_log_msg(LOG_INFO,
251 "%s: skip %s since it has a %s partition "
252 "already", __func__, devname, part);
253 /* skip and wait for partition event */
254 udev_device_unref(dev);
255 continue;
256 }
257 }
258
259 /*
260 * ignore small partitions
261 */
262 sectors = udev_device_get_property_value(dev,
263 "ID_PART_ENTRY_SIZE");
264 if (sectors == NULL)
265 sectors = udev_device_get_sysattr_value(dev, "size");
266 if (sectors != NULL &&
267 strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) {
268 zed_log_msg(LOG_INFO,
269 "%s: %s sectors %s < %llu (minimum)",
270 __func__,
271 udev_device_get_property_value(dev, "DEVNAME"),
272 sectors, MINIMUM_SECTORS);
273 udev_device_unref(dev);
274 continue;
275 }
276
277 /*
278 * If the blkid probe didn't find ZFS, then a persistent
279 * device id string is required in the message schema
280 * for matching with vdevs. Preflight here for expected
281 * udev information.
282 *
283 * Special case:
284 * NVMe devices don't have ID_BUS set (at least on RHEL 7-8),
285 * but they are valid for autoreplace. Add a special case for
286 * them by searching for "/nvme/" in the udev DEVPATH:
287 *
288 * DEVPATH=/devices/pci0000:00/0000:00:1e.0/nvme/nvme2/nvme2n1
289 */
290 bus = udev_device_get_property_value(dev, "ID_BUS");
291 uuid = udev_device_get_property_value(dev, "DM_UUID");
292 devpath = udev_device_get_devpath(dev);
293 if (!is_zfs && (bus == NULL && uuid == NULL &&
294 strstr(devpath, "/nvme/") == NULL)) {
295 zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid "
296 "source", udev_device_get_devnode(dev));
297 udev_device_unref(dev);
298 continue;
299 }
300
301 action = udev_device_get_action(dev);
302 if (strcmp(action, "add") == 0) {
303 class = EC_DEV_ADD;
304 subclass = ESC_DISK;
305 } else if (strcmp(action, "remove") == 0) {
306 class = EC_DEV_REMOVE;
307 subclass = ESC_DISK;
308 } else if (strcmp(action, "change") == 0) {
309 class = EC_DEV_STATUS;
310 subclass = ESC_DEV_DLE;
311 } else {
312 zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown",
313 action);
314 udev_device_unref(dev);
315 continue;
316 }
317
318 /*
319 * Special case an EC_DEV_ADD for multipath devices
320 *
321 * When a multipath device is created, udev reports the
322 * following:
323 *
324 * 1. "add" event of the dm device for the multipath device
325 * (like /dev/dm-3).
326 * 2. "change" event to create the actual multipath device
327 * symlink (like /dev/mapper/mpatha). The event also
328 * passes back the relevant DM vars we care about, like
329 * DM_UUID.
330 * 3. Another "change" event identical to #2 (that we ignore).
331 *
332 * To get the behavior we want, we treat the "change" event
333 * in #2 as a "add" event; as if "/dev/mapper/mpatha" was
334 * a new disk being added.
335 */
336 if (strcmp(class, EC_DEV_STATUS) == 0 &&
337 udev_device_get_property_value(dev, "DM_UUID") &&
338 udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) {
339 tmp = (char *)udev_device_get_devnode(dev);
340 tmp2 = zfs_get_underlying_path(tmp);
341 if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) {
342 /*
343 * We have a real underlying device, which
344 * means that this multipath "change" event is
345 * an "add" event.
346 *
347 * If the multipath device and the underlying
348 * dev are the same name (i.e. /dev/dm-5), then
349 * there is no real underlying disk for this
350 * multipath device, and so this "change" event
351 * really is a multipath removal.
352 */
353 class = EC_DEV_ADD;
354 subclass = ESC_DISK;
355 } else {
356 tmp = (char *)
357 udev_device_get_property_value(dev,
358 "DM_NR_VALID_PATHS");
359 /* treat as a multipath remove */
360 if (tmp != NULL && strcmp(tmp, "") == 0) {
361 class = EC_DEV_REMOVE;
362 subclass = ESC_DISK;
363 }
364 }
365 free(tmp2);
366 }
367
368 /*
369 * Special case an EC_DEV_ADD for scsi_debug devices
370 *
371 * These devices require a udevadm trigger command after
372 * creation in order to register the vdev_id scsidebug alias
373 * rule (adds a persistent path (phys_path) used for fault
374 * management automated tests in the ZFS test suite.
375 *
376 * After udevadm trigger command, event registers as a "change"
377 * event but needs to instead be handled as another "add" event
378 * to allow for disk labeling and partitioning to occur.
379 */
380 if (strcmp(class, EC_DEV_STATUS) == 0 &&
381 udev_device_get_property_value(dev, "ID_VDEV") &&
382 udev_device_get_property_value(dev, "ID_MODEL")) {
383 const char *id_model, *id_model_sd = "scsi_debug";
384
385 id_model = udev_device_get_property_value(dev,
386 "ID_MODEL");
387 if (strcmp(id_model, id_model_sd) == 0) {
388 class = EC_DEV_ADD;
389 subclass = ESC_DISK;
390 }
391 }
392
393 if ((nvl = dev_event_nvlist(dev)) != NULL) {
394 zed_udev_event(class, subclass, nvl);
395 nvlist_free(nvl);
396 }
397
398 udev_device_unref(dev);
399 }
400
401 return (NULL);
402 }
403
404 int
405 zed_disk_event_init(void)
406 {
407 int fd, fflags;
408
409 if ((g_udev = udev_new()) == NULL) {
410 zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno);
411 return (-1);
412 }
413
414 /* Set up a udev monitor for block devices */
415 g_mon = udev_monitor_new_from_netlink(g_udev, "udev");
416 udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk");
417 udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block",
418 "partition");
419 udev_monitor_enable_receiving(g_mon);
420
421 /* Make sure monitoring socket is blocking */
422 fd = udev_monitor_get_fd(g_mon);
423 if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK)
424 (void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK);
425
426 /* spawn a thread to monitor events */
427 if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) {
428 udev_monitor_unref(g_mon);
429 udev_unref(g_udev);
430 zed_log_msg(LOG_WARNING, "pthread_create failed");
431 return (-1);
432 }
433
434 pthread_setname_np(g_mon_tid, "udev monitor");
435 zed_log_msg(LOG_INFO, "zed_disk_event_init");
436
437 return (0);
438 }
439
440 void
441 zed_disk_event_fini(void)
442 {
443 /* cancel monitor thread at recvmsg() */
444 (void) pthread_cancel(g_mon_tid);
445 (void) pthread_join(g_mon_tid, NULL);
446
447 /* cleanup udev resources */
448 udev_monitor_unref(g_mon);
449 udev_unref(g_udev);
450
451 zed_log_msg(LOG_INFO, "zed_disk_event_fini");
452 }
453
454 #else
455
456 #include "zed_disk_event.h"
457
458 int
459 zed_disk_event_init(void)
460 {
461 return (0);
462 }
463
464 void
465 zed_disk_event_fini(void)
466 {
467 }
468
469 #endif /* HAVE_LIBUDEV */
Cache object: b295ce4650b5463b5f9fd3f8da450c9c
|