Skip to content

Commit 90abfdf

Browse files
committed
zed: Misc multipath autoreplace fixes
We recently had a case where our operators replaced a bad multipathed disk, only to see it fail to autoreplace. The zed logs showed that the multipath replacement disk did not pass the 'is_dm' test in zfs_process_add() even though it should have. is_dm is set if there exists a sysfs entry for to the underlying /dev/sd* paths for the multipath disk. It's possible this path didn't exist due to a race condition where the sysfs paths weren't created at the time the udev event came in to zed, but this was never verified. This patch updates the check to look for udev properties that indicate if the new autoreplace disk is an empty multipath disk, rather than looking for the underlying sysfs entries. It also adds in additional logging, and fixes a bug where zed allowed you to use an already zfs-formatted disk from another pool as a multipath auto-replacement disk. Furthermore, while testing this patch, I also ran across a case where a force-faulted disk did not have a ZPOOL_CONFIG_PHYS_PATH entry in its config. This prevented it from being autoreplaced. I added additional logic to derive the PHYS_PATH from the PATH if the PATH was a /dev/disk/by-vdev/ path. For example, if PATH was /dev/disk/by-vdev/L28, then PHYS_PATH would be L28. This is safe since by-vdev paths represent physical locations and do not change between boots. Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Tony Hutter <[email protected]> Closes #13023
1 parent 847d030 commit 90abfdf

File tree

3 files changed

+192
-26
lines changed

3 files changed

+192
-26
lines changed

cmd/zed/agents/zfs_mod.c

Lines changed: 163 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -183,14 +183,14 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
183183
nvlist_t *nvroot, *newvd;
184184
pendingdev_t *device;
185185
uint64_t wholedisk = 0ULL;
186-
uint64_t offline = 0ULL;
186+
uint64_t offline = 0ULL, faulted = 0ULL;
187187
uint64_t guid = 0ULL;
188188
char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL;
189189
char rawpath[PATH_MAX], fullpath[PATH_MAX];
190190
char devpath[PATH_MAX];
191191
int ret;
192-
boolean_t is_dm = B_FALSE;
193192
boolean_t is_sd = B_FALSE;
193+
boolean_t is_mpath_wholedisk = B_FALSE;
194194
uint_t c;
195195
vdev_stat_t *vs;
196196

@@ -211,15 +211,73 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
211211
&enc_sysfs_path);
212212
(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
213213
(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline);
214+
(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted);
215+
214216
(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid);
215217

216-
if (offline)
217-
return; /* don't intervene if it was taken offline */
218+
/*
219+
* Special case:
220+
*
221+
* We've seen times where a disk won't have a ZPOOL_CONFIG_PHYS_PATH
222+
* entry in their config. For example, on this force-faulted disk:
223+
*
224+
* children[0]:
225+
* type: 'disk'
226+
* id: 0
227+
* guid: 14309659774640089719
228+
* path: '/dev/disk/by-vdev/L28'
229+
* whole_disk: 0
230+
* DTL: 654
231+
* create_txg: 4
232+
* com.delphix:vdev_zap_leaf: 1161
233+
* faulted: 1
234+
* aux_state: 'external'
235+
* children[1]:
236+
* type: 'disk'
237+
* id: 1
238+
* guid: 16002508084177980912
239+
* path: '/dev/disk/by-vdev/L29'
240+
* devid: 'dm-uuid-mpath-35000c500a61d68a3'
241+
* phys_path: 'L29'
242+
* vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32'
243+
* whole_disk: 0
244+
* DTL: 1028
245+
* create_txg: 4
246+
* com.delphix:vdev_zap_leaf: 131
247+
*
248+
* If the disk's path is a /dev/disk/by-vdev/ path, then we can infer
249+
* the ZPOOL_CONFIG_PHYS_PATH from the by-vdev disk name.
250+
*/
251+
if (physpath == NULL && path != NULL) {
252+
/* If path begins with "/dev/disk/by-vdev/" ... */
253+
if (strncmp(path, DEV_BYVDEV_PATH,
254+
strlen(DEV_BYVDEV_PATH)) == 0) {
255+
/* Set physpath to the char after "/dev/disk/by-vdev" */
256+
physpath = &path[strlen(DEV_BYVDEV_PATH)];
257+
}
258+
}
259+
260+
/*
261+
* We don't want to autoreplace offlined disks. However, we do want to
262+
* replace force-faulted disks (`zpool offline -f`). Force-faulted
263+
* disks have both offline=1 and faulted=1 in the nvlist.
264+
*/
265+
if (offline && !faulted) {
266+
zed_log_msg(LOG_INFO, "%s: %s is offline, skip autoreplace",
267+
__func__, path);
268+
return;
269+
}
218270

219-
is_dm = zfs_dev_is_dm(path);
271+
is_mpath_wholedisk = is_mpath_whole_disk(path);
220272
zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'"
221-
" wholedisk %d, %s dm (guid %llu)", zpool_get_name(zhp), path,
222-
physpath ? physpath : "NULL", wholedisk, is_dm ? "is" : "not",
273+
" %s blank disk, %s mpath blank disk, %s labeled, enc sysfs '%s', "
274+
"(guid %llu)",
275+
zpool_get_name(zhp), path,
276+
physpath ? physpath : "NULL",
277+
wholedisk ? "is" : "not",
278+
is_mpath_wholedisk? "is" : "not",
279+
labeled ? "is" : "not",
280+
enc_sysfs_path,
223281
(long long unsigned int)guid);
224282

225283
/*
@@ -253,8 +311,9 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
253311
ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
254312
(newstate == VDEV_STATE_HEALTHY ||
255313
newstate == VDEV_STATE_DEGRADED)) {
256-
zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s",
257-
fullpath, (newstate == VDEV_STATE_HEALTHY) ?
314+
zed_log_msg(LOG_INFO,
315+
" zpool_vdev_online: vdev '%s' ('%s') is "
316+
"%s", fullpath, physpath, (newstate == VDEV_STATE_HEALTHY) ?
258317
"HEALTHY" : "DEGRADED");
259318
return;
260319
}
@@ -271,11 +330,12 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
271330
* vdev online to trigger a FMA fault by posting an ereport.
272331
*/
273332
if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
274-
!(wholedisk || is_dm) || (physpath == NULL)) {
333+
!(wholedisk || is_mpath_wholedisk) || (physpath == NULL)) {
275334
(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
276335
&newstate);
277336
zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or "
278-
"not a whole disk for '%s'", fullpath);
337+
"not a blank disk for '%s' ('%s')", fullpath,
338+
physpath);
279339
return;
280340
}
281341

@@ -287,7 +347,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
287347
(void) snprintf(rawpath, sizeof (rawpath), "%s%s",
288348
is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath);
289349

290-
if (realpath(rawpath, devpath) == NULL && !is_dm) {
350+
if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) {
291351
zed_log_msg(LOG_INFO, " realpath: %s failed (%s)",
292352
rawpath, strerror(errno));
293353

@@ -303,12 +363,14 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
303363
if ((vs->vs_state != VDEV_STATE_DEGRADED) &&
304364
(vs->vs_state != VDEV_STATE_FAULTED) &&
305365
(vs->vs_state != VDEV_STATE_CANT_OPEN)) {
366+
zed_log_msg(LOG_INFO, " not autoreplacing since disk isn't in "
367+
"a bad state (currently %d)", vs->vs_state);
306368
return;
307369
}
308370

309371
nvlist_lookup_string(vdev, "new_devid", &new_devid);
310372

311-
if (is_dm) {
373+
if (is_mpath_wholedisk) {
312374
/* Don't label device mapper or multipath disks. */
313375
} else if (!labeled) {
314376
/*
@@ -522,8 +584,11 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
522584
* the dp->dd_compare value.
523585
*/
524586
if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
525-
strcmp(dp->dd_compare, path) != 0)
587+
strcmp(dp->dd_compare, path) != 0) {
588+
zed_log_msg(LOG_INFO, " %s: no match (%s != vdev %s)",
589+
__func__, dp->dd_compare, path);
526590
return;
591+
}
527592

528593
zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s",
529594
dp->dd_prop, path);
@@ -571,6 +636,8 @@ zfs_iter_pool(zpool_handle_t *zhp, void *data)
571636
ZPOOL_CONFIG_VDEV_TREE, &nvl);
572637
zfs_iter_vdev(zhp, nvl, data);
573638
}
639+
} else {
640+
zed_log_msg(LOG_INFO, "%s: no config\n", __func__);
574641
}
575642

576643
/*
@@ -619,6 +686,72 @@ devphys_iter(const char *physical, const char *devid, zfs_process_func_t func,
619686
return (data.dd_found);
620687
}
621688

689+
/*
690+
* Given a device identifier, find any vdevs with a matching by-vdev
691+
* path. Normally we shouldn't need this as the comparison would be
692+
* made earlier in the devphys_iter(). For example, if we were replacing
693+
* /dev/disk/by-vdev/L28, normally devphys_iter() would match the
694+
* ZPOOL_CONFIG_PHYS_PATH of "L28" from the old disk config to "L28"
695+
* of the new disk config. However, we've seen cases where
696+
* ZPOOL_CONFIG_PHYS_PATH was not in the config for the old disk. Here's
697+
* an example of a real 2-disk mirror pool where one disk was force
698+
* faulted:
699+
*
700+
* com.delphix:vdev_zap_top: 129
701+
* children[0]:
702+
* type: 'disk'
703+
* id: 0
704+
* guid: 14309659774640089719
705+
* path: '/dev/disk/by-vdev/L28'
706+
* whole_disk: 0
707+
* DTL: 654
708+
* create_txg: 4
709+
* com.delphix:vdev_zap_leaf: 1161
710+
* faulted: 1
711+
* aux_state: 'external'
712+
* children[1]:
713+
* type: 'disk'
714+
* id: 1
715+
* guid: 16002508084177980912
716+
* path: '/dev/disk/by-vdev/L29'
717+
* devid: 'dm-uuid-mpath-35000c500a61d68a3'
718+
* phys_path: 'L29'
719+
* vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32'
720+
* whole_disk: 0
721+
* DTL: 1028
722+
* create_txg: 4
723+
* com.delphix:vdev_zap_leaf: 131
724+
*
725+
* So in the case above, the only thing we could compare is the path.
726+
*
727+
* We can do this because we assume by-vdev paths are authoritative as physical
728+
* paths. We could not assume this for normal paths like /dev/sda since the
729+
* physical location /dev/sda points to could change over time.
730+
*/
731+
static boolean_t
732+
by_vdev_path_iter(const char *by_vdev_path, const char *devid,
733+
zfs_process_func_t func, boolean_t is_slice)
734+
{
735+
dev_data_t data = { 0 };
736+
737+
data.dd_compare = by_vdev_path;
738+
data.dd_func = func;
739+
data.dd_prop = ZPOOL_CONFIG_PATH;
740+
data.dd_found = B_FALSE;
741+
data.dd_islabeled = is_slice;
742+
data.dd_new_devid = devid;
743+
744+
if (strncmp(by_vdev_path, DEV_BYVDEV_PATH,
745+
strlen(DEV_BYVDEV_PATH)) != 0) {
746+
/* by_vdev_path doesn't start with "/dev/disk/by-vdev/" */
747+
return (B_FALSE);
748+
}
749+
750+
(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
751+
752+
return (data.dd_found);
753+
}
754+
622755
/*
623756
* Given a device identifier, find any vdevs with a matching devid.
624757
* On Linux we can match devid directly which is always a whole disk.
@@ -683,15 +816,17 @@ guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid,
683816
static int
684817
zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
685818
{
686-
char *devpath = NULL, *devid;
819+
char *devpath = NULL, *devid = NULL;
687820
uint64_t pool_guid = 0, vdev_guid = 0;
688821
boolean_t is_slice;
689822

690823
/*
691824
* Expecting a devid string and an optional physical location and guid
692825
*/
693-
if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0)
826+
if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) {
827+
zed_log_msg(LOG_INFO, "%s: no dev identifier\n", __func__);
694828
return (-1);
829+
}
695830

696831
(void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath);
697832
(void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
@@ -707,6 +842,8 @@ zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
707842
* 1. ZPOOL_CONFIG_DEVID (identifies the unique disk)
708843
* 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location).
709844
* 3. ZPOOL_CONFIG_GUID (identifies unique vdev).
845+
* 4. ZPOOL_CONFIG_PATH for /dev/disk/by-vdev devices only (since
846+
* by-vdev paths represent physical paths).
710847
*/
711848
if (devid_iter(devid, zfs_process_add, is_slice))
712849
return (0);
@@ -717,6 +854,16 @@ zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
717854
(void) guid_iter(pool_guid, vdev_guid, devid, zfs_process_add,
718855
is_slice);
719856

857+
if (devpath != NULL) {
858+
/* Can we match a /dev/disk/by-vdev/ path? */
859+
char by_vdev_path[MAXPATHLEN];
860+
snprintf(by_vdev_path, sizeof (by_vdev_path),
861+
"/dev/disk/by-vdev/%s", devpath);
862+
if (by_vdev_path_iter(by_vdev_path, devid, zfs_process_add,
863+
is_slice))
864+
return (0);
865+
}
866+
720867
return (0);
721868
}
722869

cmd/zed/zed_disk_event.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,11 @@ zed_udev_monitor(void *arg)
215215
if (type != NULL && type[0] != '\0' &&
216216
strcmp(type, "disk") == 0 &&
217217
part != NULL && part[0] != '\0') {
218+
zed_log_msg(LOG_INFO,
219+
"%s: skip %s since it has a %s partition already",
220+
__func__,
221+
udev_device_get_property_value(dev, "DEVNAME"),
222+
part);
218223
/* skip and wait for partition event */
219224
udev_device_unref(dev);
220225
continue;
@@ -229,6 +234,11 @@ zed_udev_monitor(void *arg)
229234
sectors = udev_device_get_sysattr_value(dev, "size");
230235
if (sectors != NULL &&
231236
strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) {
237+
zed_log_msg(LOG_INFO,
238+
"%s: %s sectors %s < %llu (minimum)",
239+
__func__,
240+
udev_device_get_property_value(dev, "DEVNAME"),
241+
sectors, MINIMUM_SECTORS);
232242
udev_device_unref(dev);
233243
continue;
234244
}

lib/libzutil/os/linux/zutil_device_path_os.c

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ zfs_dev_is_dm(const char *dev_name)
527527
boolean_t
528528
zfs_dev_is_whole_disk(const char *dev_name)
529529
{
530-
struct dk_gpt *label;
530+
struct dk_gpt *label = NULL;
531531
int fd;
532532

533533
if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0)
@@ -613,30 +613,39 @@ zfs_get_underlying_path(const char *dev_name)
613613
/*
614614
* A disk is considered a multipath whole disk when:
615615
* DEVNAME key value has "dm-"
616-
* DM_NAME key value has "mpath" prefix
616+
* MPATH_DEVICE_READY is present
617617
* DM_UUID key exists
618618
* ID_PART_TABLE_TYPE key does not exist or is not gpt
619+
* ID_FS_LABEL key does not exist (disk isn't labeled)
619620
*/
620621
static boolean_t
621-
udev_mpath_whole_disk(struct udev_device *dev)
622+
is_mpath_udev_sane(struct udev_device *dev)
622623
{
623-
const char *devname, *type, *uuid;
624+
const char *devname, *type, *uuid, *label, *mpath_ready;
624625

625626
devname = udev_device_get_property_value(dev, "DEVNAME");
626627
type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
627628
uuid = udev_device_get_property_value(dev, "DM_UUID");
629+
label = udev_device_get_property_value(dev, "ID_FS_LABEL");
630+
mpath_ready = udev_device_get_property_value(dev, "MPATH_DEVICE_READY");
628631

629632
if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
630633
((type == NULL) || (strcmp(type, "gpt") != 0)) &&
631-
(uuid != NULL)) {
634+
(uuid != NULL) &&
635+
(label == NULL) &&
636+
(mpath_ready != NULL && strncmp(mpath_ready, "1", 1) == 0)) {
632637
return (B_TRUE);
633638
}
634639

635640
return (B_FALSE);
636641
}
637642

638643
/*
639-
* Check if a disk is effectively a multipath whole disk
644+
* Check if a disk is a multipath "blank" disk:
645+
*
646+
* 1. The disk has udev values that suggest it's a multipath disk
647+
* 2. The disk is not currently labeled with a filesystem of any type
648+
* 3. There are no partitions on the disk
640649
*/
641650
boolean_t
642651
is_mpath_whole_disk(const char *path)
@@ -645,7 +654,6 @@ is_mpath_whole_disk(const char *path)
645654
struct udev_device *dev = NULL;
646655
char nodepath[MAXPATHLEN];
647656
char *sysname;
648-
boolean_t wholedisk = B_FALSE;
649657

650658
if (realpath(path, nodepath) == NULL)
651659
return (B_FALSE);
@@ -660,10 +668,11 @@ is_mpath_whole_disk(const char *path)
660668
return (B_FALSE);
661669
}
662670

663-
wholedisk = udev_mpath_whole_disk(dev);
664-
671+
/* Sanity check some udev values */
672+
boolean_t is_sane = is_mpath_udev_sane(dev);
665673
udev_device_unref(dev);
666-
return (wholedisk);
674+
675+
return (is_sane);
667676
}
668677

669678
#else /* HAVE_LIBUDEV */

0 commit comments

Comments
 (0)