@@ -183,14 +183,14 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
183183 nvlist_t * nvroot , * newvd ;
184184 pendingdev_t * device ;
185185 uint64_t wholedisk = 0ULL ;
186- uint64_t offline = 0ULL ;
186+ uint64_t offline = 0ULL , faulted = 0ULL ;
187187 uint64_t guid = 0ULL ;
188188 char * physpath = NULL , * new_devid = NULL , * enc_sysfs_path = NULL ;
189189 char rawpath [PATH_MAX ], fullpath [PATH_MAX ];
190190 char devpath [PATH_MAX ];
191191 int ret ;
192- boolean_t is_dm = B_FALSE ;
193192 boolean_t is_sd = B_FALSE ;
193+ boolean_t is_mpath_wholedisk = B_FALSE ;
194194 uint_t c ;
195195 vdev_stat_t * vs ;
196196
@@ -211,15 +211,73 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
211211 & enc_sysfs_path );
212212 (void ) nvlist_lookup_uint64 (vdev , ZPOOL_CONFIG_WHOLE_DISK , & wholedisk );
213213 (void ) nvlist_lookup_uint64 (vdev , ZPOOL_CONFIG_OFFLINE , & offline );
214+ (void ) nvlist_lookup_uint64 (vdev , ZPOOL_CONFIG_FAULTED , & faulted );
215+
214216 (void ) nvlist_lookup_uint64 (vdev , ZPOOL_CONFIG_GUID , & guid );
215217
216- if (offline )
217- return ; /* don't intervene if it was taken offline */
218+ /*
219+ * Special case:
220+ *
221+ * We've seen times where a disk won't have a ZPOOL_CONFIG_PHYS_PATH
222+ * entry in their config. For example, on this force-faulted disk:
223+ *
224+ * children[0]:
225+ * type: 'disk'
226+ * id: 0
227+ * guid: 14309659774640089719
228+ * path: '/dev/disk/by-vdev/L28'
229+ * whole_disk: 0
230+ * DTL: 654
231+ * create_txg: 4
232+ * com.delphix:vdev_zap_leaf: 1161
233+ * faulted: 1
234+ * aux_state: 'external'
235+ * children[1]:
236+ * type: 'disk'
237+ * id: 1
238+ * guid: 16002508084177980912
239+ * path: '/dev/disk/by-vdev/L29'
240+ * devid: 'dm-uuid-mpath-35000c500a61d68a3'
241+ * phys_path: 'L29'
242+ * vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32'
243+ * whole_disk: 0
244+ * DTL: 1028
245+ * create_txg: 4
246+ * com.delphix:vdev_zap_leaf: 131
247+ *
248+ * If the disk's path is a /dev/disk/by-vdev/ path, then we can infer
249+ * the ZPOOL_CONFIG_PHYS_PATH from the by-vdev disk name.
250+ */
251+ if (physpath == NULL && path != NULL ) {
252+ /* If path begins with "/dev/disk/by-vdev/" ... */
253+ if (strncmp (path , DEV_BYVDEV_PATH ,
254+ strlen (DEV_BYVDEV_PATH )) == 0 ) {
255+ /* Set physpath to the char after "/dev/disk/by-vdev" */
256+ physpath = & path [strlen (DEV_BYVDEV_PATH )];
257+ }
258+ }
259+
260+ /*
261+ * We don't want to autoreplace offlined disks. However, we do want to
262+ * replace force-faulted disks (`zpool offline -f`). Force-faulted
263+ * disks have both offline=1 and faulted=1 in the nvlist.
264+ */
265+ if (offline && !faulted ) {
266+ zed_log_msg (LOG_INFO , "%s: %s is offline, skip autoreplace" ,
267+ __func__ , path );
268+ return ;
269+ }
218270
219- is_dm = zfs_dev_is_dm (path );
271+ is_mpath_wholedisk = is_mpath_whole_disk (path );
220272 zed_log_msg (LOG_INFO , "zfs_process_add: pool '%s' vdev '%s', phys '%s'"
221- " wholedisk %d, %s dm (guid %llu)" , zpool_get_name (zhp ), path ,
222- physpath ? physpath : "NULL" , wholedisk , is_dm ? "is" : "not" ,
273+ " %s blank disk, %s mpath blank disk, %s labeled, enc sysfs '%s', "
274+ "(guid %llu)" ,
275+ zpool_get_name (zhp ), path ,
276+ physpath ? physpath : "NULL" ,
277+ wholedisk ? "is" : "not" ,
278+ is_mpath_wholedisk ? "is" : "not" ,
279+ labeled ? "is" : "not" ,
280+ enc_sysfs_path ,
223281 (long long unsigned int )guid );
224282
225283 /*
@@ -253,8 +311,9 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
253311 ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE , & newstate ) == 0 &&
254312 (newstate == VDEV_STATE_HEALTHY ||
255313 newstate == VDEV_STATE_DEGRADED )) {
256- zed_log_msg (LOG_INFO , " zpool_vdev_online: vdev %s is %s" ,
257- fullpath , (newstate == VDEV_STATE_HEALTHY ) ?
314+ zed_log_msg (LOG_INFO ,
315+ " zpool_vdev_online: vdev '%s' ('%s') is "
316+ "%s" , fullpath , physpath , (newstate == VDEV_STATE_HEALTHY ) ?
258317 "HEALTHY" : "DEGRADED" );
259318 return ;
260319 }
@@ -271,11 +330,12 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
271330 * vdev online to trigger a FMA fault by posting an ereport.
272331 */
273332 if (!zpool_get_prop_int (zhp , ZPOOL_PROP_AUTOREPLACE , NULL ) ||
274- !(wholedisk || is_dm ) || (physpath == NULL )) {
333+ !(wholedisk || is_mpath_wholedisk ) || (physpath == NULL )) {
275334 (void ) zpool_vdev_online (zhp , fullpath , ZFS_ONLINE_FORCEFAULT ,
276335 & newstate );
277336 zed_log_msg (LOG_INFO , "Pool's autoreplace is not enabled or "
278- "not a whole disk for '%s'" , fullpath );
337+ "not a blank disk for '%s' ('%s')" , fullpath ,
338+ physpath );
279339 return ;
280340 }
281341
@@ -287,7 +347,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
287347 (void ) snprintf (rawpath , sizeof (rawpath ), "%s%s" ,
288348 is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH , physpath );
289349
290- if (realpath (rawpath , devpath ) == NULL && !is_dm ) {
350+ if (realpath (rawpath , devpath ) == NULL && !is_mpath_wholedisk ) {
291351 zed_log_msg (LOG_INFO , " realpath: %s failed (%s)" ,
292352 rawpath , strerror (errno ));
293353
@@ -303,12 +363,14 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
303363 if ((vs -> vs_state != VDEV_STATE_DEGRADED ) &&
304364 (vs -> vs_state != VDEV_STATE_FAULTED ) &&
305365 (vs -> vs_state != VDEV_STATE_CANT_OPEN )) {
366+ zed_log_msg (LOG_INFO , " not autoreplacing since disk isn't in "
367+ "a bad state (currently %d)" , vs -> vs_state );
306368 return ;
307369 }
308370
309371 nvlist_lookup_string (vdev , "new_devid" , & new_devid );
310372
311- if (is_dm ) {
373+ if (is_mpath_wholedisk ) {
312374 /* Don't label device mapper or multipath disks. */
313375 } else if (!labeled ) {
314376 /*
@@ -522,8 +584,11 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
522584 * the dp->dd_compare value.
523585 */
524586 if (nvlist_lookup_string (nvl , dp -> dd_prop , & path ) != 0 ||
525- strcmp (dp -> dd_compare , path ) != 0 )
587+ strcmp (dp -> dd_compare , path ) != 0 ) {
588+ zed_log_msg (LOG_INFO , " %s: no match (%s != vdev %s)" ,
589+ __func__ , dp -> dd_compare , path );
526590 return ;
591+ }
527592
528593 zed_log_msg (LOG_INFO , " zfs_iter_vdev: matched %s on %s" ,
529594 dp -> dd_prop , path );
@@ -571,6 +636,8 @@ zfs_iter_pool(zpool_handle_t *zhp, void *data)
571636 ZPOOL_CONFIG_VDEV_TREE , & nvl );
572637 zfs_iter_vdev (zhp , nvl , data );
573638 }
639+ } else {
640+ zed_log_msg (LOG_INFO , "%s: no config\n" , __func__ );
574641 }
575642
576643 /*
@@ -619,6 +686,72 @@ devphys_iter(const char *physical, const char *devid, zfs_process_func_t func,
619686 return (data .dd_found );
620687}
621688
689+ /*
690+ * Given a device identifier, find any vdevs with a matching by-vdev
691+ * path. Normally we shouldn't need this as the comparison would be
692+ * made earlier in the devphys_iter(). For example, if we were replacing
693+ * /dev/disk/by-vdev/L28, normally devphys_iter() would match the
694+ * ZPOOL_CONFIG_PHYS_PATH of "L28" from the old disk config to "L28"
695+ * of the new disk config. However, we've seen cases where
696+ * ZPOOL_CONFIG_PHYS_PATH was not in the config for the old disk. Here's
697+ * an example of a real 2-disk mirror pool where one disk was force
698+ * faulted:
699+ *
700+ * com.delphix:vdev_zap_top: 129
701+ * children[0]:
702+ * type: 'disk'
703+ * id: 0
704+ * guid: 14309659774640089719
705+ * path: '/dev/disk/by-vdev/L28'
706+ * whole_disk: 0
707+ * DTL: 654
708+ * create_txg: 4
709+ * com.delphix:vdev_zap_leaf: 1161
710+ * faulted: 1
711+ * aux_state: 'external'
712+ * children[1]:
713+ * type: 'disk'
714+ * id: 1
715+ * guid: 16002508084177980912
716+ * path: '/dev/disk/by-vdev/L29'
717+ * devid: 'dm-uuid-mpath-35000c500a61d68a3'
718+ * phys_path: 'L29'
719+ * vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32'
720+ * whole_disk: 0
721+ * DTL: 1028
722+ * create_txg: 4
723+ * com.delphix:vdev_zap_leaf: 131
724+ *
725+ * So in the case above, the only thing we could compare is the path.
726+ *
727+ * We can do this because we assume by-vdev paths are authoritative as physical
728+ * paths. We could not assume this for normal paths like /dev/sda since the
729+ * physical location /dev/sda points to could change over time.
730+ */
731+ static boolean_t
732+ by_vdev_path_iter (const char * by_vdev_path , const char * devid ,
733+ zfs_process_func_t func , boolean_t is_slice )
734+ {
735+ dev_data_t data = { 0 };
736+
737+ data .dd_compare = by_vdev_path ;
738+ data .dd_func = func ;
739+ data .dd_prop = ZPOOL_CONFIG_PATH ;
740+ data .dd_found = B_FALSE ;
741+ data .dd_islabeled = is_slice ;
742+ data .dd_new_devid = devid ;
743+
744+ if (strncmp (by_vdev_path , DEV_BYVDEV_PATH ,
745+ strlen (DEV_BYVDEV_PATH )) != 0 ) {
746+ /* by_vdev_path doesn't start with "/dev/disk/by-vdev/" */
747+ return (B_FALSE );
748+ }
749+
750+ (void ) zpool_iter (g_zfshdl , zfs_iter_pool , & data );
751+
752+ return (data .dd_found );
753+ }
754+
622755/*
623756 * Given a device identifier, find any vdevs with a matching devid.
624757 * On Linux we can match devid directly which is always a whole disk.
@@ -683,15 +816,17 @@ guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid,
683816static int
684817zfs_deliver_add (nvlist_t * nvl , boolean_t is_lofi )
685818{
686- char * devpath = NULL , * devid ;
819+ char * devpath = NULL , * devid = NULL ;
687820 uint64_t pool_guid = 0 , vdev_guid = 0 ;
688821 boolean_t is_slice ;
689822
690823 /*
691824 * Expecting a devid string and an optional physical location and guid
692825 */
693- if (nvlist_lookup_string (nvl , DEV_IDENTIFIER , & devid ) != 0 )
826+ if (nvlist_lookup_string (nvl , DEV_IDENTIFIER , & devid ) != 0 ) {
827+ zed_log_msg (LOG_INFO , "%s: no dev identifier\n" , __func__ );
694828 return (-1 );
829+ }
695830
696831 (void ) nvlist_lookup_string (nvl , DEV_PHYS_PATH , & devpath );
697832 (void ) nvlist_lookup_uint64 (nvl , ZFS_EV_POOL_GUID , & pool_guid );
@@ -707,6 +842,8 @@ zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
707842 * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk)
708843 * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location).
709844 * 3. ZPOOL_CONFIG_GUID (identifies unique vdev).
845+ * 4. ZPOOL_CONFIG_PATH for /dev/disk/by-vdev devices only (since
846+ * by-vdev paths represent physical paths).
710847 */
711848 if (devid_iter (devid , zfs_process_add , is_slice ))
712849 return (0 );
@@ -717,6 +854,16 @@ zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
717854 (void ) guid_iter (pool_guid , vdev_guid , devid , zfs_process_add ,
718855 is_slice );
719856
857+ if (devpath != NULL ) {
858+ /* Can we match a /dev/disk/by-vdev/ path? */
859+ char by_vdev_path [MAXPATHLEN ];
860+ snprintf (by_vdev_path , sizeof (by_vdev_path ),
861+ "/dev/disk/by-vdev/%s" , devpath );
862+ if (by_vdev_path_iter (by_vdev_path , devid , zfs_process_add ,
863+ is_slice ))
864+ return (0 );
865+ }
866+
720867 return (0 );
721868}
722869
0 commit comments