From fd9f319af1ce08db48b84b8bfd0392f2e6028f37 Mon Sep 17 00:00:00 2001 From: Nitin Jain Date: Fri, 17 Oct 2025 23:31:55 +0530 Subject: [PATCH] fix: resource cleanup and state reset for dcgm handle after failures or connectivity issues Signed-off-by: Nitin Jain --- .../gpu_health_monitor/dcgm_watcher/dcgm.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/dcgm.py b/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/dcgm.py index 5a1bce483..ab65586e4 100644 --- a/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/dcgm.py +++ b/health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/dcgm.py @@ -260,6 +260,7 @@ def _cleanup_dcgm_resources( dcgm_group = None if dcgm_handle: # Clean up the handle + dcgm_handle.Shutdown() del dcgm_handle except Exception as e: log.error(f"Error cleaning up DCGM handle: {e}") @@ -281,12 +282,17 @@ def start(self, fields_to_monitor: list[str], exit: Event) -> None: log.error(f"Error getting DCGM handle: {e}") self._fire_callback_funcs(types.CallbackInterface.dcgm_connectivity_failed.__name__, []) self._cleanup_dcgm_resources(dcgm_group, dcgm_handle) + dcgm_handle = None + dcgm_group = None + gpu_ids = [] + gpu_serials = {} else: log.debug("Running health check") health_status, connectivity_success = self._perform_health_check(dcgm_group) if not connectivity_success: log.warning("DCGM connectivity failure detected") + self._cleanup_dcgm_resources(dcgm_group, dcgm_handle) dcgm_handle = None dcgm_group = None gpu_ids = []