Skip to content

Commit 1cc57eb

Browse files
authored
fix: resource cleanup and state reset for dcgm handle after failures … (#76)
Signed-off-by: Nitin Jain <[email protected]>
1 parent 0b05a55 commit 1cc57eb

File tree

1 file changed

+6
-0
lines changed
  • health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher

1 file changed

+6
-0
lines changed

health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/dcgm.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ def _cleanup_dcgm_resources(
260260
dcgm_group = None
261261
if dcgm_handle:
262262
# Clean up the handle
263+
dcgm_handle.Shutdown()
263264
del dcgm_handle
264265
except Exception as e:
265266
log.error(f"Error cleaning up DCGM handle: {e}")
@@ -281,12 +282,17 @@ def start(self, fields_to_monitor: list[str], exit: Event) -> None:
281282
log.error(f"Error getting DCGM handle: {e}")
282283
self._fire_callback_funcs(types.CallbackInterface.dcgm_connectivity_failed.__name__, [])
283284
self._cleanup_dcgm_resources(dcgm_group, dcgm_handle)
285+
dcgm_handle = None
286+
dcgm_group = None
287+
gpu_ids = []
288+
gpu_serials = {}
284289
else:
285290
log.debug("Running health check")
286291
health_status, connectivity_success = self._perform_health_check(dcgm_group)
287292

288293
if not connectivity_success:
289294
log.warning("DCGM connectivity failure detected")
295+
self._cleanup_dcgm_resources(dcgm_group, dcgm_handle)
290296
dcgm_handle = None
291297
dcgm_group = None
292298
gpu_ids = []

0 commit comments

Comments
 (0)