Skip to content

Commit 342c083

Browse files
authored
fix: health events are not sent for dcgm connectivity failures (#253)
Signed-off-by: Nitin Jain <[email protected]>
1 parent efaf95b commit 342c083

File tree

2 files changed

+178
-2
lines changed

2 files changed

+178
-2
lines changed

health-monitors/gpu-health-monitor/gpu_health_monitor/platform_connector/platform_connector.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,10 +278,20 @@ def send_health_event_with_retries(self, health_events: list[platformconnector_p
278278
metrics.health_events_insertion_to_uds_error.set(1.0)
279279
# Remove failed health events from entity cache
280280
for health_event in health_events:
281-
for entity in health_event.entitiesImpacted:
282-
cache_key = self._build_cache_key(health_event.checkName, entity.entityType, entity.entityValue)
281+
if health_event.checkName == "GpuDcgmConnectivityFailure":
282+
# Explicitly handle DCGM connectivity events
283+
cache_key = self._build_cache_key(health_event.checkName, "DCGM", "ALL")
283284
if cache_key in self.entity_cache:
284285
del self.entity_cache[cache_key]
286+
log.info(f"Removed DCGM connectivity event from cache after send failure: {cache_key}")
287+
elif len(health_event.entitiesImpacted) > 0:
288+
for entity in health_event.entitiesImpacted:
289+
cache_key = self._build_cache_key(health_event.checkName, entity.entityType, entity.entityValue)
290+
if cache_key in self.entity_cache:
291+
del self.entity_cache[cache_key]
292+
else:
293+
# Ideally should not come here, but just in case added a warning.
294+
log.warning(f"Unknown system-level event with empty entities detected: {health_event.checkName}")
285295
return False
286296

287297
def dcgm_connectivity_failed(self):

health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_platform_connector/test_platform_connector.py

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,3 +537,169 @@ def test_dcgm_connectivity_restored(self):
537537
assert restored_event.recommendedAction == platformconnector_pb2.NONE
538538

539539
server.stop(0)
540+
541+
def test_event_retry_and_cache_cleanup_when_platform_connector_down(self):
542+
"""Test when platform connector goes down and comes back up."""
543+
import tempfile
544+
import os
545+
546+
original_max_retries = platform_connector.MAX_RETRIES
547+
original_initial_delay = platform_connector.INITIAL_DELAY
548+
platform_connector.MAX_RETRIES = 3
549+
platform_connector.INITIAL_DELAY = 1
550+
551+
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="_test_state") as f:
552+
f.write("test_boot_id")
553+
state_file_path = f.name
554+
555+
try:
556+
watcher = dcgm.DCGMWatcher(
557+
addr="localhost:5555",
558+
poll_interval_seconds=10,
559+
callbacks=[],
560+
dcgm_k8s_service_enabled=False,
561+
)
562+
gpu_serials = {0: "1650924060039"}
563+
exit = Event()
564+
565+
dcgm_errors_info_dict = {"DCGM_FR_CORRUPT_INFOROM": "COMPONENT_RESET"}
566+
dcgm_health_conditions_categorization_mapping_config = {
567+
"DCGM_HEALTH_WATCH_INFOROM": "Fatal",
568+
"DCGM_HEALTH_WATCH_THERMAL": "NonFatal",
569+
"DCGM_HEALTH_WATCH_POWER": "NonFatal",
570+
"DCGM_HEALTH_WATCH_NVLINK": "Fatal",
571+
"DCGM_HEALTH_WATCH_SM": "Fatal",
572+
"DCGM_HEALTH_WATCH_MEM": "Fatal",
573+
"DCGM_HEALTH_WATCH_MCU": "Fatal",
574+
"DCGM_HEALTH_WATCH_DRIVER": "Fatal",
575+
"DCGM_HEALTH_WATCH_NVSWITCH_FATAL": "Fatal",
576+
"DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL": "NonFatal",
577+
"DCGM_HEALTH_WATCH_PCIE": "Fatal",
578+
"DCGM_HEALTH_WATCH_PMU": "Fatal",
579+
"DCGM_HEALTH_WATCH_CPUSET": "NonFatal",
580+
"DCGM_HEALTH_WATCH_NVSWITCH": "Fatal",
581+
}
582+
583+
platform_connector_processor = platform_connector.PlatformConnectorEventProcessor(
584+
socket_path=socket_path,
585+
node_name=node_name,
586+
exit=exit,
587+
dcgm_errors_info_dict=dcgm_errors_info_dict,
588+
state_file_path=state_file_path,
589+
dcgm_health_conditions_categorization_mapping_config=dcgm_health_conditions_categorization_mapping_config,
590+
)
591+
592+
# Verify cache is empty initially
593+
assert len(platform_connector_processor.entity_cache) == 0
594+
595+
# Test 1: DCGM Connectivity Failure (system-level event, no entities)
596+
# This tests the cache cleanup path: if checkName == "GpuDcgmConnectivityFailure"
597+
platform_connector_processor.dcgm_connectivity_failed()
598+
dcgm_failure_cache_key = platform_connector_processor._build_cache_key(
599+
"GpuDcgmConnectivityFailure", "DCGM", "ALL"
600+
)
601+
assert (
602+
dcgm_failure_cache_key not in platform_connector_processor.entity_cache
603+
), "DCGM failure cache entry should be removed after send failure"
604+
605+
# Test 2: DCGM Connectivity Restored
606+
timestamp = Timestamp()
607+
timestamp.GetCurrentTime()
608+
platform_connector_processor.clear_dcgm_connectivity_failure(timestamp)
609+
assert (
610+
dcgm_failure_cache_key not in platform_connector_processor.entity_cache
611+
), "DCGM restored cache entry should be removed after send failure"
612+
613+
# Test 3: GPU Health Event (entity-specific event)
614+
# This tests the cache cleanup path: elif len(entitiesImpacted) > 0
615+
dcgm_health_events = watcher._get_health_status_dict()
616+
dcgm_health_events["DCGM_HEALTH_WATCH_INFOROM"] = dcgmtypes.HealthDetails(
617+
status=dcgmtypes.HealthStatus.FAIL,
618+
entity_failures={
619+
0: dcgm.types.ErrorDetails(
620+
code="DCGM_FR_CORRUPT_INFOROM",
621+
message="A corrupt InfoROM has been detected in GPU 0.",
622+
)
623+
},
624+
)
625+
gpu_ids = [0]
626+
platform_connector_processor.health_event_occurred(dcgm_health_events, gpu_ids, gpu_serials)
627+
gpu_health_cache_key = platform_connector_processor._build_cache_key("GpuInforomWatch", "GPU", "0")
628+
assert (
629+
gpu_health_cache_key not in platform_connector_processor.entity_cache
630+
), "GPU health cache entry should be removed after send failure"
631+
632+
# Platform connector comes UP - Start server
633+
healthEventProcessor = PlatformConnectorServicer()
634+
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
635+
platformconnector_pb2_grpc.add_PlatformConnectorServicer_to_server(healthEventProcessor, server)
636+
server.add_insecure_port(f"unix://{socket_path}")
637+
server.start()
638+
639+
# Retry DCGM Connectivity Failure
640+
platform_connector_processor.dcgm_connectivity_failed()
641+
time.sleep(1)
642+
assert (
643+
dcgm_failure_cache_key in platform_connector_processor.entity_cache
644+
), "DCGM failure cache entry should be present after successful send"
645+
assert platform_connector_processor.entity_cache[dcgm_failure_cache_key].isFatal == True
646+
assert platform_connector_processor.entity_cache[dcgm_failure_cache_key].isHealthy == False
647+
648+
# Verify DCGM failure event was sent
649+
health_events = healthEventProcessor.health_events
650+
dcgm_failure_event = None
651+
for event in health_events:
652+
if event.checkName == "GpuDcgmConnectivityFailure" and not event.isHealthy:
653+
dcgm_failure_event = event
654+
break
655+
assert dcgm_failure_event is not None, "DCGM failure event should be sent"
656+
assert dcgm_failure_event.isFatal == True
657+
assert dcgm_failure_event.errorCode == ["DCGM_CONNECTIVITY_ERROR"]
658+
assert dcgm_failure_event.entitiesImpacted == []
659+
660+
# Retry DCGM Connectivity Restored
661+
timestamp = Timestamp()
662+
timestamp.GetCurrentTime()
663+
platform_connector_processor.clear_dcgm_connectivity_failure(timestamp)
664+
time.sleep(1)
665+
assert (
666+
platform_connector_processor.entity_cache[dcgm_failure_cache_key].isHealthy == True
667+
), "DCGM failure cache entry should be updated to healthy"
668+
669+
# Verify DCGM restored event was sent
670+
health_events = healthEventProcessor.health_events
671+
dcgm_restored_event = None
672+
for event in health_events:
673+
if event.checkName == "GpuDcgmConnectivityFailure" and event.isHealthy:
674+
dcgm_restored_event = event
675+
break
676+
assert dcgm_restored_event is not None, "DCGM restored event should be sent"
677+
assert dcgm_restored_event.isFatal == False
678+
assert dcgm_restored_event.isHealthy == True
679+
680+
# Retry GPU Health Event
681+
platform_connector_processor.health_event_occurred(dcgm_health_events, gpu_ids, gpu_serials)
682+
time.sleep(1)
683+
assert (
684+
gpu_health_cache_key in platform_connector_processor.entity_cache
685+
), "GPU health cache entry should be present after successful send"
686+
assert platform_connector_processor.entity_cache[gpu_health_cache_key].isFatal == True
687+
assert platform_connector_processor.entity_cache[gpu_health_cache_key].isHealthy == False
688+
689+
# Verify GPU health event was sent
690+
health_events = healthEventProcessor.health_events
691+
gpu_health_event = None
692+
for event in health_events:
693+
if event.checkName == "GpuInforomWatch" and not event.isHealthy:
694+
gpu_health_event = event
695+
break
696+
assert gpu_health_event is not None, "GPU health event should be sent"
697+
assert gpu_health_event.errorCode[0] == "DCGM_FR_CORRUPT_INFOROM"
698+
assert gpu_health_event.entitiesImpacted[0].entityValue == "0"
699+
700+
server.stop(0)
701+
finally:
702+
platform_connector.MAX_RETRIES = original_max_retries
703+
platform_connector.INITIAL_DELAY = original_initial_delay
704+
if os.path.exists(state_file_path):
705+
os.unlink(state_file_path)

0 commit comments

Comments
 (0)