@@ -537,3 +537,169 @@ def test_dcgm_connectivity_restored(self):
537537 assert restored_event .recommendedAction == platformconnector_pb2 .NONE
538538
539539 server .stop (0 )
540+
541+ def test_event_retry_and_cache_cleanup_when_platform_connector_down (self ):
542+ """Test when platform connector goes down and comes back up."""
543+ import tempfile
544+ import os
545+
546+ original_max_retries = platform_connector .MAX_RETRIES
547+ original_initial_delay = platform_connector .INITIAL_DELAY
548+ platform_connector .MAX_RETRIES = 3
549+ platform_connector .INITIAL_DELAY = 1
550+
551+ with tempfile .NamedTemporaryFile (mode = "w" , delete = False , suffix = "_test_state" ) as f :
552+ f .write ("test_boot_id" )
553+ state_file_path = f .name
554+
555+ try :
556+ watcher = dcgm .DCGMWatcher (
557+ addr = "localhost:5555" ,
558+ poll_interval_seconds = 10 ,
559+ callbacks = [],
560+ dcgm_k8s_service_enabled = False ,
561+ )
562+ gpu_serials = {0 : "1650924060039" }
563+ exit = Event ()
564+
565+ dcgm_errors_info_dict = {"DCGM_FR_CORRUPT_INFOROM" : "COMPONENT_RESET" }
566+ dcgm_health_conditions_categorization_mapping_config = {
567+ "DCGM_HEALTH_WATCH_INFOROM" : "Fatal" ,
568+ "DCGM_HEALTH_WATCH_THERMAL" : "NonFatal" ,
569+ "DCGM_HEALTH_WATCH_POWER" : "NonFatal" ,
570+ "DCGM_HEALTH_WATCH_NVLINK" : "Fatal" ,
571+ "DCGM_HEALTH_WATCH_SM" : "Fatal" ,
572+ "DCGM_HEALTH_WATCH_MEM" : "Fatal" ,
573+ "DCGM_HEALTH_WATCH_MCU" : "Fatal" ,
574+ "DCGM_HEALTH_WATCH_DRIVER" : "Fatal" ,
575+ "DCGM_HEALTH_WATCH_NVSWITCH_FATAL" : "Fatal" ,
576+ "DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL" : "NonFatal" ,
577+ "DCGM_HEALTH_WATCH_PCIE" : "Fatal" ,
578+ "DCGM_HEALTH_WATCH_PMU" : "Fatal" ,
579+ "DCGM_HEALTH_WATCH_CPUSET" : "NonFatal" ,
580+ "DCGM_HEALTH_WATCH_NVSWITCH" : "Fatal" ,
581+ }
582+
583+ platform_connector_processor = platform_connector .PlatformConnectorEventProcessor (
584+ socket_path = socket_path ,
585+ node_name = node_name ,
586+ exit = exit ,
587+ dcgm_errors_info_dict = dcgm_errors_info_dict ,
588+ state_file_path = state_file_path ,
589+ dcgm_health_conditions_categorization_mapping_config = dcgm_health_conditions_categorization_mapping_config ,
590+ )
591+
592+ # Verify cache is empty initially
593+ assert len (platform_connector_processor .entity_cache ) == 0
594+
595+ # Test 1: DCGM Connectivity Failure (system-level event, no entities)
596+ # This tests the cache cleanup path: if checkName == "GpuDcgmConnectivityFailure"
597+ platform_connector_processor .dcgm_connectivity_failed ()
598+ dcgm_failure_cache_key = platform_connector_processor ._build_cache_key (
599+ "GpuDcgmConnectivityFailure" , "DCGM" , "ALL"
600+ )
601+ assert (
602+ dcgm_failure_cache_key not in platform_connector_processor .entity_cache
603+ ), "DCGM failure cache entry should be removed after send failure"
604+
605+ # Test 2: DCGM Connectivity Restored
606+ timestamp = Timestamp ()
607+ timestamp .GetCurrentTime ()
608+ platform_connector_processor .clear_dcgm_connectivity_failure (timestamp )
609+ assert (
610+ dcgm_failure_cache_key not in platform_connector_processor .entity_cache
611+ ), "DCGM restored cache entry should be removed after send failure"
612+
613+ # Test 3: GPU Health Event (entity-specific event)
614+ # This tests the cache cleanup path: elif len(entitiesImpacted) > 0
615+ dcgm_health_events = watcher ._get_health_status_dict ()
616+ dcgm_health_events ["DCGM_HEALTH_WATCH_INFOROM" ] = dcgmtypes .HealthDetails (
617+ status = dcgmtypes .HealthStatus .FAIL ,
618+ entity_failures = {
619+ 0 : dcgm .types .ErrorDetails (
620+ code = "DCGM_FR_CORRUPT_INFOROM" ,
621+ message = "A corrupt InfoROM has been detected in GPU 0." ,
622+ )
623+ },
624+ )
625+ gpu_ids = [0 ]
626+ platform_connector_processor .health_event_occurred (dcgm_health_events , gpu_ids , gpu_serials )
627+ gpu_health_cache_key = platform_connector_processor ._build_cache_key ("GpuInforomWatch" , "GPU" , "0" )
628+ assert (
629+ gpu_health_cache_key not in platform_connector_processor .entity_cache
630+ ), "GPU health cache entry should be removed after send failure"
631+
632+ # Platform connector comes UP - Start server
633+ healthEventProcessor = PlatformConnectorServicer ()
634+ server = grpc .server (futures .ThreadPoolExecutor (max_workers = 10 ))
635+ platformconnector_pb2_grpc .add_PlatformConnectorServicer_to_server (healthEventProcessor , server )
636+ server .add_insecure_port (f"unix://{ socket_path } " )
637+ server .start ()
638+
639+ # Retry DCGM Connectivity Failure
640+ platform_connector_processor .dcgm_connectivity_failed ()
641+ time .sleep (1 )
642+ assert (
643+ dcgm_failure_cache_key in platform_connector_processor .entity_cache
644+ ), "DCGM failure cache entry should be present after successful send"
645+ assert platform_connector_processor .entity_cache [dcgm_failure_cache_key ].isFatal == True
646+ assert platform_connector_processor .entity_cache [dcgm_failure_cache_key ].isHealthy == False
647+
648+ # Verify DCGM failure event was sent
649+ health_events = healthEventProcessor .health_events
650+ dcgm_failure_event = None
651+ for event in health_events :
652+ if event .checkName == "GpuDcgmConnectivityFailure" and not event .isHealthy :
653+ dcgm_failure_event = event
654+ break
655+ assert dcgm_failure_event is not None , "DCGM failure event should be sent"
656+ assert dcgm_failure_event .isFatal == True
657+ assert dcgm_failure_event .errorCode == ["DCGM_CONNECTIVITY_ERROR" ]
658+ assert dcgm_failure_event .entitiesImpacted == []
659+
660+ # Retry DCGM Connectivity Restored
661+ timestamp = Timestamp ()
662+ timestamp .GetCurrentTime ()
663+ platform_connector_processor .clear_dcgm_connectivity_failure (timestamp )
664+ time .sleep (1 )
665+ assert (
666+ platform_connector_processor .entity_cache [dcgm_failure_cache_key ].isHealthy == True
667+ ), "DCGM failure cache entry should be updated to healthy"
668+
669+ # Verify DCGM restored event was sent
670+ health_events = healthEventProcessor .health_events
671+ dcgm_restored_event = None
672+ for event in health_events :
673+ if event .checkName == "GpuDcgmConnectivityFailure" and event .isHealthy :
674+ dcgm_restored_event = event
675+ break
676+ assert dcgm_restored_event is not None , "DCGM restored event should be sent"
677+ assert dcgm_restored_event .isFatal == False
678+ assert dcgm_restored_event .isHealthy == True
679+
680+ # Retry GPU Health Event
681+ platform_connector_processor .health_event_occurred (dcgm_health_events , gpu_ids , gpu_serials )
682+ time .sleep (1 )
683+ assert (
684+ gpu_health_cache_key in platform_connector_processor .entity_cache
685+ ), "GPU health cache entry should be present after successful send"
686+ assert platform_connector_processor .entity_cache [gpu_health_cache_key ].isFatal == True
687+ assert platform_connector_processor .entity_cache [gpu_health_cache_key ].isHealthy == False
688+
689+ # Verify GPU health event was sent
690+ health_events = healthEventProcessor .health_events
691+ gpu_health_event = None
692+ for event in health_events :
693+ if event .checkName == "GpuInforomWatch" and not event .isHealthy :
694+ gpu_health_event = event
695+ break
696+ assert gpu_health_event is not None , "GPU health event should be sent"
697+ assert gpu_health_event .errorCode [0 ] == "DCGM_FR_CORRUPT_INFOROM"
698+ assert gpu_health_event .entitiesImpacted [0 ].entityValue == "0"
699+
700+ server .stop (0 )
701+ finally :
702+ platform_connector .MAX_RETRIES = original_max_retries
703+ platform_connector .INITIAL_DELAY = original_initial_delay
704+ if os .path .exists (state_file_path ):
705+ os .unlink (state_file_path )
0 commit comments