Fix node failure reason reporting. (#1665)

BalaBalaYi · web-flow · commit 235cee68b1ea · 2025-11-13T15:30:54.000+08:00
* Add pkill when timeout in worker stopping. (#1659) * add pkill when stopping worker timeout * debug ut * fix ut * fix * lint * lint
diff --git a/dlrover/python/common/constants.py b/dlrover/python/common/constants.py
@@ -193,6 +193,7 @@ class NodeExitReason(object):
     NO_HEARTBEAT = "NoHeartBeat"
     DIAG_FAIL = "DiagnosticFailure"
     RELAUNCHED = "Relaunched"
+    CHECK_FAIL = "CheckFailure"
 
 
 class NodeExitDescription(object):
diff --git a/dlrover/python/elastic_agent/torch/training.py b/dlrover/python/elastic_agent/torch/training.py
@@ -80,6 +80,7 @@
     TrainingExceptionLevel,
     EventReportConstants,
     ScriptPath,
+    NodeExitReason,
 )
 from dlrover.python.common.error import ProcessError
 from dlrover.python.common.log import default_logger as logger
@@ -1732,7 +1733,7 @@ def run(self, role: str = DEFAULT_ROLE) -> bool:
 
         if self._node_rank in fault_nodes:
             self._client.report_failures(
-                NodeEventType.NODE_CHECK_FAILED,
+                NodeExitReason.CHECK_FAIL,
                 level=TrainingExceptionLevel.NODE_ERROR,
             )
             raise NodeCheckFailedError(NodeExitDescription.NODE_FAILED_MSG)
diff --git a/dlrover/python/master/node/dist_job_manager.py b/dlrover/python/master/node/dist_job_manager.py
@@ -1361,7 +1361,14 @@ def handle_training_failure(
     ):
         """Process the training failure reported by the node."""
         node = self._job_context.job_node(node_type, node_id)
-        logger.info(f"Handle failed node: {node}")
+
+        if error_data:
+            # self detected reason override the reason from k8s pod
+            node.set_exit_reason(error_data)
+        else:
+            # inherit the reason from k8s pod
+            error_data = node.exit_reason
+        logger.info(f"Handle failed node: {node} with reason: {error_data}")
         if node.is_released:
             return
         relaunch_node = self._process_error(
diff --git a/dlrover/python/tests/test_job_manager.py b/dlrover/python/tests/test_job_manager.py
@@ -328,7 +328,10 @@ def test_relaunch_node(self):
         )
         self.assertEqual(self.job_context.get_failed_node_cnt(), 1)
         manager.handle_training_failure(
-            NodeType.WORKER, 1, level=TrainingExceptionLevel.NODE_ERROR
+            NodeType.WORKER,
+            1,
+            level=TrainingExceptionLevel.NODE_ERROR,
+            error_data="test_reason",
         )
         self.assertEqual(self.job_context.get_failed_node_cnt(), 2)
 

Original file line number	Diff line number	Diff line change
`@@ -328,7 +328,10 @@ def test_relaunch_node(self):`
`328`	`328`	`)`
`329`	`329`	`self.assertEqual(self.job_context.get_failed_node_cnt(), 1)`
`330`	`330`	`manager.handle_training_failure(`
`331`		`- NodeType.WORKER, 1, level=TrainingExceptionLevel.NODE_ERROR`
	`331`	`+ NodeType.WORKER,`
	`332`	`+ 1,`
	`333`	`+ level=TrainingExceptionLevel.NODE_ERROR,`
	`334`	`+ error_data="test_reason",`
`332`	`335`	`)`
`333`	`336`	`self.assertEqual(self.job_context.get_failed_node_cnt(), 2)`
`334`	`337`