Skip to content

Commit 235cee6

Browse files
authored
Fix node failure reason reporting. (#1665)
* Add pkill when timeout in worker stopping. (#1659) * add pkill when stopping worker timeout * debug ut * fix ut * fix * lint * lint
1 parent 67e98b0 commit 235cee6

File tree

4 files changed

+15
-3
lines changed

4 files changed

+15
-3
lines changed

dlrover/python/common/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ class NodeExitReason(object):
193193
NO_HEARTBEAT = "NoHeartBeat"
194194
DIAG_FAIL = "DiagnosticFailure"
195195
RELAUNCHED = "Relaunched"
196+
CHECK_FAIL = "CheckFailure"
196197

197198

198199
class NodeExitDescription(object):

dlrover/python/elastic_agent/torch/training.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@
8080
TrainingExceptionLevel,
8181
EventReportConstants,
8282
ScriptPath,
83+
NodeExitReason,
8384
)
8485
from dlrover.python.common.error import ProcessError
8586
from dlrover.python.common.log import default_logger as logger
@@ -1732,7 +1733,7 @@ def run(self, role: str = DEFAULT_ROLE) -> bool:
17321733

17331734
if self._node_rank in fault_nodes:
17341735
self._client.report_failures(
1735-
NodeEventType.NODE_CHECK_FAILED,
1736+
NodeExitReason.CHECK_FAIL,
17361737
level=TrainingExceptionLevel.NODE_ERROR,
17371738
)
17381739
raise NodeCheckFailedError(NodeExitDescription.NODE_FAILED_MSG)

dlrover/python/master/node/dist_job_manager.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1361,7 +1361,14 @@ def handle_training_failure(
13611361
):
13621362
"""Process the training failure reported by the node."""
13631363
node = self._job_context.job_node(node_type, node_id)
1364-
logger.info(f"Handle failed node: {node}")
1364+
1365+
if error_data:
1366+
# self detected reason override the reason from k8s pod
1367+
node.set_exit_reason(error_data)
1368+
else:
1369+
# inherit the reason from k8s pod
1370+
error_data = node.exit_reason
1371+
logger.info(f"Handle failed node: {node} with reason: {error_data}")
13651372
if node.is_released:
13661373
return
13671374
relaunch_node = self._process_error(

dlrover/python/tests/test_job_manager.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,10 @@ def test_relaunch_node(self):
328328
)
329329
self.assertEqual(self.job_context.get_failed_node_cnt(), 1)
330330
manager.handle_training_failure(
331-
NodeType.WORKER, 1, level=TrainingExceptionLevel.NODE_ERROR
331+
NodeType.WORKER,
332+
1,
333+
level=TrainingExceptionLevel.NODE_ERROR,
334+
error_data="test_reason",
332335
)
333336
self.assertEqual(self.job_context.get_failed_node_cnt(), 2)
334337

0 commit comments

Comments
 (0)