Skip to content

Commit b61994d

Browse files
authored
add group relaunch limits (#1662)
* add group relaunch limits * add group relaunch limits unit test
1 parent 12dbc9b commit b61994d

File tree

3 files changed

+28
-1
lines changed

3 files changed

+28
-1
lines changed

dlrover/python/common/global_context.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,8 @@ class DefaultValues(object):
8181
MAX_CKPT_THRESHOLD = 900 # seconds
8282
MAX_AVG_STEPS = 50
8383
FIRST_GROUP_IDX = 1000 # group idx initial value for group relaunch
84-
MAX_RELAUNCH_COUNT = 3
84+
MAX_RELAUNCH_COUNT = 3 # maximum node relaunch count
85+
MAX_GROUP_RELAUNCH_COUNT = 3 # maximum node group relaunch count
8586

8687

8788
class Context(Singleton):
@@ -145,6 +146,7 @@ def __init__(self):
145146
# pre-check args
146147
self.pre_check_operators = DefaultValues.PRE_CHECK_OPS
147148
self.max_relaunch_count = DefaultValues.MAX_RELAUNCH_COUNT
149+
self.max_group_relaunch_count = DefaultValues.MAX_GROUP_RELAUNCH_COUNT
148150

149151
def set_params_from_brain(self):
150152
self.train_speed_record_num = self.get_param_value_from_brain(

dlrover/python/master/node/dist_job_manager.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,8 @@ def __init__(
195195
self._scaler: Scaler = job_scaler
196196
self._init_training_node_manager()
197197
self._relaunched_groups: List[int] = []
198+
self._group_relaunch_count = 0
199+
self._max_group_relaunch_count = _dlrover_context.max_relaunch_count
198200

199201
def start(self):
200202
self._scaler.start()
@@ -917,6 +919,13 @@ def _should_relaunch_node_group(self, node_group: int) -> bool:
917919
f"{self._enable_relaunch_node}, {node_check}, {job_ctx.get_job_stage()}"
918920
)
919921

922+
if self._group_relaunch_count > self._max_group_relaunch_count:
923+
logger.info(
924+
f"Node group {node_group} has exceeded max relaunch count: "
925+
f"{self._group_relaunch_count}/{self._max_group_relaunch_count}"
926+
)
927+
return False
928+
920929
return should_relaunch
921930

922931
def _should_relaunch(
@@ -1089,6 +1098,7 @@ def _relaunch_node_group(self, node_group: int):
10891098

10901099
self._relaunched_groups.append(node_group)
10911100
self._scaler.scale(plan)
1101+
self._group_relaunch_count += 1
10921102
return plan
10931103

10941104
def clear_exited_nodes(self):

dlrover/python/tests/test_job_manager.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,21 @@ def test_relaunch_node_group(self):
356356
manager._init_nodes()
357357
manager._scaler.scale = mock.MagicMock(return_value=None)
358358

359+
manager._max_group_relaunch_count = -1
360+
self.job_context.clear_job_node_groups()
361+
node = Node(
362+
NodeType.WORKER,
363+
0,
364+
rank_index=0,
365+
status=NodeStatus.PENDING,
366+
node_group=0,
367+
node_group_size=1,
368+
relaunchable=True,
369+
)
370+
self.job_context.update_job_node_by_group(node)
371+
self.assertFalse(manager._should_relaunch_node_group(0))
372+
manager._max_group_relaunch_count = 3
373+
359374
self.job_context.clear_job_node_groups()
360375
node = Node(
361376
NodeType.WORKER,

0 commit comments

Comments
 (0)