refactor: recovery after agent kill and rejoin

raresgaia123 · raresgaia123 · commit b7aa7cf19200 · 2025-12-11T12:50:28.000+02:00
In the case when an agent that deployed workers is destroyed, the current code implementation would skip the agent that previously deployed failed workers if it wasn't found during the first iteration of recovery, continuing with remaining agents. We had to refactor the code to remove this limitation. So when an agent fails and is restarted, it will be included in the recovery process. Added changes for when an agent joins controller, to also update all existing job contexts with the details about this agent. So whenever an agent joins, all job contexts will be able to use its resources.
diff --git a/infscale/controller/controller.py b/infscale/controller/controller.py
@@ -99,6 +99,11 @@ async def run(self):
 
         await self.apiserver.run()
 
+    def _manage_agent_in_job_ctx(self) -> None:
+        """Manage agent data in job contexts when a new agent registers."""
+        for ctx in self.job_contexts.values():
+            ctx.manage_agent_metadata()
+
     async def handle_register(self, req: pb2.RegReq) -> tuple[bool, str]:
         """Handle registration message."""
         if req.id in self.agent_contexts:
@@ -107,6 +112,7 @@ async def handle_register(self, req: pb2.RegReq) -> tuple[bool, str]:
         self.agent_contexts[req.id] = AgentContext(self, req.id, req.ip)
         # since registration is done, let's keep agent context alive
         self.agent_contexts[req.id].keep_alive()
+        self._manage_agent_in_job_ctx()
 
         return True, ""
 
diff --git a/infscale/controller/job_context.py b/infscale/controller/job_context.py
@@ -518,25 +518,9 @@ def _get_wrk_resources_map(self, wrk_ids: set[str]) -> dict[str, str]:
         agent_gpu_map: dict[str, set[int]] = {}
 
         for wrk_id in wrk_ids:
-            curr_agent = self._get_curr_agent_data(wrk_id)
-            assign_success = False
-            # current agent id might not be available in the case of
-            # recover due to agent failure
-            curr_agent_id = curr_agent.id if curr_agent else ""
-
-            if curr_agent:
-                assign_success = self._assign_available_gpu_to_worker(
-                    curr_agent_id,
-                    agent_resources[curr_agent_id],
-                    wrk_id,
-                    wrk_agent_map,
-                    agent_gpu_map,
-                )
-
-            if not assign_success:
-                assign_success = self._search_gpu_on_all_agents(
-                    agent_resources, curr_agent_id, wrk_id, wrk_agent_map, agent_gpu_map
-                )
+            assign_success = self._search_gpu_on_all_agents(
+                agent_resources, wrk_id, wrk_agent_map, agent_gpu_map
+            )
 
             if not assign_success:
                 # if no resources, return and let while loop continue
@@ -594,7 +578,6 @@ def _assign_available_gpu_to_worker(
     def _search_gpu_on_all_agents(
         self,
         agent_resources: dict[str, AgentResources],
-        curr_agent_id: str,
         wrk_id: str,
         wrk_agent_map: dict[str, tuple[str, int]],
         agent_gpu_map: dict[str, set[int]],
@@ -604,15 +587,16 @@ def _search_gpu_on_all_agents(
         Returns:
             bool: True if a GPU was successfully assigned, False otherwise.
         """
+        assign_success = False
         for agent_id, resources in agent_resources.items():
-            if agent_id == curr_agent_id:
-                continue
-
-            return self._assign_available_gpu_to_worker(
+            assign_success = self._assign_available_gpu_to_worker(
                 agent_id, resources, wrk_id, wrk_agent_map, agent_gpu_map
             )
+            
+            if assign_success:
+                break
 
-        return False
+        return assign_success
 
     def enum_(self) -> JobStateEnum:
         """Return recovery state enum."""
@@ -1207,7 +1191,7 @@ def _get_state_class(self, state_enum: JobStateEnum):
         }
         return state_mapping[state_enum]
 
-    def _manage_agent_metadata(self) -> None:
+    def manage_agent_metadata(self) -> None:
         """Manage agent metadata by create/update/delete."""
         agent_contexts = self.ctrl.agent_contexts
 
@@ -1380,7 +1364,7 @@ async def __update(self):
         # DO NOT call this method in job_context instance or any other places.
         # Call it only in methods of a state instance
         # (e.g., RunningState, RecoveryState, etc).
-        self._manage_agent_metadata()
+        self.manage_agent_metadata()
 
         try:
             self.process_cfg()
@@ -1448,7 +1432,7 @@ async def __start(self):
         # DO NOT call this method in job_context instance or any other places.
         # Call it only in methods of a state instance
         # (e.g., ReadyState, CompleteState, etc).
-        self._manage_agent_metadata()
+        self.manage_agent_metadata()
 
         self._check_agent_info()