@@ -169,21 +169,14 @@ async def _execute_with_semaphore(idx):
169169 max_tool_calls = getattr (policy , "max_tools_per_turn" , None ),
170170 )
171171 if trajectory .terminated :
172- if trajectory .termination_reason in {
173- TerminationReason .CONTROL_PLANE_SIGNAL ,
174- TerminationReason .USER_STOP ,
175- }:
176- evaluation_rows [idx ].rollout_status .status = "finished"
177- elif trajectory .termination_reason in {TerminationReason .MAX_STEPS , TerminationReason .INTERRUPTED }:
178- evaluation_rows [idx ].rollout_status .status = "stopped"
179- evaluation_rows [idx ].rollout_status .error_message = trajectory .control_plane_summary .get (
180- "termination_reason" , trajectory .termination_reason
181- )
182- else :
172+ if trajectory .termination_reason == TerminationReason .ERROR :
183173 evaluation_rows [idx ].rollout_status .status = "error"
184- evaluation_rows [idx ].rollout_status .error_message = trajectory .control_plane_summary .get (
174+ evaluation_rows [idx ].rollout_status .termination_reason = trajectory .control_plane_summary .get (
185175 "error_message" , None
186176 )
177+ else :
178+ evaluation_rows [idx ].rollout_status .status = "finished"
179+ evaluation_rows [idx ].rollout_status .termination_reason = trajectory .termination_reason
187180 else :
188181 evaluation_rows [idx ].rollout_status .status = "running"
189182
@@ -266,7 +259,7 @@ async def _execute_rollout(
266259
267260 # Run rollout loop for this specific environment
268261 step = 0
269- rollout_end = False
262+ env_end = False # if the env indicates the rollout reaches the goal
270263
271264 while step < steps and not trajectory .terminated :
272265 turn_completed = False
@@ -297,7 +290,9 @@ async def _execute_rollout(
297290
298291 # In each turn: keep looping until assistant is ready to provide final response
299292 while not turn_completed and not trajectory .terminated :
300- tool_calls , usage_stats = await policy (tool_schema , rollout_idx , conversation_history )
293+ tool_calls , usage_stats , finish_reason = await policy (
294+ tool_schema , rollout_idx , conversation_history
295+ )
301296
302297 # calc llm usage stats happened in this turn if there is aany
303298 if usage_stats :
@@ -311,17 +306,17 @@ async def _execute_rollout(
311306 if tool_calls [0 ].tool_name == "_no_tool_call" and user_simulator :
312307 turn_completed = True
313308 break
314- # If there's no user simulator, no tool call means policy failed and we should terminate the rollout
309+ # If there's no user simulator, then it marks the end of the episode as LLM think there is no tool call needed.
315310 elif tool_calls [0 ].tool_name in ["_playback_terminate" , "_no_tool_call" ]:
316311 trajectory .terminated = True
317- trajectory .termination_reason = TerminationReason .INTERRUPTED
312+ trajectory .termination_reason = TerminationReason .from_str ( finish_reason )
318313 break
319314
320315 # Execute each tool call sequentially
321316 for tool_call in tool_calls :
322317
323318 # Execute tool call for this environment
324- observation , reward , rollout_end , info = await envs .step (rollout_idx , tool_call )
319+ observation , reward , env_end , info = await envs .step (rollout_idx , tool_call )
325320
326321 tool_response = envs .format_tool_response (observation )
327322
@@ -331,7 +326,7 @@ async def _execute_rollout(
331326 tool_response ,
332327 conversation_history ,
333328 reward ,
334- rollout_end ,
329+ env_end ,
335330 info ,
336331 )
337332
@@ -354,7 +349,7 @@ async def _execute_rollout(
354349 control_plane_step = {
355350 "step" : step - 1 ,
356351 "reward" : reward ,
357- "terminated" : rollout_end ,
352+ "terminated" : env_end ,
358353 "info" : info .get ("control_plane" , {}),
359354 "tool_calls" : [f"{ tool_call .tool_name } ({ tool_call .arguments } )" ],
360355 "num_tool_calls" : 1 ,
@@ -367,11 +362,13 @@ async def _execute_rollout(
367362 if recording_mode :
368363 policy .log_conversation_state_for_playback (rollout_idx , step - 1 , conversation_history )
369364
370- if rollout_end :
365+ if env_end :
366+ # if the env marks the end of the rollout, break the tool call loop
367+ # but set the termination reason later after the final policy call
371368 trajectory .terminated = True
372- trajectory .termination_reason = TerminationReason .CONTROL_PLANE_SIGNAL
373369 break
374- elif step >= steps :
370+
371+ if step >= steps :
375372 trajectory .terminated = True
376373 trajectory .termination_reason = TerminationReason .MAX_STEPS
377374 break
@@ -392,7 +389,7 @@ async def _execute_rollout(
392389 control_plane_step = {
393390 "step" : step - 1 ,
394391 "reward" : reward ,
395- "terminated" : rollout_end ,
392+ "terminated" : env_end ,
396393 "info" : info .get ("control_plane" , {}),
397394 "tool_calls" : tool_calls_summary ,
398395 "num_tool_calls" : len (tool_calls ),
@@ -404,19 +401,16 @@ async def _execute_rollout(
404401 if recording_mode :
405402 policy .log_conversation_state_for_playback (rollout_idx , step - 1 , conversation_history )
406403
407- # Use control plane information for termination decision
408- if rollout_end :
409- trajectory .terminated = True
410- trajectory .termination_reason = TerminationReason .CONTROL_PLANE_SIGNAL
411-
412- # tool indicates rollout should be terminated, call policy one last time to get the final response
413- _ , usage_stats = await policy (tool_schema , rollout_idx , conversation_history )
404+ # if the env marks end, update control plane summary and do one last policy call, then break the agent loop
405+ # this is to ensure each turn ends with an assistant message, which will align with the actual agentic llm behavior
406+ if env_end :
407+ _ , usage_stats , finish_reason = await policy (tool_schema , rollout_idx , conversation_history )
414408 if usage_stats :
415409 trajectory .usage ["prompt_tokens" ] += usage_stats .prompt_tokens
416410 trajectory .usage ["completion_tokens" ] += usage_stats .completion_tokens
417411 trajectory .usage ["total_tokens" ] += usage_stats .total_tokens
418-
419- # Add final control plane summary
412+ trajectory . terminated = True
413+ trajectory . termination_reason = TerminationReason . from_str ( finish_reason )
420414 trajectory .control_plane_summary .update (
421415 {
422416 "total_reward" : trajectory .total_reward ,
@@ -445,7 +439,7 @@ async def _execute_rollout(
445439 )
446440
447441 logger .info (
448- f"🏁 Rollout { rollout_idx } terminated at step { step } (reward: { trajectory .total_reward } ) in thread { threading .current_thread ().name } "
442+ f"🏁 Environmnet indicates rollout { rollout_idx } terminated at step { step } (reward: { trajectory .total_reward } ) in thread { threading .current_thread ().name } "
449443 )
450444 break
451445
0 commit comments