@@ -60,26 +60,6 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
6060
6161 return rows
6262
63-
64- def save_single_trajectory (trajectory_record : Dict , row_id : str , output_dir : str = "trajectory_outputs" ):
65- """Save a single trajectory record to file."""
66- output_path = Path (output_dir )
67- output_path .mkdir (exist_ok = True )
68-
69- # Sanitize model_id for filename (replace slashes with underscores)
70- safe_model_id = trajectory_record ["model_id" ].replace ("/" , "_" ).replace ("\\ " , "_" )
71-
72- # Use row_id if provided, otherwise fall back to scenario_id
73- filename = f"{ safe_model_id } _{ row_id } _trajectory.json"
74- filepath = output_path / filename
75-
76- with open (filepath , "w" ) as f :
77- json .dump (trajectory_record , f , indent = 2 , default = str )
78-
79- print (f"💾 Saved trajectory: { filepath } " )
80- return filepath
81-
82-
8363@evaluation_test (
8464 input_dataset = ["tests/pytest/data/airline_dataset.jsonl" ],
8565 dataset_adapter = tau_bench_airline_to_evaluation_row ,
@@ -245,44 +225,6 @@ def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow:
245225 # If everything passed, show success
246226 reason = "\n " .join (failed_reasons ) if failed_reasons else "✅ All checks passed"
247227
248-
249- # # DELETE FROM HERE
250- # row_id = row.input_metadata.row_id
251-
252- # # Create trajectory record similar to test_entire_airline_dataset
253- # model_id = row.input_metadata.completion_params.model if row.input_metadata else "unknown"
254- # trajectory_record = {
255- # "model_id": model_id,
256- # "row_id": row_id,
257- # "messages": [
258- # {"role": msg.role, "content": msg.content, "tool_calls": getattr(msg, "tool_calls", None)}
259- # for msg in messages
260- # ],
261- # "evaluation": {
262- # "score": reward,
263- # "reason": reason,
264- # "metrics": {
265- # "env_reward": {"score": env_reward_info.reward, "success": env_reward_info.reward > 0, "reason": str(env_reward_info.reward_breakdown)},
266- # "action_reward": {"score": action_reward_info.reward, "success": action_reward_info.reward > 0, "reason": str(action_reward_info.reward_breakdown)},
267- # "nl_reward": {"score": nl_reward_info.reward, "success": nl_reward_info.reward > 0, "reason": str(nl_reward_info.reward_breakdown)},
268- # "comm_reward": {"score": communicate_reward_info.reward, "success": communicate_reward_info.reward > 0, "reason": str(communicate_reward_info.reward_breakdown)},
269- # },
270- # },
271- # "evaluation_criteria": evaluation_criteria,
272- # "conversation_length": len(messages),
273- # "trajectory_steps": len([msg for msg in messages if msg.role == "assistant"]), # Approximate step count
274- # "cost_info": {
275- # "total_cost": 0.0, # Could be extracted from usage stats if available
276- # "total_tokens": 0, # Could be extracted from usage stats if available
277- # "cost_source": "not_tracked",
278- # },
279- # "timestamp": datetime.now().isoformat(),
280- # }
281-
282- # # Save this individual trajectory immediately
283- # save_single_trajectory(trajectory_record, row_id=row_id)
284- # # DELETE UNTIL HERE
285-
286228 row .evaluation_result = EvaluateResult (
287229 score = reward ,
288230 reason = reason ,
0 commit comments