Skip to content

Commit 686ed67

Browse files
committed
Merge branch 'main' of github.com:eval-protocol/python-sdk
2 parents 86a0cc4 + 2e19cf7 commit 686ed67

15 files changed

+220
-83
lines changed

eval_protocol/integrations/tinker_rollout_processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
152152
# Update row
153153
new_messages = list(row.messages) + [Message(role="assistant", content=assistant_content)]
154154
row.messages = new_messages
155-
row.execution_metadata.duration_seconds = time.perf_counter() - start_time
155+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
156156

157157
# Log usage (approximate since Tinker might not return usage stats in same format)
158158
# We can count tokens ourselves

eval_protocol/mcp/execution/manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ async def _execute_with_semaphore(idx):
150150
else:
151151
evaluation_row.rollout_status = Status.rollout_running()
152152

153-
evaluation_row.execution_metadata.duration_seconds = time.perf_counter() - row_start_time
153+
evaluation_row.execution_metadata.rollout_duration_seconds = time.perf_counter() - row_start_time
154154

155155
return evaluation_row
156156

eval_protocol/models.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -809,9 +809,21 @@ class ExecutionMetadata(BaseModel):
809809

810810
cost_metrics: Optional[CostMetrics] = Field(default=None, description="Cost breakdown for LLM API calls.")
811811

812+
# deprecated: use rollout_duration_seconds and eval_duration_seconds instead
812813
duration_seconds: Optional[float] = Field(
813814
default=None,
814-
description="Processing duration in seconds for this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
815+
deprecated=True,
816+
description="[Deprecated] Processing duration in seconds for this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
817+
)
818+
819+
rollout_duration_seconds: Optional[float] = Field(
820+
default=None,
821+
description="Processing duration in seconds for the rollout of this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
822+
)
823+
824+
eval_duration_seconds: Optional[float] = Field(
825+
default=None,
826+
description="Processing duration in seconds for the evaluation of this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
815827
)
816828

817829
experiment_duration_seconds: Optional[float] = Field(

eval_protocol/pytest/default_agent_rollout_processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
267267
total_tokens=agent.usage["total_tokens"],
268268
)
269269

270-
agent.evaluation_row.execution_metadata.duration_seconds = time.perf_counter() - start_time
270+
agent.evaluation_row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
271271

272272
return agent.evaluation_row
273273
finally:

eval_protocol/pytest/default_pydantic_ai_rollout_processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
8383
# total_tokens=usage_info.total_tokens or 0,
8484
# )
8585

86-
row.execution_metadata.duration_seconds = time.perf_counter() - start_time
86+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
8787

8888
return row
8989

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
180180

181181
row.messages = messages
182182

183-
row.execution_metadata.duration_seconds = time.perf_counter() - start_time
183+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
184184

185185
default_logger.log(row)
186186
return row

eval_protocol/pytest/evaluation_test_utils.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242

4343

4444
async def run_tasks_with_eval_progress(
45-
pointwise_tasks: list[asyncio.Task[EvaluationRow]], run_idx: int
45+
pointwise_tasks: list[asyncio.Task[EvaluationRow]], run_idx: int, disable_tqdm: bool = False
4646
) -> list[EvaluationRow]:
4747
"""
4848
Run evaluation tasks with a progress bar and proper cancellation handling.
@@ -66,6 +66,7 @@ async def run_tasks_with_eval_progress(
6666
miniters=1,
6767
mininterval=0.1,
6868
bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
69+
disable=disable_tqdm,
6970
) as eval_pbar:
7071

7172
async def task_with_progress(task: asyncio.Task[EvaluationRow]) -> EvaluationRow:
@@ -88,7 +89,10 @@ async def task_with_progress(task: asyncio.Task[EvaluationRow]) -> EvaluationRow
8889

8990

9091
async def run_tasks_with_run_progress(
91-
execute_run_func: Callable[[int, RolloutProcessorConfig], Any], num_runs: int, config: RolloutProcessorConfig
92+
execute_run_func: Callable[[int, RolloutProcessorConfig], Any],
93+
num_runs: int,
94+
config: RolloutProcessorConfig,
95+
disable_tqdm: bool = False,
9296
) -> None:
9397
"""
9498
Run tasks with a parallel runs progress bar, preserving original logic.
@@ -108,6 +112,7 @@ async def run_tasks_with_run_progress(
108112
dynamic_ncols=True,
109113
miniters=1,
110114
bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
115+
disable=disable_tqdm,
111116
) as run_pbar:
112117

113118
async def execute_run_with_progress(run_idx: int, config: RolloutProcessorConfig) -> Any:
@@ -330,6 +335,7 @@ async def rollout_processor_with_retry(
330335
fresh_dataset: list[EvaluationRow],
331336
config: RolloutProcessorConfig,
332337
run_idx: int = 0,
338+
disable_tqdm: bool = False,
333339
) -> AsyncGenerator[EvaluationRow, None]:
334340
"""
335341
Wrapper around rollout_processor that handles retry logic using the Python backoff library.
@@ -449,6 +455,7 @@ async def execute_row_with_backoff_and_log(
449455
miniters=1,
450456
mininterval=0.1,
451457
bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
458+
disable=disable_tqdm,
452459
) as rollout_pbar:
453460
# Yield results as they complete
454461
for task in asyncio.as_completed(retry_tasks):

eval_protocol/pytest/github_action_rollout_processor.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -162,15 +162,15 @@ def _list_runs():
162162
row.rollout_status = Status.rollout_error(
163163
f"Failed to find workflow run in GHA with rollout_id {row.execution_metadata.rollout_id}"
164164
)
165-
row.execution_metadata.duration_seconds = time.perf_counter() - start_time
165+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
166166
return row
167167

168168
run_id = run.get("id")
169169
if not run_id:
170170
row.rollout_status = Status.rollout_error(
171171
f"Failed to find workflow run in GHA with rollout_id {row.execution_metadata.rollout_id}"
172172
)
173-
row.execution_metadata.duration_seconds = time.perf_counter() - start_time
173+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
174174
return row
175175

176176
# Poll the specific run until completion
@@ -194,10 +194,10 @@ def _get_run() -> Dict[str, Any]:
194194
row.rollout_status = Status.rollout_error(
195195
f"GitHub Actions run timed out after {self.timeout_seconds} seconds"
196196
)
197-
row.execution_metadata.duration_seconds = time.perf_counter() - start_time
197+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
198198
return row
199199

200-
row.execution_metadata.duration_seconds = time.perf_counter() - start_time
200+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
201201

202202
def _update_with_trace() -> None:
203203
return update_row_with_remote_trace(row, self._output_data_loader, self.model_base_url)

eval_protocol/pytest/openenv_rollout_processor.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
411411
completion_tokens=usage["completion_tokens"],
412412
total_tokens=usage["total_tokens"],
413413
)
414-
row.execution_metadata.duration_seconds = time.perf_counter() - start_time
414+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
415415

416416
# Attach per-step rewards and accumulated token IDs to
417417
# execution_metadata.extra for downstream integrations
@@ -436,14 +436,14 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
436436
logger.info("[OpenEnvRolloutProcessor] Total reward: %.3f", total_reward)
437437
logger.info(
438438
"[OpenEnvRolloutProcessor] Duration: %.2fs",
439-
row.execution_metadata.duration_seconds,
439+
row.execution_metadata.rollout_duration_seconds,
440440
)
441441
logger.debug("[OpenEnvRolloutProcessor] Messages collected: %d", len(messages))
442442

443443
logger.info(
444444
f"Rollout complete: {len(step_rewards)} steps, "
445445
f"total_reward={total_reward:.2f}, "
446-
f"duration={row.execution_metadata.duration_seconds:.2f}s"
446+
f"duration={row.execution_metadata.rollout_duration_seconds:.2f}s"
447447
)
448448
# Final log with complete message history
449449
if getattr(config, "logger", None):

0 commit comments

Comments
 (0)