Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions eval_protocol/benchmarks/test_aime25.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Any, Dict, List, Optional

from eval_protocol.common_utils import load_jsonl
from eval_protocol.data_loader import DynamicDataLoader
from eval_protocol.models import (
EvaluateResult,
EvaluationRow,
Expand All @@ -11,6 +13,7 @@
SingleTurnRolloutProcessor,
)
from eval_protocol.pytest.evaluation_test import evaluation_test
from eval_protocol.pytest.utils import parse_ep_max_rows

SYSTEM_PROMPT = (
"You are a helpful math assistant. Please reason step by step, and put your final answer within \\boxed{...}."
Expand Down Expand Up @@ -71,12 +74,29 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
return converted


_AIME2025_DATASET_URLS: List[str] = [
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
]


def aime2025_data_generator() -> List[EvaluationRow]:
"""Load the AIME 2025 datasets and convert them into evaluation rows."""
dataset_rows: List[Dict[str, Any]] = []
for dataset_url in _AIME2025_DATASET_URLS:
dataset_rows.extend(load_jsonl(dataset_url))

max_rows = parse_ep_max_rows(2)
if max_rows is not None:
dataset_rows = dataset_rows[:max_rows]

return aime2025_dataset_adapter(dataset_rows)


@evaluation_test(
input_dataset=[
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
],
dataset_adapter=aime2025_dataset_adapter,
data_loaders=DynamicDataLoader(
generators=[aime2025_data_generator],
),
completion_params=[
{
"max_tokens": 131000,
Expand Down
15 changes: 13 additions & 2 deletions eval_protocol/benchmarks/test_tau_bench_airline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from pathlib import Path
from typing import Any, Dict, List

from eval_protocol.common_utils import load_jsonl
from eval_protocol.data_loader import DynamicDataLoader
from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message
from eval_protocol.pytest import evaluation_test, ExceptionHandlerConfig
from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor
Expand Down Expand Up @@ -69,6 +71,14 @@ def _get_airline_dataset_path() -> str:
return str(Path(__file__).parent / "data" / "airline_dataset.jsonl")


def tau_bench_airline_data_generator() -> List[EvaluationRow]:
"""Load and adapt the airline dataset into evaluation rows."""
dataset_rows: List[Dict[str, Any]] = []
for dataset_path in [_get_airline_dataset_path()]:
dataset_rows.extend(load_jsonl(dataset_path))
return tau_bench_airline_to_evaluation_row(dataset_rows)


def _get_server_script_path() -> str:
"""Get the tau2 mcp server script path."""
from eval_protocol.mcp_servers.tau2 import get_server_script_path
Expand Down Expand Up @@ -107,8 +117,9 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval


@evaluation_test(
input_dataset=[_get_airline_dataset_path()],
dataset_adapter=tau_bench_airline_to_evaluation_row,
data_loaders=DynamicDataLoader(
generators=[tau_bench_airline_data_generator],
),
completion_params=[
{
"temperature": 0.8,
Expand Down
15 changes: 13 additions & 2 deletions eval_protocol/benchmarks/test_tau_bench_retail.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from pathlib import Path
from typing import Any, Dict, List

from eval_protocol.common_utils import load_jsonl
from eval_protocol.data_loader import DynamicDataLoader
from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message
from eval_protocol.pytest import evaluation_test, ExceptionHandlerConfig
from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor
Expand Down Expand Up @@ -69,6 +71,14 @@ def _get_retail_dataset_path() -> str:
return str(Path(__file__).parent / "data" / "retail_dataset.jsonl")


def tau_bench_retail_data_generator() -> List[EvaluationRow]:
"""Load and adapt the retail dataset into evaluation rows."""
dataset_rows: List[Dict[str, Any]] = []
for dataset_path in [_get_retail_dataset_path()]:
dataset_rows.extend(load_jsonl(dataset_path))
return tau_bench_retail_to_evaluation_row(dataset_rows)


def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
"""
Convert entries from retail dataset to EvaluationRow objects.
Expand Down Expand Up @@ -98,8 +108,9 @@ def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu


@evaluation_test(
input_dataset=[_get_retail_dataset_path()],
dataset_adapter=tau_bench_retail_to_evaluation_row,
data_loaders=DynamicDataLoader(
generators=[tau_bench_retail_data_generator],
),
completion_params=[
{
"temperature": 0.8,
Expand Down
Loading