Skip to content

Commit d88e155

Browse files
committed
Add frozen lake to benchmarks
1 parent 2f19f5b commit d88e155

File tree

1 file changed

+76
-0
lines changed

1 file changed

+76
-0
lines changed
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""
2+
Pytest test for frozen lake evaluation using the evaluation_test decorator.
3+
4+
This test demonstrates how to use frozen lake environments within the pytest framework,
5+
similar to the test_frozen_lake_e2e test but integrated with the pytest evaluation system.
6+
"""
7+
8+
from typing import Any, Dict, List
9+
10+
from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message
11+
from eval_protocol.pytest import evaluation_test
12+
from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor
13+
14+
15+
def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
16+
"""
17+
Convert entries from frozen lake dataset to EvaluationRow objects.
18+
"""
19+
rows = []
20+
21+
for row in data:
22+
eval_row = EvaluationRow(
23+
messages=[Message(role="system", content=row["system_prompt"])],
24+
input_metadata=InputMetadata(
25+
row_id=row["id"],
26+
dataset_info={
27+
"environment_context": row["environment_context"],
28+
"user_prompt_template": row["user_prompt_template"],
29+
},
30+
),
31+
)
32+
33+
rows.append(eval_row)
34+
35+
return rows
36+
37+
38+
@evaluation_test(
39+
input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"],
40+
dataset_adapter=frozen_lake_to_evaluation_row,
41+
completion_params=[
42+
{"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}
43+
],
44+
rollout_processor=MCPGymRolloutProcessor(),
45+
passed_threshold=0.66,
46+
num_runs=1,
47+
max_concurrent_rollouts=3,
48+
mode="pointwise",
49+
server_script_path="examples/frozen_lake_mcp/server.py",
50+
)
51+
def test_frozen_lake_evaluation(row: EvaluationRow) -> EvaluationRow:
52+
"""
53+
Test frozen lake evaluation using the pytest framework.
54+
55+
This test evaluates how well the model can navigate the FrozenLake environment
56+
by checking if it successfully reaches the goal while avoiding holes.
57+
58+
Args:
59+
row: EvaluationRow object from frozen lake dataset
60+
61+
Returns:
62+
EvaluationRow object with evaluation results
63+
"""
64+
score = row.get_total_reward()
65+
66+
if score == 1.0:
67+
reason = "Agent reached the goal"
68+
else:
69+
reason = "Agent did not reach the goal"
70+
71+
row.evaluation_result = EvaluateResult(
72+
score=score,
73+
reason=reason,
74+
)
75+
76+
return row

0 commit comments

Comments
 (0)