Skip to content

Commit 0964562

Browse files
committed
Update to basic coding example
1 parent 2422f24 commit 0964562

File tree

4 files changed

+77
-6
lines changed

4 files changed

+77
-6
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"prompt": "Write a Python program that reads two integers from stdin and outputs their sum.", "input": "5\n3", "expected_output": "8"}
2+
{"prompt": "Write a Python program that reads a string from stdin and outputs its length.", "input": "hello", "expected_output": "5"}
3+
{"prompt": "Write a Python program that reads two integers from stdin and outputs the larger one.", "input": "10\n7", "expected_output": "10"}

tests/pytest/test_apps_coding.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
"""
2+
Pytest test for APPS coding evaluation using the evaluation_test decorator.
3+
4+
This test demonstrates how to evaluate code correctness for competitive programming problems
5+
using the actual evaluate_apps_solution function from apps_coding_reward.py.
6+
"""
7+
8+
import json
9+
from typing import Any, Dict, List
10+
11+
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
12+
from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
13+
from eval_protocol.rewards.apps_coding_reward import evaluate_apps_solution
14+
15+
16+
def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
17+
"""
18+
Convert entries from APPS dataset to EvaluationRow objects.
19+
"""
20+
return [
21+
EvaluationRow(
22+
messages=[Message(role="user", content=row["prompt"])],
23+
ground_truth=json.dumps({
24+
"inputs": [row["input"] + "\n"], # Add newline for stdin format
25+
"outputs": [row["expected_output"] + "\n"] # Add newline for stdout format
26+
})
27+
)
28+
for row in data
29+
]
30+
31+
32+
@evaluation_test(
33+
input_dataset=["tests/pytest/data/apps_dataset.jsonl"],
34+
dataset_adapter=apps_dataset_to_evaluation_row,
35+
model=["accounts/fireworks/models/kimi-k2-instruct"],
36+
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
37+
threshold_of_success=0.5,
38+
rollout_processor=default_single_turn_rollout_processor,
39+
num_runs=1,
40+
mode="pointwise",
41+
max_dataset_rows=3, # Limit for testing
42+
)
43+
def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
44+
"""
45+
Evaluation function that tests APPS coding problems using evaluate_apps_solution.
46+
47+
This function:
48+
1. Uses the actual evaluate_apps_solution from apps_coding_reward.py
49+
2. Expects ground_truth as JSON string with "inputs" and "outputs" arrays
50+
3. Returns the evaluation result directly from evaluate_apps_solution
51+
52+
Args:
53+
row: EvaluationRow containing the conversation messages and ground_truth as JSON string
54+
55+
Returns:
56+
EvaluationRow with the evaluation result
57+
"""
58+
# Use evaluate_apps_solution directly
59+
result = evaluate_apps_solution(
60+
messages=row.messages,
61+
ground_truth=row.ground_truth,
62+
execution_timeout=10
63+
)
64+
65+
# Set the evaluation result on the row
66+
row.evaluation_result = result
67+
68+
return row
Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Pytest test for deepcoder code evaluation using the evaluation_test decorator.
2+
Pytest test for coding code evaluation using the evaluation_test decorator.
33
44
This test demonstrates how to evaluate code correctness by executing Python code locally
55
and comparing the output against expected results in a pointwise manner.
@@ -12,9 +12,9 @@
1212
from eval_protocol.rewards.code_execution import extract_code_blocks, execute_python_code
1313

1414

15-
def deepcoder_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
15+
def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
1616
"""
17-
Convert entries from deepcoder dataset to EvaluationRow objects.
17+
Convert entries from coding dataset to EvaluationRow objects.
1818
"""
1919
return [
2020
EvaluationRow(
@@ -26,16 +26,16 @@ def deepcoder_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
2626

2727

2828
@evaluation_test(
29-
input_dataset=["tests/pytest/data/deepcoder_dataset.jsonl"],
30-
dataset_adapter=deepcoder_dataset_to_evaluation_row,
29+
input_dataset=["tests/pytest/data/coding_dataset.jsonl"],
30+
dataset_adapter=coding_dataset_to_evaluation_row,
3131
model=["accounts/fireworks/models/kimi-k2-instruct"],
3232
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
3333
threshold_of_success=0.5,
3434
rollout_processor=default_single_turn_rollout_processor,
3535
num_runs=1,
3636
mode="pointwise",
3737
)
38-
def test_deepcoder_code_evaluation(row: EvaluationRow) -> EvaluationRow:
38+
def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow:
3939
"""
4040
Evaluation function that tests code correctness by executing it locally.
4141

0 commit comments

Comments
 (0)