1+ """
2+ Pytest test for APPS coding evaluation using the evaluation_test decorator.
3+
4+ This test demonstrates how to evaluate code correctness for competitive programming problems
5+ using the actual evaluate_apps_solution function from apps_coding_reward.py.
6+ """
7+
8+ import json
9+ from typing import Any , Dict , List
10+
11+ from eval_protocol .models import EvaluateResult , EvaluationRow , Message
12+ from eval_protocol .pytest import default_single_turn_rollout_processor , evaluation_test
13+ from eval_protocol .rewards .apps_coding_reward import evaluate_apps_solution
14+
15+
16+ def apps_dataset_to_evaluation_row (data : List [Dict [str , Any ]]) -> List [EvaluationRow ]:
17+ """
18+ Convert entries from APPS dataset to EvaluationRow objects.
19+ """
20+ return [
21+ EvaluationRow (
22+ messages = [Message (role = "user" , content = row ["prompt" ])],
23+ ground_truth = json .dumps ({
24+ "inputs" : [row ["input" ] + "\n " ], # Add newline for stdin format
25+ "outputs" : [row ["expected_output" ] + "\n " ] # Add newline for stdout format
26+ })
27+ )
28+ for row in data
29+ ]
30+
31+
32+ @evaluation_test (
33+ input_dataset = ["tests/pytest/data/apps_dataset.jsonl" ],
34+ dataset_adapter = apps_dataset_to_evaluation_row ,
35+ model = ["accounts/fireworks/models/kimi-k2-instruct" ],
36+ rollout_input_params = [{"temperature" : 0.0 , "max_tokens" : 4096 }],
37+ threshold_of_success = 0.5 ,
38+ rollout_processor = default_single_turn_rollout_processor ,
39+ num_runs = 1 ,
40+ mode = "pointwise" ,
41+ max_dataset_rows = 3 , # Limit for testing
42+ )
43+ def test_apps_code_evaluation (row : EvaluationRow ) -> EvaluationRow :
44+ """
45+ Evaluation function that tests APPS coding problems using evaluate_apps_solution.
46+
47+ This function:
48+ 1. Uses the actual evaluate_apps_solution from apps_coding_reward.py
49+ 2. Expects ground_truth as JSON string with "inputs" and "outputs" arrays
50+ 3. Returns the evaluation result directly from evaluate_apps_solution
51+
52+ Args:
53+ row: EvaluationRow containing the conversation messages and ground_truth as JSON string
54+
55+ Returns:
56+ EvaluationRow with the evaluation result
57+ """
58+ # Use evaluate_apps_solution directly
59+ result = evaluate_apps_solution (
60+ messages = row .messages ,
61+ ground_truth = row .ground_truth ,
62+ execution_timeout = 10
63+ )
64+
65+ # Set the evaluation result on the row
66+ row .evaluation_result = result
67+
68+ return row
0 commit comments