Update to basic coding example

xzrderek · xzrderek · commit 09645620f45d · 2025-08-04T12:11:39.000-07:00
diff --git a/tests/pytest/data/apps_dataset.jsonl b/tests/pytest/data/apps_dataset.jsonl
@@ -0,0 +1,3 @@
+{"prompt": "Write a Python program that reads two integers from stdin and outputs their sum.", "input": "5\n3", "expected_output": "8"}
+{"prompt": "Write a Python program that reads a string from stdin and outputs its length.", "input": "hello", "expected_output": "5"}
+{"prompt": "Write a Python program that reads two integers from stdin and outputs the larger one.", "input": "10\n7", "expected_output": "10"}
diff --git a/tests/pytest/data/coding_dataset.jsonl b/tests/pytest/data/coding_dataset.jsonl
diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py
@@ -0,0 +1,68 @@
+"""
+Pytest test for APPS coding evaluation using the evaluation_test decorator.
+
+This test demonstrates how to evaluate code correctness for competitive programming problems
+using the actual evaluate_apps_solution function from apps_coding_reward.py.
+"""
+
+import json
+from typing import Any, Dict, List
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message
+from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
+from eval_protocol.rewards.apps_coding_reward import evaluate_apps_solution
+
+
+def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    """
+    Convert entries from APPS dataset to EvaluationRow objects.
+    """
+    return [
+        EvaluationRow(
+            messages=[Message(role="user", content=row["prompt"])], 
+            ground_truth=json.dumps({
+                "inputs": [row["input"] + "\n"],  # Add newline for stdin format
+                "outputs": [row["expected_output"] + "\n"]  # Add newline for stdout format
+            })
+        )
+        for row in data
+    ]
+
+
+@evaluation_test(
+    input_dataset=["tests/pytest/data/apps_dataset.jsonl"],
+    dataset_adapter=apps_dataset_to_evaluation_row,
+    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
+    threshold_of_success=0.5,
+    rollout_processor=default_single_turn_rollout_processor,
+    num_runs=1,
+    mode="pointwise",
+    max_dataset_rows=3,  # Limit for testing
+)
+def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
+    """
+    Evaluation function that tests APPS coding problems using evaluate_apps_solution.
+    
+    This function:
+    1. Uses the actual evaluate_apps_solution from apps_coding_reward.py
+    2. Expects ground_truth as JSON string with "inputs" and "outputs" arrays
+    3. Returns the evaluation result directly from evaluate_apps_solution
+    
+    Args:
+        row: EvaluationRow containing the conversation messages and ground_truth as JSON string
+        
+    Returns:
+        EvaluationRow with the evaluation result
+    """
+    # Use evaluate_apps_solution directly
+    result = evaluate_apps_solution(
+        messages=row.messages,
+        ground_truth=row.ground_truth,
+        execution_timeout=10
+    )
+    
+    # Set the evaluation result on the row
+    row.evaluation_result = result
+    
+    return row 
diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py
@@ -1,5 +1,5 @@
 """
-Pytest test for deepcoder code evaluation using the evaluation_test decorator.
+Pytest test for coding code evaluation using the evaluation_test decorator.
 
 This test demonstrates how to evaluate code correctness by executing Python code locally
 and comparing the output against expected results in a pointwise manner.
@@ -12,9 +12,9 @@
 from eval_protocol.rewards.code_execution import extract_code_blocks, execute_python_code
 
 
-def deepcoder_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
+def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
     """
-    Convert entries from deepcoder dataset to EvaluationRow objects.
+    Convert entries from coding dataset to EvaluationRow objects.
     """
     return [
         EvaluationRow(
@@ -26,16 +26,16 @@ def deepcoder_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
 
 
 @evaluation_test(
-    input_dataset=["tests/pytest/data/deepcoder_dataset.jsonl"],
-    dataset_adapter=deepcoder_dataset_to_evaluation_row,
+    input_dataset=["tests/pytest/data/coding_dataset.jsonl"],
+    dataset_adapter=coding_dataset_to_evaluation_row,
     model=["accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     threshold_of_success=0.5,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
 )
-def test_deepcoder_code_evaluation(row: EvaluationRow) -> EvaluationRow:
+def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow:
     """
     Evaluation function that tests code correctness by executing it locally.
     

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{"prompt": "Write a Python program that reads two integers from stdin and outputs their sum.", "input": "5\n3", "expected_output": "8"}`
	`2`	`+{"prompt": "Write a Python program that reads a string from stdin and outputs its length.", "input": "hello", "expected_output": "5"}`
	`3`	`+{"prompt": "Write a Python program that reads two integers from stdin and outputs the larger one.", "input": "10\n7", "expected_output": "10"}`