gsm8k math example (#294)

benjibc · web-flow · commit 9f352ed4ad5f · 2025-10-28T14:29:05.000-07:00
* gsm8k math example

* fix tests
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -41,9 +41,6 @@ jobs:
       - name: Install tau2 for testing
         run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
 
-      - name: Ruff format (check)
-        run: uv run ruff format --check .
-
       - name: Ruff lint
         run: uv run ruff check .
 
diff --git a/development/gsm8k_sample.jsonl b/development/gsm8k_sample.jsonl
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -21,6 +21,16 @@
 class SingleTurnRolloutProcessor(RolloutProcessor):
     """Single turn rollout processor for direct LLM calls."""
 
+    def __init__(self, *, drop_trailing_assistant_messages: bool = True) -> None:
+        """
+        Args:
+            drop_trailing_assistant_messages: When True (default), strip any trailing
+                assistant messages from the input conversation before calling the model.
+                This helps when datasets include previous assistant turns and you want
+                the model to answer the latest user query.
+        """
+        self.drop_trailing_assistant_messages = drop_trailing_assistant_messages
+
     def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]:
         """Generate single turn rollout tasks and return them for external handling."""
         # Do not modify global LiteLLM cache. Disable caching per-request instead.
@@ -32,7 +42,13 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             if len(row.messages) == 0:
                 raise ValueError("Messages is empty. Please provide a non-empty dataset")
 
-            messages_payload = [message.model_dump() for message in row.messages]
+            # Optionally drop trailing assistant messages for single-turn prompts
+            messages_for_request: List[Message] = list(row.messages)
+            if self.drop_trailing_assistant_messages:
+                while messages_for_request and messages_for_request[-1].role == "assistant":
+                    messages_for_request.pop()
+
+            messages_payload = [message.model_dump() for message in messages_for_request]
 
             request_params = {"messages": messages_payload, **config.completion_params}
             # Ensure caching is disabled only for this request (review feedback)
@@ -114,7 +130,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                         except Exception:
                             pass
 
-            messages = list(row.messages) + [
+            messages = list(messages_for_request) + [
                 Message(
                     role="assistant",
                     content=assistant_content,
diff --git a/eval_protocol/pytest/exception_config.py b/eval_protocol/pytest/exception_config.py
@@ -12,6 +12,7 @@
 import requests
 import httpx
 
+
 # Default exceptions that should be retried with backoff
 DEFAULT_RETRYABLE_EXCEPTIONS: Set[Type[Exception]] = {
     # Standard library exceptions
diff --git a/tests/pytest/gsm8k/requirements.txt b/tests/pytest/gsm8k/requirements.txt
@@ -0,0 +1 @@
+eval-protocol
diff --git a/tests/pytest/gsm8k/test_pytest_math_example.py b/tests/pytest/gsm8k/test_pytest_math_example.py
@@ -0,0 +1,66 @@
+import re
+from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult, Message
+from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test
+from typing import List, Dict, Any, Optional
+
+
+def extract_answer_digits(ground_truth: str) -> Optional[str]:
+    """
+    Extract the digits from the answer string.
+    """
+    answer_string = ground_truth.split("<answer>")[1].split("</answer>")[0]
+    return re.search(r"(\d+)", answer_string).group(1) if answer_string else None
+
+
+@evaluation_test(
+    input_dataset=["development/gsm8k_sample.jsonl"],
+    completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
+    max_dataset_rows=5,
+    passed_threshold=0.0,
+    rollout_processor=SingleTurnRolloutProcessor(),
+    mode="pointwise",
+    evaluation_test_kwargs=[
+        {"math_reward_kwargs": {"tolerance": 0.001, "absolute_tolerance": 1e-8, "require_units": False}}
+    ],
+)
+def test_math_dataset(row: EvaluationRow, **kwargs) -> EvaluationRow:
+    """
+    Evaluate math problem solving considering both accuracy and format.
+
+    This function demonstrates how to combine multiple evaluation criteria:
+    - Numerical accuracy using built-in math evaluation (80% weight)
+    - Format compliance checking for <think>...</think><answer>...</answer> structure (20% weight)
+
+    Args:
+        row: EvaluationRow containing the conversation messages and ground truth
+        **kwargs: Additional parameters (like math_reward_kwargs)
+
+    Returns:
+        EvaluationRow with the evaluation result
+    """
+    #### Get predicted answer value
+    prediction = extract_answer_digits(str(row.messages[2].content))
+    gt = extract_answer_digits(str(row.ground_truth))
+
+    #### Get score
+    if prediction is None or gt is None:
+        score = 0
+        reason = "Missing answer tags in prediction or ground truth."
+
+    elif gt == prediction:
+        score = 1
+        reason = "Model answer is correct."
+
+    else:
+        score = 0
+        reason = "Model answer is not correct."
+
+    reason += f" Prediction: {prediction}, Ground Truth: {gt}"
+
+    evaluation_result = EvaluateResult(
+        score=score,  # Required: The final evaluation score
+        is_score_valid=True,  # Optional: Whether the score is valid, true by default
+        reason=reason,  # Optional: The reason for the score
+    )
+    row.evaluation_result = evaluation_result
+    return row
diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py
diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py
@@ -5,12 +5,10 @@
 from eval_protocol.rewards.length import count_tokens
 from eval_protocol.rewards.math import math_reward
 from examples.math_with_format_and_length.main import check_think_answer_format
-from tests.pytest.helper.gsm8k_to_evaluation_row import gsm8k_to_evaluation_row
 
 
 @evaluation_test(
     input_dataset=["development/gsm8k_sample.jsonl"],
-    dataset_adapter=gsm8k_to_evaluation_row,
     completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
     max_dataset_rows=5,
     passed_threshold=0.0,
diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
@@ -2,12 +2,10 @@
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult
 from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test
-from tests.pytest.helper.word_count_to_evaluation_row import word_count_to_evaluation_row
 
 
 @evaluation_test(
     input_dataset=["development/gsm8k_sample.jsonl"],
-    dataset_adapter=word_count_to_evaluation_row,
     completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
     max_dataset_rows=5,
     passed_threshold=0.3,  # Reasonable threshold for word count evaluation
diff --git a/tests/pytest/test_single_turn_rollout_processor.py b/tests/pytest/test_single_turn_rollout_processor.py
@@ -0,0 +1,118 @@
+import asyncio
+from types import SimpleNamespace
+
+import pytest
+
+from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.pytest import SingleTurnRolloutProcessor
+
+
+class _DummyConfig:
+    def __init__(self):
+        self.completion_params = {"model": "fake-model", "temperature": 0}
+        self.semaphore = asyncio.Semaphore(10)
+
+
+@pytest.mark.asyncio
+async def test_single_turn_drops_trailing_assistant_by_default(monkeypatch):
+    # Arrange dataset row with trailing assistant message
+    row = EvaluationRow(
+        messages=[
+            Message(role="user", content="What is 2+2?"),
+            Message(role="assistant", content="Old response"),
+        ]
+    )
+
+    # Capture the messages payload passed to the LLM call
+    captured = {}
+
+    # Patch module-level imports in the processor module
+    import eval_protocol.pytest.default_single_turn_rollout_process as mod
+
+    class StubChoices:
+        pass
+
+    class StubModelResponse:
+        def __init__(self, text: str):
+            self.choices = [StubChoices()]
+            # Emulate OpenAI-like response.message fields
+            self.choices[0].message = SimpleNamespace(content=text, tool_calls=None)
+            # Minimal usage payload
+            self.usage = SimpleNamespace(prompt_tokens=1, completion_tokens=1, total_tokens=2)
+
+    async def fake_acompletion(**kwargs):
+        # Verify that trailing assistant was dropped before sending
+        msgs = kwargs.get("messages", [])
+        assert msgs, "Expected non-empty messages payload"
+        captured["messages"] = msgs
+        assert msgs[-1]["role"] != "assistant", "Trailing assistant should be dropped by default"
+        return StubModelResponse(text="4")
+
+    # Monkeypatch the processor module's symbols to avoid dependency on litellm types
+    monkeypatch.setattr(mod, "ModelResponse", StubModelResponse, raising=True)
+    monkeypatch.setattr(mod, "Choices", StubChoices, raising=True)
+    monkeypatch.setattr(mod, "acompletion", fake_acompletion, raising=True)
+
+    processor = SingleTurnRolloutProcessor()
+    config = _DummyConfig()
+
+    # Act
+    tasks = processor([row], config)
+    out = await tasks[0]
+
+    # Assert: request trimmed the trailing assistant
+    sent_msgs = captured["messages"]
+    assert len(sent_msgs) == 1
+    assert sent_msgs[0]["role"] == "user"
+    assert out.messages[-1].role == "assistant"
+    assert out.messages[-1].content == "4"
+    # Ensure previous trailing assistant was not duplicated
+    assert [m.role for m in out.messages] == ["user", "assistant"]
+
+
+@pytest.mark.asyncio
+async def test_single_turn_keeps_trailing_assistant_when_disabled(monkeypatch):
+    # Arrange dataset row with trailing assistant message
+    row = EvaluationRow(
+        messages=[
+            Message(role="user", content="Say hi"),
+            Message(role="assistant", content="Hi!"),
+        ]
+    )
+
+    captured = {}
+
+    import eval_protocol.pytest.default_single_turn_rollout_process as mod
+
+    class StubChoices:
+        pass
+
+    class StubModelResponse:
+        def __init__(self, text: str):
+            self.choices = [StubChoices()]
+            self.choices[0].message = SimpleNamespace(content=text, tool_calls=None)
+            self.usage = SimpleNamespace(prompt_tokens=1, completion_tokens=1, total_tokens=2)
+
+    async def fake_acompletion(**kwargs):
+        msgs = kwargs.get("messages", [])
+        captured["messages"] = msgs
+        # With opt-out, trailing assistant is preserved
+        assert msgs[-1]["role"] == "assistant"
+        return StubModelResponse(text="Hello again")
+
+    monkeypatch.setattr(mod, "ModelResponse", StubModelResponse, raising=True)
+    monkeypatch.setattr(mod, "Choices", StubChoices, raising=True)
+    monkeypatch.setattr(mod, "acompletion", fake_acompletion, raising=True)
+
+    processor = SingleTurnRolloutProcessor(drop_trailing_assistant_messages=False)
+    config = _DummyConfig()
+
+    # Act
+    tasks = processor([row], config)
+    out = await tasks[0]
+
+    # Assert: both original messages plus new assistant
+    sent_msgs = captured["messages"]
+    assert [m["role"] for m in sent_msgs] == ["user", "assistant"]
+    assert [m.role for m in out.messages] == ["user", "assistant", "assistant"]
+    assert out.messages[-1].content == "Hello again"