Add GEval logprob artifacts for OpenAI and Fireworks

benjibc · benjibc · commit ff210d40d8c0 · 2025-12-15T22:20:16.000-08:00
diff --git a/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py b/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py
@@ -7,7 +7,7 @@
 
 from eval_protocol.common_utils import load_jsonl
 from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
-from eval_protocol.dataset_logger.directory_utils import find_eval_protocol_datasets_dir
+from eval_protocol.directory_utils import find_eval_protocol_datasets_dir
 
 if TYPE_CHECKING:
     from eval_protocol.models import EvaluationRow
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -517,10 +517,20 @@ class Message(BaseModel):
     function_call: Optional[FunctionCall] = None
     control_plane_step: Optional[Dict[str, Any]] = None
     weight: Optional[int] = None
+    logprobs: Optional[Any] = Field(
+        default=None,
+        description=(
+            "Optional log probability metadata captured from the completion response. "
+            "When present, this typically mirrors the provider-specific logprob payload."
+        ),
+    )
 
     def dump_mdoel_for_chat_completion_request(self):
         """Only keep chat completion accepted fields"""
-        return self.model_dump(exclude_none=True, exclude={"control_plane_step", "reasoning_content", "weight"})
+        return self.model_dump(
+            exclude_none=True,
+            exclude={"control_plane_step", "reasoning_content", "weight", "logprobs"},
+        )
 
     @classmethod
     def model_validate(cls, obj, *args, **kwargs):
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -3,7 +3,9 @@
 import logging
 import os
 import time
-from typing import List
+from dataclasses import asdict, is_dataclass
+from types import SimpleNamespace
+from typing import Any, List
 
 import litellm
 from litellm import acompletion
@@ -19,6 +21,28 @@
 logger = logging.getLogger(__name__)
 
 
+def _serialize_logprobs(logprobs: Any) -> Any:
+    """Best-effort conversion of provider logprobs into JSON-serializable data."""
+
+    if logprobs is None:
+        return None
+    if hasattr(logprobs, "model_dump"):
+        try:
+            return logprobs.model_dump()
+        except Exception:
+            pass
+    if is_dataclass(logprobs) and not isinstance(logprobs, type):
+        return asdict(logprobs)
+    if isinstance(logprobs, SimpleNamespace):
+        return vars(logprobs)
+    if isinstance(logprobs, dict):
+        return logprobs
+    try:
+        return json.loads(json.dumps(logprobs, default=lambda o: getattr(o, "__dict__", str(o))))
+    except Exception:
+        return logprobs
+
+
 class SingleTurnRolloutProcessor(RolloutProcessor):
     """Single turn rollout processor for direct LLM calls."""
 
@@ -105,6 +129,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
 
             assistant_message = response.choices[0].message
             finish_reason = getattr(response.choices[0], "finish_reason", None)
+            assistant_logprobs = _serialize_logprobs(getattr(response.choices[0], "logprobs", None))
 
             # Extract content
             assistant_content = assistant_message.content or ""
@@ -159,6 +184,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                     content=assistant_content,
                     reasoning_content=reasoning_content,
                     tool_calls=converted_tool_calls,
+                    logprobs=assistant_logprobs,
                 )
             ]
 
diff --git a/examples/deepeval/artifacts/geval_logprobs_combined.jsonl b/examples/deepeval/artifacts/geval_logprobs_combined.jsonl
diff --git a/examples/deepeval/artifacts/geval_logprobs_fireworks.jsonl b/examples/deepeval/artifacts/geval_logprobs_fireworks.jsonl
diff --git a/examples/deepeval/artifacts/geval_logprobs_openai.jsonl b/examples/deepeval/artifacts/geval_logprobs_openai.jsonl
@@ -0,0 +1 @@
+{"messages":[{"role":"user","content":"Say hello politely."},{"role":"assistant","content":"Hello, how are you today?","logprobs":{"content":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257,"top_logprobs":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257},{"token":"Good","bytes":[71,111,111,100],"logprob":-7.1408277},{"token":"\"","bytes":[34],"logprob":-7.267258}]},{"token":",","bytes":[44],"logprob":-0.011751671,"top_logprobs":[{"token":",","bytes":[44],"logprob":-0.011751671},{"token":" there","bytes":[32,116,104,101,114,101],"logprob":-5.0653806},{"token":"!","bytes":[33],"logprob":-5.312371}]},{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638,"top_logprobs":[{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638},{"token":" I","bytes":[32,73],"logprob":-3.3884728},{"token":" it","bytes":[32,105,116],"logprob":-3.5947793}]},{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342,"top_logprobs":[{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342},{"token":" may","bytes":[32,109,97,121],"logprob":-5.5837092},{"token":" do","bytes":[32,100,111],"logprob":-6.48237}]},{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6,"top_logprobs":[{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6},{"token":" your","bytes":[32,121,111,117,114],"logprob":-14.169239},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-14.23296}]},{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095,"top_logprobs":[{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095},{"token":"?","bytes":[63],"logprob":-1.1978787},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-1.263482}]},{"token":"?","bytes":[63],"logprob":-0.00011260267,"top_logprobs":[{"token":"?","bytes":[63],"logprob":-0.00011260267},{"token":"?\n","bytes":[63,10],"logprob":-9.57358},{"token":"?\n\n","bytes":[63,10,10],"logprob":-10.431084}]}],"refusal":null}}],"input_metadata":{"row_id":"liquid-school-081702","completion_params":{"model":"gpt-3.5-turbo","logprobs":true,"top_logprobs":3},"session_data":{"mode":"all"}},"rollout_status":{"code":100,"message":"Rollout finished","details":[]},"evaluation_result":{"score":0.9952574125139473,"is_score_valid":true,"reason":"The Actual Output directly addresses the Input by providing a polite greeting, 'Hello, how are you today?', which is both relevant and helpful. There is no missing or extraneous information, and the response aligns well with the request to say hello politely.","metrics":{"Helpful & Relevant [GEval]":{"is_score_valid":true,"score":0.9952574125139473,"reason":"The Actual Output directly addresses the Input by providing a polite greeting, 'Hello, how are you today?', which is both relevant and helpful. There is no missing or extraneous information, and the response aligns well with the request to say hello politely.","data":{"logprobs":{"content":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257,"top_logprobs":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257},{"token":"Good","bytes":[71,111,111,100],"logprob":-7.1408277},{"token":"\"","bytes":[34],"logprob":-7.267258}]},{"token":",","bytes":[44],"logprob":-0.011751671,"top_logprobs":[{"token":",","bytes":[44],"logprob":-0.011751671},{"token":" there","bytes":[32,116,104,101,114,101],"logprob":-5.0653806},{"token":"!","bytes":[33],"logprob":-5.312371}]},{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638,"top_logprobs":[{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638},{"token":" I","bytes":[32,73],"logprob":-3.3884728},{"token":" it","bytes":[32,105,116],"logprob":-3.5947793}]},{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342,"top_logprobs":[{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342},{"token":" may","bytes":[32,109,97,121],"logprob":-5.5837092},{"token":" do","bytes":[32,100,111],"logprob":-6.48237}]},{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6,"top_logprobs":[{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6},{"token":" your","bytes":[32,121,111,117,114],"logprob":-14.169239},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-14.23296}]},{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095,"top_logprobs":[{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095},{"token":"?","bytes":[63],"logprob":-1.1978787},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-1.263482}]},{"token":"?","bytes":[63],"logprob":-0.00011260267,"top_logprobs":[{"token":"?","bytes":[63],"logprob":-0.00011260267},{"token":"?\n","bytes":[63,10],"logprob":-9.57358},{"token":"?\n\n","bytes":[63,10,10],"logprob":-10.431084}]}],"refusal":null}}}},"agg_score":0.9952574125139473,"standard_error":0.06870295008214607},"execution_metadata":{"invocation_id":"private-thing-379551","experiment_id":"tidy-picture-368421","rollout_id":"smooth-concert-175178","run_id":"traditional-hour-630753","usage":{"completion_tokens":7,"prompt_tokens":11,"total_tokens":18},"cost_metrics":{"input_cost":5.5e-6,"output_cost":0.000010500000000000001,"total_cost_dollar":0.000016000000000000003},"duration_seconds":1.00264809600003,"experiment_duration_seconds":4.857228931000009,"finish_reason":"stop","tool_call_count":0},"created_at":"2025-12-15T16:33:31.522715Z","eval_metadata":{"name":"test_geval_with_logprobs","description":"Attach GEval scores while keeping the raw logprobs on the final message.","version":"0.0.0.dev112+g8dd93b8.dirty","status":{"code":100,"message":"Evaluation finished","details":[]},"num_runs":1,"aggregation_method":"mean"},"pid":8933}
diff --git a/examples/deepeval/artifacts/geval_logprobs_openai_fireworks.jsonl b/examples/deepeval/artifacts/geval_logprobs_openai_fireworks.jsonl
diff --git a/examples/deepeval/test_geval_with_logprobs.py b/examples/deepeval/test_geval_with_logprobs.py
@@ -0,0 +1,65 @@
+"""Example evaluation_test that wraps deepeval's GEval and captures logprobs.
+
+To run this example you will need `deepeval` installed and a compatible
+API key (e.g., OpenAI or Fireworks). You can override the base URL with
+``EP_LLM_API_BASE`` or ``EP_LLM_BASE_URL`` and pass provider-specific
+parameters through ``completion_params``. Logs are written to
+``~/.eval_protocol/datasets/<YYYY-MM-DD>.jsonl`` via the local filesystem
+logger so you can inspect the captured logprobs directly.
+"""
+
+from typing import List
+
+from eval_protocol.dataset_logger.local_fs_dataset_logger_adapter import LocalFSDatasetLoggerAdapter
+from eval_protocol.integrations.deepeval import adapt_metric
+from eval_protocol.models import EvaluationRow
+from eval_protocol.pytest import evaluation_test
+
+try:  # pragma: no cover - optional dependency for the example
+    from deepeval.metrics import GEval
+    from deepeval.test_case import LLMTestCaseParams
+except ImportError as exc:  # pragma: no cover - optional dependency for the example
+    raise ImportError("Install deepeval to run this example: pip install deepeval") from exc
+
+# Configure GEval to judge the assistant response with the full chat context.
+wrapped_metric = adapt_metric(
+    GEval(
+        name="Helpful & Relevant",
+        criteria="Evaluate the helpfulness and relevance of the model output.",
+        evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
+    )
+)
+
+
+@evaluation_test(
+    input_rows=[[EvaluationRow(messages=[{"role": "user", "content": "Say hello politely."}])]],
+    completion_params=[
+        {"model": "gpt-3.5-turbo", "logprobs": True, "top_logprobs": 3},
+        {
+            "model": "accounts/fireworks/models/qwen3-8b",
+            "logprobs": True,
+            "api_base": "https://api.fireworks.ai/inference/v1",
+            "custom_llm_provider": "fireworks_ai",
+        },
+    ],
+    logger=LocalFSDatasetLoggerAdapter(),
+    mode="all",
+)
+def test_geval_with_logprobs(rows: List[EvaluationRow]) -> List[EvaluationRow]:
+    """Attach GEval scores while keeping the raw logprobs on the final message."""
+
+    for row in rows:
+        eval_result = wrapped_metric(
+            messages=[message.model_dump(exclude_none=True) for message in row.messages],
+            ground_truth="Hello!",
+        )
+        row.evaluation_result = eval_result
+
+        # Logprob payload is available on the last assistant message after rollout
+        # and can be forwarded to metric metadata for debugging or analysis.
+        last_assistant = row.messages[-1]
+        if last_assistant.logprobs:
+            metric_key = next(iter(eval_result.metrics))
+            eval_result.metrics[metric_key].data["logprobs"] = last_assistant.logprobs
+
+    return rows
diff --git a/tests/test_rollout_logprobs.py b/tests/test_rollout_logprobs.py
@@ -0,0 +1,58 @@
+import asyncio
+
+import pytest
+from litellm.types.utils import Choices, Message as LLMMessage, ModelResponse
+
+from eval_protocol.dataset_logger import default_logger
+from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
+from eval_protocol.pytest.exception_config import get_default_exception_handler_config
+from eval_protocol.pytest.types import RolloutProcessorConfig
+
+
+def test_single_turn_rollout_captures_logprobs(monkeypatch):
+    processor = SingleTurnRolloutProcessor(drop_trailing_assistant_messages=False)
+
+    config = RolloutProcessorConfig(
+        completion_params={"model": "test-model", "logprobs": True, "top_logprobs": 2},
+        mcp_config_path="",
+        semaphore=asyncio.Semaphore(1),
+        server_script_path=None,
+        steps=1,
+        logger=default_logger,
+        exception_handler_config=get_default_exception_handler_config(),
+    )
+
+    row = EvaluationRow(messages=[Message(role="user", content="hi")])
+
+    async def fake_acompletion(**kwargs):
+        assert kwargs["logprobs"] is True
+        assert kwargs["top_logprobs"] == 2
+        logprobs = {"content": [{"token": "hello", "logprob": -0.1, "top_logprobs": []}]}
+        return ModelResponse(
+            id="resp-1",
+            choices=[
+                Choices(
+                    index=0,
+                    message=LLMMessage(role="assistant", content="hello"),
+                    finish_reason="stop",
+                    logprobs=logprobs,
+                )
+            ],
+            created=0,
+            model="test-model",
+        )
+
+    monkeypatch.setattr("eval_protocol.pytest.default_single_turn_rollout_process.acompletion", fake_acompletion)
+
+    async def _run() -> None:
+        tasks = processor([row], config)
+        completed_rows = await asyncio.gather(*tasks)
+
+        assert completed_rows[0].messages[-1].content == "hello"
+        assistant_logprobs = completed_rows[0].messages[-1].logprobs
+        assert isinstance(assistant_logprobs, dict)
+        assert assistant_logprobs["content"][0]["token"] == "hello"
+        assert assistant_logprobs["content"][0]["logprob"] == -0.1
+
+    asyncio.run(_run())

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"messages":[{"role":"user","content":"Say hello politely."},{"role":"assistant","content":"Hello, how are you today?","logprobs":{"content":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257,"top_logprobs":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257},{"token":"Good","bytes":[71,111,111,100],"logprob":-7.1408277},{"token":"\"","bytes":[34],"logprob":-7.267258}]},{"token":",","bytes":[44],"logprob":-0.011751671,"top_logprobs":[{"token":",","bytes":[44],"logprob":-0.011751671},{"token":" there","bytes":[32,116,104,101,114,101],"logprob":-5.0653806},{"token":"!","bytes":[33],"logprob":-5.312371}]},{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638,"top_logprobs":[{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638},{"token":" I","bytes":[32,73],"logprob":-3.3884728},{"token":" it","bytes":[32,105,116],"logprob":-3.5947793}]},{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342,"top_logprobs":[{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342},{"token":" may","bytes":[32,109,97,121],"logprob":-5.5837092},{"token":" do","bytes":[32,100,111],"logprob":-6.48237}]},{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6,"top_logprobs":[{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6},{"token":" your","bytes":[32,121,111,117,114],"logprob":-14.169239},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-14.23296}]},{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095,"top_logprobs":[{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095},{"token":"?","bytes":[63],"logprob":-1.1978787},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-1.263482}]},{"token":"?","bytes":[63],"logprob":-0.00011260267,"top_logprobs":[{"token":"?","bytes":[63],"logprob":-0.00011260267},{"token":"?\n","bytes":[63,10],"logprob":-9.57358},{"token":"?\n\n","bytes":[63,10,10],"logprob":-10.431084}]}],"refusal":null}}],"input_metadata":{"row_id":"liquid-school-081702","completion_params":{"model":"gpt-3.5-turbo","logprobs":true,"top_logprobs":3},"session_data":{"mode":"all"}},"rollout_status":{"code":100,"message":"Rollout finished","details":[]},"evaluation_result":{"score":0.9952574125139473,"is_score_valid":true,"reason":"The Actual Output directly addresses the Input by providing a polite greeting, 'Hello, how are you today?', which is both relevant and helpful. There is no missing or extraneous information, and the response aligns well with the request to say hello politely.","metrics":{"Helpful & Relevant [GEval]":{"is_score_valid":true,"score":0.9952574125139473,"reason":"The Actual Output directly addresses the Input by providing a polite greeting, 'Hello, how are you today?', which is both relevant and helpful. There is no missing or extraneous information, and the response aligns well with the request to say hello politely.","data":{"logprobs":{"content":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257,"top_logprobs":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257},{"token":"Good","bytes":[71,111,111,100],"logprob":-7.1408277},{"token":"\"","bytes":[34],"logprob":-7.267258}]},{"token":",","bytes":[44],"logprob":-0.011751671,"top_logprobs":[{"token":",","bytes":[44],"logprob":-0.011751671},{"token":" there","bytes":[32,116,104,101,114,101],"logprob":-5.0653806},{"token":"!","bytes":[33],"logprob":-5.312371}]},{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638,"top_logprobs":[{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638},{"token":" I","bytes":[32,73],"logprob":-3.3884728},{"token":" it","bytes":[32,105,116],"logprob":-3.5947793}]},{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342,"top_logprobs":[{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342},{"token":" may","bytes":[32,109,97,121],"logprob":-5.5837092},{"token":" do","bytes":[32,100,111],"logprob":-6.48237}]},{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6,"top_logprobs":[{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6},{"token":" your","bytes":[32,121,111,117,114],"logprob":-14.169239},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-14.23296}]},{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095,"top_logprobs":[{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095},{"token":"?","bytes":[63],"logprob":-1.1978787},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-1.263482}]},{"token":"?","bytes":[63],"logprob":-0.00011260267,"top_logprobs":[{"token":"?","bytes":[63],"logprob":-0.00011260267},{"token":"?\n","bytes":[63,10],"logprob":-9.57358},{"token":"?\n\n","bytes":[63,10,10],"logprob":-10.431084}]}],"refusal":null}}}},"agg_score":0.9952574125139473,"standard_error":0.06870295008214607},"execution_metadata":{"invocation_id":"private-thing-379551","experiment_id":"tidy-picture-368421","rollout_id":"smooth-concert-175178","run_id":"traditional-hour-630753","usage":{"completion_tokens":7,"prompt_tokens":11,"total_tokens":18},"cost_metrics":{"input_cost":5.5e-6,"output_cost":0.000010500000000000001,"total_cost_dollar":0.000016000000000000003},"duration_seconds":1.00264809600003,"experiment_duration_seconds":4.857228931000009,"finish_reason":"stop","tool_call_count":0},"created_at":"2025-12-15T16:33:31.522715Z","eval_metadata":{"name":"test_geval_with_logprobs","description":"Attach GEval scores while keeping the raw logprobs on the final message.","version":"0.0.0.dev112+g8dd93b8.dirty","status":{"code":100,"message":"Evaluation finished","details":[]},"num_runs":1,"aggregation_method":"mean"},"pid":8933}