Skip to content

Commit ff210d4

Browse files
committed
Add GEval logprob artifacts for OpenAI and Fireworks
1 parent 948961b commit ff210d4

File tree

9 files changed

+167
-3
lines changed

9 files changed

+167
-3
lines changed

eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from eval_protocol.common_utils import load_jsonl
99
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
10-
from eval_protocol.dataset_logger.directory_utils import find_eval_protocol_datasets_dir
10+
from eval_protocol.directory_utils import find_eval_protocol_datasets_dir
1111

1212
if TYPE_CHECKING:
1313
from eval_protocol.models import EvaluationRow

eval_protocol/models.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -517,10 +517,20 @@ class Message(BaseModel):
517517
function_call: Optional[FunctionCall] = None
518518
control_plane_step: Optional[Dict[str, Any]] = None
519519
weight: Optional[int] = None
520+
logprobs: Optional[Any] = Field(
521+
default=None,
522+
description=(
523+
"Optional log probability metadata captured from the completion response. "
524+
"When present, this typically mirrors the provider-specific logprob payload."
525+
),
526+
)
520527

521528
def dump_mdoel_for_chat_completion_request(self):
522529
"""Only keep chat completion accepted fields"""
523-
return self.model_dump(exclude_none=True, exclude={"control_plane_step", "reasoning_content", "weight"})
530+
return self.model_dump(
531+
exclude_none=True,
532+
exclude={"control_plane_step", "reasoning_content", "weight", "logprobs"},
533+
)
524534

525535
@classmethod
526536
def model_validate(cls, obj, *args, **kwargs):

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
import logging
44
import os
55
import time
6-
from typing import List
6+
from dataclasses import asdict, is_dataclass
7+
from types import SimpleNamespace
8+
from typing import Any, List
79

810
import litellm
911
from litellm import acompletion
@@ -19,6 +21,28 @@
1921
logger = logging.getLogger(__name__)
2022

2123

24+
def _serialize_logprobs(logprobs: Any) -> Any:
25+
"""Best-effort conversion of provider logprobs into JSON-serializable data."""
26+
27+
if logprobs is None:
28+
return None
29+
if hasattr(logprobs, "model_dump"):
30+
try:
31+
return logprobs.model_dump()
32+
except Exception:
33+
pass
34+
if is_dataclass(logprobs) and not isinstance(logprobs, type):
35+
return asdict(logprobs)
36+
if isinstance(logprobs, SimpleNamespace):
37+
return vars(logprobs)
38+
if isinstance(logprobs, dict):
39+
return logprobs
40+
try:
41+
return json.loads(json.dumps(logprobs, default=lambda o: getattr(o, "__dict__", str(o))))
42+
except Exception:
43+
return logprobs
44+
45+
2246
class SingleTurnRolloutProcessor(RolloutProcessor):
2347
"""Single turn rollout processor for direct LLM calls."""
2448

@@ -105,6 +129,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
105129

106130
assistant_message = response.choices[0].message
107131
finish_reason = getattr(response.choices[0], "finish_reason", None)
132+
assistant_logprobs = _serialize_logprobs(getattr(response.choices[0], "logprobs", None))
108133

109134
# Extract content
110135
assistant_content = assistant_message.content or ""
@@ -159,6 +184,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
159184
content=assistant_content,
160185
reasoning_content=reasoning_content,
161186
tool_calls=converted_tool_calls,
187+
logprobs=assistant_logprobs,
162188
)
163189
]
164190

examples/deepeval/artifacts/geval_logprobs_combined.jsonl

Lines changed: 2 additions & 0 deletions
Large diffs are not rendered by default.

examples/deepeval/artifacts/geval_logprobs_fireworks.jsonl

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"messages":[{"role":"user","content":"Say hello politely."},{"role":"assistant","content":"Hello, how are you today?","logprobs":{"content":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257,"top_logprobs":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257},{"token":"Good","bytes":[71,111,111,100],"logprob":-7.1408277},{"token":"\"","bytes":[34],"logprob":-7.267258}]},{"token":",","bytes":[44],"logprob":-0.011751671,"top_logprobs":[{"token":",","bytes":[44],"logprob":-0.011751671},{"token":" there","bytes":[32,116,104,101,114,101],"logprob":-5.0653806},{"token":"!","bytes":[33],"logprob":-5.312371}]},{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638,"top_logprobs":[{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638},{"token":" I","bytes":[32,73],"logprob":-3.3884728},{"token":" it","bytes":[32,105,116],"logprob":-3.5947793}]},{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342,"top_logprobs":[{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342},{"token":" may","bytes":[32,109,97,121],"logprob":-5.5837092},{"token":" do","bytes":[32,100,111],"logprob":-6.48237}]},{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6,"top_logprobs":[{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6},{"token":" your","bytes":[32,121,111,117,114],"logprob":-14.169239},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-14.23296}]},{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095,"top_logprobs":[{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095},{"token":"?","bytes":[63],"logprob":-1.1978787},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-1.263482}]},{"token":"?","bytes":[63],"logprob":-0.00011260267,"top_logprobs":[{"token":"?","bytes":[63],"logprob":-0.00011260267},{"token":"?\n","bytes":[63,10],"logprob":-9.57358},{"token":"?\n\n","bytes":[63,10,10],"logprob":-10.431084}]}],"refusal":null}}],"input_metadata":{"row_id":"liquid-school-081702","completion_params":{"model":"gpt-3.5-turbo","logprobs":true,"top_logprobs":3},"session_data":{"mode":"all"}},"rollout_status":{"code":100,"message":"Rollout finished","details":[]},"evaluation_result":{"score":0.9952574125139473,"is_score_valid":true,"reason":"The Actual Output directly addresses the Input by providing a polite greeting, 'Hello, how are you today?', which is both relevant and helpful. There is no missing or extraneous information, and the response aligns well with the request to say hello politely.","metrics":{"Helpful & Relevant [GEval]":{"is_score_valid":true,"score":0.9952574125139473,"reason":"The Actual Output directly addresses the Input by providing a polite greeting, 'Hello, how are you today?', which is both relevant and helpful. There is no missing or extraneous information, and the response aligns well with the request to say hello politely.","data":{"logprobs":{"content":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257,"top_logprobs":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257},{"token":"Good","bytes":[71,111,111,100],"logprob":-7.1408277},{"token":"\"","bytes":[34],"logprob":-7.267258}]},{"token":",","bytes":[44],"logprob":-0.011751671,"top_logprobs":[{"token":",","bytes":[44],"logprob":-0.011751671},{"token":" there","bytes":[32,116,104,101,114,101],"logprob":-5.0653806},{"token":"!","bytes":[33],"logprob":-5.312371}]},{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638,"top_logprobs":[{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638},{"token":" I","bytes":[32,73],"logprob":-3.3884728},{"token":" it","bytes":[32,105,116],"logprob":-3.5947793}]},{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342,"top_logprobs":[{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342},{"token":" may","bytes":[32,109,97,121],"logprob":-5.5837092},{"token":" do","bytes":[32,100,111],"logprob":-6.48237}]},{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6,"top_logprobs":[{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6},{"token":" your","bytes":[32,121,111,117,114],"logprob":-14.169239},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-14.23296}]},{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095,"top_logprobs":[{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095},{"token":"?","bytes":[63],"logprob":-1.1978787},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-1.263482}]},{"token":"?","bytes":[63],"logprob":-0.00011260267,"top_logprobs":[{"token":"?","bytes":[63],"logprob":-0.00011260267},{"token":"?\n","bytes":[63,10],"logprob":-9.57358},{"token":"?\n\n","bytes":[63,10,10],"logprob":-10.431084}]}],"refusal":null}}}},"agg_score":0.9952574125139473,"standard_error":0.06870295008214607},"execution_metadata":{"invocation_id":"private-thing-379551","experiment_id":"tidy-picture-368421","rollout_id":"smooth-concert-175178","run_id":"traditional-hour-630753","usage":{"completion_tokens":7,"prompt_tokens":11,"total_tokens":18},"cost_metrics":{"input_cost":5.5e-6,"output_cost":0.000010500000000000001,"total_cost_dollar":0.000016000000000000003},"duration_seconds":1.00264809600003,"experiment_duration_seconds":4.857228931000009,"finish_reason":"stop","tool_call_count":0},"created_at":"2025-12-15T16:33:31.522715Z","eval_metadata":{"name":"test_geval_with_logprobs","description":"Attach GEval scores while keeping the raw logprobs on the final message.","version":"0.0.0.dev112+g8dd93b8.dirty","status":{"code":100,"message":"Evaluation finished","details":[]},"num_runs":1,"aggregation_method":"mean"},"pid":8933}

examples/deepeval/artifacts/geval_logprobs_openai_fireworks.jsonl

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
"""Example evaluation_test that wraps deepeval's GEval and captures logprobs.
2+
3+
To run this example you will need `deepeval` installed and a compatible
4+
API key (e.g., OpenAI or Fireworks). You can override the base URL with
5+
``EP_LLM_API_BASE`` or ``EP_LLM_BASE_URL`` and pass provider-specific
6+
parameters through ``completion_params``. Logs are written to
7+
``~/.eval_protocol/datasets/<YYYY-MM-DD>.jsonl`` via the local filesystem
8+
logger so you can inspect the captured logprobs directly.
9+
"""
10+
11+
from typing import List
12+
13+
from eval_protocol.dataset_logger.local_fs_dataset_logger_adapter import LocalFSDatasetLoggerAdapter
14+
from eval_protocol.integrations.deepeval import adapt_metric
15+
from eval_protocol.models import EvaluationRow
16+
from eval_protocol.pytest import evaluation_test
17+
18+
try: # pragma: no cover - optional dependency for the example
19+
from deepeval.metrics import GEval
20+
from deepeval.test_case import LLMTestCaseParams
21+
except ImportError as exc: # pragma: no cover - optional dependency for the example
22+
raise ImportError("Install deepeval to run this example: pip install deepeval") from exc
23+
24+
# Configure GEval to judge the assistant response with the full chat context.
25+
wrapped_metric = adapt_metric(
26+
GEval(
27+
name="Helpful & Relevant",
28+
criteria="Evaluate the helpfulness and relevance of the model output.",
29+
evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
30+
)
31+
)
32+
33+
34+
@evaluation_test(
35+
input_rows=[[EvaluationRow(messages=[{"role": "user", "content": "Say hello politely."}])]],
36+
completion_params=[
37+
{"model": "gpt-3.5-turbo", "logprobs": True, "top_logprobs": 3},
38+
{
39+
"model": "accounts/fireworks/models/qwen3-8b",
40+
"logprobs": True,
41+
"api_base": "https://api.fireworks.ai/inference/v1",
42+
"custom_llm_provider": "fireworks_ai",
43+
},
44+
],
45+
logger=LocalFSDatasetLoggerAdapter(),
46+
mode="all",
47+
)
48+
def test_geval_with_logprobs(rows: List[EvaluationRow]) -> List[EvaluationRow]:
49+
"""Attach GEval scores while keeping the raw logprobs on the final message."""
50+
51+
for row in rows:
52+
eval_result = wrapped_metric(
53+
messages=[message.model_dump(exclude_none=True) for message in row.messages],
54+
ground_truth="Hello!",
55+
)
56+
row.evaluation_result = eval_result
57+
58+
# Logprob payload is available on the last assistant message after rollout
59+
# and can be forwarded to metric metadata for debugging or analysis.
60+
last_assistant = row.messages[-1]
61+
if last_assistant.logprobs:
62+
metric_key = next(iter(eval_result.metrics))
63+
eval_result.metrics[metric_key].data["logprobs"] = last_assistant.logprobs
64+
65+
return rows

tests/test_rollout_logprobs.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import asyncio
2+
3+
import pytest
4+
from litellm.types.utils import Choices, Message as LLMMessage, ModelResponse
5+
6+
from eval_protocol.dataset_logger import default_logger
7+
from eval_protocol.models import EvaluationRow, Message
8+
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
9+
from eval_protocol.pytest.exception_config import get_default_exception_handler_config
10+
from eval_protocol.pytest.types import RolloutProcessorConfig
11+
12+
13+
def test_single_turn_rollout_captures_logprobs(monkeypatch):
14+
processor = SingleTurnRolloutProcessor(drop_trailing_assistant_messages=False)
15+
16+
config = RolloutProcessorConfig(
17+
completion_params={"model": "test-model", "logprobs": True, "top_logprobs": 2},
18+
mcp_config_path="",
19+
semaphore=asyncio.Semaphore(1),
20+
server_script_path=None,
21+
steps=1,
22+
logger=default_logger,
23+
exception_handler_config=get_default_exception_handler_config(),
24+
)
25+
26+
row = EvaluationRow(messages=[Message(role="user", content="hi")])
27+
28+
async def fake_acompletion(**kwargs):
29+
assert kwargs["logprobs"] is True
30+
assert kwargs["top_logprobs"] == 2
31+
logprobs = {"content": [{"token": "hello", "logprob": -0.1, "top_logprobs": []}]}
32+
return ModelResponse(
33+
id="resp-1",
34+
choices=[
35+
Choices(
36+
index=0,
37+
message=LLMMessage(role="assistant", content="hello"),
38+
finish_reason="stop",
39+
logprobs=logprobs,
40+
)
41+
],
42+
created=0,
43+
model="test-model",
44+
)
45+
46+
monkeypatch.setattr("eval_protocol.pytest.default_single_turn_rollout_process.acompletion", fake_acompletion)
47+
48+
async def _run() -> None:
49+
tasks = processor([row], config)
50+
completed_rows = await asyncio.gather(*tasks)
51+
52+
assert completed_rows[0].messages[-1].content == "hello"
53+
assistant_logprobs = completed_rows[0].messages[-1].logprobs
54+
assert isinstance(assistant_logprobs, dict)
55+
assert assistant_logprobs["content"][0]["token"] == "hello"
56+
assert assistant_logprobs["content"][0]["logprob"] == -0.1
57+
58+
asyncio.run(_run())

0 commit comments

Comments
 (0)