Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from eval_protocol.common_utils import load_jsonl
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
from eval_protocol.dataset_logger.directory_utils import find_eval_protocol_datasets_dir
from eval_protocol.directory_utils import find_eval_protocol_datasets_dir

if TYPE_CHECKING:
from eval_protocol.models import EvaluationRow
Expand Down
12 changes: 11 additions & 1 deletion eval_protocol/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,10 +517,20 @@ class Message(BaseModel):
function_call: Optional[FunctionCall] = None
control_plane_step: Optional[Dict[str, Any]] = None
weight: Optional[int] = None
logprobs: Optional[Any] = Field(
default=None,
description=(
"Optional log probability metadata captured from the completion response. "
"When present, this typically mirrors the provider-specific logprob payload."
),
)

def dump_mdoel_for_chat_completion_request(self):
"""Only keep chat completion accepted fields"""
return self.model_dump(exclude_none=True, exclude={"control_plane_step", "reasoning_content", "weight"})
return self.model_dump(
exclude_none=True,
exclude={"control_plane_step", "reasoning_content", "weight", "logprobs"},
)

@classmethod
def model_validate(cls, obj, *args, **kwargs):
Expand Down
28 changes: 27 additions & 1 deletion eval_protocol/pytest/default_single_turn_rollout_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import logging
import os
import time
from typing import List
from dataclasses import asdict, is_dataclass
from types import SimpleNamespace
from typing import Any, List

import litellm
from litellm import acompletion
Expand All @@ -19,6 +21,28 @@
logger = logging.getLogger(__name__)


def _serialize_logprobs(logprobs: Any) -> Any:
"""Best-effort conversion of provider logprobs into JSON-serializable data."""

if logprobs is None:
return None
if hasattr(logprobs, "model_dump"):
try:
return logprobs.model_dump()
except Exception:
pass
if is_dataclass(logprobs) and not isinstance(logprobs, type):
return asdict(logprobs)
if isinstance(logprobs, SimpleNamespace):
return vars(logprobs)
if isinstance(logprobs, dict):
return logprobs
try:
return json.loads(json.dumps(logprobs, default=lambda o: getattr(o, "__dict__", str(o))))
except Exception:
return logprobs


class SingleTurnRolloutProcessor(RolloutProcessor):
"""Single turn rollout processor for direct LLM calls."""

Expand Down Expand Up @@ -105,6 +129,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:

assistant_message = response.choices[0].message
finish_reason = getattr(response.choices[0], "finish_reason", None)
assistant_logprobs = _serialize_logprobs(getattr(response.choices[0], "logprobs", None))

# Extract content
assistant_content = assistant_message.content or ""
Expand Down Expand Up @@ -159,6 +184,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
content=assistant_content,
reasoning_content=reasoning_content,
tool_calls=converted_tool_calls,
logprobs=assistant_logprobs,
)
]

Expand Down
2 changes: 2 additions & 0 deletions examples/deepeval/artifacts/geval_logprobs_combined.jsonl

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions examples/deepeval/artifacts/geval_logprobs_fireworks.jsonl

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions examples/deepeval/artifacts/geval_logprobs_openai.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"messages":[{"role":"user","content":"Say hello politely."},{"role":"assistant","content":"Hello, how are you today?","logprobs":{"content":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257,"top_logprobs":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257},{"token":"Good","bytes":[71,111,111,100],"logprob":-7.1408277},{"token":"\"","bytes":[34],"logprob":-7.267258}]},{"token":",","bytes":[44],"logprob":-0.011751671,"top_logprobs":[{"token":",","bytes":[44],"logprob":-0.011751671},{"token":" there","bytes":[32,116,104,101,114,101],"logprob":-5.0653806},{"token":"!","bytes":[33],"logprob":-5.312371}]},{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638,"top_logprobs":[{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638},{"token":" I","bytes":[32,73],"logprob":-3.3884728},{"token":" it","bytes":[32,105,116],"logprob":-3.5947793}]},{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342,"top_logprobs":[{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342},{"token":" may","bytes":[32,109,97,121],"logprob":-5.5837092},{"token":" do","bytes":[32,100,111],"logprob":-6.48237}]},{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6,"top_logprobs":[{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6},{"token":" your","bytes":[32,121,111,117,114],"logprob":-14.169239},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-14.23296}]},{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095,"top_logprobs":[{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095},{"token":"?","bytes":[63],"logprob":-1.1978787},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-1.263482}]},{"token":"?","bytes":[63],"logprob":-0.00011260267,"top_logprobs":[{"token":"?","bytes":[63],"logprob":-0.00011260267},{"token":"?\n","bytes":[63,10],"logprob":-9.57358},{"token":"?\n\n","bytes":[63,10,10],"logprob":-10.431084}]}],"refusal":null}}],"input_metadata":{"row_id":"liquid-school-081702","completion_params":{"model":"gpt-3.5-turbo","logprobs":true,"top_logprobs":3},"session_data":{"mode":"all"}},"rollout_status":{"code":100,"message":"Rollout finished","details":[]},"evaluation_result":{"score":0.9952574125139473,"is_score_valid":true,"reason":"The Actual Output directly addresses the Input by providing a polite greeting, 'Hello, how are you today?', which is both relevant and helpful. There is no missing or extraneous information, and the response aligns well with the request to say hello politely.","metrics":{"Helpful & Relevant [GEval]":{"is_score_valid":true,"score":0.9952574125139473,"reason":"The Actual Output directly addresses the Input by providing a polite greeting, 'Hello, how are you today?', which is both relevant and helpful. There is no missing or extraneous information, and the response aligns well with the request to say hello politely.","data":{"logprobs":{"content":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257,"top_logprobs":[{"token":"Hello","bytes":[72,101,108,108,111],"logprob":-0.001780257},{"token":"Good","bytes":[71,111,111,100],"logprob":-7.1408277},{"token":"\"","bytes":[34],"logprob":-7.267258}]},{"token":",","bytes":[44],"logprob":-0.011751671,"top_logprobs":[{"token":",","bytes":[44],"logprob":-0.011751671},{"token":" there","bytes":[32,116,104,101,114,101],"logprob":-5.0653806},{"token":"!","bytes":[33],"logprob":-5.312371}]},{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638,"top_logprobs":[{"token":" how","bytes":[32,104,111,119],"logprob":-0.08952638},{"token":" I","bytes":[32,73],"logprob":-3.3884728},{"token":" it","bytes":[32,105,116],"logprob":-3.5947793}]},{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342,"top_logprobs":[{"token":" are","bytes":[32,97,114,101],"logprob":-0.005683342},{"token":" may","bytes":[32,109,97,121],"logprob":-5.5837092},{"token":" do","bytes":[32,100,111],"logprob":-6.48237}]},{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6,"top_logprobs":[{"token":" you","bytes":[32,121,111,117],"logprob":-3.7697225e-6},{"token":" your","bytes":[32,121,111,117,114],"logprob":-14.169239},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-14.23296}]},{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095,"top_logprobs":[{"token":" today","bytes":[32,116,111,100,97,121],"logprob":-0.87895095},{"token":"?","bytes":[63],"logprob":-1.1978787},{"token":" doing","bytes":[32,100,111,105,110,103],"logprob":-1.263482}]},{"token":"?","bytes":[63],"logprob":-0.00011260267,"top_logprobs":[{"token":"?","bytes":[63],"logprob":-0.00011260267},{"token":"?\n","bytes":[63,10],"logprob":-9.57358},{"token":"?\n\n","bytes":[63,10,10],"logprob":-10.431084}]}],"refusal":null}}}},"agg_score":0.9952574125139473,"standard_error":0.06870295008214607},"execution_metadata":{"invocation_id":"private-thing-379551","experiment_id":"tidy-picture-368421","rollout_id":"smooth-concert-175178","run_id":"traditional-hour-630753","usage":{"completion_tokens":7,"prompt_tokens":11,"total_tokens":18},"cost_metrics":{"input_cost":5.5e-6,"output_cost":0.000010500000000000001,"total_cost_dollar":0.000016000000000000003},"duration_seconds":1.00264809600003,"experiment_duration_seconds":4.857228931000009,"finish_reason":"stop","tool_call_count":0},"created_at":"2025-12-15T16:33:31.522715Z","eval_metadata":{"name":"test_geval_with_logprobs","description":"Attach GEval scores while keeping the raw logprobs on the final message.","version":"0.0.0.dev112+g8dd93b8.dirty","status":{"code":100,"message":"Evaluation finished","details":[]},"num_runs":1,"aggregation_method":"mean"},"pid":8933}

Large diffs are not rendered by default.

65 changes: 65 additions & 0 deletions examples/deepeval/test_geval_with_logprobs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""Example evaluation_test that wraps deepeval's GEval and captures logprobs.

To run this example you will need `deepeval` installed and a compatible
API key (e.g., OpenAI or Fireworks). You can override the base URL with
``EP_LLM_API_BASE`` or ``EP_LLM_BASE_URL`` and pass provider-specific
parameters through ``completion_params``. Logs are written to
``~/.eval_protocol/datasets/<YYYY-MM-DD>.jsonl`` via the local filesystem
logger so you can inspect the captured logprobs directly.
"""

from typing import List

from eval_protocol.dataset_logger.local_fs_dataset_logger_adapter import LocalFSDatasetLoggerAdapter
from eval_protocol.integrations.deepeval import adapt_metric
from eval_protocol.models import EvaluationRow
from eval_protocol.pytest import evaluation_test

try: # pragma: no cover - optional dependency for the example
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
except ImportError as exc: # pragma: no cover - optional dependency for the example
raise ImportError("Install deepeval to run this example: pip install deepeval") from exc

# Configure GEval to judge the assistant response with the full chat context.
wrapped_metric = adapt_metric(
GEval(
name="Helpful & Relevant",
criteria="Evaluate the helpfulness and relevance of the model output.",
evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)
)


@evaluation_test(
input_rows=[[EvaluationRow(messages=[{"role": "user", "content": "Say hello politely."}])]],
completion_params=[
{"model": "gpt-3.5-turbo", "logprobs": True, "top_logprobs": 3},
{
"model": "accounts/fireworks/models/qwen3-8b",
"logprobs": True,
"api_base": "https://api.fireworks.ai/inference/v1",
"custom_llm_provider": "fireworks_ai",
},
],
logger=LocalFSDatasetLoggerAdapter(),
mode="all",
)
def test_geval_with_logprobs(rows: List[EvaluationRow]) -> List[EvaluationRow]:
"""Attach GEval scores while keeping the raw logprobs on the final message."""

for row in rows:
eval_result = wrapped_metric(
messages=[message.model_dump(exclude_none=True) for message in row.messages],
ground_truth="Hello!",
)
row.evaluation_result = eval_result

# Logprob payload is available on the last assistant message after rollout
# and can be forwarded to metric metadata for debugging or analysis.
last_assistant = row.messages[-1]
if last_assistant.logprobs:
metric_key = next(iter(eval_result.metrics))
eval_result.metrics[metric_key].data["logprobs"] = last_assistant.logprobs

return rows
58 changes: 58 additions & 0 deletions tests/test_rollout_logprobs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import asyncio

import pytest
from litellm.types.utils import Choices, Message as LLMMessage, ModelResponse

from eval_protocol.dataset_logger import default_logger
from eval_protocol.models import EvaluationRow, Message
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
from eval_protocol.pytest.exception_config import get_default_exception_handler_config
from eval_protocol.pytest.types import RolloutProcessorConfig


def test_single_turn_rollout_captures_logprobs(monkeypatch):
processor = SingleTurnRolloutProcessor(drop_trailing_assistant_messages=False)

config = RolloutProcessorConfig(
completion_params={"model": "test-model", "logprobs": True, "top_logprobs": 2},
mcp_config_path="",
semaphore=asyncio.Semaphore(1),
server_script_path=None,
steps=1,
logger=default_logger,
exception_handler_config=get_default_exception_handler_config(),
)

row = EvaluationRow(messages=[Message(role="user", content="hi")])

async def fake_acompletion(**kwargs):
assert kwargs["logprobs"] is True
assert kwargs["top_logprobs"] == 2
logprobs = {"content": [{"token": "hello", "logprob": -0.1, "top_logprobs": []}]}
return ModelResponse(
id="resp-1",
choices=[
Choices(
index=0,
message=LLMMessage(role="assistant", content="hello"),
finish_reason="stop",
logprobs=logprobs,
)
],
created=0,
model="test-model",
)

monkeypatch.setattr("eval_protocol.pytest.default_single_turn_rollout_process.acompletion", fake_acompletion)

async def _run() -> None:
tasks = processor([row], config)
completed_rows = await asyncio.gather(*tasks)

assert completed_rows[0].messages[-1].content == "hello"
assistant_logprobs = completed_rows[0].messages[-1].logprobs
assert isinstance(assistant_logprobs, dict)
assert assistant_logprobs["content"][0]["token"] == "hello"
assert assistant_logprobs["content"][0]["logprob"] == -0.1

asyncio.run(_run())
Loading