Skip to content

Commit 158eab8

Browse files
Testbed judge prompt (#348)
* Introduce ability for users to adjust the testbed judge prompt --------- Co-authored-by: Lorenzo De Marchis <[email protected]>
1 parent d2e2fb0 commit 158eab8

File tree

3 files changed

+198
-13
lines changed

3 files changed

+198
-13
lines changed
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
"""
2+
Copyright (c) 2024, 2025, Oracle and/or its affiliates.
3+
Licensed under the Universal Permissive License v1.0 as shown at http://oss.oracle.com/licenses/upl.
4+
5+
Custom metrics for testbed evaluation.
6+
7+
This module provides a customizable correctness metric for evaluating chatbot answers
8+
against reference answers. Unlike Giskard's default CorrectnessMetric which has a
9+
hardcoded prompt, this allows the system prompt to be configured via MCP prompts.
10+
"""
11+
# spell-checker:ignore giskard
12+
13+
from giskard.rag.metrics import CorrectnessMetric
14+
from giskard.llm.client import ChatMessage, LLMClient, get_default_client
15+
from giskard.llm.errors import LLMGenerationError
16+
from giskard.rag.base import AgentAnswer
17+
from giskard.rag.question_generators.utils import parse_json_output
18+
19+
20+
def format_conversation(conversation: list[dict]) -> str:
21+
"""Format conversation history for the evaluation prompt."""
22+
return "\n\n".join([f"<{msg['role'].lower()}>{msg['content']}</{msg['role'].lower()}>" for msg in conversation])
23+
24+
25+
CORRECTNESS_INPUT_TEMPLATE = """
26+
### AGENT DESCRIPTION
27+
{description}
28+
29+
### CONVERSATION
30+
{conversation}
31+
32+
### AGENT ANSWER
33+
{answer}
34+
35+
### EXPECTED ANSWER
36+
{reference_answer}
37+
"""
38+
39+
40+
class CustomCorrectnessMetric(CorrectnessMetric): # pylint: disable=too-few-public-methods
41+
"""Custom correctness metric with configurable system prompt."""
42+
43+
def __init__(
44+
self,
45+
name: str,
46+
system_prompt: str,
47+
llm_client: LLMClient = None,
48+
agent_description: str = None,
49+
):
50+
"""Initialize the custom correctness metric.
51+
52+
Args:
53+
name: The metric name (typically "correctness").
54+
system_prompt: The system prompt for the judge LLM.
55+
llm_client: Optional LLM client. If not provided, uses Giskard's default.
56+
agent_description: Description of the agent being evaluated.
57+
"""
58+
# Call parent with name and llm_client only (CorrectnessMetric signature)
59+
super().__init__(name=name, llm_client=llm_client)
60+
self.system_prompt = system_prompt
61+
self.agent_description = agent_description or "A chatbot answering questions."
62+
63+
def __call__(self, question_sample: dict, answer: AgentAnswer) -> dict:
64+
"""Evaluate correctness of agent answer vs reference."""
65+
llm_client = self._llm_client or get_default_client()
66+
try:
67+
out = llm_client.complete(
68+
messages=[
69+
ChatMessage(role="system", content=self.system_prompt),
70+
ChatMessage(
71+
role="user",
72+
content=CORRECTNESS_INPUT_TEMPLATE.format(
73+
conversation=format_conversation(
74+
question_sample.conversation_history
75+
+ [{"role": "user", "content": question_sample.question}]
76+
),
77+
answer=answer.message,
78+
reference_answer=question_sample.reference_answer,
79+
description=self.agent_description,
80+
),
81+
),
82+
],
83+
temperature=0,
84+
format="json_object",
85+
)
86+
87+
json_output = parse_json_output(
88+
out.content,
89+
llm_client=llm_client,
90+
keys=["correctness", "correctness_reason"],
91+
caller_id=self.__class__.__name__,
92+
)
93+
94+
if "correctness" in json_output and not isinstance(json_output["correctness"], bool):
95+
raise LLMGenerationError(
96+
f"Error in correctness evaluation: {json_output['correctness']}. "
97+
"Expected boolean value for 'correctness' key."
98+
)
99+
100+
# Strip correctness_reason when correct (LLM sometimes includes it anyway)
101+
if json_output.get("correctness") is True:
102+
json_output.pop("correctness_reason", None)
103+
104+
return json_output
105+
106+
except LLMGenerationError:
107+
raise
108+
except Exception as err:
109+
raise LLMGenerationError("Error while evaluating the agent") from err

src/server/api/v1/testbed.py

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44
"""
55
# spell-checker:ignore testsets testset giskard litellm
66

7-
import asyncio
87
import pickle
98
import shutil
109

1110
from datetime import datetime
1211
import json
1312
from typing import Optional
1413
from giskard.rag import evaluate, QATestset
14+
from giskard.rag.base import AgentAnswer
1515
from giskard.llm import set_llm_model
1616
from fastapi import APIRouter, HTTPException, Header, UploadFile
1717
from fastapi.responses import JSONResponse
@@ -24,6 +24,8 @@
2424
import server.api.utils.testbed as utils_testbed
2525
import server.api.utils.databases as utils_databases
2626
import server.api.utils.models as utils_models
27+
from server.api.utils.testbed_metrics import CustomCorrectnessMetric
28+
from server.mcp.prompts.defaults import get_prompt_with_override
2729

2830
from server.api.v1 import chat
2931

@@ -229,26 +231,29 @@ async def testbed_generate_qa(
229231
return testset_qa
230232

231233

234+
async def _collect_testbed_answers(loaded_testset: QATestset, client: str) -> list[AgentAnswer]:
235+
"""Collect answers from the chatbot for all questions in the testset."""
236+
answers = []
237+
for sample in loaded_testset.to_pandas().itertuples():
238+
request = schema.ChatRequest(
239+
messages=[ChatMessage(role="human", content=sample.question)],
240+
)
241+
ai_response = await chat.chat_post(client=client, request=request)
242+
answers.append(AgentAnswer(message=ai_response["choices"][0]["message"]["content"]))
243+
return answers
244+
245+
232246
@auth.post(
233247
"/evaluate",
234248
description="Evaluate Q&A Test Set.",
235249
response_model=schema.EvaluationReport,
236250
)
237-
def testbed_evaluate(
251+
async def testbed_evaluate(
238252
tid: schema.TestSetsIdType,
239253
judge: str,
240254
client: schema.ClientIdType = Header(default="server"),
241255
) -> schema.EvaluationReport:
242256
"""Run evaluate against a testset"""
243-
244-
def get_answer(question: str):
245-
"""Submit question against the chatbot"""
246-
request = schema.ChatRequest(
247-
messages=[ChatMessage(role="human", content=question)],
248-
)
249-
ai_response = asyncio.run(chat.chat_post(client=client, request=request))
250-
return ai_response["choices"][0]["message"]["content"]
251-
252257
evaluated = datetime.now().isoformat()
253258
client_settings = utils_settings.get_client(client)
254259
# Disable History
@@ -271,8 +276,23 @@ def get_answer(question: str):
271276

272277
judge_config = utils_models.get_litellm_config(model_config={"model": judge}, oci_config=oci_config, giskard=True)
273278
set_llm_model(llm_model=judge, **judge_config)
279+
280+
# Get judge prompt from MCP (allows override via Prompt Engineering page)
281+
judge_prompt_message = get_prompt_with_override("optimizer_testbed-judge")
282+
judge_prompt = judge_prompt_message.content.text
283+
284+
# Create custom metric with the configurable prompt
285+
custom_metric = CustomCorrectnessMetric(
286+
name="correctness",
287+
system_prompt=judge_prompt,
288+
agent_description="A chatbot answering questions.",
289+
)
290+
291+
# Pre-compute answers asynchronously to avoid event loop conflicts with LiteLLM
292+
answers = await _collect_testbed_answers(loaded_testset, client)
293+
274294
try:
275-
report = evaluate(get_answer, testset=loaded_testset, metrics=None)
295+
report = evaluate(answers, testset=loaded_testset, metrics=[custom_metric])
276296
except KeyError as ex:
277297
if str(ex) == "'correctness'":
278298
raise HTTPException(status_code=500, detail="Unable to determine the correctness; please retry.") from ex

src/server/mcp/prompts/defaults.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
Copyright (c) 2024, 2025, Oracle and/or its affiliates.
33
Licensed under the Universal Permissive License v1.0 as shown at http://oss.oracle.com/licenses/upl.
44
"""
5+
# spell-checker:ignore fastmcp giskard
56

6-
# spell-checker:ignore fastmcp
77
from fastmcp.prompts.prompt import PromptMessage, TextContent
88
from server.mcp.prompts import cache
99

@@ -216,6 +216,51 @@ def optimizer_vs_rephrase() -> PromptMessage:
216216
return PromptMessage(role="assistant", content=TextContent(type="text", text=clean_prompt_string(content)))
217217

218218

219+
def optimizer_testbed_judge() -> PromptMessage:
220+
"""Prompt for testbed evaluation judge.
221+
222+
Used to evaluate whether a chatbot's answer correctly matches the reference answer.
223+
This prompt is more lenient than the default Giskard prompt - it allows additional
224+
context in answers and only marks as incorrect when essential information is missing
225+
or contradicted.
226+
"""
227+
content = """
228+
You are evaluating whether an AI assistant correctly answered a question.
229+
230+
EVALUATION CRITERIA:
231+
1. CORRECT if the agent's answer contains the essential information from the EXPECTED ANSWER
232+
2. Additional context, elaboration, historical background, or helpful details beyond the expected answer should NOT be penalized
233+
3. INCORRECT only if the agent's answer to the specific question asked contradicts or conflicts with the expected answer
234+
235+
Consider the answer CORRECT if:
236+
- The core question is answered accurately with facts matching the expected answer
237+
- The agent provides extra context, background, comparisons, or elaboration (this is GOOD)
238+
- Additional information about related topics does not change the core answer
239+
240+
Consider the answer INCORRECT if:
241+
- The direct answer to the question contradicts the expected answer
242+
- Essential information from the expected answer is missing or wrong
243+
- The agent admits it doesn't know or cannot answer
244+
245+
IMPORTANT: Additional context is NOT a contradiction. For example:
246+
- Expected: "The new default is X"
247+
- Agent says: "The new default is X. Previously it was Y."
248+
- This is CORRECT - the agent answered the question correctly and added helpful context.
249+
250+
You will receive:
251+
- AGENT DESCRIPTION: What the agent does
252+
- CONVERSATION: The chat history
253+
- AGENT ANSWER: What the agent responded
254+
- EXPECTED ANSWER: The correct answer to compare against
255+
256+
Output ONLY valid JSON with no additional text:
257+
- If correct: {"correctness": true}
258+
- If incorrect: {"correctness": false, "correctness_reason": "brief explanation of what was wrong or missing"}
259+
"""
260+
261+
return PromptMessage(role="assistant", content=TextContent(type="text", text=clean_prompt_string(content)))
262+
263+
219264
# MCP Registration
220265
async def register(mcp):
221266
"""Register Out-of-Box Prompts"""
@@ -282,3 +327,14 @@ def rephrase_mcp() -> PromptMessage:
282327
based on conversation history before performing retrieval.
283328
"""
284329
return get_prompt_with_override("optimizer_vs-rephrase")
330+
331+
@mcp.prompt(name="optimizer_testbed-judge", title="Testbed Judge Prompt", tags=optimizer_tags)
332+
def testbed_judge_mcp() -> PromptMessage:
333+
"""Prompt for testbed evaluation judge.
334+
335+
Used by the testbed to evaluate whether the chatbot's answer matches the reference.
336+
Configurable to adjust evaluation strictness. The default prompt is lenient -
337+
it allows additional context in answers and only fails on contradictions or
338+
missing essential information.
339+
"""
340+
return get_prompt_with_override("optimizer_testbed-judge")

0 commit comments

Comments
 (0)