Testbed judge prompt (#348)

gotsysdba · ldemarchis · web-flow · commit 158eab89bd7a · 2025-11-28T14:43:17.000Z
* Introduce ability for users to adjust the testbed judge prompt

---------

Co-authored-by: Lorenzo De Marchis &lt;lorenzo.de.marchis@oracle.com&gt;
diff --git a/src/server/api/utils/testbed_metrics.py b/src/server/api/utils/testbed_metrics.py
@@ -0,0 +1,109 @@
+"""
+Copyright (c) 2024, 2025, Oracle and/or its affiliates.
+Licensed under the Universal Permissive License v1.0 as shown at http://oss.oracle.com/licenses/upl.
+
+Custom metrics for testbed evaluation.
+
+This module provides a customizable correctness metric for evaluating chatbot answers
+against reference answers. Unlike Giskard's default CorrectnessMetric which has a
+hardcoded prompt, this allows the system prompt to be configured via MCP prompts.
+"""
+# spell-checker:ignore giskard
+
+from giskard.rag.metrics import CorrectnessMetric
+from giskard.llm.client import ChatMessage, LLMClient, get_default_client
+from giskard.llm.errors import LLMGenerationError
+from giskard.rag.base import AgentAnswer
+from giskard.rag.question_generators.utils import parse_json_output
+
+
+def format_conversation(conversation: list[dict]) -> str:
+    """Format conversation history for the evaluation prompt."""
+    return "\n\n".join([f"<{msg['role'].lower()}>{msg['content']}</{msg['role'].lower()}>" for msg in conversation])
+
+
+CORRECTNESS_INPUT_TEMPLATE = """
+### AGENT DESCRIPTION
+{description}
+
+### CONVERSATION
+{conversation}
+
+### AGENT ANSWER
+{answer}
+
+### EXPECTED ANSWER
+{reference_answer}
+"""
+
+
+class CustomCorrectnessMetric(CorrectnessMetric):  # pylint: disable=too-few-public-methods
+    """Custom correctness metric with configurable system prompt."""
+
+    def __init__(
+        self,
+        name: str,
+        system_prompt: str,
+        llm_client: LLMClient = None,
+        agent_description: str = None,
+    ):
+        """Initialize the custom correctness metric.
+
+        Args:
+            name: The metric name (typically "correctness").
+            system_prompt: The system prompt for the judge LLM.
+            llm_client: Optional LLM client. If not provided, uses Giskard's default.
+            agent_description: Description of the agent being evaluated.
+        """
+        # Call parent with name and llm_client only (CorrectnessMetric signature)
+        super().__init__(name=name, llm_client=llm_client)
+        self.system_prompt = system_prompt
+        self.agent_description = agent_description or "A chatbot answering questions."
+
+    def __call__(self, question_sample: dict, answer: AgentAnswer) -> dict:
+        """Evaluate correctness of agent answer vs reference."""
+        llm_client = self._llm_client or get_default_client()
+        try:
+            out = llm_client.complete(
+                messages=[
+                    ChatMessage(role="system", content=self.system_prompt),
+                    ChatMessage(
+                        role="user",
+                        content=CORRECTNESS_INPUT_TEMPLATE.format(
+                            conversation=format_conversation(
+                                question_sample.conversation_history
+                                + [{"role": "user", "content": question_sample.question}]
+                            ),
+                            answer=answer.message,
+                            reference_answer=question_sample.reference_answer,
+                            description=self.agent_description,
+                        ),
+                    ),
+                ],
+                temperature=0,
+                format="json_object",
+            )
+
+            json_output = parse_json_output(
+                out.content,
+                llm_client=llm_client,
+                keys=["correctness", "correctness_reason"],
+                caller_id=self.__class__.__name__,
+            )
+
+            if "correctness" in json_output and not isinstance(json_output["correctness"], bool):
+                raise LLMGenerationError(
+                    f"Error in correctness evaluation: {json_output['correctness']}. "
+                    "Expected boolean value for 'correctness' key."
+                )
+
+            # Strip correctness_reason when correct (LLM sometimes includes it anyway)
+            if json_output.get("correctness") is True:
+                json_output.pop("correctness_reason", None)
+
+            return json_output
+
+        except LLMGenerationError:
+            raise
+        except Exception as err:
+            raise LLMGenerationError("Error while evaluating the agent") from err
diff --git a/src/server/api/v1/testbed.py b/src/server/api/v1/testbed.py
@@ -4,14 +4,14 @@
 """
 # spell-checker:ignore testsets testset giskard litellm
 
-import asyncio
 import pickle
 import shutil
 
 from datetime import datetime
 import json
 from typing import Optional
 from giskard.rag import evaluate, QATestset
+from giskard.rag.base import AgentAnswer
 from giskard.llm import set_llm_model
 from fastapi import APIRouter, HTTPException, Header, UploadFile
 from fastapi.responses import JSONResponse
@@ -24,6 +24,8 @@
 import server.api.utils.testbed as utils_testbed
 import server.api.utils.databases as utils_databases
 import server.api.utils.models as utils_models
+from server.api.utils.testbed_metrics import CustomCorrectnessMetric
+from server.mcp.prompts.defaults import get_prompt_with_override
 
 from server.api.v1 import chat
 
@@ -229,26 +231,29 @@ async def testbed_generate_qa(
     return testset_qa
 
 
+async def _collect_testbed_answers(loaded_testset: QATestset, client: str) -> list[AgentAnswer]:
+    """Collect answers from the chatbot for all questions in the testset."""
+    answers = []
+    for sample in loaded_testset.to_pandas().itertuples():
+        request = schema.ChatRequest(
+            messages=[ChatMessage(role="human", content=sample.question)],
+        )
+        ai_response = await chat.chat_post(client=client, request=request)
+        answers.append(AgentAnswer(message=ai_response["choices"][0]["message"]["content"]))
+    return answers
+
+
 @auth.post(
     "/evaluate",
     description="Evaluate Q&A Test Set.",
     response_model=schema.EvaluationReport,
 )
-def testbed_evaluate(
+async def testbed_evaluate(
     tid: schema.TestSetsIdType,
     judge: str,
     client: schema.ClientIdType = Header(default="server"),
 ) -> schema.EvaluationReport:
     """Run evaluate against a testset"""
-
-    def get_answer(question: str):
-        """Submit question against the chatbot"""
-        request = schema.ChatRequest(
-            messages=[ChatMessage(role="human", content=question)],
-        )
-        ai_response = asyncio.run(chat.chat_post(client=client, request=request))
-        return ai_response["choices"][0]["message"]["content"]
-
     evaluated = datetime.now().isoformat()
     client_settings = utils_settings.get_client(client)
     # Disable History
@@ -271,8 +276,23 @@ def get_answer(question: str):
 
     judge_config = utils_models.get_litellm_config(model_config={"model": judge}, oci_config=oci_config, giskard=True)
     set_llm_model(llm_model=judge, **judge_config)
+
+    # Get judge prompt from MCP (allows override via Prompt Engineering page)
+    judge_prompt_message = get_prompt_with_override("optimizer_testbed-judge")
+    judge_prompt = judge_prompt_message.content.text
+
+    # Create custom metric with the configurable prompt
+    custom_metric = CustomCorrectnessMetric(
+        name="correctness",
+        system_prompt=judge_prompt,
+        agent_description="A chatbot answering questions.",
+    )
+
+    # Pre-compute answers asynchronously to avoid event loop conflicts with LiteLLM
+    answers = await _collect_testbed_answers(loaded_testset, client)
+
     try:
-        report = evaluate(get_answer, testset=loaded_testset, metrics=None)
+        report = evaluate(answers, testset=loaded_testset, metrics=[custom_metric])
     except KeyError as ex:
         if str(ex) == "'correctness'":
             raise HTTPException(status_code=500, detail="Unable to determine the correctness; please retry.") from ex
diff --git a/src/server/mcp/prompts/defaults.py b/src/server/mcp/prompts/defaults.py
@@ -2,8 +2,8 @@
 Copyright (c) 2024, 2025, Oracle and/or its affiliates.
 Licensed under the Universal Permissive License v1.0 as shown at http://oss.oracle.com/licenses/upl.
 """
+# spell-checker:ignore fastmcp giskard
 
-# spell-checker:ignore fastmcp
 from fastmcp.prompts.prompt import PromptMessage, TextContent
 from server.mcp.prompts import cache
 
@@ -216,6 +216,51 @@ def optimizer_vs_rephrase() -> PromptMessage:
     return PromptMessage(role="assistant", content=TextContent(type="text", text=clean_prompt_string(content)))
 
 
+def optimizer_testbed_judge() -> PromptMessage:
+    """Prompt for testbed evaluation judge.
+
+    Used to evaluate whether a chatbot's answer correctly matches the reference answer.
+    This prompt is more lenient than the default Giskard prompt - it allows additional
+    context in answers and only marks as incorrect when essential information is missing
+    or contradicted.
+    """
+    content = """
+        You are evaluating whether an AI assistant correctly answered a question.
+
+        EVALUATION CRITERIA:
+        1. CORRECT if the agent's answer contains the essential information from the EXPECTED ANSWER
+        2. Additional context, elaboration, historical background, or helpful details beyond the expected answer should NOT be penalized
+        3. INCORRECT only if the agent's answer to the specific question asked contradicts or conflicts with the expected answer
+
+        Consider the answer CORRECT if:
+        - The core question is answered accurately with facts matching the expected answer
+        - The agent provides extra context, background, comparisons, or elaboration (this is GOOD)
+        - Additional information about related topics does not change the core answer
+
+        Consider the answer INCORRECT if:
+        - The direct answer to the question contradicts the expected answer
+        - Essential information from the expected answer is missing or wrong
+        - The agent admits it doesn't know or cannot answer
+
+        IMPORTANT: Additional context is NOT a contradiction. For example:
+        - Expected: "The new default is X"
+        - Agent says: "The new default is X. Previously it was Y."
+        - This is CORRECT - the agent answered the question correctly and added helpful context.
+
+        You will receive:
+        - AGENT DESCRIPTION: What the agent does
+        - CONVERSATION: The chat history
+        - AGENT ANSWER: What the agent responded
+        - EXPECTED ANSWER: The correct answer to compare against
+
+        Output ONLY valid JSON with no additional text:
+        - If correct: {"correctness": true}
+        - If incorrect: {"correctness": false, "correctness_reason": "brief explanation of what was wrong or missing"}
+    """
+
+    return PromptMessage(role="assistant", content=TextContent(type="text", text=clean_prompt_string(content)))
+
+
 # MCP Registration
 async def register(mcp):
     """Register Out-of-Box Prompts"""
@@ -282,3 +327,14 @@ def rephrase_mcp() -> PromptMessage:
         based on conversation history before performing retrieval.
         """
         return get_prompt_with_override("optimizer_vs-rephrase")
+
+    @mcp.prompt(name="optimizer_testbed-judge", title="Testbed Judge Prompt", tags=optimizer_tags)
+    def testbed_judge_mcp() -> PromptMessage:
+        """Prompt for testbed evaluation judge.
+
+        Used by the testbed to evaluate whether the chatbot's answer matches the reference.
+        Configurable to adjust evaluation strictness. The default prompt is lenient -
+        it allows additional context in answers and only fails on contradictions or
+        missing essential information.
+        """
+        return get_prompt_with_override("optimizer_testbed-judge")