Skip to content

feat: Implement GcsEvalSetResultsManager to handle storage of eval sets on GCS, and refactor eval set results manager #1200

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions src/google/adk/evaluation/_eval_set_results_manager_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import time

from .eval_result import EvalCaseResult
from .eval_result import EvalSetResult


def _sanitize_eval_set_result_name(eval_set_result_name: str) -> str:
"""Sanitizes the eval set result name."""
return eval_set_result_name.replace("/", "_")


def create_eval_set_result(
app_name: str,
eval_set_id: str,
eval_case_results: list[EvalCaseResult],
) -> EvalSetResult:
"""Creates a new EvalSetResult given eval_case_results."""
timestamp = time.time()
eval_set_result_id = f"{app_name}_{eval_set_id}_{timestamp}"
eval_set_result_name = _sanitize_eval_set_result_name(eval_set_result_id)
eval_set_result = EvalSetResult(
eval_set_result_id=eval_set_result_id,
eval_set_result_name=eval_set_result_name,
eval_set_id=eval_set_id,
eval_case_results=eval_case_results,
creation_timestamp=timestamp,
)
return eval_set_result
108 changes: 108 additions & 0 deletions src/google/adk/evaluation/_eval_sets_manager_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import logging
from typing import Optional

from ..errors.not_found_error import NotFoundError
from .eval_case import EvalCase
from .eval_set import EvalSet
from .eval_sets_manager import EvalSetsManager

logger = logging.getLogger("google_adk." + __name__)


def get_eval_set_from_app_and_id(
eval_sets_manager: EvalSetsManager, app_name: str, eval_set_id: str
) -> EvalSet:
"""Returns an EvalSet if found, otherwise raises NotFoundError."""
eval_set = eval_sets_manager.get_eval_set(app_name, eval_set_id)
if not eval_set:
raise NotFoundError(f"Eval set `{eval_set_id}` not found.")
return eval_set


def get_eval_case_from_eval_set(
eval_set: EvalSet, eval_case_id: str
) -> Optional[EvalCase]:
"""Returns an EvalCase if found, otherwise None."""
eval_case_to_find = None

# Look up the eval case by eval_case_id
for eval_case in eval_set.eval_cases:
if eval_case.eval_id == eval_case_id:
eval_case_to_find = eval_case
break

return eval_case_to_find


def add_eval_case_to_eval_set(
eval_set: EvalSet, eval_case: EvalCase
) -> EvalSet:
"""Adds an eval case to an eval set and returns the updated eval set."""
eval_case_id = eval_case.eval_id

if [x for x in eval_set.eval_cases if x.eval_id == eval_case_id]:
raise ValueError(
f"Eval id `{eval_case_id}` already exists in `{eval_set.eval_set_id}`"
" eval set.",
)

eval_set.eval_cases.append(eval_case)
return eval_set


def update_eval_case_in_eval_set(
eval_set: EvalSet, updated_eval_case: EvalCase
) -> EvalSet:
"""Updates an eval case in an eval set and returns the updated eval set."""
# Find the eval case to be updated.
eval_case_id = updated_eval_case.eval_id
eval_case_to_update = get_eval_case_from_eval_set(eval_set, eval_case_id)

if not eval_case_to_update:
raise NotFoundError(
f"Eval case `{eval_case_id}` not found in eval set"
f" `{eval_set.eval_set_id}`."
)

# Remove the existing eval case and add the updated eval case.
eval_set.eval_cases.remove(eval_case_to_update)
eval_set.eval_cases.append(updated_eval_case)
return eval_set


def delete_eval_case_from_eval_set(
eval_set: EvalSet, eval_case_id: str
) -> EvalSet:
"""Deletes an eval case from an eval set and returns the updated eval set."""
# Find the eval case to be deleted.
eval_case_to_delete = get_eval_case_from_eval_set(eval_set, eval_case_id)

if not eval_case_to_delete:
raise NotFoundError(
f"Eval case `{eval_case_id}` not found in eval set"
f" `{eval_set.eval_set_id}`."
)

# Remove the existing eval case.
logger.info(
"EvalCase`%s` was found in the eval set. It will be removed permanently.",
eval_case_id,
)
eval_set.eval_cases.remove(eval_case_to_delete)
return eval_set
19 changes: 12 additions & 7 deletions src/google/adk/evaluation/eval_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@ class EvalCaseResult(BaseModel):
populate_by_name=True,
)

eval_set_file: str = Field(
eval_set_file: Optional[str] = Field(
deprecated=True,
default=None,
description="This field is deprecated, use eval_set_id instead.",
)
eval_set_id: str = ""
Expand All @@ -49,11 +50,15 @@ class EvalCaseResult(BaseModel):
final_eval_status: EvalStatus
"""Final eval status for this eval case."""

eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]] = Field(
deprecated=True,
description=(
"This field is deprecated, use overall_eval_metric_results instead."
),
eval_metric_results: Optional[list[tuple[EvalMetric, EvalMetricResult]]] = (
Field(
deprecated=True,
default=None,
description=(
"This field is deprecated, use overall_eval_metric_results"
" instead."
),
)
)

overall_eval_metric_results: list[EvalMetricResult]
Expand All @@ -80,7 +85,7 @@ class EvalSetResult(BaseModel):
populate_by_name=True,
)
eval_set_result_id: str
eval_set_result_name: str
eval_set_result_name: Optional[str] = None
eval_set_id: str
eval_case_results: list[EvalCaseResult] = Field(default_factory=list)
creation_timestamp: float = 0.0
1 change: 1 addition & 0 deletions src/google/adk/evaluation/eval_set_results_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from abc import ABC
from abc import abstractmethod
import time

from .eval_result import EvalCaseResult
from .eval_result import EvalSetResult
Expand Down
3 changes: 3 additions & 0 deletions src/google/adk/evaluation/eval_sets_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,15 @@

from abc import ABC
from abc import abstractmethod
import logging
from typing import Optional

from ..errors.not_found_error import NotFoundError
from .eval_case import EvalCase
from .eval_set import EvalSet

logger = logging.getLogger("google_adk." + __name__)


class EvalSetsManager(ABC):
"""An interface to manage an Eval Sets."""
Expand Down
121 changes: 121 additions & 0 deletions src/google/adk/evaluation/gcs_eval_set_results_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import logging

from google.cloud import exceptions as cloud_exceptions
from google.cloud import storage
from typing_extensions import override

from ._eval_set_results_manager_utils import create_eval_set_result
from .eval_result import EvalCaseResult
from .eval_result import EvalSetResult
from .eval_set_results_manager import EvalSetResultsManager

logger = logging.getLogger("google_adk." + __name__)

_EVAL_HISTORY_DIR = "evals/eval_history"
_EVAL_SET_RESULT_FILE_EXTENSION = ".evalset_result.json"


class GcsEvalSetResultsManager(EvalSetResultsManager):
"""An EvalSetResultsManager that stores eval results in a GCS bucket."""

def __init__(self, bucket_name: str, **kwargs):
"""Initializes the GcsEvalSetsManager.

Args:
bucket_name: The name of the bucket to use.
**kwargs: Keyword arguments to pass to the Google Cloud Storage client.
"""
self.bucket_name = bucket_name
self.storage_client = storage.Client(**kwargs)
self.bucket = self.storage_client.bucket(self.bucket_name)
# Check if the bucket exists.
if not self.bucket.exists():
raise ValueError(
f"Bucket `{self.bucket_name}` does not exist. Please create it before"
" using the GcsEvalSetsManager."
)

def _get_eval_history_dir(self, app_name: str) -> str:
return f"{app_name}/{_EVAL_HISTORY_DIR}"

def _get_eval_set_result_blob_name(
self, app_name: str, eval_set_result_id: str
) -> str:
eval_history_dir = self._get_eval_history_dir(app_name)
return f"{eval_history_dir}/{eval_set_result_id}{_EVAL_SET_RESULT_FILE_EXTENSION}"

def _write_eval_set_result(
self, blob_name: str, eval_set_result: EvalSetResult
):
"""Writes an EvalSetResult to GCS."""
blob = self.bucket.blob(blob_name)
blob.upload_from_string(
eval_set_result.model_dump_json(indent=2),
content_type="application/json",
)

@override
def save_eval_set_result(
self,
app_name: str,
eval_set_id: str,
eval_case_results: list[EvalCaseResult],
) -> None:
"""Creates and saves a new EvalSetResult given eval_case_results."""
eval_set_result = create_eval_set_result(
app_name, eval_set_id, eval_case_results
)

eval_set_result_blob_name = self._get_eval_set_result_blob_name(
app_name, eval_set_result.eval_set_result_id
)
logger.info("Writing eval result to blob: %s", eval_set_result_blob_name)
self._write_eval_set_result(eval_set_result_blob_name, eval_set_result)

@override
def get_eval_set_result(
self, app_name: str, eval_set_result_id: str
) -> EvalSetResult:
"""Returns an EvalSetResult from app_name and eval_set_result_id."""
try:
eval_set_result_blob_name = self._get_eval_set_result_blob_name(
app_name, eval_set_result_id
)
blob = self.bucket.blob(eval_set_result_blob_name)
eval_set_result_data = blob.download_as_text()
return EvalSetResult.model_validate_json(eval_set_result_data)
except cloud_exceptions.NotFound:
return None

@override
def list_eval_set_results(self, app_name: str) -> list[str]:
"""Returns the eval result ids that belong to the given app_name."""
eval_history_dir = self._get_eval_history_dir(app_name)
eval_set_results = []
try:
for blob in self.bucket.list_blobs(prefix=eval_history_dir):
eval_set_result_id = blob.name.split("/")[-1].removesuffix(
_EVAL_SET_RESULT_FILE_EXTENSION
)
eval_set_results.append(eval_set_result_id)
return sorted(eval_set_results)
except cloud_exceptions.NotFound as e:
raise ValueError(
f"App `{app_name}` not found in GCS bucket `{self.bucket_name}`."
) from e
Loading