Skip to content

feat: Implement GcsEvalSetResultsManager to handle storage of eval sets on GCS, and refactor eval set results manager #1200

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions src/google/adk/evaluation/_eval_set_results_manager_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import time

from .eval_result import EvalCaseResult
from .eval_result import EvalSetResult


def _sanitize_eval_set_result_name(eval_set_result_name: str) -> str:
"""Sanitizes the eval set result name."""
return eval_set_result_name.replace("/", "_")


def create_eval_set_result(
app_name: str,
eval_set_id: str,
eval_case_results: list[EvalCaseResult],
) -> EvalSetResult:
"""Creates a new EvalSetResult given eval_case_results."""
timestamp = time.time()
eval_set_result_id = f"{app_name}_{eval_set_id}_{timestamp}"
eval_set_result_name = _sanitize_eval_set_result_name(eval_set_result_id)
eval_set_result = EvalSetResult(
eval_set_result_id=eval_set_result_id,
eval_set_result_name=eval_set_result_name,
eval_set_id=eval_set_id,
eval_case_results=eval_case_results,
creation_timestamp=timestamp,
)
return eval_set_result
19 changes: 12 additions & 7 deletions src/google/adk/evaluation/eval_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@ class EvalCaseResult(BaseModel):
populate_by_name=True,
)

eval_set_file: str = Field(
eval_set_file: Optional[str] = Field(
deprecated=True,
default=None,
description="This field is deprecated, use eval_set_id instead.",
)
eval_set_id: str = ""
Expand All @@ -49,11 +50,15 @@ class EvalCaseResult(BaseModel):
final_eval_status: EvalStatus
"""Final eval status for this eval case."""

eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]] = Field(
deprecated=True,
description=(
"This field is deprecated, use overall_eval_metric_results instead."
),
eval_metric_results: Optional[list[tuple[EvalMetric, EvalMetricResult]]] = (
Field(
deprecated=True,
default=None,
description=(
"This field is deprecated, use overall_eval_metric_results"
" instead."
),
)
)

overall_eval_metric_results: list[EvalMetricResult]
Expand All @@ -80,7 +85,7 @@ class EvalSetResult(BaseModel):
populate_by_name=True,
)
eval_set_result_id: str
eval_set_result_name: str
eval_set_result_name: Optional[str] = None
eval_set_id: str
eval_case_results: list[EvalCaseResult] = Field(default_factory=list)
creation_timestamp: float = 0.0
7 changes: 6 additions & 1 deletion src/google/adk/evaluation/eval_set_results_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from abc import ABC
from abc import abstractmethod
from typing import Optional

from .eval_result import EvalCaseResult
from .eval_result import EvalSetResult
Expand All @@ -38,7 +39,11 @@ def save_eval_set_result(
def get_eval_set_result(
self, app_name: str, eval_set_result_id: str
) -> EvalSetResult:
"""Returns an EvalSetResult identified by app_name and eval_set_result_id."""
"""Returns the EvalSetResult from app_name and eval_set_result_id.

Raises:
NotFoundError: If the EvalSetResult is not found.
"""
raise NotImplementedError()

@abstractmethod
Expand Down
121 changes: 121 additions & 0 deletions src/google/adk/evaluation/gcs_eval_set_results_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import logging

from google.cloud import exceptions as cloud_exceptions
from google.cloud import storage
from typing_extensions import override

from ..errors.not_found_error import NotFoundError
from ._eval_set_results_manager_utils import create_eval_set_result
from .eval_result import EvalCaseResult
from .eval_result import EvalSetResult
from .eval_set_results_manager import EvalSetResultsManager

logger = logging.getLogger("google_adk." + __name__)

_EVAL_HISTORY_DIR = "evals/eval_history"
_EVAL_SET_RESULT_FILE_EXTENSION = ".evalset_result.json"


class GcsEvalSetResultsManager(EvalSetResultsManager):
"""An EvalSetResultsManager that stores eval results in a GCS bucket."""

def __init__(self, bucket_name: str, **kwargs):
"""Initializes the GcsEvalSetsManager.

Args:
bucket_name: The name of the bucket to use.
**kwargs: Keyword arguments to pass to the Google Cloud Storage client.
"""
self.bucket_name = bucket_name
self.storage_client = storage.Client(**kwargs)
self.bucket = self.storage_client.bucket(self.bucket_name)
# Check if the bucket exists.
if not self.bucket.exists():
raise ValueError(
f"Bucket `{self.bucket_name}` does not exist. Please create it before"
" using the GcsEvalSetsManager."
)

def _get_eval_history_dir(self, app_name: str) -> str:
return f"{app_name}/{_EVAL_HISTORY_DIR}"

def _get_eval_set_result_blob_name(
self, app_name: str, eval_set_result_id: str
) -> str:
eval_history_dir = self._get_eval_history_dir(app_name)
return f"{eval_history_dir}/{eval_set_result_id}{_EVAL_SET_RESULT_FILE_EXTENSION}"

def _write_eval_set_result(
self, blob_name: str, eval_set_result: EvalSetResult
):
"""Writes an EvalSetResult to GCS."""
blob = self.bucket.blob(blob_name)
blob.upload_from_string(
eval_set_result.model_dump_json(indent=2),
content_type="application/json",
)

@override
def save_eval_set_result(
self,
app_name: str,
eval_set_id: str,
eval_case_results: list[EvalCaseResult],
) -> None:
"""Creates and saves a new EvalSetResult given eval_case_results."""
eval_set_result = create_eval_set_result(
app_name, eval_set_id, eval_case_results
)

eval_set_result_blob_name = self._get_eval_set_result_blob_name(
app_name, eval_set_result.eval_set_result_id
)
logger.info("Writing eval result to blob: %s", eval_set_result_blob_name)
self._write_eval_set_result(eval_set_result_blob_name, eval_set_result)

@override
def get_eval_set_result(
self, app_name: str, eval_set_result_id: str
) -> EvalSetResult:
"""Returns an EvalSetResult from app_name and eval_set_result_id."""
eval_set_result_blob_name = self._get_eval_set_result_blob_name(
app_name, eval_set_result_id
)
blob = self.bucket.blob(eval_set_result_blob_name)
if not blob.exists():
raise NotFoundError(f"Eval set result `{eval_set_result_id}` not found.")
eval_set_result_data = blob.download_as_text()
return EvalSetResult.model_validate_json(eval_set_result_data)

@override
def list_eval_set_results(self, app_name: str) -> list[str]:
"""Returns the eval result ids that belong to the given app_name."""
eval_history_dir = self._get_eval_history_dir(app_name)
eval_set_results = []
try:
for blob in self.bucket.list_blobs(prefix=eval_history_dir):
eval_set_result_id = blob.name.split("/")[-1].removesuffix(
_EVAL_SET_RESULT_FILE_EXTENSION
)
eval_set_results.append(eval_set_result_id)
return sorted(eval_set_results)
except cloud_exceptions.NotFound as e:
raise ValueError(
f"App `{app_name}` not found in GCS bucket `{self.bucket_name}`."
) from e
24 changes: 6 additions & 18 deletions src/google/adk/evaluation/local_eval_set_results_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@
import json
import logging
import os
import time

from typing_extensions import override

from ..errors.not_found_error import NotFoundError
from ._eval_set_results_manager_utils import create_eval_set_result
from .eval_result import EvalCaseResult
from .eval_result import EvalSetResult
from .eval_set_results_manager import EvalSetResultsManager
Expand All @@ -31,10 +32,6 @@
_EVAL_SET_RESULT_FILE_EXTENSION = ".evalset_result.json"


def _sanitize_eval_set_result_name(eval_set_result_name: str) -> str:
return eval_set_result_name.replace("/", "_")


class LocalEvalSetResultsManager(EvalSetResultsManager):
"""An EvalSetResult manager that stores eval set results locally on disk."""

Expand All @@ -49,15 +46,8 @@ def save_eval_set_result(
eval_case_results: list[EvalCaseResult],
) -> None:
"""Creates and saves a new EvalSetResult given eval_case_results."""
timestamp = time.time()
eval_set_result_id = app_name + "_" + eval_set_id + "_" + str(timestamp)
eval_set_result_name = _sanitize_eval_set_result_name(eval_set_result_id)
eval_set_result = EvalSetResult(
eval_set_result_id=eval_set_result_id,
eval_set_result_name=eval_set_result_name,
eval_set_id=eval_set_id,
eval_case_results=eval_case_results,
creation_timestamp=timestamp,
eval_set_result = create_eval_set_result(
app_name, eval_set_id, eval_case_results
)
# Write eval result file, with eval_set_result_name.
app_eval_history_dir = self._get_eval_history_dir(app_name)
Expand All @@ -67,7 +57,7 @@ def save_eval_set_result(
eval_set_result_json = eval_set_result.model_dump_json()
eval_set_result_file_path = os.path.join(
app_eval_history_dir,
eval_set_result_name + _EVAL_SET_RESULT_FILE_EXTENSION,
eval_set_result.eval_set_result_name + _EVAL_SET_RESULT_FILE_EXTENSION,
)
logger.info("Writing eval result to file: %s", eval_set_result_file_path)
with open(eval_set_result_file_path, "w") as f:
Expand All @@ -87,9 +77,7 @@ def get_eval_set_result(
+ _EVAL_SET_RESULT_FILE_EXTENSION
)
if not os.path.exists(maybe_eval_result_file_path):
raise ValueError(
f"Eval set result `{eval_set_result_id}` does not exist."
)
raise NotFoundError(f"Eval set result `{eval_set_result_id}` not found.")
with open(maybe_eval_result_file_path, "r") as file:
eval_result_data = json.load(file)
return EvalSetResult.model_validate_json(eval_result_data)
Expand Down
Loading