Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions api/endpoints/evaluation_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,26 @@
from queries.evaluation_run import (
get_evaluation_run_by_id,
get_evaluation_run_logs_by_id,
get_evaluation_run_metrics_by_id,
)
from utils.problem_alias import add_test_aliases, make_problem_alias

router = APIRouter()


def _temp_evaluation_run_metrics(run: EvaluationRun) -> dict:
run_time_seconds = None
if run.started_initializing_agent_at is not None and run.finished_or_errored_at is not None:
run_time_seconds = (run.finished_or_errored_at - run.started_initializing_agent_at).total_seconds()

return {
"run_time_seconds": run_time_seconds,
"run_cost_usd": None,
"problem_total_runs": 0,
"problem_average_time_seconds": None,
"problem_average_cost_usd": None,
}


# /evaluation-run/get-by-id?evaluation_run_id=
@router.get("/get-by-id")
async def evaluation_run_get_by_id(evaluation_run_id: UUID) -> EvaluationRun:
Expand All @@ -21,7 +34,6 @@ async def evaluation_run_get_by_id(evaluation_run_id: UUID) -> EvaluationRun:
if evaluation_run is None:
raise HTTPException(status_code=404, detail=f"Evaluation run with ID {evaluation_run_id} does not exist.")

metrics = await get_evaluation_run_metrics_by_id(evaluation_run_id)
alias = make_problem_alias(evaluation_run.problem_name, evaluation_run.benchmark_family)
test_results = add_test_aliases(
evaluation_run.test_results,
Expand All @@ -33,7 +45,7 @@ async def evaluation_run_get_by_id(evaluation_run_id: UUID) -> EvaluationRun:
update={
"problem_alias": alias,
"test_results": test_results,
**(metrics or {}),
**_temp_evaluation_run_metrics(evaluation_run),
}
)

Expand Down
21 changes: 17 additions & 4 deletions api/endpoints/retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from models.agent import Agent, AgentScored, AgentStatus, BenchmarkAgentScored, PossiblyBenchmarkAgent
from models.evaluation import Evaluation, EvaluationWithRuns
from models.evaluation_run import EvaluationRun
from models.evaluation_set import EvaluationSetGroup
from queries.agent import (
get_agent_by_id,
Expand All @@ -18,7 +19,7 @@
get_top_agents,
)
from queries.evaluation import get_evaluations_for_agent_id
from queries.evaluation_run import get_all_evaluation_runs_in_evaluation_id, get_evaluation_run_metrics_by_ids
from queries.evaluation_run import get_all_evaluation_runs_in_evaluation_id
from queries.statistics import (
PerfectlySolvedOverTime,
ProblemSetCreationTime,
Expand All @@ -37,6 +38,20 @@
router = APIRouter()


def _temp_evaluation_run_metrics(run: EvaluationRun) -> dict:
run_time_seconds = None
if run.started_initializing_agent_at is not None and run.finished_or_errored_at is not None:
run_time_seconds = (run.finished_or_errored_at - run.started_initializing_agent_at).total_seconds()

return {
"run_time_seconds": run_time_seconds,
"run_cost_usd": None,
"problem_total_runs": 0,
"problem_average_time_seconds": None,
"problem_average_cost_usd": None,
}


# /retrieval/queue?stage={screener_1|screener_2|validator}
@router.get("/queue")
@ttl_cache(ttl_seconds=60) # 1 minute
Expand Down Expand Up @@ -96,8 +111,6 @@ async def evaluations_for_agent(agent_id: UUID) -> List[EvaluationWithRuns]:
runs_per_eval = await asyncio.gather(
*[get_all_evaluation_runs_in_evaluation_id(evaluation_id=e.evaluation_id) for e in evaluations]
)
all_run_ids = [run.evaluation_run_id for runs in runs_per_eval for run in runs]
metrics_by_run_id = await get_evaluation_run_metrics_by_ids(all_run_ids)

enriched_runs = [
[
Expand All @@ -109,7 +122,7 @@ async def evaluations_for_agent(agent_id: UUID) -> List[EvaluationWithRuns]:
problem_name=run.problem_name,
benchmark_family=run.benchmark_family,
),
**metrics_by_run_id.get(run.evaluation_run_id, {}),
**_temp_evaluation_run_metrics(run),
}
)
for run in runs
Expand Down
18 changes: 9 additions & 9 deletions tests/api/test_evaluation_run_aliases.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from datetime import datetime, timezone
from datetime import datetime, timedelta, timezone
from uuid import uuid4

import pytest
Expand All @@ -15,6 +15,7 @@
async def test_evaluation_run_get_by_id_adds_test_aliases(monkeypatch) -> None:
monkeypatch.delenv("PROBLEM_ALIAS_SALT", raising=False)
evaluation_run_id = uuid4()
started_at = datetime.now(timezone.utc)
evaluation_run = EvaluationRun(
evaluation_run_id=evaluation_run_id,
evaluation_id=uuid4(),
Expand All @@ -29,24 +30,23 @@ async def test_evaluation_run_get_by_id_adds_test_aliases(monkeypatch) -> None:
)
],
created_at=datetime.now(timezone.utc),
started_initializing_agent_at=started_at,
finished_or_errored_at=started_at + timedelta(seconds=12.5),
)

async def fake_get_evaluation_run_by_id(_evaluation_run_id):
return evaluation_run

async def fake_get_evaluation_run_metrics_by_id(_evaluation_run_id):
return {}

monkeypatch.setattr(evaluation_run_endpoint, "get_evaluation_run_by_id", fake_get_evaluation_run_by_id)
monkeypatch.setattr(
evaluation_run_endpoint,
"get_evaluation_run_metrics_by_id",
fake_get_evaluation_run_metrics_by_id,
)

response = await evaluation_run_endpoint.evaluation_run_get_by_id(evaluation_run_id)

assert response.problem_alias == make_problem_alias("acronym-py", "polyglot_py")
assert response.run_time_seconds == 12.5
assert response.run_cost_usd is None
assert response.problem_total_runs == 0
assert response.problem_average_time_seconds is None
assert response.problem_average_cost_usd is None
assert response.test_results is not None
assert response.test_results[0].test_alias == make_test_alias(
benchmark_family="polyglot_py",
Expand Down
Loading