Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 87 additions & 11 deletions src/aggregators/aggregate_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import sys
sys.path.append(str(Path(__file__).parent.parent.parent))
from src.errors import is_retryable_error
from src.aggregators.pricing import compute_cost_usd


def discover_tasks() -> Dict[str, List[str]]:
Expand Down Expand Up @@ -280,6 +281,12 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
avg_pass1 = 0.0
std_pass1 = 0.0

# Compute per-run tokens and cost
per_run_input_tokens = total_input_tokens / runs_count if runs_count else 0
per_run_output_tokens = total_output_tokens / runs_count if runs_count else 0
model_for_pricing = actual_model_name or model
computed_per_run_cost = compute_cost_usd(model_for_pricing, per_run_input_tokens, per_run_output_tokens)

overall_metrics = {
"total_tasks": total_tasks,
"total_agent_execution_time": total_agent_execution_time,
Expand All @@ -292,9 +299,9 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
"avg_output_tokens": round(avg_output_tokens, 4),
"avg_total_tokens": round(avg_total_tokens, 4),
"avg_turns": round(avg_turns, 4),
"per_run_input_tokens": total_input_tokens / runs_count if runs_count else 0,
"per_run_output_tokens": total_output_tokens / runs_count if runs_count else 0,
"per_run_cost": per_run_cost if per_run_cost is not None else None,
"per_run_input_tokens": per_run_input_tokens,
"per_run_output_tokens": per_run_output_tokens,
"per_run_cost": computed_per_run_cost if computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None),
"actual_model_name": actual_model_name or "",
"pass@1": {
"avg": round(avg_pass1, 4),
Expand Down Expand Up @@ -386,6 +393,11 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
s_mean = 0.0
s_std = 0.0

# Compute per-run tokens and cost for this service
s_per_run_input_tokens = s_total_input_tokens / runs_count if runs_count else 0
s_per_run_output_tokens = s_total_output_tokens / runs_count if runs_count else 0
s_computed_per_run_cost = compute_cost_usd(model_for_pricing, s_per_run_input_tokens, s_per_run_output_tokens)

service_metrics = {
"total_tasks": service_total_tasks,
"total_agent_execution_time": s_total_agent_execution_time,
Expand All @@ -398,9 +410,9 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
"avg_output_tokens": round(s_avg_output_tokens, 4),
"avg_total_tokens": round(s_avg_total_tokens, 4),
"avg_turns": round(s_avg_turns, 4),
"per_run_input_tokens": s_total_input_tokens / runs_count if runs_count else 0,
"per_run_output_tokens": s_total_output_tokens / runs_count if runs_count else 0,
"per_run_cost": per_run_cost if per_run_cost is not None else None,
"per_run_input_tokens": s_per_run_input_tokens,
"per_run_output_tokens": s_per_run_output_tokens,
"per_run_cost": s_computed_per_run_cost if s_computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None),
"actual_model_name": actual_model_name or "",
"pass@1": {
"avg": round(s_mean, 4),
Expand Down Expand Up @@ -484,14 +496,67 @@ def generate_task_results(exp_dir: Path, complete_models: Dict, all_tasks: Dict)
for run_name, run_data in model_data[service].items():
if task in run_data:
meta = run_data[task]
agent_time = float(meta.get("agent_execution_time", 0.0) or 0.0)
token_usage = meta.get("token_usage", {}) or {}
turn_count = int(meta.get("turn_count", 0) or 0)
success = bool(meta.get("execution_result", {}).get("success", False))
model_task_data["runs"].append({
"run": run_name,
"success": meta.get("execution_result", {}).get("success", False),
"execution_time": meta.get("agent_execution_time", 0),
"token_usage": meta.get("token_usage", {})
"success": success,
"execution_time": agent_time,
"agent_execution_time": agent_time,
"token_usage": token_usage,
"turn_count": turn_count,
})

if model_task_data["runs"]:
# Compute per-model summary across runs for this task
runs_list = model_task_data["runs"]
runs_count = len(runs_list)
successful_runs = sum(1 for r in runs_list if r.get("success"))

# Averages
total_agent_time = sum(float(r.get("agent_execution_time", r.get("execution_time", 0.0)) or 0.0) for r in runs_list)
avg_agent_time = round(total_agent_time / runs_count, 2)

def _tok(r, key):
tu = r.get("token_usage") or {}
return int(tu.get(key, 0) or 0)

total_input_tokens = 0
total_output_tokens = 0
total_total_tokens = 0
for r in runs_list:
in_tok = _tok(r, "input_tokens")
out_tok = _tok(r, "output_tokens")
ttl_tok = int((r.get("token_usage") or {}).get("total_tokens", in_tok + out_tok) or (in_tok + out_tok))
total_input_tokens += in_tok
total_output_tokens += out_tok
total_total_tokens += ttl_tok

avg_input_tokens = round(total_input_tokens / runs_count, 1)
avg_output_tokens = round(total_output_tokens / runs_count, 1)
avg_total_tokens = round(total_total_tokens / runs_count, 1)

total_turns = sum(int(r.get("turn_count", 0) or 0) for r in runs_list)
avg_turn_count = round(total_turns / runs_count, 2)

summary_obj = {
"total_runs": runs_count,
"successful_runs": successful_runs,
"avg_agent_execution_time": avg_agent_time,
"avg_input_tokens": avg_input_tokens,
"avg_output_tokens": avg_output_tokens,
"avg_total_tokens": avg_total_tokens,
"avg_turn_count": avg_turn_count,
}

# Include pass@k and pass^k only for multi-run models
if runs_count > 1:
summary_obj[f"pass@{runs_count}"] = 1.0 if successful_runs > 0 else 0.0
summary_obj[f"pass^{runs_count}"] = 1.0 if successful_runs == runs_count else 0.0

model_task_data["summary"] = summary_obj
task_data["models"][model] = model_task_data

# Save task file
Expand Down Expand Up @@ -525,7 +590,9 @@ def render_section(title: str, section_data: Dict[str, Any]) -> List[str]:
if include_k:
header += f" Pass@{k} | Pass^{k} |"
sep += "----------|----------|"
# Add Avg Turns and Avg Agent Time (s) at the end
# Add Per-Run Cost (USD) and Avg Agent Time (s) at the end
header += " Per-Run Cost (USD) |"
sep += "---------------------|"
header += " Avg Agent Time (s) |"
sep += "--------------------|"

Expand All @@ -542,6 +609,14 @@ def render_section(title: str, section_data: Dict[str, Any]) -> List[str]:
for model, metrics in sorted_items:
pass1_avg, pass1_std = get_pass1_avg_std(metrics)
avg_time = float(metrics.get("avg_agent_execution_time", 0.0) or 0.0)
# Format per-run cost (up to 2 decimal places, trim trailing zeros)
cost_val = metrics.get("per_run_cost")
if isinstance(cost_val, (int, float)):
rounded_cost = round(float(cost_val), 2)
formatted_cost = f"{rounded_cost:.2f}".rstrip('0').rstrip('.')
cost_str = f"${formatted_cost}"
else:
cost_str = "/"
row = (
f"| {model} | {metrics.get('total_tasks', 0)} | "
f"{pass1_avg * 100:.1f}% ± {pass1_std * 100:.1f}% |"
Expand All @@ -552,7 +627,8 @@ def render_section(title: str, section_data: Dict[str, Any]) -> List[str]:
else:
# Single-run models do not have pass@k or pass^k; show placeholders
row += " / | / |"
# Append avg agent time at the end
# Append cost and avg agent time at the end
row += f" {cost_str} |"
row += f" {avg_time:.1f} |"
lines_sec.append(row)

Expand Down
79 changes: 79 additions & 0 deletions src/aggregators/pricing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""
Pricing utilities for computing per-run cost from token usage.

All prices are specified per 1,000,000 tokens (M tokens) in USD.
"""

from __future__ import annotations

from typing import Dict, Optional


# Price map keyed by canonical model name (lowercased)
# Values are dicts with per-M token prices for input and output tokens
MODEL_PRICES_PER_M: Dict[str, Dict[str, float]] = {
# Use exact actual_model_name keys (lowercased) provided by the user
# Anthropic
"claude-opus-4-1-20250805": {"input": 15.0, "output": 75.0},
"claude-sonnet-4-20250514": {"input": 3.0, "output": 15.0},

# DeepSeek
"deepseek-v3.1-non-think": {"input": 0.56, "output": 1.68},

# Google Gemini
"gemini-2.5-pro": {"input": 2.5, "output": 15.0},
"gemini-2.5-flash": {"input": 0.3, "output": 2.5},

# Z.AI
"glm-4.5": {"input": 0.33, "output": 1.32},

# OpenAI
"gpt-5-2025-08-07": {"input": 1.25, "output": 10.0},
"gpt-5-mini-2025-08-07": {"input": 0.25, "output": 2.0},
"gpt-5-nano-2025-08-07": {"input": 0.05, "output": 0.4},
"gpt-4.1-2025-04-14": {"input": 2.0, "output": 8.0},
"gpt-4.1-mini-2025-04-14": {"input": 0.4, "output": 1.6},
"gpt-4.1-nano-2025-04-14": {"input": 0.1, "output": 0.4},
"o3-2025-04-16": {"input": 2.0, "output": 8.0},
"o4-mini-2025-04-16": {"input": 1.1, "output": 4.4},
"gpt-oss-120b": {"input": 0.072, "output": 0.28},

# Qwen
"qwen3-coder-480b-a35b-instruct": {"input": 0.2, "output": 0.8},

# Xai
"grok-4-0709": {"input": 3.0, "output": 15.0},
"grok-code-fast-1": {"input": 0.2, "output": 1.5},

# Moonshot
"kimi-k2-0711-preview": {"input": 0.6, "output": 2.5},
}


def normalize_model_name(model_name: str) -> str:
"""Normalize model name for pricing lookup.

Lowercases only.
"""
return (model_name or "").strip().lower()


def get_price_per_m(model_name: str) -> Optional[Dict[str, float]]:
"""Return per-M token prices for given model, or None if unknown."""
key = normalize_model_name(model_name)
return MODEL_PRICES_PER_M.get(key)


def compute_cost_usd(model_name: str, input_tokens: float, output_tokens: float) -> Optional[float]:
"""Compute cost in USD given token usage and model pricing.

Prices are per 1,000,000 tokens. If pricing unknown, returns None.
"""
prices = get_price_per_m(model_name)
if not prices:
return None
input_cost = (input_tokens / 1_000_000.0) * prices["input"]
output_cost = (output_tokens / 1_000_000.0) * prices["output"]
return float(round(input_cost + output_cost, 6))