From 1b82b713eb61fc7e536ede80243de8f1075a340e Mon Sep 17 00:00:00 2001 From: zjwu0522 Date: Fri, 5 Sep 2025 17:44:31 +0000 Subject: [PATCH 1/2] feat: support price cost caluating per model --- src/aggregators/aggregate_results.py | 83 +++++++++++++++++++++++++--- src/aggregators/pricing.py | 79 ++++++++++++++++++++++++++ 2 files changed, 153 insertions(+), 9 deletions(-) create mode 100644 src/aggregators/pricing.py diff --git a/src/aggregators/aggregate_results.py b/src/aggregators/aggregate_results.py index 72777565..d6e5be89 100755 --- a/src/aggregators/aggregate_results.py +++ b/src/aggregators/aggregate_results.py @@ -17,6 +17,7 @@ import sys sys.path.append(str(Path(__file__).parent.parent.parent)) from src.errors import is_retryable_error +from src.aggregators.pricing import compute_cost_usd def discover_tasks() -> Dict[str, List[str]]: @@ -280,6 +281,12 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]: avg_pass1 = 0.0 std_pass1 = 0.0 + # Compute per-run tokens and cost + per_run_input_tokens = total_input_tokens / runs_count if runs_count else 0 + per_run_output_tokens = total_output_tokens / runs_count if runs_count else 0 + model_for_pricing = actual_model_name or model + computed_per_run_cost = compute_cost_usd(model_for_pricing, per_run_input_tokens, per_run_output_tokens) + overall_metrics = { "total_tasks": total_tasks, "total_agent_execution_time": total_agent_execution_time, @@ -292,9 +299,9 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]: "avg_output_tokens": round(avg_output_tokens, 4), "avg_total_tokens": round(avg_total_tokens, 4), "avg_turns": round(avg_turns, 4), - "per_run_input_tokens": total_input_tokens / runs_count if runs_count else 0, - "per_run_output_tokens": total_output_tokens / runs_count if runs_count else 0, - "per_run_cost": per_run_cost if per_run_cost is not None else None, + "per_run_input_tokens": per_run_input_tokens, + "per_run_output_tokens": per_run_output_tokens, + "per_run_cost": computed_per_run_cost if computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None), "actual_model_name": actual_model_name or "", "pass@1": { "avg": round(avg_pass1, 4), @@ -386,6 +393,11 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]: s_mean = 0.0 s_std = 0.0 + # Compute per-run tokens and cost for this service + s_per_run_input_tokens = s_total_input_tokens / runs_count if runs_count else 0 + s_per_run_output_tokens = s_total_output_tokens / runs_count if runs_count else 0 + s_computed_per_run_cost = compute_cost_usd(model_for_pricing, s_per_run_input_tokens, s_per_run_output_tokens) + service_metrics = { "total_tasks": service_total_tasks, "total_agent_execution_time": s_total_agent_execution_time, @@ -398,9 +410,9 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]: "avg_output_tokens": round(s_avg_output_tokens, 4), "avg_total_tokens": round(s_avg_total_tokens, 4), "avg_turns": round(s_avg_turns, 4), - "per_run_input_tokens": s_total_input_tokens / runs_count if runs_count else 0, - "per_run_output_tokens": s_total_output_tokens / runs_count if runs_count else 0, - "per_run_cost": per_run_cost if per_run_cost is not None else None, + "per_run_input_tokens": s_per_run_input_tokens, + "per_run_output_tokens": s_per_run_output_tokens, + "per_run_cost": s_computed_per_run_cost if s_computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None), "actual_model_name": actual_model_name or "", "pass@1": { "avg": round(s_mean, 4), @@ -484,14 +496,67 @@ def generate_task_results(exp_dir: Path, complete_models: Dict, all_tasks: Dict) for run_name, run_data in model_data[service].items(): if task in run_data: meta = run_data[task] + agent_time = float(meta.get("agent_execution_time", 0.0) or 0.0) + token_usage = meta.get("token_usage", {}) or {} + turn_count = int(meta.get("turn_count", 0) or 0) + success = bool(meta.get("execution_result", {}).get("success", False)) model_task_data["runs"].append({ "run": run_name, - "success": meta.get("execution_result", {}).get("success", False), - "execution_time": meta.get("agent_execution_time", 0), - "token_usage": meta.get("token_usage", {}) + "success": success, + "execution_time": agent_time, + "agent_execution_time": agent_time, + "token_usage": token_usage, + "turn_count": turn_count, }) if model_task_data["runs"]: + # Compute per-model summary across runs for this task + runs_list = model_task_data["runs"] + runs_count = len(runs_list) + successful_runs = sum(1 for r in runs_list if r.get("success")) + + # Averages + total_agent_time = sum(float(r.get("agent_execution_time", r.get("execution_time", 0.0)) or 0.0) for r in runs_list) + avg_agent_time = round(total_agent_time / runs_count, 2) + + def _tok(r, key): + tu = r.get("token_usage") or {} + return int(tu.get(key, 0) or 0) + + total_input_tokens = 0 + total_output_tokens = 0 + total_total_tokens = 0 + for r in runs_list: + in_tok = _tok(r, "input_tokens") + out_tok = _tok(r, "output_tokens") + ttl_tok = int((r.get("token_usage") or {}).get("total_tokens", in_tok + out_tok) or (in_tok + out_tok)) + total_input_tokens += in_tok + total_output_tokens += out_tok + total_total_tokens += ttl_tok + + avg_input_tokens = round(total_input_tokens / runs_count, 1) + avg_output_tokens = round(total_output_tokens / runs_count, 1) + avg_total_tokens = round(total_total_tokens / runs_count, 1) + + total_turns = sum(int(r.get("turn_count", 0) or 0) for r in runs_list) + avg_turn_count = round(total_turns / runs_count, 2) + + summary_obj = { + "total_runs": runs_count, + "successful_runs": successful_runs, + "avg_agent_execution_time": avg_agent_time, + "avg_input_tokens": avg_input_tokens, + "avg_output_tokens": avg_output_tokens, + "avg_total_tokens": avg_total_tokens, + "avg_turn_count": avg_turn_count, + } + + # Include pass@k and pass^k only for multi-run models + if runs_count > 1: + summary_obj[f"pass@{runs_count}"] = 1.0 if successful_runs > 0 else 0.0 + summary_obj[f"pass^{runs_count}"] = 1.0 if successful_runs == runs_count else 0.0 + + model_task_data["summary"] = summary_obj task_data["models"][model] = model_task_data # Save task file diff --git a/src/aggregators/pricing.py b/src/aggregators/pricing.py new file mode 100644 index 00000000..343f03af --- /dev/null +++ b/src/aggregators/pricing.py @@ -0,0 +1,79 @@ +""" +Pricing utilities for computing per-run cost from token usage. + +All prices are specified per 1,000,000 tokens (M tokens) in USD. +""" + +from __future__ import annotations + +from typing import Dict, Optional + + +# Price map keyed by canonical model name (lowercased) +# Values are dicts with per-M token prices for input and output tokens +MODEL_PRICES_PER_M: Dict[str, Dict[str, float]] = { + # Use exact actual_model_name keys (lowercased) provided by the user + # Anthropic + "claude-opus-4-1-20250805": {"input": 15.0, "output": 75.0}, + "claude-sonnet-4-20250514": {"input": 3.0, "output": 15.0}, + + # DeepSeek + "deepseek-v3.1-non-think": {"input": 0.56, "output": 1.68}, + + # Google Gemini + "gemini-2.5-pro": {"input": 2.5, "output": 15.0}, + "gemini-2.5-flash": {"input": 0.3, "output": 2.5}, + + # Z.AI + "glm-4.5": {"input": 0.33, "output": 1.32}, + + # OpenAI + "gpt-5-2025-08-07": {"input": 1.25, "output": 10.0}, + "gpt-5-mini-2025-08-07": {"input": 0.25, "output": 2.0}, + "gpt-5-nano-2025-08-07": {"input": 0.05, "output": 0.4}, + "gpt-4.1-2025-04-14": {"input": 2.0, "output": 8.0}, + "gpt-4.1-mini-2025-04-14": {"input": 0.4, "output": 1.6}, + "gpt-4.1-nano-2025-04-14": {"input": 0.1, "output": 0.4}, + "o3-2025-04-16": {"input": 2.0, "output": 8.0}, + "o4-mini-2025-04-16": {"input": 1.1, "output": 4.4}, + "gpt-oss-120b": {"input": 0.072, "output": 0.28}, + + # Qwen + "qwen3-coder-480b-a35b-instruct": {"input": 0.2, "output": 0.8}, + + # Xai + "grok-4-0709": {"input": 3.0, "output": 15.0}, + "grok-code-fast-1": {"input": 0.2, "output": 1.5}, + + # Moonshot + "kimi-k2-0711-preview": {"input": 0.6, "output": 2.5}, +} + + +def normalize_model_name(model_name: str) -> str: + """Normalize model name for pricing lookup. + + Lowercases only. + """ + return (model_name or "").strip().lower() + + +def get_price_per_m(model_name: str) -> Optional[Dict[str, float]]: + """Return per-M token prices for given model, or None if unknown.""" + key = normalize_model_name(model_name) + return MODEL_PRICES_PER_M.get(key) + + +def compute_cost_usd(model_name: str, input_tokens: float, output_tokens: float) -> Optional[float]: + """Compute cost in USD given token usage and model pricing. + + Prices are per 1,000,000 tokens. If pricing unknown, returns None. + """ + prices = get_price_per_m(model_name) + if not prices: + return None + input_cost = (input_tokens / 1_000_000.0) * prices["input"] + output_cost = (output_tokens / 1_000_000.0) * prices["output"] + return float(round(input_cost + output_cost, 6)) + + From fc77c79a494b7f645efbc970745de3f6a6dda822 Mon Sep 17 00:00:00 2001 From: zjwu0522 Date: Fri, 5 Sep 2025 18:21:37 +0000 Subject: [PATCH 2/2] minor --- src/aggregators/aggregate_results.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/aggregators/aggregate_results.py b/src/aggregators/aggregate_results.py index d6e5be89..3e28b4cc 100755 --- a/src/aggregators/aggregate_results.py +++ b/src/aggregators/aggregate_results.py @@ -590,7 +590,9 @@ def render_section(title: str, section_data: Dict[str, Any]) -> List[str]: if include_k: header += f" Pass@{k} | Pass^{k} |" sep += "----------|----------|" - # Add Avg Turns and Avg Agent Time (s) at the end + # Add Per-Run Cost (USD) and Avg Agent Time (s) at the end + header += " Per-Run Cost (USD) |" + sep += "---------------------|" header += " Avg Agent Time (s) |" sep += "--------------------|" @@ -607,6 +609,14 @@ def render_section(title: str, section_data: Dict[str, Any]) -> List[str]: for model, metrics in sorted_items: pass1_avg, pass1_std = get_pass1_avg_std(metrics) avg_time = float(metrics.get("avg_agent_execution_time", 0.0) or 0.0) + # Format per-run cost (up to 2 decimal places, trim trailing zeros) + cost_val = metrics.get("per_run_cost") + if isinstance(cost_val, (int, float)): + rounded_cost = round(float(cost_val), 2) + formatted_cost = f"{rounded_cost:.2f}".rstrip('0').rstrip('.') + cost_str = f"${formatted_cost}" + else: + cost_str = "/" row = ( f"| {model} | {metrics.get('total_tasks', 0)} | " f"{pass1_avg * 100:.1f}% ± {pass1_std * 100:.1f}% |" @@ -617,7 +627,8 @@ def render_section(title: str, section_data: Dict[str, Any]) -> List[str]: else: # Single-run models do not have pass@k or pass^k; show placeholders row += " / | / |" - # Append avg agent time at the end + # Append cost and avg agent time at the end + row += f" {cost_str} |" row += f" {avg_time:.1f} |" lines_sec.append(row)