From f848d0e139f22fe6f02bbe22c0ddcf86200a7741 Mon Sep 17 00:00:00 2001 From: Eric Campbell Date: Thu, 19 Mar 2026 14:55:08 -0500 Subject: [PATCH 1/7] New model configs + Makefile/score_table updates --- .gitignore | 7 +- Makefile | 22 +- cli/score_table.py | 131 ++++++ .../adapters/fireworks.py | 83 +++- src/arc_agi_benchmarking/models.yml | 410 +++++++++++++++++- 5 files changed, 647 insertions(+), 6 deletions(-) create mode 100644 cli/score_table.py diff --git a/.gitignore b/.gitignore index 3cea7601..7a8b7cb4 100644 --- a/.gitignore +++ b/.gitignore @@ -182,4 +182,9 @@ analyze_submission_times.py /logs/* /test_logs/* .codex_* -dummy_submissions \ No newline at end of file +dummy_submissions + +data/semiprivate-v1 +data/semiprivate-v2 +data/public-v1 +data/public-v2 \ No newline at end of file diff --git a/Makefile b/Makefile index 1a1d9898..8a7eec95 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help install test test-verbose run-sample run-batch clean score +.PHONY: help install test test-verbose run-sample run-batch run-benchmark clean score upload help: @echo "Available commands:" @@ -7,7 +7,9 @@ help: @echo " make test-verbose - Run tests with verbose output" @echo " make run-sample - Run random baseline on sample task" @echo " make run-batch - Run random baseline on all sample tasks" - @echo " make score - Score submissions against ground truth" + @echo " make run-benchmark CONFIG= DATA_SOURCE= - Run a full local benchmark" + @echo " make score CONFIGS=\"config1,config2\" - Score configs across all datasets" + @echo " make upload CONFIG= DATASET= - Upload submissions to Hugging Face" @echo " make clean - Remove generated files and caches" install: @@ -43,6 +45,22 @@ score: --submission_dir submissions/random-batch \ --print_logs +# Upload PUBLIC submissions to Hugging Face (NEVER semiprivate or private) +# Usage: make upload CONFIG=minimax-m2.5 DATASET=v1 +# Usage: make upload CONFIG=minimax-m2.5 DATASET=v2 +DATASET ?= +upload: +ifndef CONFIG + $(error CONFIG is required. Usage: make upload CONFIG= DATASET=) +endif +ifndef DATASET + $(error DATASET is required (v1 or v2). Usage: make upload CONFIG= DATASET=) +endif + python cli/submission_cli.py upload \ + submissions/$(CONFIG)/public-$(DATASET)/evaluation \ + --model-name $(CONFIG) \ + --task-set arc_agi_$(DATASET)_public_eval + clean: rm -rf __pycache__ .pytest_cache rm -rf src/arc_agi_benchmarking/__pycache__ diff --git a/cli/score_table.py b/cli/score_table.py new file mode 100644 index 00000000..74343dc8 --- /dev/null +++ b/cli/score_table.py @@ -0,0 +1,131 @@ +"""Score multiple configs across datasets and display as a table.""" + +import argparse +import io +import os +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src")) + +from arc_agi_benchmarking.scoring.scoring import ARCScorer +from arc_agi_benchmarking.utils.task_utils import read_models_config + +DATASETS = [ + ("public-v1/evaluation", "Public Eval v1"), + ("public-v2/evaluation", "Public Eval v2"), + ("semiprivate-v1/evaluation", "Semi-Private v1"), + ("semiprivate-v2/evaluation", "Semi-Private v2"), +] + + +def score_config_dataset(config: str, dataset_path: str, base_dir: Path) -> dict | None: + task_dir = base_dir / "data" / dataset_path + submission_dir = base_dir / "submissions" / config / dataset_path + + if not submission_dir.exists() or not any(submission_dir.glob("*.json")): + return None + + if not task_dir.exists(): + return None + + scorer = ARCScorer( + task_dir=str(task_dir), + submission_dir=str(submission_dir), + print_logs=False, + ) + # Suppress the scorer's built-in print output + old_stdout = sys.stdout + sys.stdout = io.StringIO() + try: + total_score, total_tasks = scorer.score_submission() + finally: + sys.stdout = old_stdout + return { + "score": total_score, + "total_tasks": total_tasks, + "cost": scorer.total_cost, + "cost_per_task": scorer.total_cost / total_tasks, + } + + +def format_cell(result: dict | None) -> str: + if result is None: + return "-" + pct = ( + (result["score"] / result["total_tasks"] * 100) + if result["total_tasks"] > 0 + else 0 + ) + return f"{pct:.2f}% / {result['score']:.4g}/{result['total_tasks']} / ${result['cost_per_task']:.2f}" + + +DISPLAY_PARAMS = ["reasoning_effort", "thinking", "thinking_config", "reasoning", "stream", "max_tokens"] + + +def get_model_params(config: str) -> str: + """Get key model params from models.yml for display.""" + try: + model_config = read_models_config(config) + parts = [f"provider={model_config.provider}"] + for key in DISPLAY_PARAMS: + if key in model_config.kwargs: + val = model_config.kwargs[key] + parts.append(f"{key}={val}") + return ", ".join(parts) + except (ValueError, Exception): + return "" + + +def main(): + parser = argparse.ArgumentParser(description="Score multiple configs in a table") + parser.add_argument( + "configs", + type=lambda s: [c.strip() for c in s.split(",")], + help="Comma-separated model config names to score", + ) + parser.add_argument( + "--base_dir", + type=str, + default=str(Path(__file__).resolve().parent.parent), + help="Base directory of arc-agi-benchmarking", + ) + args = parser.parse_args() + + base_dir = Path(args.base_dir) + + # Collect results + rows = [] + for config in args.configs: + row = {"config": config, "params": get_model_params(config)} + for dataset_path, _ in DATASETS: + row[dataset_path] = score_config_dataset(config, dataset_path, base_dir) + rows.append(row) + + # Build table + headers = ["Model Config"] + [label for _, label in DATASETS] + ["Params"] + config_col_width = max(len(headers[0]), max(len(r["config"]) for r in rows)) + col_widths = [config_col_width] + for i, (dataset_path, label) in enumerate(DATASETS): + cells = [format_cell(r[dataset_path]) for r in rows] + col_widths.append(max(len(label), max(len(c) for c in cells))) + params_width = max(len("Params"), max(len(r["params"]) for r in rows)) + col_widths.append(params_width) + + def fmt_row(values): + return "| " + " | ".join(v.ljust(w) for v, w in zip(values, col_widths)) + " |" + + sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|" + + print(fmt_row(headers)) + print(sep) + for row in rows: + cells = [row["config"]] + for dataset_path, _ in DATASETS: + cells.append(format_cell(row[dataset_path])) + cells.append(row["params"]) + print(fmt_row(cells)) + + +if __name__ == "__main__": + main() diff --git a/src/arc_agi_benchmarking/adapters/fireworks.py b/src/arc_agi_benchmarking/adapters/fireworks.py index 7a9cd7b3..0553ba74 100644 --- a/src/arc_agi_benchmarking/adapters/fireworks.py +++ b/src/arc_agi_benchmarking/adapters/fireworks.py @@ -18,12 +18,91 @@ def init_client(self): return OpenAI(api_key=api_key, base_url="https://api.fireworks.ai/inference/v1") - def _chat_completion(self, messages: List[Dict[str, str]]) -> Any: + def _prepare_kwargs(self) -> dict: api_kwargs = _filter_api_kwargs(self.model_config.kwargs) api_kwargs["store"] = False + return api_kwargs + + def _chat_completion(self, messages: List[Dict[str, str]]) -> Any: + api_kwargs = self._prepare_kwargs() + logger.debug( f"Calling Fireworks API with model: {self.model_config.model_name} and kwargs: {api_kwargs}" ) return self.client.chat.completions.create( - model=self.model_config.model_name, messages=messages, **api_kwargs + model=self.model_config.model_name, + messages=messages, + **api_kwargs, ) + + def _chat_completion_stream(self, messages: List[Dict[str, str]]) -> Any: + api_kwargs = self._prepare_kwargs() + stream_kwargs = {k: v for k, v in api_kwargs.items() if k != "stream"} + + logger.debug( + f"Starting streaming Fireworks API call with model: {self.model_config.model_name}" + ) + try: + stream = self.client.chat.completions.create( + model=self.model_config.model_name, + messages=messages, + stream=True, + stream_options={"include_usage": True}, + **stream_kwargs, + ) + + from openai.types.chat import ChatCompletion, ChatCompletionMessage + from openai.types.chat.chat_completion import Choice as OpenAIChoice + from openai.types import CompletionUsage + import time + + content_chunks = [] + last_chunk = None + finish_reason = "stop" + chunk_count = 0 + + for chunk in stream: + last_chunk = chunk + chunk_count += 1 + + if chunk.choices: + delta_content = chunk.choices[0].delta.content or "" + if delta_content: + content_chunks.append(delta_content) + + if chunk.choices and chunk.choices[0].finish_reason: + finish_reason = chunk.choices[0].finish_reason + + final_content = "".join(content_chunks) + usage_data = ( + last_chunk.usage if last_chunk and hasattr(last_chunk, "usage") else None + ) + response_id = last_chunk.id if last_chunk else f"stream-{int(time.time())}" + + if not usage_data: + logger.warning("No usage data received from streaming response") + usage_data = CompletionUsage( + prompt_tokens=0, completion_tokens=0, total_tokens=0 + ) + + return ChatCompletion( + id=response_id, + choices=[ + OpenAIChoice( + finish_reason=finish_reason, + index=0, + message=ChatCompletionMessage( + content=final_content, role="assistant" + ), + logprobs=None, + ) + ], + created=int(time.time()), + model=self.model_config.model_name, + object="chat.completion", + usage=usage_data, + ) + + except Exception as e: + logger.error(f"Error during streaming: {e}") + raise diff --git a/src/arc_agi_benchmarking/models.yml b/src/arc_agi_benchmarking/models.yml index b29dd95d..851c1588 100644 --- a/src/arc_agi_benchmarking/models.yml +++ b/src/arc_agi_benchmarking/models.yml @@ -9,10 +9,278 @@ models: input: 0.0 output: 0.0 +################ +#### Ollama #### +################ + + - name: "qwen3.5-0.8b" + model_name: "qwen3.5:0.8b" + provider: "ollama" + api_type: "chat_completions" + max_completion_tokens: 16000 + temperature: 1.0 + top_p: 0.95 + presence_penalty: 1.5 + enable_thinking: true + pricing: + date: "2026-03-03" + input: 0.00 + output: 0.00 + + - name: "qwen3.5-2b" + model_name: "qwen3.5:2b" + provider: "ollama" + api_type: "chat_completions" + max_completion_tokens: 16000 + temperature: 1.0 + top_p: 0.95 + presence_penalty: 1.5 + enable_thinking: true + pricing: + date: "2026-03-03" + input: 0.00 + output: 0.00 + + - name: "qwen3.5-4b" + model_name: "qwen3.5:4b" + provider: "ollama" + api_type: "chat_completions" + max_completion_tokens: 16000 + temperature: 1.0 + top_p: 0.95 + presence_penalty: 1.5 + enable_thinking: true + pricing: + date: "2026-03-03" + input: 0.00 + output: 0.00 + + - name: "qwen3.5-9b" + model_name: "qwen3.5:9b" + provider: "ollama" + api_type: "chat_completions" + max_completion_tokens: 16000 + temperature: 1.0 + top_p: 0.95 + presence_penalty: 1.5 + enable_thinking: true + pricing: + date: "2026-03-03" + input: 0.00 + output: 0.00 + ################ #### OpenAI #### ################ + - name: "gpt-5-4-pro-low" + model_name: "gpt-5.4-pro" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "low" + max_completion_tokens: 128000 + pricing: + date: "2026-03-04" + input: 30.00 + output: 180.00 + + - name: "gpt-5-4-pro-medium" + model_name: "gpt-5.4-pro" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "medium" + max_completion_tokens: 128000 + pricing: + date: "2026-03-04" + input: 30.00 + output: 180.00 + + - name: "gpt-5-4-pro-high" + model_name: "gpt-5.4-pro" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "high" + max_completion_tokens: 128000 + pricing: + date: "2026-03-04" + input: 30.00 + output: 180.00 + + - name: "gpt-5-4-pro-xhigh" + model_name: "gpt-5.4-pro" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "xhigh" + max_completion_tokens: 128000 + pricing: + date: "2026-03-04" + input: 30.00 + output: 180.00 + + - name: "gpt-5-4-mini-low" + model_name: "gpt-5.4-mini" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "low" + max_completion_tokens: 128000 + pricing: + date: "2026-03-17" + input: 0.75 + output: 4.50 + + - name: "gpt-5-4-mini-medium" + model_name: "gpt-5.4-mini" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "medium" + max_completion_tokens: 128000 + pricing: + date: "2026-03-17" + input: 0.75 + output: 4.50 + + - name: "gpt-5-4-mini-high" + model_name: "gpt-5.4-mini" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "high" + max_completion_tokens: 128000 + pricing: + date: "2026-03-17" + input: 0.75 + output: 4.50 + + - name: "gpt-5-4-mini-xhigh" + model_name: "gpt-5.4-mini" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "xhigh" + max_completion_tokens: 128000 + pricing: + date: "2026-03-17" + input: 0.75 + output: 4.50 + + - name: "gpt-5-4-nano-low" + model_name: "gpt-5.4-nano" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "low" + max_completion_tokens: 128000 + pricing: + date: "2026-03-17" + input: 0.20 + output: 1.25 + + - name: "gpt-5-4-nano-medium" + model_name: "gpt-5.4-nano" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "medium" + max_completion_tokens: 128000 + pricing: + date: "2026-03-17" + input: 0.20 + output: 1.25 + + - name: "gpt-5-4-nano-high" + model_name: "gpt-5.4-nano" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "high" + max_completion_tokens: 128000 + pricing: + date: "2026-03-17" + input: 0.20 + output: 1.25 + + - name: "gpt-5-4-nano-xhigh" + model_name: "gpt-5.4-nano" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "xhigh" + max_completion_tokens: 128000 + pricing: + date: "2026-03-17" + input: 0.20 + output: 1.25 + + - name: "gpt-5-4-low" + model_name: "gpt-5.4" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "low" + max_completion_tokens: 128000 + pricing: + date: "2026-03-04" + input: 2.50 + output: 15.00 + + - name: "gpt-5-4-medium" + model_name: "gpt-5.4" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "medium" + max_completion_tokens: 128000 + pricing: + date: "2026-03-04" + input: 2.50 + output: 15.00 + + - name: "gpt-5-4-high" + model_name: "gpt-5.4" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "high" + max_completion_tokens: 128000 + pricing: + date: "2026-03-04" + input: 2.50 + output: 15.00 + + - name: "gpt-5-4-xhigh" + model_name: "gpt-5.4" + provider: "openai" + api_type: "responses" + background: true + reasoning: + effort: "xhigh" + max_completion_tokens: 128000 + pricing: + date: "2026-03-04" + input: 2.50 + output: 15.00 + - name: "gpt-5-2-pro-2025-12-11-medium" model_name: "gpt-5.2-pro-2025-12-11" provider: "openai" @@ -547,6 +815,17 @@ models: input: 2.00 output: 8.00 + - name: "gpt-4o" + model_name: "gpt-4o" + provider: "openai" + api_type: "responses" + max_completion_tokens: 8000 + stream: true + pricing: + date: "2025-02-23" + input: 2.50 # Standard input price; note: a cached input rate of 1.25 exists. + output: 10.00 + - name: "gpt-4o-2024-11-20" model_name: "gpt-4o-2024-11-20" provider: "openai" @@ -724,9 +1003,28 @@ models: max_completion_tokens: 65536 pricing: date: "2025-04-28" - input: 1.10 + input: 1.10 output: 4.40 + # 2023-era models + - name: "gpt-3-5-turbo" + model_name: "gpt-3.5-turbo" + provider: "openai" + max_completion_tokens: 4096 + pricing: + date: "2023-03-01" + input: 0.50 + output: 1.50 + + - name: "gpt-4" + model_name: "gpt-4-0613" + provider: "openai" + max_completion_tokens: 8192 + pricing: + date: "2023-03-14" + input: 30.00 + output: 60.00 + ################### #### Grok (X.AI) #### ################### @@ -1096,6 +1394,16 @@ models: input: 15.00 # Single input price per 1M tokens. output: 75.00 # Single output price per 1M tokens. + # 2024-era model + - name: "claude-3-haiku" + model_name: "anthropic/claude-3-haiku" + provider: "openrouter" + max_tokens: 4096 + pricing: + date: "2024-03-13" + input: 0.25 + output: 1.25 + ########################## #### Claude Agent SDK #### ########################## @@ -1488,6 +1796,60 @@ models: input: 3.00 output: 15.00 + - name: "grok-4.20-beta-0309b-reasoning" + model_name: "grok-4.20-beta-0309b-reasoning" + provider: "xai" + api_type: "responses" + max_completion_tokens: 128000 + temperature: 0.7 + top_p: 1.0 + stream: true + pricing: + date: "2026-03-11" + input: 2.00 + output: 6.00 + + - name: "grok-4.20-multi-agent-beta-0309" + model_name: "grok-4.20-multi-agent-beta-0309" + provider: "xai" + api_type: "responses" + max_completion_tokens: 128000 + temperature: 0.7 + top_p: 1.0 + stream: true + pricing: + date: "2026-03-11" + input: 2.00 + output: 6.00 + + - name: "grok-4.20-multi-agent-beta-0309-high" + model_name: "grok-4.20-multi-agent-beta-0309" + provider: "xai" + api_type: "responses" + max_completion_tokens: 128000 + reasoning_effort: "high" + temperature: 0.7 + top_p: 1.0 + stream: true + pricing: + date: "2026-03-11" + input: 2.00 + output: 6.00 + + - name: "grok-4.20-multi-agent-beta-0309-xhigh" + model_name: "grok-4.20-multi-agent-beta-0309" + provider: "xai" + api_type: "responses" + max_completion_tokens: 128000 + reasoning_effort: "xhigh" + temperature: 0.7 + top_p: 1.0 + stream: true + pricing: + date: "2026-03-11" + input: 2.00 + output: 6.00 + - name: "grok-3-mini-xai-high" model_name: "grok-3-mini" provider: "xai" @@ -1509,6 +1871,8 @@ models: model_name: "accounts/fireworks/models/kimi-k2p5" provider: "fireworks" max_tokens: 100000 + stream: true + reasoning_effort: "high" pricing: date: "2025-01-27" input: 0.60 @@ -1536,11 +1900,55 @@ models: model_name: "accounts/fireworks/models/qwen3-235b-a22b-instruct-2507" provider: "fireworks" max_tokens: 100000 + stream: true + reasoning_effort: "high" pricing: date: "2025-07-21" input: 0.12 output: 0.59 + - name: "deepseek-v3.2" + model_name: "accounts/fireworks/models/deepseek-v3p2" + provider: "fireworks" + max_tokens: 100000 + stream: true + reasoning_effort: "high" + rate_limit: + rate: 2 + period: 60 + pricing: + date: "2026-02-20" + input: 0.56 + output: 1.68 + + - name: "glm-5" + model_name: "accounts/fireworks/models/glm-5" + provider: "fireworks" + max_tokens: 100000 + stream: true + reasoning_effort: "high" + rate_limit: + rate: 2 + period: 60 + pricing: + date: "2026-02-20" + input: 1.00 + output: 3.20 + + - name: "minimax-m2.5" + model_name: "accounts/fireworks/models/minimax-m2p5" + provider: "fireworks" + max_tokens: 100000 + stream: true + reasoning_effort: "high" + rate_limit: + rate: 2 + period: 60 + pricing: + date: "2026-02-12" + input: 0.30 + output: 1.20 + ######################## ###### DashScope ####### ######################## From 1fe1bce5f3f6394ff9f1aeb86667dce6cda506bc Mon Sep 17 00:00:00 2001 From: Eric Campbell Date: Thu, 19 Mar 2026 14:55:47 -0500 Subject: [PATCH 2/7] Requeue failed/in-progress tasks on resume instead of skipping them. --- cli/run_all.py | 34 +++++++-- .../checkpoint/batch_progress.py | 30 ++++---- .../tests/test_checkpoint.py | 71 +++++++++++++++++++ 3 files changed, 116 insertions(+), 19 deletions(-) diff --git a/cli/run_all.py b/cli/run_all.py index 103a202e..9c83a6b5 100644 --- a/cli/run_all.py +++ b/cli/run_all.py @@ -385,29 +385,53 @@ async def main(task_list_file: Optional[str], # Determine which tasks to run if resume: - # Only run pending tasks that don't already have submission files on disk + # Run any unfinished task that doesn't already have a submission on disk. + # This lets interrupted runs pick up failed/in-progress tasks automatically. tasks_to_run = [] skipped_existing_submissions = 0 + requeued_unfinished_tasks = 0 for task_id in task_ids: task_progress = progress_manager.progress.tasks.get(task_id) - if not task_progress or task_progress.status != TaskStatus.PENDING: + if not task_progress: continue + # Check if submission already exists on disk (handles pre-checkpoint runs) if submission_exists(save_submission_dir, task_id): - progress_manager.mark_completed(task_id) - skipped_existing_submissions += 1 + if task_progress.status != TaskStatus.COMPLETED: + progress_manager.mark_completed(task_id) + skipped_existing_submissions += 1 + continue + + if task_progress.status == TaskStatus.COMPLETED: + # Checkpoint says completed but no submission file — requeue + logger.warning(f"Task {task_id} marked completed but no submission found on disk. Requeuing.") + progress_manager.requeue_task(task_id) + requeued_unfinished_tasks += 1 + tasks_to_run.append(task_id) continue + + if task_progress.status != TaskStatus.PENDING: + progress_manager.requeue_task(task_id) + requeued_unfinished_tasks += 1 + tasks_to_run.append(task_id) completed_count = progress_manager.progress.completed_count failed_count = progress_manager.progress.failed_count - if completed_count > 0 or failed_count > 0 or skipped_existing_submissions > 0: + if ( + completed_count > 0 + or failed_count > 0 + or skipped_existing_submissions > 0 + or requeued_unfinished_tasks > 0 + ): logger.info( f"Resuming: {completed_count} completed, {failed_count} failed, " f"{len(tasks_to_run)} remaining" ) if skipped_existing_submissions > 0: logger.info(f"Marked {skipped_existing_submissions} task(s) as completed (existing submissions found on disk)") + if requeued_unfinished_tasks > 0: + logger.info(f"Requeued {requeued_unfinished_tasks} unfinished task(s) with no submission file") else: tasks_to_run = task_ids logger.info("Resume disabled - running all tasks") diff --git a/src/arc_agi_benchmarking/checkpoint/batch_progress.py b/src/arc_agi_benchmarking/checkpoint/batch_progress.py index 7bc9807f..05cc8bf4 100644 --- a/src/arc_agi_benchmarking/checkpoint/batch_progress.py +++ b/src/arc_agi_benchmarking/checkpoint/batch_progress.py @@ -176,6 +176,20 @@ def update_task_progress( task.cost_usd = cost_usd self._save() + def requeue_task(self, task_id: str) -> bool: + """Reset a task back to pending so it can be picked up again.""" + task = self.progress.tasks.get(task_id) + if not task: + return False + + task.status = TaskStatus.PENDING + task.error = None + task.worker_id = None + task.started_at = None + task.completed_at = None + self._save() + return True + def reset_stale_tasks(self, max_age_seconds: int = 3600) -> int: """Reset tasks that have been in-progress too long (stale workers). @@ -196,14 +210,9 @@ def reset_stale_tasks(self, max_age_seconds: int = 3600) -> int: f"Resetting stale task {task.task_id} " f"(age: {age:.0f}s, worker: {task.worker_id})" ) - task.status = TaskStatus.PENDING - task.worker_id = None - task.started_at = None + self.requeue_task(task.task_id) reset_count += 1 - if reset_count > 0: - self._save() - return reset_count def is_complete(self) -> bool: @@ -221,16 +230,9 @@ def retry_failed_tasks(self) -> int: reset_count = 0 for task in self.progress.tasks.values(): if task.status == TaskStatus.FAILED: - task.status = TaskStatus.PENDING - task.error = None - task.worker_id = None - task.started_at = None - task.completed_at = None + self.requeue_task(task.task_id) reset_count += 1 - if reset_count > 0: - self._save() - return reset_count def get_summary(self) -> dict: diff --git a/src/arc_agi_benchmarking/tests/test_checkpoint.py b/src/arc_agi_benchmarking/tests/test_checkpoint.py index 226b97c7..416a1f9d 100644 --- a/src/arc_agi_benchmarking/tests/test_checkpoint.py +++ b/src/arc_agi_benchmarking/tests/test_checkpoint.py @@ -292,6 +292,24 @@ def test_mark_failed(self, manager: BatchProgressManager): assert task.status == TaskStatus.FAILED assert task.error == "API error" + def test_requeue_task(self, manager: BatchProgressManager): + manager.initialize_tasks(["task_1", "task_2"]) + manager.claim_task("task_1") + manager.mark_failed("task_1", error="API error") + manager.claim_task("task_2") + + assert manager.requeue_task("task_1") + assert manager.requeue_task("task_2") + + failed_task = manager.progress.tasks["task_1"] + in_progress_task = manager.progress.tasks["task_2"] + assert failed_task.status == TaskStatus.PENDING + assert failed_task.error is None + assert failed_task.worker_id is None + assert failed_task.started_at is None + assert failed_task.completed_at is None + assert in_progress_task.status == TaskStatus.PENDING + def test_mark_failed_accumulates_costs(self, manager: BatchProgressManager): manager.initialize_tasks(["task_1", "task_2"]) manager.claim_task("task_1") @@ -400,6 +418,59 @@ def test_retry_failed_tasks(self, manager: BatchProgressManager): assert manager.progress.tasks["task_2"].error is None assert manager.progress.tasks["task_3"].status == TaskStatus.PENDING + def test_resume_requeues_unfinished_tasks_without_submissions( + self, tmp_path: Path + ): + """ + Regression test: resume should rerun failed and interrupted tasks when + no submission file exists, instead of requiring checkpoint deletion. + """ + from arc_agi_benchmarking.utils.submission_exists import submission_exists + + submission_dir = tmp_path / "submissions" + submission_dir.mkdir(parents=True) + + completed_submission = submission_dir / "task_1.json" + completed_submission.write_text('{"attempt_1": [[1, 2]], "attempt_2": [[3, 4]]}') + + checkpoint_dir = submission_dir / ".checkpoints" + storage = LocalStorageBackend(checkpoint_dir) + manager = BatchProgressManager(storage, run_id="test_config") + + all_task_ids = ["task_1", "task_2", "task_3", "task_4"] + manager.initialize_tasks(all_task_ids, attempts_per_task=2) + manager.claim_task("task_1") + manager.mark_completed("task_1") + manager.claim_task("task_2") + manager.mark_failed("task_2", "API error") + manager.claim_task("task_3") + # task_3 remains in progress to simulate interruption + + tasks_to_run = [] + for task_id in all_task_ids: + task_progress = manager.progress.tasks.get(task_id) + if not task_progress: + continue + + if submission_exists(str(submission_dir), task_id): + if task_progress.status != TaskStatus.COMPLETED: + manager.mark_completed(task_id) + continue + + if task_progress.status == TaskStatus.COMPLETED: + continue + + if task_progress.status != TaskStatus.PENDING: + manager.requeue_task(task_id) + + tasks_to_run.append(task_id) + + assert tasks_to_run == ["task_2", "task_3", "task_4"] + assert manager.progress.tasks["task_1"].status == TaskStatus.COMPLETED + assert manager.progress.tasks["task_2"].status == TaskStatus.PENDING + assert manager.progress.tasks["task_3"].status == TaskStatus.PENDING + assert manager.progress.tasks["task_4"].status == TaskStatus.PENDING + class TestTaskCheckpointManager: @pytest.fixture From 1b158897b6bb42a8f976bfbf04c9014c6b830c32 Mon Sep 17 00:00:00 2001 From: Eric Campbell Date: Thu, 19 Mar 2026 14:56:25 -0500 Subject: [PATCH 3/7] Extract _prepare_responses_kwargs(), default store=False, map reasoning_effort, fix response parsing. --- .../adapters/openai_base.py | 42 +++++++++----- .../tests/test_openai_providers.py | 57 +++++++++++++++++++ 2 files changed, 86 insertions(+), 13 deletions(-) diff --git a/src/arc_agi_benchmarking/adapters/openai_base.py b/src/arc_agi_benchmarking/adapters/openai_base.py index 1c2cd3ad..3754020b 100644 --- a/src/arc_agi_benchmarking/adapters/openai_base.py +++ b/src/arc_agi_benchmarking/adapters/openai_base.py @@ -26,7 +26,7 @@ # Keys in model_config.kwargs that are for internal use only and should NOT be passed to the API # Note: 'reasoning' and 'background' ARE valid Responses API parameters and should pass through -_CONFIG_ONLY_KWARGS = {"rate_limit", "pricing", "enable_thinking"} +_CONFIG_ONLY_KWARGS = {"rate_limit", "pricing", "enable_thinking", "reasoning_effort"} def _filter_api_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]: @@ -61,6 +61,25 @@ def __init__( class OpenAIBaseAdapter(ProviderAdapter, abc.ABC): + def _prepare_responses_kwargs(self) -> Dict[str, Any]: + """Prepare kwargs for Responses API calls.""" + api_kwargs = _filter_api_kwargs(self.model_config.kwargs) + + # Default to privacy-first behavior for Responses API providers unless the + # model config explicitly opts into server-side storage. + if "store" not in api_kwargs: + api_kwargs["store"] = False + + # reasoning is valid for OpenAI Responses API - re-add if present + if "reasoning" in self.model_config.kwargs: + api_kwargs["reasoning"] = self.model_config.kwargs["reasoning"] + elif "reasoning_effort" in self.model_config.kwargs: + api_kwargs["reasoning"] = { + "effort": self.model_config.kwargs["reasoning_effort"] + } + + return api_kwargs + def make_prediction( self, prompt: str, @@ -256,11 +275,7 @@ def _responses(self, messages: List[Dict[str, str]]) -> Any: """ Make a call to the OpenAI Responses API """ - api_kwargs = _filter_api_kwargs(self.model_config.kwargs) - - # reasoning is valid for OpenAI Responses API - re-add if present - if "reasoning" in self.model_config.kwargs: - api_kwargs["reasoning"] = self.model_config.kwargs["reasoning"] + api_kwargs = self._prepare_responses_kwargs() resp = self.client.responses.create( model=self.model_config.model_name, input=messages, **api_kwargs @@ -292,13 +307,9 @@ def _responses_stream(self, messages: List[Dict[str, str]]) -> Any: ) # Prepare kwargs for streaming, removing 'stream' and config-only keys - api_kwargs = _filter_api_kwargs(self.model_config.kwargs) + api_kwargs = self._prepare_responses_kwargs() stream_kwargs = {k: v for k, v in api_kwargs.items() if k != "stream"} - # reasoning is valid for OpenAI Responses API - re-add if present - if "reasoning" in self.model_config.kwargs: - stream_kwargs["reasoning"] = self.model_config.kwargs["reasoning"] - try: # Create the stream stream = self.client.responses.create( @@ -536,8 +547,13 @@ def _get_content(self, response: Any) -> str: else: # APIType.RESPONSES content = getattr(response, "output_text", "") if not content and getattr(response, "output", None): - # Fallback to first text block - content = response.output[0].content[0].text or "" + # Fallback: find the first message item (skip reasoning items) + for item in response.output: + if getattr(item, "type", None) == "message" and hasattr( + item, "content" + ): + content = item.content[0].text or "" + break return content.strip() def _get_role(self, response: Any) -> str: diff --git a/src/arc_agi_benchmarking/tests/test_openai_providers.py b/src/arc_agi_benchmarking/tests/test_openai_providers.py index 7a1b6bfa..f00665e4 100644 --- a/src/arc_agi_benchmarking/tests/test_openai_providers.py +++ b/src/arc_agi_benchmarking/tests/test_openai_providers.py @@ -515,6 +515,63 @@ def test_responses_api_passes_background_and_reasoning_to_api(self, adapter_inst assert passed_kwargs['reasoning'] == {'effort': 'high'}, \ "reasoning value was not passed correctly" + def test_responses_default_store_false(self, adapter_instance): + """Responses API should default to store=false unless explicitly overridden.""" + adapter_instance.model_config.api_type = APIType.RESPONSES + adapter_instance.model_config.kwargs = { + 'max_output_tokens': 100000, + } + + mock_response = MagicMock() + mock_response.status = "completed" + adapter_instance.client.responses.create = MagicMock(return_value=mock_response) + + messages = [{"role": "user", "content": "Test prompt"}] + adapter_instance._responses(messages) + + adapter_instance.client.responses.create.assert_called_once() + passed_kwargs = adapter_instance.client.responses.create.call_args.kwargs + assert passed_kwargs["store"] is False + + def test_responses_respect_explicit_store_override(self, adapter_instance): + """Explicit store config should override the Responses API default.""" + adapter_instance.model_config.api_type = APIType.RESPONSES + adapter_instance.model_config.kwargs = { + 'store': True, + 'max_output_tokens': 100000, + } + + mock_response = MagicMock() + mock_response.status = "completed" + adapter_instance.client.responses.create = MagicMock(return_value=mock_response) + + messages = [{"role": "user", "content": "Test prompt"}] + adapter_instance._responses(messages) + + adapter_instance.client.responses.create.assert_called_once() + passed_kwargs = adapter_instance.client.responses.create.call_args.kwargs + assert passed_kwargs["store"] is True + + def test_responses_maps_reasoning_effort_to_reasoning_payload(self, adapter_instance): + """Responses API should translate config reasoning_effort into reasoning.effort.""" + adapter_instance.model_config.api_type = APIType.RESPONSES + adapter_instance.model_config.kwargs = { + 'reasoning_effort': 'xhigh', + 'max_output_tokens': 100000, + } + + mock_response = MagicMock() + mock_response.status = "completed" + adapter_instance.client.responses.create = MagicMock(return_value=mock_response) + + messages = [{"role": "user", "content": "Test prompt"}] + adapter_instance._responses(messages) + + adapter_instance.client.responses.create.assert_called_once() + passed_kwargs = adapter_instance.client.responses.create.call_args.kwargs + assert passed_kwargs["reasoning"] == {"effort": "xhigh"} + assert "reasoning_effort" not in passed_kwargs + # --- Tests for extract_json_from_response --- def test_extract_json_direct_array(self, adapter_class, adapter_instance): From c18212b9bcdc1315bfed168692a66c350de2e747 Mon Sep 17 00:00:00 2001 From: Eric Campbell Date: Thu, 19 Mar 2026 15:00:07 -0500 Subject: [PATCH 4/7] upload filter + preflight fix --- Makefile | 16 ++++++++++++++++ cli/submission_cli.py | 6 ++++-- src/arc_agi_benchmarking/adapters/__init__.py | 1 + src/arc_agi_benchmarking/utils/preflight.py | 13 +++++++------ 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 8a7eec95..a083c48f 100644 --- a/Makefile +++ b/Makefile @@ -61,6 +61,22 @@ endif --model-name $(CONFIG) \ --task-set arc_agi_$(DATASET)_public_eval +# Upload PUBLIC submissions to Hugging Face (NEVER semiprivate or private) +# Usage: make upload CONFIG=minimax-m2.5 DATASET=v1 +# Usage: make upload CONFIG=minimax-m2.5 DATASET=v2 +DATASET ?= +upload: +ifndef CONFIG + $(error CONFIG is required. Usage: make upload CONFIG= DATASET=) +endif +ifndef DATASET + $(error DATASET is required (v1 or v2). Usage: make upload CONFIG= DATASET=) +endif + python cli/submission_cli.py upload \ + submissions/$(CONFIG)/public-$(DATASET)/evaluation \ + --model-name $(CONFIG) \ + --task-set arc_agi_$(DATASET)_public_eval + clean: rm -rf __pycache__ .pytest_cache rm -rf src/arc_agi_benchmarking/__pycache__ diff --git a/cli/submission_cli.py b/cli/submission_cli.py index 27c1743f..aeff34e9 100644 --- a/cli/submission_cli.py +++ b/cli/submission_cli.py @@ -104,7 +104,8 @@ def upload(output_dir, model_name, task_set, org, public): folder_path=str(output_path), path_in_repo=model_name, # Places files in task_set/model_name/ repo_id=repo_id, - repo_type="dataset" + repo_type="dataset", + allow_patterns=["*.json"], ) click.echo(f"\nāœ… Successfully uploaded files to {repo_id}/{model_name}") @@ -153,7 +154,8 @@ def bulk_upload(submissions_dir, task_set, org, public): folder_path=str(model_dir), path_in_repo=model_name, repo_id=repo_id, - repo_type="dataset" + repo_type="dataset", + allow_patterns=["*.json"], ) click.echo(f"āœ… Successfully uploaded {model_name}") success_count += 1 diff --git a/src/arc_agi_benchmarking/adapters/__init__.py b/src/arc_agi_benchmarking/adapters/__init__.py index bd426008..971134db 100644 --- a/src/arc_agi_benchmarking/adapters/__init__.py +++ b/src/arc_agi_benchmarking/adapters/__init__.py @@ -16,3 +16,4 @@ from .random import RandomAdapter from .claudeagentsdk import ClaudeagentsdkAdapter from .codexcli import CodexcliAdapter +from .ollama import OllamaAdapter diff --git a/src/arc_agi_benchmarking/utils/preflight.py b/src/arc_agi_benchmarking/utils/preflight.py index fd961323..de94e345 100644 --- a/src/arc_agi_benchmarking/utils/preflight.py +++ b/src/arc_agi_benchmarking/utils/preflight.py @@ -32,6 +32,7 @@ "openrouter": ["OPENROUTER_API_KEY"], "codex": ["OPENAI_API_KEY", "CODEX_API_KEY"], # Either works "random": [], # No API key needed + "ollama": [], # Local inference, no API key needed } # Average tokens per ARC task (empirically estimated) @@ -243,20 +244,20 @@ def validate_output_dir(output_dir: str) -> ValidationResult: """Check if the output directory is writable.""" path = Path(output_dir) - # If it doesn't exist, check if parent is writable + # If it doesn't exist, create the full directory tree if not path.exists(): - parent = path.parent - if parent.exists() and os.access(parent, os.W_OK): + try: + path.mkdir(parents=True, exist_ok=True) return ValidationResult( passed=True, - message=f"Output directory will be created", + message=f"Output directory created", details=str(path.absolute()) ) - else: + except OSError as e: return ValidationResult( passed=False, message=f"Cannot create output directory", - details=f"Parent not writable: {parent.absolute()}" + details=str(e) ) if not path.is_dir(): From 535a536b75b3caf45c9360c3157bc086d2b72af0 Mon Sep 17 00:00:00 2001 From: Eric Campbell Date: Thu, 19 Mar 2026 15:07:49 -0500 Subject: [PATCH 5/7] remove unused ollama adapter --- src/arc_agi_benchmarking/adapters/__init__.py | 1 - src/arc_agi_benchmarking/models.yml | 60 ------------- src/arc_agi_benchmarking/utils/preflight.py | 88 ++++++++++--------- 3 files changed, 46 insertions(+), 103 deletions(-) diff --git a/src/arc_agi_benchmarking/adapters/__init__.py b/src/arc_agi_benchmarking/adapters/__init__.py index 971134db..bd426008 100644 --- a/src/arc_agi_benchmarking/adapters/__init__.py +++ b/src/arc_agi_benchmarking/adapters/__init__.py @@ -16,4 +16,3 @@ from .random import RandomAdapter from .claudeagentsdk import ClaudeagentsdkAdapter from .codexcli import CodexcliAdapter -from .ollama import OllamaAdapter diff --git a/src/arc_agi_benchmarking/models.yml b/src/arc_agi_benchmarking/models.yml index 851c1588..437c49e9 100644 --- a/src/arc_agi_benchmarking/models.yml +++ b/src/arc_agi_benchmarking/models.yml @@ -9,66 +9,6 @@ models: input: 0.0 output: 0.0 -################ -#### Ollama #### -################ - - - name: "qwen3.5-0.8b" - model_name: "qwen3.5:0.8b" - provider: "ollama" - api_type: "chat_completions" - max_completion_tokens: 16000 - temperature: 1.0 - top_p: 0.95 - presence_penalty: 1.5 - enable_thinking: true - pricing: - date: "2026-03-03" - input: 0.00 - output: 0.00 - - - name: "qwen3.5-2b" - model_name: "qwen3.5:2b" - provider: "ollama" - api_type: "chat_completions" - max_completion_tokens: 16000 - temperature: 1.0 - top_p: 0.95 - presence_penalty: 1.5 - enable_thinking: true - pricing: - date: "2026-03-03" - input: 0.00 - output: 0.00 - - - name: "qwen3.5-4b" - model_name: "qwen3.5:4b" - provider: "ollama" - api_type: "chat_completions" - max_completion_tokens: 16000 - temperature: 1.0 - top_p: 0.95 - presence_penalty: 1.5 - enable_thinking: true - pricing: - date: "2026-03-03" - input: 0.00 - output: 0.00 - - - name: "qwen3.5-9b" - model_name: "qwen3.5:9b" - provider: "ollama" - api_type: "chat_completions" - max_completion_tokens: 16000 - temperature: 1.0 - top_p: 0.95 - presence_penalty: 1.5 - enable_thinking: true - pricing: - date: "2026-03-03" - input: 0.00 - output: 0.00 - ################ #### OpenAI #### ################ diff --git a/src/arc_agi_benchmarking/utils/preflight.py b/src/arc_agi_benchmarking/utils/preflight.py index de94e345..f301b075 100644 --- a/src/arc_agi_benchmarking/utils/preflight.py +++ b/src/arc_agi_benchmarking/utils/preflight.py @@ -32,7 +32,6 @@ "openrouter": ["OPENROUTER_API_KEY"], "codex": ["OPENAI_API_KEY", "CODEX_API_KEY"], # Either works "random": [], # No API key needed - "ollama": [], # Local inference, no API key needed } # Average tokens per ARC task (empirically estimated) @@ -44,6 +43,7 @@ @dataclass class ValidationResult: """Result of a single validation check.""" + passed: bool message: str details: Optional[str] = None @@ -52,6 +52,7 @@ class ValidationResult: @dataclass class CostEstimate: """Estimated cost breakdown for a benchmark run.""" + num_tasks: int num_attempts_per_task: int total_attempts: int @@ -78,6 +79,7 @@ def __str__(self) -> str: @dataclass class PreflightReport: """Complete preflight validation report.""" + config_name: str validations: List[ValidationResult] cost_estimate: Optional[CostEstimate] @@ -124,19 +126,17 @@ def validate_config_exists(config_name: str) -> ValidationResult: return ValidationResult( passed=True, message=f"Config '{config_name}' found", - details=f"Model: {config.model_name}, Provider: {config.provider}" + details=f"Model: {config.model_name}, Provider: {config.provider}", ) except ValueError as e: return ValidationResult( - passed=False, - message=f"Config '{config_name}' not found", - details=str(e) + passed=False, message=f"Config '{config_name}' not found", details=str(e) ) except Exception as e: return ValidationResult( passed=False, message=f"Error reading config '{config_name}'", - details=str(e) + details=str(e), ) @@ -146,15 +146,14 @@ def validate_api_key(provider: str) -> ValidationResult: return ValidationResult( passed=False, message=f"Unknown provider '{provider}'", - details=f"Known providers: {', '.join(PROVIDER_API_KEYS.keys())}" + details=f"Known providers: {', '.join(PROVIDER_API_KEYS.keys())}", ) required_keys = PROVIDER_API_KEYS[provider] if not required_keys: return ValidationResult( - passed=True, - message=f"No API key required for '{provider}'" + passed=True, message=f"No API key required for '{provider}'" ) # Check if any of the valid keys exist @@ -162,17 +161,19 @@ def validate_api_key(provider: str) -> ValidationResult: if os.environ.get(key_name): # Mask the key for security key_value = os.environ.get(key_name, "") - masked = key_value[:4] + "..." + key_value[-4:] if len(key_value) > 8 else "***" + masked = ( + key_value[:4] + "..." + key_value[-4:] if len(key_value) > 8 else "***" + ) return ValidationResult( passed=True, message=f"API key '{key_name}' found", - details=f"Value: {masked}" + details=f"Value: {masked}", ) return ValidationResult( passed=False, message=f"API key not found for '{provider}'", - details=f"Set one of: {', '.join(required_keys)}" + details=f"Set one of: {', '.join(required_keys)}", ) @@ -185,14 +186,14 @@ def validate_data_dir(data_dir: str) -> Tuple[ValidationResult, List[str]]: return ValidationResult( passed=False, message=f"Data directory not found", - details=str(path.absolute()) + details=str(path.absolute()), ), task_ids if not path.is_dir(): return ValidationResult( passed=False, message=f"Data path is not a directory", - details=str(path.absolute()) + details=str(path.absolute()), ), task_ids # Find all JSON files @@ -202,7 +203,7 @@ def validate_data_dir(data_dir: str) -> Tuple[ValidationResult, List[str]]: return ValidationResult( passed=False, message=f"No task files found in data directory", - details=str(path.absolute()) + details=str(path.absolute()), ), task_ids # Validate each file @@ -211,11 +212,11 @@ def validate_data_dir(data_dir: str) -> Tuple[ValidationResult, List[str]]: for json_file in json_files: try: - with open(json_file, 'r') as f: + with open(json_file, "r") as f: data = json.load(f) # Check for required keys - if 'train' in data and 'test' in data: + if "train" in data and "test" in data: valid_count += 1 task_ids.append(json_file.stem) else: @@ -226,17 +227,24 @@ def validate_data_dir(data_dir: str) -> Tuple[ValidationResult, List[str]]: invalid_files.append(f"{json_file.name} ({str(e)})") if invalid_files: - return ValidationResult( - passed=valid_count > 0, # Partial pass if some files are valid - message=f"Found {valid_count} valid tasks, {len(invalid_files)} invalid", - details=f"Invalid: {', '.join(invalid_files[:5])}" + - (f" (+{len(invalid_files)-5} more)" if len(invalid_files) > 5 else "") - ), task_ids + return ( + ValidationResult( + passed=valid_count > 0, # Partial pass if some files are valid + message=f"Found {valid_count} valid tasks, {len(invalid_files)} invalid", + details=f"Invalid: {', '.join(invalid_files[:5])}" + + ( + f" (+{len(invalid_files) - 5} more)" + if len(invalid_files) > 5 + else "" + ), + ), + task_ids, + ) return ValidationResult( passed=True, message=f"Found {valid_count} valid task files", - details=str(path.absolute()) + details=str(path.absolute()), ), task_ids @@ -251,33 +259,31 @@ def validate_output_dir(output_dir: str) -> ValidationResult: return ValidationResult( passed=True, message=f"Output directory created", - details=str(path.absolute()) + details=str(path.absolute()), ) except OSError as e: return ValidationResult( - passed=False, - message=f"Cannot create output directory", - details=str(e) + passed=False, message=f"Cannot create output directory", details=str(e) ) if not path.is_dir(): return ValidationResult( passed=False, message=f"Output path exists but is not a directory", - details=str(path.absolute()) + details=str(path.absolute()), ) if not os.access(path, os.W_OK): return ValidationResult( passed=False, message=f"Output directory not writable", - details=str(path.absolute()) + details=str(path.absolute()), ) return ValidationResult( passed=True, message=f"Output directory exists and is writable", - details=str(path.absolute()) + details=str(path.absolute()), ) @@ -348,10 +354,11 @@ def run_preflight( api_result = validate_api_key(model_config.provider) validations.append(api_result) else: - validations.append(ValidationResult( - passed=False, - message="Skipping API key validation (config not found)" - )) + validations.append( + ValidationResult( + passed=False, message="Skipping API key validation (config not found)" + ) + ) # 3. Validate data directory data_result, task_ids = validate_data_dir(data_dir) @@ -394,25 +401,22 @@ def main(): "--config", type=str, required=True, - help="Model configuration name from models.yml" + help="Model configuration name from models.yml", ) parser.add_argument( "--data_dir", type=str, default="data/sample/tasks", - help="Directory containing task JSON files" + help="Directory containing task JSON files", ) parser.add_argument( "--output_dir", type=str, default="submissions", - help="Directory for saving submissions" + help="Directory for saving submissions", ) parser.add_argument( - "--num_attempts", - type=int, - default=2, - help="Number of attempts per task" + "--num_attempts", type=int, default=2, help="Number of attempts per task" ) args = parser.parse_args() From 1dbb0f5b603e34306a2e7413dee927e688beacb6 Mon Sep 17 00:00:00 2001 From: Eric Campbell Date: Thu, 19 Mar 2026 15:48:27 -0500 Subject: [PATCH 6/7] fix preflight test --- src/arc_agi_benchmarking/tests/test_preflight.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arc_agi_benchmarking/tests/test_preflight.py b/src/arc_agi_benchmarking/tests/test_preflight.py index 4cbbcca7..f6c68812 100644 --- a/src/arc_agi_benchmarking/tests/test_preflight.py +++ b/src/arc_agi_benchmarking/tests/test_preflight.py @@ -155,7 +155,7 @@ def test_nonexistent_dir_with_writable_parent(self): new_dir = os.path.join(tmpdir, "new_subdir") result = validate_output_dir(new_dir) assert result.passed is True - assert "will be created" in result.message.lower() + assert "created" in result.message.lower() def test_file_instead_of_dir(self): """Test validation when path is a file, not a directory.""" From 09d2cf86ac2a4734263708f52c12b1dd6784a58a Mon Sep 17 00:00:00 2001 From: Eric Campbell Date: Fri, 20 Mar 2026 08:41:58 -0500 Subject: [PATCH 7/7] add score table back in --- Makefile | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index a083c48f..a2d31500 100644 --- a/Makefile +++ b/Makefile @@ -38,28 +38,28 @@ run-batch: --save_submission_dir submissions/random-batch \ --log-level INFO -# Score submissions -score: - python -m arc_agi_benchmarking.scoring.scoring \ - --task_dir data/sample/tasks \ - --submission_dir submissions/random-batch \ - --print_logs - -# Upload PUBLIC submissions to Hugging Face (NEVER semiprivate or private) -# Usage: make upload CONFIG=minimax-m2.5 DATASET=v1 -# Usage: make upload CONFIG=minimax-m2.5 DATASET=v2 -DATASET ?= -upload: +# Run a full benchmark locally +# Usage: make run-benchmark CONFIG=kimi-k2.5 DATA_SOURCE=semiprivate-v1/evaluation +CONFIG ?= +DATA_SOURCE ?= public-v2/evaluation +run-benchmark: ifndef CONFIG - $(error CONFIG is required. Usage: make upload CONFIG= DATASET=) + $(error CONFIG is required. Usage: make run-benchmark CONFIG= [DATA_SOURCE=]) endif -ifndef DATASET - $(error DATASET is required (v1 or v2). Usage: make upload CONFIG= DATASET=) + python cli/run_all.py \ + --data_dir data/$(DATA_SOURCE) \ + --config $(CONFIG) \ + --save_submission_dir submissions/$(CONFIG)/$(DATA_SOURCE) \ + --log-level INFO + +# Score one or more configs across all datasets in a table +# Usage: make score CONFIGS="kimi-k2.5,gpt-5-2-thinking-low-v1" +CONFIGS ?= +score: +ifndef CONFIGS + $(error CONFIGS is required. Usage: make score CONFIGS="config1,config2,...") endif - python cli/submission_cli.py upload \ - submissions/$(CONFIG)/public-$(DATASET)/evaluation \ - --model-name $(CONFIG) \ - --task-set arc_agi_$(DATASET)_public_eval + python cli/score_table.py "$(CONFIGS)" # Upload PUBLIC submissions to Hugging Face (NEVER semiprivate or private) # Usage: make upload CONFIG=minimax-m2.5 DATASET=v1