arcprize · ericc59 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/.gitignore b/.gitignore
@@ -182,4 +182,9 @@ analyze_submission_times.py
 /logs/*
 /test_logs/*
 .codex_*
-dummy_submissions
+dummy_submissions
+
+data/semiprivate-v1
+data/semiprivate-v2
+data/public-v1
+data/public-v2
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help install test test-verbose run-sample run-batch clean score
+.PHONY: help install test test-verbose run-sample run-batch run-benchmark clean score upload
 
 help:
 	@echo "Available commands:"
@@ -7,7 +7,9 @@ help:
 	@echo "  make test-verbose   - Run tests with verbose output"
 	@echo "  make run-sample     - Run random baseline on sample task"
 	@echo "  make run-batch      - Run random baseline on all sample tasks"
-	@echo "  make score          - Score submissions against ground truth"
+	@echo "  make run-benchmark CONFIG=<name> DATA_SOURCE=<path>  - Run a full local benchmark"
+	@echo "  make score CONFIGS=\"config1,config2\"  - Score configs across all datasets"
+	@echo "  make upload CONFIG=<name> DATASET=<v1|v2>  - Upload submissions to Hugging Face"
 	@echo "  make clean          - Remove generated files and caches"
 
 install:
@@ -36,12 +38,44 @@ run-batch:
 		--save_submission_dir submissions/random-batch \
 		--log-level INFO
 
-# Score submissions
+# Run a full benchmark locally
+# Usage: make run-benchmark CONFIG=kimi-k2.5 DATA_SOURCE=semiprivate-v1/evaluation
+CONFIG ?=
+DATA_SOURCE ?= public-v2/evaluation
+run-benchmark:
+ifndef CONFIG
+	$(error CONFIG is required. Usage: make run-benchmark CONFIG=<model-config> [DATA_SOURCE=<path>])
+endif
+	python cli/run_all.py \
+		--data_dir data/$(DATA_SOURCE) \
+		--config $(CONFIG) \
+		--save_submission_dir submissions/$(CONFIG)/$(DATA_SOURCE) \
+		--log-level INFO
+
+# Score one or more configs across all datasets in a table
+# Usage: make score CONFIGS="kimi-k2.5,gpt-5-2-thinking-low-v1"
+CONFIGS ?=
 score:
-	python -m arc_agi_benchmarking.scoring.scoring \
-		--task_dir data/sample/tasks \
-		--submission_dir submissions/random-batch \
-		--print_logs
+ifndef CONFIGS
+	$(error CONFIGS is required. Usage: make score CONFIGS="config1,config2,...")
+endif
+	python cli/score_table.py "$(CONFIGS)"
+
+# Upload PUBLIC submissions to Hugging Face (NEVER semiprivate or private)
+# Usage: make upload CONFIG=minimax-m2.5 DATASET=v1
+# Usage: make upload CONFIG=minimax-m2.5 DATASET=v2
+DATASET ?=
+upload:
+ifndef CONFIG
+	$(error CONFIG is required. Usage: make upload CONFIG=<model-config> DATASET=<v1|v2>)
+endif
+ifndef DATASET
+	$(error DATASET is required (v1 or v2). Usage: make upload CONFIG=<model-config> DATASET=<v1|v2>)
+endif
+	python cli/submission_cli.py upload \
+		submissions/$(CONFIG)/public-$(DATASET)/evaluation \
+		--model-name $(CONFIG) \
+		--task-set arc_agi_$(DATASET)_public_eval
 
 clean:
 	rm -rf __pycache__ .pytest_cache

diff --git a/cli/run_all.py b/cli/run_all.py
@@ -385,29 +385,53 @@ async def main(task_list_file: Optional[str],
 
     # Determine which tasks to run
     if resume:
-        # Only run pending tasks that don't already have submission files on disk
+        # Run any unfinished task that doesn't already have a submission on disk.
+        # This lets interrupted runs pick up failed/in-progress tasks automatically.
         tasks_to_run = []
         skipped_existing_submissions = 0
+        requeued_unfinished_tasks = 0
         for task_id in task_ids:
             task_progress = progress_manager.progress.tasks.get(task_id)
-            if not task_progress or task_progress.status != TaskStatus.PENDING:
+            if not task_progress:
                 continue
+
             # Check if submission already exists on disk (handles pre-checkpoint runs)
             if submission_exists(save_submission_dir, task_id):
-                progress_manager.mark_completed(task_id)
-                skipped_existing_submissions += 1
+                if task_progress.status != TaskStatus.COMPLETED:
+                    progress_manager.mark_completed(task_id)
+                    skipped_existing_submissions += 1
+                continue
+
+            if task_progress.status == TaskStatus.COMPLETED:
+                # Checkpoint says completed but no submission file — requeue
+                logger.warning(f"Task {task_id} marked completed but no submission found on disk. Requeuing.")
+                progress_manager.requeue_task(task_id)
+                requeued_unfinished_tasks += 1
+                tasks_to_run.append(task_id)
                 continue
+
+            if task_progress.status != TaskStatus.PENDING:
+                progress_manager.requeue_task(task_id)
+                requeued_unfinished_tasks += 1
+
             tasks_to_run.append(task_id)
 
         completed_count = progress_manager.progress.completed_count
         failed_count = progress_manager.progress.failed_count
-        if completed_count > 0 or failed_count > 0 or skipped_existing_submissions > 0:
+        if (
+            completed_count > 0
+            or failed_count > 0
+            or skipped_existing_submissions > 0
+            or requeued_unfinished_tasks > 0
+        ):
             logger.info(
                 f"Resuming: {completed_count} completed, {failed_count} failed, "
                 f"{len(tasks_to_run)} remaining"
             )
             if skipped_existing_submissions > 0:
                 logger.info(f"Marked {skipped_existing_submissions} task(s) as completed (existing submissions found on disk)")
+            if requeued_unfinished_tasks > 0:
+                logger.info(f"Requeued {requeued_unfinished_tasks} unfinished task(s) with no submission file")
     else:
         tasks_to_run = task_ids
         logger.info("Resume disabled - running all tasks")

diff --git a/cli/score_table.py b/cli/score_table.py
@@ -0,0 +1,131 @@
+"""Score multiple configs across datasets and display as a table."""
+
+import argparse
+import io
+import os
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))
+
+from arc_agi_benchmarking.scoring.scoring import ARCScorer
+from arc_agi_benchmarking.utils.task_utils import read_models_config
+
+DATASETS = [
+    ("public-v1/evaluation", "Public Eval v1"),
+    ("public-v2/evaluation", "Public Eval v2"),
+    ("semiprivate-v1/evaluation", "Semi-Private v1"),
+    ("semiprivate-v2/evaluation", "Semi-Private v2"),
+]
+
+
+def score_config_dataset(config: str, dataset_path: str, base_dir: Path) -> dict | None:
+    task_dir = base_dir / "data" / dataset_path
+    submission_dir = base_dir / "submissions" / config / dataset_path
+
+    if not submission_dir.exists() or not any(submission_dir.glob("*.json")):
+        return None
+
+    if not task_dir.exists():
+        return None
+
+    scorer = ARCScorer(
+        task_dir=str(task_dir),
+        submission_dir=str(submission_dir),
+        print_logs=False,
+    )
+    # Suppress the scorer's built-in print output
+    old_stdout = sys.stdout
+    sys.stdout = io.StringIO()
+    try:
+        total_score, total_tasks = scorer.score_submission()
+    finally:
+        sys.stdout = old_stdout
+    return {
+        "score": total_score,
+        "total_tasks": total_tasks,
+        "cost": scorer.total_cost,
+        "cost_per_task": scorer.total_cost / total_tasks,
+    }
+
+
+def format_cell(result: dict | None) -> str:
+    if result is None:
+        return "-"
+    pct = (
+        (result["score"] / result["total_tasks"] * 100)
+        if result["total_tasks"] > 0
+        else 0
+    )
+    return f"{pct:.2f}% / {result['score']:.4g}/{result['total_tasks']} / ${result['cost_per_task']:.2f}"
+
+
+DISPLAY_PARAMS = ["reasoning_effort", "thinking", "thinking_config", "reasoning", "stream", "max_tokens"]
+
+
+def get_model_params(config: str) -> str:
+    """Get key model params from models.yml for display."""
+    try:
+        model_config = read_models_config(config)
+        parts = [f"provider={model_config.provider}"]
+        for key in DISPLAY_PARAMS:
+            if key in model_config.kwargs:
+                val = model_config.kwargs[key]
+                parts.append(f"{key}={val}")
+        return ", ".join(parts)
+    except (ValueError, Exception):
+        return ""
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Score multiple configs in a table")
+    parser.add_argument(
+        "configs",
+        type=lambda s: [c.strip() for c in s.split(",")],
+        help="Comma-separated model config names to score",
+    )
+    parser.add_argument(
+        "--base_dir",
+        type=str,
+        default=str(Path(__file__).resolve().parent.parent),
+        help="Base directory of arc-agi-benchmarking",
+    )
+    args = parser.parse_args()
+
+    base_dir = Path(args.base_dir)
+
+    # Collect results
+    rows = []
+    for config in args.configs:
+        row = {"config": config, "params": get_model_params(config)}
+        for dataset_path, _ in DATASETS:
+            row[dataset_path] = score_config_dataset(config, dataset_path, base_dir)
+        rows.append(row)
+
+    # Build table
+    headers = ["Model Config"] + [label for _, label in DATASETS] + ["Params"]
+    config_col_width = max(len(headers[0]), max(len(r["config"]) for r in rows))
+    col_widths = [config_col_width]
+    for i, (dataset_path, label) in enumerate(DATASETS):
+        cells = [format_cell(r[dataset_path]) for r in rows]
+        col_widths.append(max(len(label), max(len(c) for c in cells)))
+    params_width = max(len("Params"), max(len(r["params"]) for r in rows))
+    col_widths.append(params_width)
+
+    def fmt_row(values):
+        return "| " + " | ".join(v.ljust(w) for v, w in zip(values, col_widths)) + " |"
+
+    sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|"
+
+    print(fmt_row(headers))
+    print(sep)
+    for row in rows:
+        cells = [row["config"]]
+        for dataset_path, _ in DATASETS:
+            cells.append(format_cell(row[dataset_path]))
+        cells.append(row["params"])
+        print(fmt_row(cells))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cli/submission_cli.py b/cli/submission_cli.py
@@ -104,7 +104,8 @@ def upload(output_dir, model_name, task_set, org, public):
             folder_path=str(output_path),
             path_in_repo=model_name,  # Places files in task_set/model_name/
             repo_id=repo_id,
-            repo_type="dataset"
+            repo_type="dataset",
+            allow_patterns=["*.json"],
         )
 
         click.echo(f"\n✅ Successfully uploaded files to {repo_id}/{model_name}")
@@ -153,7 +154,8 @@ def bulk_upload(submissions_dir, task_set, org, public):
                     folder_path=str(model_dir),
                     path_in_repo=model_name,
                     repo_id=repo_id,
-                    repo_type="dataset"
+                    repo_type="dataset",
+                    allow_patterns=["*.json"],
                 )
                 click.echo(f"✅ Successfully uploaded {model_name}")
                 success_count += 1

diff --git a/src/arc_agi_benchmarking/adapters/fireworks.py b/src/arc_agi_benchmarking/adapters/fireworks.py
@@ -18,12 +18,91 @@ def init_client(self):
 
         return OpenAI(api_key=api_key, base_url="https://api.fireworks.ai/inference/v1")
 
-    def _chat_completion(self, messages: List[Dict[str, str]]) -> Any:
+    def _prepare_kwargs(self) -> dict:
         api_kwargs = _filter_api_kwargs(self.model_config.kwargs)
         api_kwargs["store"] = False
+        return api_kwargs
+
+    def _chat_completion(self, messages: List[Dict[str, str]]) -> Any:
+        api_kwargs = self._prepare_kwargs()
+
         logger.debug(
             f"Calling Fireworks API with model: {self.model_config.model_name} and kwargs: {api_kwargs}"
         )
         return self.client.chat.completions.create(
-            model=self.model_config.model_name, messages=messages, **api_kwargs
+            model=self.model_config.model_name,
+            messages=messages,
+            **api_kwargs,
         )
+
+    def _chat_completion_stream(self, messages: List[Dict[str, str]]) -> Any:
+        api_kwargs = self._prepare_kwargs()
+        stream_kwargs = {k: v for k, v in api_kwargs.items() if k != "stream"}
+
+        logger.debug(
+            f"Starting streaming Fireworks API call with model: {self.model_config.model_name}"
+        )
+        try:
+            stream = self.client.chat.completions.create(
+                model=self.model_config.model_name,
+                messages=messages,
+                stream=True,
+                stream_options={"include_usage": True},
+                **stream_kwargs,
+            )
+
+            from openai.types.chat import ChatCompletion, ChatCompletionMessage
+            from openai.types.chat.chat_completion import Choice as OpenAIChoice
+            from openai.types import CompletionUsage
+            import time
+
+            content_chunks = []
+            last_chunk = None
+            finish_reason = "stop"
+            chunk_count = 0
+
+            for chunk in stream:
+                last_chunk = chunk
+                chunk_count += 1
+
+                if chunk.choices:
+                    delta_content = chunk.choices[0].delta.content or ""
+                    if delta_content:
+                        content_chunks.append(delta_content)
+
+                if chunk.choices and chunk.choices[0].finish_reason:
+                    finish_reason = chunk.choices[0].finish_reason
+
+            final_content = "".join(content_chunks)
+            usage_data = (
+                last_chunk.usage if last_chunk and hasattr(last_chunk, "usage") else None
+            )
+            response_id = last_chunk.id if last_chunk else f"stream-{int(time.time())}"
+
+            if not usage_data:
+                logger.warning("No usage data received from streaming response")
+                usage_data = CompletionUsage(
+                    prompt_tokens=0, completion_tokens=0, total_tokens=0
+                )
+
+            return ChatCompletion(
+                id=response_id,
+                choices=[
+                    OpenAIChoice(
+                        finish_reason=finish_reason,
+                        index=0,
+                        message=ChatCompletionMessage(
+                            content=final_content, role="assistant"
+                        ),
+                        logprobs=None,
+                    )
+                ],
+                created=int(time.time()),
+                model=self.model_config.model_name,
+                object="chat.completion",
+                usage=usage_data,
+            )
+
+        except Exception as e:
+            logger.error(f"Error during streaming: {e}")
+            raise