Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -182,4 +182,9 @@ analyze_submission_times.py
/logs/*
/test_logs/*
.codex_*
dummy_submissions
dummy_submissions

data/semiprivate-v1
data/semiprivate-v2
data/public-v1
data/public-v2
48 changes: 41 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.PHONY: help install test test-verbose run-sample run-batch clean score
.PHONY: help install test test-verbose run-sample run-batch run-benchmark clean score upload

help:
@echo "Available commands:"
Expand All @@ -7,7 +7,9 @@ help:
@echo " make test-verbose - Run tests with verbose output"
@echo " make run-sample - Run random baseline on sample task"
@echo " make run-batch - Run random baseline on all sample tasks"
@echo " make score - Score submissions against ground truth"
@echo " make run-benchmark CONFIG=<name> DATA_SOURCE=<path> - Run a full local benchmark"
@echo " make score CONFIGS=\"config1,config2\" - Score configs across all datasets"
@echo " make upload CONFIG=<name> DATASET=<v1|v2> - Upload submissions to Hugging Face"
@echo " make clean - Remove generated files and caches"

install:
Expand Down Expand Up @@ -36,12 +38,44 @@ run-batch:
--save_submission_dir submissions/random-batch \
--log-level INFO

# Score submissions
# Run a full benchmark locally
# Usage: make run-benchmark CONFIG=kimi-k2.5 DATA_SOURCE=semiprivate-v1/evaluation
CONFIG ?=
DATA_SOURCE ?= public-v2/evaluation
run-benchmark:
ifndef CONFIG
$(error CONFIG is required. Usage: make run-benchmark CONFIG=<model-config> [DATA_SOURCE=<path>])
endif
python cli/run_all.py \
--data_dir data/$(DATA_SOURCE) \
--config $(CONFIG) \
--save_submission_dir submissions/$(CONFIG)/$(DATA_SOURCE) \
--log-level INFO

# Score one or more configs across all datasets in a table
# Usage: make score CONFIGS="kimi-k2.5,gpt-5-2-thinking-low-v1"
CONFIGS ?=
score:
python -m arc_agi_benchmarking.scoring.scoring \
--task_dir data/sample/tasks \
--submission_dir submissions/random-batch \
--print_logs
ifndef CONFIGS
$(error CONFIGS is required. Usage: make score CONFIGS="config1,config2,...")
endif
python cli/score_table.py "$(CONFIGS)"

# Upload PUBLIC submissions to Hugging Face (NEVER semiprivate or private)
# Usage: make upload CONFIG=minimax-m2.5 DATASET=v1
# Usage: make upload CONFIG=minimax-m2.5 DATASET=v2
DATASET ?=
upload:
ifndef CONFIG
$(error CONFIG is required. Usage: make upload CONFIG=<model-config> DATASET=<v1|v2>)
endif
ifndef DATASET
$(error DATASET is required (v1 or v2). Usage: make upload CONFIG=<model-config> DATASET=<v1|v2>)
endif
python cli/submission_cli.py upload \
submissions/$(CONFIG)/public-$(DATASET)/evaluation \
--model-name $(CONFIG) \
--task-set arc_agi_$(DATASET)_public_eval

clean:
rm -rf __pycache__ .pytest_cache
Expand Down
34 changes: 29 additions & 5 deletions cli/run_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,29 +385,53 @@ async def main(task_list_file: Optional[str],

# Determine which tasks to run
if resume:
# Only run pending tasks that don't already have submission files on disk
# Run any unfinished task that doesn't already have a submission on disk.
# This lets interrupted runs pick up failed/in-progress tasks automatically.
tasks_to_run = []
skipped_existing_submissions = 0
requeued_unfinished_tasks = 0
for task_id in task_ids:
task_progress = progress_manager.progress.tasks.get(task_id)
if not task_progress or task_progress.status != TaskStatus.PENDING:
if not task_progress:
continue

# Check if submission already exists on disk (handles pre-checkpoint runs)
if submission_exists(save_submission_dir, task_id):
progress_manager.mark_completed(task_id)
skipped_existing_submissions += 1
if task_progress.status != TaskStatus.COMPLETED:
progress_manager.mark_completed(task_id)
skipped_existing_submissions += 1
continue

if task_progress.status == TaskStatus.COMPLETED:
# Checkpoint says completed but no submission file — requeue
logger.warning(f"Task {task_id} marked completed but no submission found on disk. Requeuing.")
progress_manager.requeue_task(task_id)
requeued_unfinished_tasks += 1
tasks_to_run.append(task_id)
continue

if task_progress.status != TaskStatus.PENDING:
progress_manager.requeue_task(task_id)
requeued_unfinished_tasks += 1

tasks_to_run.append(task_id)

completed_count = progress_manager.progress.completed_count
failed_count = progress_manager.progress.failed_count
if completed_count > 0 or failed_count > 0 or skipped_existing_submissions > 0:
if (
completed_count > 0
or failed_count > 0
or skipped_existing_submissions > 0
or requeued_unfinished_tasks > 0
):
logger.info(
f"Resuming: {completed_count} completed, {failed_count} failed, "
f"{len(tasks_to_run)} remaining"
)
if skipped_existing_submissions > 0:
logger.info(f"Marked {skipped_existing_submissions} task(s) as completed (existing submissions found on disk)")
if requeued_unfinished_tasks > 0:
logger.info(f"Requeued {requeued_unfinished_tasks} unfinished task(s) with no submission file")
else:
tasks_to_run = task_ids
logger.info("Resume disabled - running all tasks")
Expand Down
131 changes: 131 additions & 0 deletions cli/score_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""Score multiple configs across datasets and display as a table."""

import argparse
import io
import os
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))

from arc_agi_benchmarking.scoring.scoring import ARCScorer
from arc_agi_benchmarking.utils.task_utils import read_models_config

DATASETS = [
("public-v1/evaluation", "Public Eval v1"),
("public-v2/evaluation", "Public Eval v2"),
("semiprivate-v1/evaluation", "Semi-Private v1"),
("semiprivate-v2/evaluation", "Semi-Private v2"),
]


def score_config_dataset(config: str, dataset_path: str, base_dir: Path) -> dict | None:
task_dir = base_dir / "data" / dataset_path
submission_dir = base_dir / "submissions" / config / dataset_path

if not submission_dir.exists() or not any(submission_dir.glob("*.json")):
return None

if not task_dir.exists():
return None

scorer = ARCScorer(
task_dir=str(task_dir),
submission_dir=str(submission_dir),
print_logs=False,
)
# Suppress the scorer's built-in print output
old_stdout = sys.stdout
sys.stdout = io.StringIO()
try:
total_score, total_tasks = scorer.score_submission()
finally:
sys.stdout = old_stdout
return {
"score": total_score,
"total_tasks": total_tasks,
"cost": scorer.total_cost,
"cost_per_task": scorer.total_cost / total_tasks,
}


def format_cell(result: dict | None) -> str:
if result is None:
return "-"
pct = (
(result["score"] / result["total_tasks"] * 100)
if result["total_tasks"] > 0
else 0
)
return f"{pct:.2f}% / {result['score']:.4g}/{result['total_tasks']} / ${result['cost_per_task']:.2f}"


DISPLAY_PARAMS = ["reasoning_effort", "thinking", "thinking_config", "reasoning", "stream", "max_tokens"]


def get_model_params(config: str) -> str:
"""Get key model params from models.yml for display."""
try:
model_config = read_models_config(config)
parts = [f"provider={model_config.provider}"]
for key in DISPLAY_PARAMS:
if key in model_config.kwargs:
val = model_config.kwargs[key]
parts.append(f"{key}={val}")
return ", ".join(parts)
except (ValueError, Exception):
return ""


def main():
parser = argparse.ArgumentParser(description="Score multiple configs in a table")
parser.add_argument(
"configs",
type=lambda s: [c.strip() for c in s.split(",")],
help="Comma-separated model config names to score",
)
parser.add_argument(
"--base_dir",
type=str,
default=str(Path(__file__).resolve().parent.parent),
help="Base directory of arc-agi-benchmarking",
)
args = parser.parse_args()

base_dir = Path(args.base_dir)

# Collect results
rows = []
for config in args.configs:
row = {"config": config, "params": get_model_params(config)}
for dataset_path, _ in DATASETS:
row[dataset_path] = score_config_dataset(config, dataset_path, base_dir)
rows.append(row)

# Build table
headers = ["Model Config"] + [label for _, label in DATASETS] + ["Params"]
config_col_width = max(len(headers[0]), max(len(r["config"]) for r in rows))
col_widths = [config_col_width]
for i, (dataset_path, label) in enumerate(DATASETS):
cells = [format_cell(r[dataset_path]) for r in rows]
col_widths.append(max(len(label), max(len(c) for c in cells)))
params_width = max(len("Params"), max(len(r["params"]) for r in rows))
col_widths.append(params_width)

def fmt_row(values):
return "| " + " | ".join(v.ljust(w) for v, w in zip(values, col_widths)) + " |"

sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|"

print(fmt_row(headers))
print(sep)
for row in rows:
cells = [row["config"]]
for dataset_path, _ in DATASETS:
cells.append(format_cell(row[dataset_path]))
cells.append(row["params"])
print(fmt_row(cells))


if __name__ == "__main__":
main()
6 changes: 4 additions & 2 deletions cli/submission_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ def upload(output_dir, model_name, task_set, org, public):
folder_path=str(output_path),
path_in_repo=model_name, # Places files in task_set/model_name/
repo_id=repo_id,
repo_type="dataset"
repo_type="dataset",
allow_patterns=["*.json"],
)

click.echo(f"\n✅ Successfully uploaded files to {repo_id}/{model_name}")
Expand Down Expand Up @@ -153,7 +154,8 @@ def bulk_upload(submissions_dir, task_set, org, public):
folder_path=str(model_dir),
path_in_repo=model_name,
repo_id=repo_id,
repo_type="dataset"
repo_type="dataset",
allow_patterns=["*.json"],
)
click.echo(f"✅ Successfully uploaded {model_name}")
success_count += 1
Expand Down
83 changes: 81 additions & 2 deletions src/arc_agi_benchmarking/adapters/fireworks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,91 @@ def init_client(self):

return OpenAI(api_key=api_key, base_url="https://api.fireworks.ai/inference/v1")

def _chat_completion(self, messages: List[Dict[str, str]]) -> Any:
def _prepare_kwargs(self) -> dict:
api_kwargs = _filter_api_kwargs(self.model_config.kwargs)
api_kwargs["store"] = False
return api_kwargs

def _chat_completion(self, messages: List[Dict[str, str]]) -> Any:
api_kwargs = self._prepare_kwargs()

logger.debug(
f"Calling Fireworks API with model: {self.model_config.model_name} and kwargs: {api_kwargs}"
)
return self.client.chat.completions.create(
model=self.model_config.model_name, messages=messages, **api_kwargs
model=self.model_config.model_name,
messages=messages,
**api_kwargs,
)

def _chat_completion_stream(self, messages: List[Dict[str, str]]) -> Any:
api_kwargs = self._prepare_kwargs()
stream_kwargs = {k: v for k, v in api_kwargs.items() if k != "stream"}

logger.debug(
f"Starting streaming Fireworks API call with model: {self.model_config.model_name}"
)
try:
stream = self.client.chat.completions.create(
model=self.model_config.model_name,
messages=messages,
stream=True,
stream_options={"include_usage": True},
**stream_kwargs,
)

from openai.types.chat import ChatCompletion, ChatCompletionMessage
from openai.types.chat.chat_completion import Choice as OpenAIChoice
from openai.types import CompletionUsage
import time

content_chunks = []
last_chunk = None
finish_reason = "stop"
chunk_count = 0

for chunk in stream:
last_chunk = chunk
chunk_count += 1

if chunk.choices:
delta_content = chunk.choices[0].delta.content or ""
if delta_content:
content_chunks.append(delta_content)

if chunk.choices and chunk.choices[0].finish_reason:
finish_reason = chunk.choices[0].finish_reason

final_content = "".join(content_chunks)
usage_data = (
last_chunk.usage if last_chunk and hasattr(last_chunk, "usage") else None
)
response_id = last_chunk.id if last_chunk else f"stream-{int(time.time())}"

if not usage_data:
logger.warning("No usage data received from streaming response")
usage_data = CompletionUsage(
prompt_tokens=0, completion_tokens=0, total_tokens=0
)

return ChatCompletion(
id=response_id,
choices=[
OpenAIChoice(
finish_reason=finish_reason,
index=0,
message=ChatCompletionMessage(
content=final_content, role="assistant"
),
logprobs=None,
)
],
created=int(time.time()),
model=self.model_config.model_name,
object="chat.completion",
usage=usage_data,
)

except Exception as e:
logger.error(f"Error during streaming: {e}")
raise
Loading
Loading