Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 108 additions & 134 deletions examples/hf_auto_eval.py
Original file line number Diff line number Diff line change
@@ -1,147 +1,121 @@
#!/usr/bin/env python3
"""
Batch‑evaluate all model/dataset/metric triples from perfect_model_dataset_metrics.json

Usage:
python hf_eval_all.py --llm-model gpt-4o --runs 1 --max-fixes 5 --limit 20
"""

import argparse
import json
import os
import sys
from pathlib import Path

from tiny_scientist import TinyScientist


def test_docker_availability() -> bool:
"""Test if Docker is available."""
try:
from tiny_scientist.tool import DockerExperimentRunner

runner = DockerExperimentRunner()
if runner.use_docker:
print("βœ… Docker is available and will be used")
return True
else:
print("⚠️ Docker is not available, will use local execution")
return False
except Exception as e:
print(f"❌ Error checking Docker: {e}")
return False


def create_formatted_idea(model: str, dataset: str, metric: str) -> dict:
"""Create a formatted idea dictionary that matches TinyScientist's expected structure."""
return {
"Name": f"evaluate_{model.replace('/', '_').replace('-', '_')}_{dataset.replace('/', '_').replace('-', '_')}",
"Title": f"Evaluating {model} on {dataset} using {metric} Metric",
"Description": f"Reproduce and evaluate the performance of the Hugging Face model {model} on the {dataset} dataset, specifically measuring the {metric} metric to establish baseline performance.",
"Problem": f"Need to reproduce and validate the evaluation of {model} on {dataset} with focus on {metric} metric for performance verification and comparison.",
"Importance": f"Reproducing model evaluations is crucial for scientific reproducibility and establishing reliable baselines. The {metric} metric provides key insights into model performance on {dataset}.",
"Difficulty": "Moderate - requires proper model loading, dataset preprocessing, and evaluation setup, but uses standard HuggingFace libraries.",
"NoveltyComparison": f"While model evaluation is standard practice, this specific reproduction of {model} on {dataset} focusing on {metric} provides valuable validation and baseline establishment.",
"Approach": f"Load the pre-trained {model} from HuggingFace, prepare the {dataset} dataset, implement evaluation pipeline, and compute {metric} along with other relevant metrics.",
"is_experimental": True,
"Interestingness": 6,
"Feasibility": 9,
"Novelty": 4,
"IntentAlignment": 10,
"Score": 7,
"Experiment": {"Model": model, "Dataset": dataset, "Metric": metric},
}
# Allow importing tiny_scientist
sys.path.insert(0, str(Path(__file__).parent.parent))
from tiny_scientist.coder_docker import DockerCoder


def main():
def load_combinations(json_path: Path):
if not json_path.exists():
raise FileNotFoundError(f"Config file not found: {json_path}")
with open(json_path, "r") as f:
data = json.load(f)
return data.get("results", [])


def iter_triples(combos):
"""
This script uses TinyScientist to automate the process of reproducing
a model evaluation on a given dataset for a specific task.
Yield (model_id, dataset_id, metric_name) for every simple metric.
Skips metrics whose value is a dict (nested/complex).
"""
parser = argparse.ArgumentParser(
description="Reproduce a model evaluation using TinyScientist."
)
parser.add_argument(
"--model",
type=str,
required=True,
help="The Hugging Face model name (e.g., 'dslim/bert-base-NER').",
)
parser.add_argument(
"--dataset",
type=str,
required=True,
help="The Hugging Face dataset name (e.g., 'eriktks/conll2003').",
)
parser.add_argument(
"--metric",
type=str,
required=True,
help="The specific metric to evaluate (e.g., 'F1', 'accuracy', 'BLEU', 'ROUGE', 'precision', 'recall').",
)
parser.add_argument(
"--gpt_model",
type=str,
default="gpt-4o",
help="The GPT model to use for TinyScientist.",
)
parser.add_argument(
"--use_docker",
action="store_true",
default=True,
help="Use Docker for experiment execution (default: True)",
)
for combo in combos:
model = combo.get("model_id")
dataset = combo.get("dataset_id")
metrics = combo.get("metrics", {}) or {}
for metric_name, value in metrics.items():
if value is None or isinstance(value, dict):
continue
yield model, dataset, metric_name


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--json-file", default=str(Path(__file__).parent / "perfect_model_dataset_metrics.json"))
parser.add_argument("--llm-model", default="gpt-4o")
parser.add_argument("--runs", type=int, default=1)
parser.add_argument("--max-fixes", type=int, default=15)
parser.add_argument("--limit", type=int, default=30, help="Evaluate only first N triples (0 = all)")
args = parser.parse_args()

# Test Docker availability
docker_available = test_docker_availability()

if args.use_docker and not docker_available:
print("⚠️ Docker requested but not available, falling back to local execution")
args.use_docker = False

# Before running, ensure you have tiny_scientist installed:
# pip install tiny-scientist

# Initialize TinyScientist with the specified model and Docker configuration
print(f"Initializing TinyScientist with model: {args.gpt_model}")
print(f"Docker enabled: {args.use_docker}")
scientist = TinyScientist(model=args.gpt_model, use_docker=args.use_docker)

# 1. Define the research intent based on user input.
# This string is the core instruction for TinyScientist.
intent = (
f"I want to write a script to reproduce the evaluation of the Hugging Face model '{args.model}' "
f"on the dataset '{args.dataset}'. I want to specifically measure the {args.metric} metric. "
f"The script should load the model and dataset, run the evaluation, "
f"and report the {args.metric} metric along with other relevant evaluation metrics."
)

print(f"πŸ”¬ Intent: {intent}")

# Step 1: Create a formatted idea directly (skipping scientist.think)
print("\nStep 1: Creating formatted research idea...")
idea = create_formatted_idea(args.model, args.dataset, args.metric)
print("βœ… Research idea created.")
print(f"πŸ“‹ Idea Title: {idea['Title']}")
print(f"πŸ“Š Target Metric: {idea['Experiment']['Metric']}")

# Step 2: Generate and run the experiment code
print("\nStep 2: Generating and running experiment code...")
status, experiment_dir = scientist.code(idea=idea)

# If the experiments run successfully, proceed to writing the paper
if status is True:
print(f"βœ… Experiments completed successfully. Results are in: {experiment_dir}")

# Step 3: Write a research paper based on the findings
print("\nStep 3: Writing a research paper...")
pdf_path = scientist.write(idea=idea, experiment_dir=experiment_dir)
if not pdf_path:
print("❌ Failed to write the paper.")
return
print(f"βœ… Paper written and saved to: {pdf_path}")

# Step 4: Review the generated paper
print("\nStep 4: Reviewing the paper...")
review = scientist.review(pdf_path=pdf_path)
print("βœ… Review complete.")
print("\n--- Paper Review ---")
print(review)
print("--------------------")
else:
print(
f"❌ Experiments failed. Check the logs in the experiment directory: {experiment_dir}"
combos = load_combinations(Path(args.json_file))
triples = list(iter_triples(combos))
if args.limit and args.limit > 0:
triples = triples[: args.limit]

if not triples:
print("No evaluable triples found.")
return

os.makedirs("simple_results", exist_ok=True)

summary = []
success_count = 0

print(f"Starting evaluation of {len(triples)} triples...\n")
for i, (model, dataset, metric) in enumerate(triples, 1):
print("="*50)
print(f"[{i}/{len(triples)}] {model} | {dataset} | {metric}")
print("="*50)
safe_dir = f"{model}_{dataset}_{metric}".replace("/", "_").replace("-", "_")
out_dir = f"simple_results/{safe_dir}"

try:
# Create a new DockerCoder instance for each evaluation with its own output directory
coder = DockerCoder(model=args.llm_model, output_dir=out_dir)
success, message = coder.evaluate_model(
model_name=model,
dataset_name=dataset,
metric=metric,
max_runs=args.runs,
max_fixes=args.max_fixes,
)
except Exception as e:
success, message = False, f"Exception: {e}"

summary.append(
{
"model": model,
"dataset": dataset,
"metric": metric,
"success": success,
"message": message,
"output_dir": out_dir if success else None,
}
)
if success:
success_count += 1

# Save summary
summary_path = "simple_results/batch_summary.json"
with open(summary_path, "w") as f:
json.dump(
{
"total": len(triples),
"success": success_count,
"failed": len(triples) - success_count,
"success_rate": success_count / len(triples),
"results": summary,
},
f,
indent=2,
)

print("\nDone.")
print(f"Success: {success_count} / {len(triples)} "
f"({success_count / len(triples) * 100:.1f}%)")
print(f"Summary written to {summary_path}")


if __name__ == "__main__":
Expand Down
Loading
Loading