ulab-uiuc · lwaekfjlk · Jul 25, 2025 · Jul 25, 2025 · Aug 1, 2025
diff --git a/examples/hf_auto_eval.py b/examples/hf_auto_eval.py
@@ -1,147 +1,121 @@
+#!/usr/bin/env python3
+"""
+Batch‑evaluate all model/dataset/metric triples from perfect_model_dataset_metrics.json
+
+Usage:
+    python hf_eval_all.py --llm-model gpt-4o --runs 1 --max-fixes 5 --limit 20
+"""
+
 import argparse
+import json
+import os
+import sys
+from pathlib import Path
 
-from tiny_scientist import TinyScientist
-
-
-def test_docker_availability() -> bool:
-    """Test if Docker is available."""
-    try:
-        from tiny_scientist.tool import DockerExperimentRunner
-
-        runner = DockerExperimentRunner()
-        if runner.use_docker:
-            print("✅ Docker is available and will be used")
-            return True
-        else:
-            print("⚠️  Docker is not available, will use local execution")
-            return False
-    except Exception as e:
-        print(f"❌ Error checking Docker: {e}")
-        return False
-
-
-def create_formatted_idea(model: str, dataset: str, metric: str) -> dict:
-    """Create a formatted idea dictionary that matches TinyScientist's expected structure."""
-    return {
-        "Name": f"evaluate_{model.replace('/', '_').replace('-', '_')}_{dataset.replace('/', '_').replace('-', '_')}",
-        "Title": f"Evaluating {model} on {dataset} using {metric} Metric",
-        "Description": f"Reproduce and evaluate the performance of the Hugging Face model {model} on the {dataset} dataset, specifically measuring the {metric} metric to establish baseline performance.",
-        "Problem": f"Need to reproduce and validate the evaluation of {model} on {dataset} with focus on {metric} metric for performance verification and comparison.",
-        "Importance": f"Reproducing model evaluations is crucial for scientific reproducibility and establishing reliable baselines. The {metric} metric provides key insights into model performance on {dataset}.",
-        "Difficulty": "Moderate - requires proper model loading, dataset preprocessing, and evaluation setup, but uses standard HuggingFace libraries.",
-        "NoveltyComparison": f"While model evaluation is standard practice, this specific reproduction of {model} on {dataset} focusing on {metric} provides valuable validation and baseline establishment.",
-        "Approach": f"Load the pre-trained {model} from HuggingFace, prepare the {dataset} dataset, implement evaluation pipeline, and compute {metric} along with other relevant metrics.",
-        "is_experimental": True,
-        "Interestingness": 6,
-        "Feasibility": 9,
-        "Novelty": 4,
-        "IntentAlignment": 10,
-        "Score": 7,
-        "Experiment": {"Model": model, "Dataset": dataset, "Metric": metric},
-    }
+# Allow importing tiny_scientist
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from tiny_scientist.coder_docker import DockerCoder
 
 
-def main():
+def load_combinations(json_path: Path):
+    if not json_path.exists():
+        raise FileNotFoundError(f"Config file not found: {json_path}")
+    with open(json_path, "r") as f:
+        data = json.load(f)
+    return data.get("results", [])
+
+
+def iter_triples(combos):
     """
-    This script uses TinyScientist to automate the process of reproducing
-    a model evaluation on a given dataset for a specific task.
+    Yield (model_id, dataset_id, metric_name) for every simple metric.
+    Skips metrics whose value is a dict (nested/complex).
     """
-    parser = argparse.ArgumentParser(
-        description="Reproduce a model evaluation using TinyScientist."
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        required=True,
-        help="The Hugging Face model name (e.g., 'dslim/bert-base-NER').",
-    )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        required=True,
-        help="The Hugging Face dataset name (e.g., 'eriktks/conll2003').",
-    )
-    parser.add_argument(
-        "--metric",
-        type=str,
-        required=True,
-        help="The specific metric to evaluate (e.g., 'F1', 'accuracy', 'BLEU', 'ROUGE', 'precision', 'recall').",
-    )
-    parser.add_argument(
-        "--gpt_model",
-        type=str,
-        default="gpt-4o",
-        help="The GPT model to use for TinyScientist.",
-    )
-    parser.add_argument(
-        "--use_docker",
-        action="store_true",
-        default=True,
-        help="Use Docker for experiment execution (default: True)",
-    )
+    for combo in combos:
+        model = combo.get("model_id")
+        dataset = combo.get("dataset_id")
+        metrics = combo.get("metrics", {}) or {}
+        for metric_name, value in metrics.items():
+            if value is None or isinstance(value, dict):
+                continue
+            yield model, dataset, metric_name
 
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--json-file", default=str(Path(__file__).parent / "perfect_model_dataset_metrics.json"))
+    parser.add_argument("--llm-model", default="gpt-4o")
+    parser.add_argument("--runs", type=int, default=1)
+    parser.add_argument("--max-fixes", type=int, default=15)
+    parser.add_argument("--limit", type=int, default=30, help="Evaluate only first N triples (0 = all)")
     args = parser.parse_args()
 
-    # Test Docker availability
-    docker_available = test_docker_availability()
-
-    if args.use_docker and not docker_available:
-        print("⚠️  Docker requested but not available, falling back to local execution")
-        args.use_docker = False
-
-    # Before running, ensure you have tiny_scientist installed:
-    # pip install tiny-scientist
-
-    # Initialize TinyScientist with the specified model and Docker configuration
-    print(f"Initializing TinyScientist with model: {args.gpt_model}")
-    print(f"Docker enabled: {args.use_docker}")
-    scientist = TinyScientist(model=args.gpt_model, use_docker=args.use_docker)
-
-    # 1. Define the research intent based on user input.
-    # This string is the core instruction for TinyScientist.
-    intent = (
-        f"I want to write a script to reproduce the evaluation of the Hugging Face model '{args.model}' "
-        f"on the dataset '{args.dataset}'. I want to specifically measure the {args.metric} metric. "
-        f"The script should load the model and dataset, run the evaluation, "
-        f"and report the {args.metric} metric along with other relevant evaluation metrics."
-    )
-
-    print(f"🔬 Intent: {intent}")
-
-    # Step 1: Create a formatted idea directly (skipping scientist.think)
-    print("\nStep 1: Creating formatted research idea...")
-    idea = create_formatted_idea(args.model, args.dataset, args.metric)
-    print("✅ Research idea created.")
-    print(f"📋 Idea Title: {idea['Title']}")
-    print(f"📊 Target Metric: {idea['Experiment']['Metric']}")
-
-    # Step 2: Generate and run the experiment code
-    print("\nStep 2: Generating and running experiment code...")
-    status, experiment_dir = scientist.code(idea=idea)
-
-    # If the experiments run successfully, proceed to writing the paper
-    if status is True:
-        print(f"✅ Experiments completed successfully. Results are in: {experiment_dir}")
-
-        # Step 3: Write a research paper based on the findings
-        print("\nStep 3: Writing a research paper...")
-        pdf_path = scientist.write(idea=idea, experiment_dir=experiment_dir)
-        if not pdf_path:
-            print("❌ Failed to write the paper.")
-            return
-        print(f"✅ Paper written and saved to: {pdf_path}")
-
-        # Step 4: Review the generated paper
-        print("\nStep 4: Reviewing the paper...")
-        review = scientist.review(pdf_path=pdf_path)
-        print("✅ Review complete.")
-        print("\n--- Paper Review ---")
-        print(review)
-        print("--------------------")
-    else:
-        print(
-            f"❌ Experiments failed. Check the logs in the experiment directory: {experiment_dir}"
+    combos = load_combinations(Path(args.json_file))
+    triples = list(iter_triples(combos))
+    if args.limit and args.limit > 0:
+        triples = triples[: args.limit]
+
+    if not triples:
+        print("No evaluable triples found.")
+        return
+
+    os.makedirs("simple_results", exist_ok=True)
+
+    summary = []
+    success_count = 0
+
+    print(f"Starting evaluation of {len(triples)} triples...\n")
+    for i, (model, dataset, metric) in enumerate(triples, 1):
+        print("="*50)
+        print(f"[{i}/{len(triples)}] {model} | {dataset} | {metric}")
+        print("="*50)
+        safe_dir = f"{model}_{dataset}_{metric}".replace("/", "_").replace("-", "_")
+        out_dir = f"simple_results/{safe_dir}"
+
+        try:
+            # Create a new DockerCoder instance for each evaluation with its own output directory
+            coder = DockerCoder(model=args.llm_model, output_dir=out_dir)
+            success, message = coder.evaluate_model(
+                model_name=model,
+                dataset_name=dataset,
+                metric=metric,
+                max_runs=args.runs,
+                max_fixes=args.max_fixes,
+            )
+        except Exception as e:
+            success, message = False, f"Exception: {e}"
+
+        summary.append(
+            {
+                "model": model,
+                "dataset": dataset,
+                "metric": metric,
+                "success": success,
+                "message": message,
+                "output_dir": out_dir if success else None,
+            }
         )
+        if success:
+            success_count += 1
+
+    # Save summary
+    summary_path = "simple_results/batch_summary.json"
+    with open(summary_path, "w") as f:
+        json.dump(
+            {
+                "total": len(triples),
+                "success": success_count,
+                "failed": len(triples) - success_count,
+                "success_rate": success_count / len(triples),
+                "results": summary,
+            },
+            f,
+            indent=2,
+        )
+
+    print("\nDone.")
+    print(f"Success: {success_count} / {len(triples)} "
+          f"({success_count / len(triples) * 100:.1f}%)")
+    print(f"Summary written to {summary_path}")
 
 
 if __name__ == "__main__":