hud-evals · MagellaX · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py
@@ -31,6 +31,7 @@
 from .utils.config import set_env_values
 from .utils.cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
 from .utils.logging import CaptureLogger
+from .validate import validate_command
 
 # Create the main Typer app
 app = typer.Typer(
@@ -978,6 +979,17 @@ def quickstart() -> None:
     clone("https://github.com/hud-evals/quickstart.git")
 
 
+@app.command()
+def validate(
+    source: str = typer.Argument(  # type: ignore[arg-type]
+        ...,
+        help="Tasks file path or dataset slug (e.g. ./tasks.json or hud-evals/SheetBench-50)",
+    ),
+) -> None:
+    """Validate task files or datasets without running them."""
+    validate_command(source)
+
+
 app.command(name="eval")(eval_command)
 
 

diff --git a/hud/cli/validate.py b/hud/cli/validate.py
@@ -0,0 +1,117 @@
+"""Validate task files or datasets."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, cast
+
+import typer
+from pydantic import ValidationError
+
+from hud.datasets import load_tasks
+from hud.eval.utils import validate_v4_task
+from hud.types import Task
+from hud.utils.hud_console import hud_console
+
+
+def validate_command(source: str) -> None:
+    """Validate tasks from a file or HuggingFace dataset."""
+    try:
+        raw_tasks, type_errors = _load_raw_tasks(source)
+    except Exception as e:
+        hud_console.error(f"Failed to load tasks: {e}")
+        raise typer.Exit(1) from e
+
+    errors: list[str] = []
+    errors.extend(type_errors)
+    for idx, task in enumerate(raw_tasks):
+        label = task.get("id") or f"index {idx}"
+        try:
+            if _looks_like_v4(task):
+                validate_v4_task(task)
+            Task(**_as_dict(task))
+        except ValidationError as e:
+            errors.append(f"{label}: {e}")
+        except Exception as e:
+            errors.append(f"{label}: {e}")
+
+    if errors:
+        hud_console.error(f"Found {len(errors)} invalid task(s).")
+        for err in errors:
+            hud_console.error(f"- {err}")
+        raise typer.Exit(1)
+
+    hud_console.success(f"Validated {len(raw_tasks)} task(s).")
+
+
+def _as_dict(task: Any) -> dict[str, Any]:
+    if isinstance(task, dict):
+        return task
+    try:
+        return dict(task)
+    except Exception:
+        return {}
+
+
+def _looks_like_v4(task: dict[str, Any]) -> bool:
+    return any(
+        key in task
+        for key in ("prompt", "mcp_config", "evaluate_tool", "setup_tool", "integration_test_tool")
+    )
+
+
+def _load_raw_tasks(source: str) -> tuple[list[dict[str, Any]], list[str]]:
+    path = Path(source)
+    if path.exists() and path.suffix.lower() in {".json", ".jsonl"}:
+        return _load_raw_from_file(path)
+    return cast("list[dict[str, Any]]", load_tasks(source, raw=True)), []
+
+
+def _load_raw_from_file(path: Path) -> tuple[list[dict[str, Any]], list[str]]:
+    errors: list[str] = []
+    items: list[dict[str, Any]] = []
+
+    if path.suffix.lower() == ".jsonl":
+        with open(path, encoding="utf-8") as f:
+            for line_no, line in enumerate(f, start=1):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    value = json.loads(line)
+                except json.JSONDecodeError as e:
+                    errors.append(f"line {line_no}: invalid JSON ({e.msg})")
+                    continue
+                if isinstance(value, dict):
+                    items.append(value)
+                    continue
+                if isinstance(value, list):
+                    for idx, entry in enumerate(value):
+                        if isinstance(entry, dict):
+                            items.append(entry)
+                        else:
+                            entry_type = type(entry).__name__
+                            errors.append(
+                                f"line {line_no} item {idx}: expected object, got {entry_type}"
+                            )
+                    continue
+                errors.append(
+                    f"line {line_no}: expected object or list, got {type(value).__name__}"
+                )
+        return items, errors
+
+    with open(path, encoding="utf-8") as f:
+        value = json.load(f)
+
+    if isinstance(value, dict):
+        return [value], errors
+    if isinstance(value, list):
+        for idx, entry in enumerate(value):
+            if isinstance(entry, dict):
+                items.append(entry)
+            else:
+                errors.append(f"index {idx}: expected object, got {type(entry).__name__}")
+        return items, errors
+
+    raise ValueError(f"JSON file must contain an object or array, got {type(value).__name__}")
diff --git a/hud/tests/test_validate_cli.py b/hud/tests/test_validate_cli.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import importlib.util
+import json
+from pathlib import Path
+
+import pytest
+import typer
+
+
+def _load_validate_command():
+    module_path = Path(__file__).resolve().parents[1] / "cli" / "validate.py"
+    spec = importlib.util.spec_from_file_location("hud.cli.validate", module_path)
+    module = importlib.util.module_from_spec(spec)  # type: ignore[arg-type]
+    assert spec and spec.loader
+    spec.loader.exec_module(module)
+    return module.validate_command
+
+
+def _write_tasks(path: Path, tasks: list[dict]) -> str:
+    path.write_text(json.dumps(tasks), encoding="utf-8")
+    return str(path)
+
+
+def test_validate_command_valid(tmp_path: Path) -> None:
+    validate_command = _load_validate_command()
+    tasks = [
+        {
+            "prompt": "Say hello",
+            "mcp_config": {"local": {"command": "echo", "args": ["hi"]}},
+            "evaluate_tool": {"name": "done", "arguments": {}},
+        }
+    ]
+    path = _write_tasks(tmp_path / "tasks.json", tasks)
+    validate_command(path)
+
+
+def test_validate_command_invalid(tmp_path: Path) -> None:
+    validate_command = _load_validate_command()
+    tasks = [{"mcp_config": {"local": {"command": "echo", "args": ["hi"]}}}]
+    path = _write_tasks(tmp_path / "tasks.json", tasks)
+    with pytest.raises(typer.Exit):
+        validate_command(path)
+
+
+def test_validate_command_flags_non_dict_entries(tmp_path: Path) -> None:
+    validate_command = _load_validate_command()
+    tasks = [
+        {
+            "prompt": "ok",
+            "mcp_config": {"local": {"command": "echo", "args": ["hi"]}},
+            "evaluate_tool": {"name": "done", "arguments": {}},
+        },
+        "not a task",
+    ]
+    path = _write_tasks(tmp_path / "tasks.json", tasks)
+    with pytest.raises(typer.Exit):
+        validate_command(path)