Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions hud/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from .utils.config import set_env_values
from .utils.cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
from .utils.logging import CaptureLogger
from .validate import validate_command

# Create the main Typer app
app = typer.Typer(
Expand Down Expand Up @@ -978,6 +979,17 @@ def quickstart() -> None:
clone("https://github.com/hud-evals/quickstart.git")


@app.command()
def validate(
source: str = typer.Argument( # type: ignore[arg-type]
...,
help="Tasks file path or dataset slug (e.g. ./tasks.json or hud-evals/SheetBench-50)",
),
) -> None:
"""Validate task files or datasets without running them."""
validate_command(source)


app.command(name="eval")(eval_command)


Expand Down
117 changes: 117 additions & 0 deletions hud/cli/validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""Validate task files or datasets."""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any, cast

import typer
from pydantic import ValidationError

from hud.datasets import load_tasks
from hud.eval.utils import validate_v4_task
from hud.types import Task
from hud.utils.hud_console import hud_console


def validate_command(source: str) -> None:
"""Validate tasks from a file or HuggingFace dataset."""
try:
raw_tasks, type_errors = _load_raw_tasks(source)
except Exception as e:
hud_console.error(f"Failed to load tasks: {e}")
raise typer.Exit(1) from e

errors: list[str] = []
errors.extend(type_errors)
for idx, task in enumerate(raw_tasks):
label = task.get("id") or f"index {idx}"
try:
if _looks_like_v4(task):
validate_v4_task(task)
Task(**_as_dict(task))
except ValidationError as e:
errors.append(f"{label}: {e}")
except Exception as e:
errors.append(f"{label}: {e}")

if errors:
hud_console.error(f"Found {len(errors)} invalid task(s).")
for err in errors:
hud_console.error(f"- {err}")
raise typer.Exit(1)

hud_console.success(f"Validated {len(raw_tasks)} task(s).")


def _as_dict(task: Any) -> dict[str, Any]:
if isinstance(task, dict):
return task
try:
return dict(task)
except Exception:
return {}


def _looks_like_v4(task: dict[str, Any]) -> bool:
return any(
key in task
for key in ("prompt", "mcp_config", "evaluate_tool", "setup_tool", "integration_test_tool")
)


def _load_raw_tasks(source: str) -> tuple[list[dict[str, Any]], list[str]]:
path = Path(source)
if path.exists() and path.suffix.lower() in {".json", ".jsonl"}:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Case sensitivity mismatch between validation and loading

Low Severity

The _load_raw_tasks and _load_raw_from_file functions use case-insensitive extension matching via .suffix.lower(), while the existing load_tasks function in loader.py uses case-sensitive matching. This means a file like tasks.JSONL would pass validation but fail when actually loaded via load_tasks, because loader.py wouldn't recognize the uppercase extension and would incorrectly try to fetch it as a HuggingFace dataset.

Additional Locations (1)

Fix in Cursor Fix in Web

return _load_raw_from_file(path)
return cast("list[dict[str, Any]]", load_tasks(source, raw=True)), []


def _load_raw_from_file(path: Path) -> tuple[list[dict[str, Any]], list[str]]:
errors: list[str] = []
items: list[dict[str, Any]] = []

if path.suffix.lower() == ".jsonl":
with open(path, encoding="utf-8") as f:
for line_no, line in enumerate(f, start=1):
line = line.strip()
if not line:
continue
try:
value = json.loads(line)
except json.JSONDecodeError as e:
errors.append(f"line {line_no}: invalid JSON ({e.msg})")
continue
if isinstance(value, dict):
items.append(value)
continue
if isinstance(value, list):
for idx, entry in enumerate(value):
if isinstance(entry, dict):
items.append(entry)
else:
entry_type = type(entry).__name__
errors.append(
f"line {line_no} item {idx}: expected object, got {entry_type}"
)
continue
errors.append(
f"line {line_no}: expected object or list, got {type(value).__name__}"
)
return items, errors

with open(path, encoding="utf-8") as f:
value = json.load(f)

if isinstance(value, dict):
return [value], errors
if isinstance(value, list):
for idx, entry in enumerate(value):
if isinstance(entry, dict):
items.append(entry)
else:
errors.append(f"index {idx}: expected object, got {type(entry).__name__}")
return items, errors

raise ValueError(f"JSON file must contain an object or array, got {type(value).__name__}")
58 changes: 58 additions & 0 deletions hud/tests/test_validate_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from __future__ import annotations

import importlib.util
import json
from pathlib import Path

import pytest
import typer


def _load_validate_command():
module_path = Path(__file__).resolve().parents[1] / "cli" / "validate.py"
spec = importlib.util.spec_from_file_location("hud.cli.validate", module_path)
module = importlib.util.module_from_spec(spec) # type: ignore[arg-type]
assert spec and spec.loader
spec.loader.exec_module(module)
return module.validate_command
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test uses unnecessarily complex importlib module loading

Medium Severity

The _load_validate_command() function uses importlib.util.spec_from_file_location to manually load the module when a simple import would work: from hud.cli.validate import validate_command. This pattern is inconsistent with other tests in hud/cli/tests/ which use standard imports.

Fix in Cursor Fix in Web



def _write_tasks(path: Path, tasks: list[dict]) -> str:
path.write_text(json.dumps(tasks), encoding="utf-8")
return str(path)


def test_validate_command_valid(tmp_path: Path) -> None:
validate_command = _load_validate_command()
tasks = [
{
"prompt": "Say hello",
"mcp_config": {"local": {"command": "echo", "args": ["hi"]}},
"evaluate_tool": {"name": "done", "arguments": {}},
}
]
path = _write_tasks(tmp_path / "tasks.json", tasks)
validate_command(path)


def test_validate_command_invalid(tmp_path: Path) -> None:
validate_command = _load_validate_command()
tasks = [{"mcp_config": {"local": {"command": "echo", "args": ["hi"]}}}]
path = _write_tasks(tmp_path / "tasks.json", tasks)
with pytest.raises(typer.Exit):
validate_command(path)


def test_validate_command_flags_non_dict_entries(tmp_path: Path) -> None:
validate_command = _load_validate_command()
tasks = [
{
"prompt": "ok",
"mcp_config": {"local": {"command": "echo", "args": ["hi"]}},
"evaluate_tool": {"name": "done", "arguments": {}},
},
"not a task",
]
path = _write_tasks(tmp_path / "tasks.json", tasks)
with pytest.raises(typer.Exit):
validate_command(path)
Loading