diff --git a/docs/advanced/harbor-convert.mdx b/docs/advanced/harbor-convert.mdx index 358335df..bfdd9ef9 100644 --- a/docs/advanced/harbor-convert.mdx +++ b/docs/advanced/harbor-convert.mdx @@ -15,7 +15,7 @@ git clone https://github.com/laude-institute/terminal-bench-2.git # 2. Convert to HUD format hud convert ./terminal-bench-2/ --output ./tb2-hud -# 3. Deploy all environments +# 3. Deploy all environments (~3 min per environment, leave it running) hud deploy ./tb2-hud --all # 4. Run evaluation @@ -24,6 +24,11 @@ hud eval ./tb2-hud/taskset.json That's it. The converter handles Dockerfile adaptation, build context, test scripts, and reward parsing automatically. + +Each environment takes roughly 3 minutes to build and deploy. For datasets with many environments, +`hud deploy --all` runs them sequentially -- just leave it running and check back when it's done. + + ## What Gets Converted A Harbor task directory: @@ -81,9 +86,29 @@ Harbor test scripts write results to `/logs/verifier/`. The converter supports b - `reward.txt` -- a single float (`1.0` for pass, `0.0` for fail) - `reward.json` -- `{"reward": 1.0}` or just a float -## Running Programmatically +## Running Tasks + +### Option 1: Upload as a Taskset (recommended) + +The generated `taskset.json` can be uploaded directly to the HUD platform for managed evaluation, leaderboards, and comparison across models: + +1. Go to [hud.ai/evalsets](https://hud.ai/evalsets) and create a new taskset +2. Click **Upload Tasks** and paste the contents of `taskset.json` +3. Run evaluations from the platform UI or via `hud eval` + +See the [Tasksets guide](/platform/tasksets) for full details on creating and managing tasksets. + +### Option 2: CLI eval + +Run the taskset directly from the command line: + +```bash +hud eval ./tb2-hud/taskset.json +``` + +### Option 3: Python SDK -You can also run converted tasks from Python using the SDK: +Run tasks programmatically with any agent: ```python import asyncio @@ -108,7 +133,7 @@ async def main(): asyncio.run(main()) ``` -Or load the full taskset: +Or load the full taskset as Task objects: ```python import json diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py index daf9ff8c..ab22eea5 100644 --- a/hud/cli/__init__.py +++ b/hud/cli/__init__.py @@ -1026,46 +1026,153 @@ def get( @app.command() def convert( - tasks_file: str = typer.Argument( - ..., help="Path to tasks file (JSON/JSONL) to convert to remote MCP configuration" + path: str = typer.Argument( + ..., help="Path to source tasks/dataset directory to convert to HUD format" + ), + from_format: str = typer.Option( + "auto", + "--from", + "-f", + help="Source format (auto, harbor). Use 'auto' to detect automatically.", + ), + output: str | None = typer.Option( + None, + "--output", + "-o", + help="Output directory (default: ./hud_converted)", ), ) -> None: - """Convert local MCP task configs to remote (mcp.hud.ai) format. + """Convert external benchmark formats to HUD environments + tasksets. - This mirrors the implicit conversion flow used by 'hud rl' and writes a new - remote_.json next to the source file when needed. + [not dim]Converts tasks from frameworks like Harbor into HUD-compatible + environments (env.py + Dockerfile.hud) and v5 taskset files. + + Supports pluggable formats. Currently: harbor. + + Examples: + hud convert ./algotune/ # Auto-detect, convert dataset + hud convert ./my-task/ --from harbor # Explicit format + hud convert ./dataset/ --output ./out # Custom output directory[/not dim] """ from pathlib import Path + from .convert import detect_format, get_converter, list_formats, write_result + hud_console = HUDConsole() + source_path = Path(path).resolve() - try: - from .flows.tasks import convert_tasks_to_remote + if not source_path.exists(): + hud_console.error(f"Path does not exist: {path}") + raise typer.Exit(1) - result_path = convert_tasks_to_remote(tasks_file) + # Resolve converter + if from_format == "auto": + converter = detect_format(source_path) + if converter is None: + # Auto-detect failed — prompt user to pick a format + available = list_formats() + if not available: + hud_console.error("No converters registered.") + raise typer.Exit(1) + + if len(available) == 1: + # Only one format exists, just use it + converter = get_converter(available[0][0]) + if converter: + hud_console.info(f"Using format: {converter.name}") + else: + import questionary + + choices = [ + questionary.Choice(title=f"{name} — {desc}", value=name) + for name, desc in available + ] + picked = questionary.select( + "Could not auto-detect format. Which format is this?", + choices=choices, + ).ask() + if not picked: + raise typer.Exit(1) + converter = get_converter(picked) - # If nothing changed, inform the user - try: - if Path(result_path).resolve() == Path(tasks_file).resolve(): - hud_console.success( - "Tasks already reference remote MCP URLs. No conversion needed." - ) - hud_console.hint("You can run them directly with: hud eval --full") - return - except Exception as e: - # Best effort; continue with success message - hud_console.debug(f"Path comparison failed, continuing: {e}") - - hud_console.success(f"Converted tasks written to: {result_path}") - hud_console.hint( - "You can now run remote flows: hud rl or hud eval " - ) - except typer.Exit: - raise + if converter is None: + hud_console.error("No converter selected.") + raise typer.Exit(1) + else: + hud_console.info(f"Detected format: {converter.name}") + else: + converter = get_converter(from_format) + if converter is None: + hud_console.error(f"Unknown format: {from_format}") + available = list_formats() + if available: + hud_console.info("Available formats:") + for name, desc in available: + hud_console.info(f" {name}: {desc}") + raise typer.Exit(1) + + # Run conversion + try: + result = converter.convert(source_path) + except ValueError as e: + hud_console.error(str(e)) + raise typer.Exit(1) from e + except Exception as e: + hud_console.error(f"Conversion failed: {e}") + raise typer.Exit(1) from e + + # Write output + output_dir = Path(output) if output else Path("./hud_converted") + try: + taskset_path = write_result(result, output_dir.resolve()) except Exception as e: - hud_console.error(f"Failed to convert tasks: {e}") + hud_console.error(f"Failed to write output: {e}") raise typer.Exit(1) from e + # Display results + hud_console.header("Convert Complete") + hud_console.info("") + + total_tasks = len(result.taskset) + total_envs = len(result.environments) + hud_console.success(f"Converted {total_tasks} task(s) into {total_envs} environment(s).") + hud_console.info("") + + # Show each environment + hud_console.section_title("Environments") + for env_gen in result.environments: + task_count = len(env_gen.task_dirs) + hud_console.status_item(env_gen.name, f"{task_count} tasks") + hud_console.info("") + + # Show output paths + hud_console.section_title("Output") + hud_console.status_item("Directory", str(output_dir.resolve())) + hud_console.status_item("Taskset", str(taskset_path)) + hud_console.info("") + + # Show next steps with numbered commands + hud_console.section_title("Next Steps") + hud_console.info("") + + hud_console.info("1. Deploy environment(s):") + if total_envs > 1: + hud_console.command_example( + f"hud deploy {output_dir.resolve()} --all", + f"Deploy all {total_envs} environments", + ) + else: + first_env = result.environments[0].name if result.environments else "" + hud_console.command_example( + f"hud deploy {output_dir.resolve() / first_env}", + "Build & deploy to HUD platform", + ) + hud_console.info("") + + hud_console.info("2. Run evaluation:") + hud_console.command_example(f"hud eval {taskset_path}", "Run agent against tasks") + hud_console.info("") + @app.command() def cancel( diff --git a/hud/cli/convert/__init__.py b/hud/cli/convert/__init__.py new file mode 100644 index 00000000..c30ef455 --- /dev/null +++ b/hud/cli/convert/__init__.py @@ -0,0 +1,177 @@ +"""Pluggable format conversion system for HUD. + +Converts external benchmark formats (Harbor, Inspect AI, etc.) into +HUD environments + tasksets. + +Usage: + hud convert # Auto-detect format + hud convert --from harbor # Explicit format + hud convert --output ./out # Custom output directory +""" + +from __future__ import annotations + +import json +import logging +import shutil +from pathlib import Path # noqa: TC003 - used at runtime + +from .base import BaseConverter, ConvertResult, GeneratedEnvironment + +__all__ = [ + "BaseConverter", + "ConvertResult", + "GeneratedEnvironment", + "detect_format", + "get_converter", + "list_formats", + "write_result", +] + +LOGGER = logging.getLogger(__name__) + +# Shell script extensions that need CRLF -> LF normalization +_SHELL_EXTENSIONS = frozenset({".sh", ".bash", ".zsh", ".ksh"}) + + +def _normalize_line_endings(directory: Path) -> None: + """Convert CRLF to LF in all shell scripts under a directory. + + Git on Windows with autocrlf=true converts LF to CRLF on checkout. + Shell scripts with CRLF break on Linux (e.g., shebang errors, + 'set: pipefail\\r: invalid option name'). + """ + for path in directory.rglob("*"): + if path.is_file() and path.suffix in _SHELL_EXTENSIONS: + raw = path.read_bytes() + if b"\r" in raw: + path.write_bytes(raw.replace(b"\r\n", b"\n").replace(b"\r", b"\n")) + LOGGER.debug("Normalized line endings: %s", path) + + +# --------------------------------------------------------------------------- +# Converter registry +# --------------------------------------------------------------------------- + +# Lazy-loaded to avoid import cost on unrelated CLI commands +_converters: list[BaseConverter] | None = None + + +def _load_converters() -> list[BaseConverter]: + global _converters + if _converters is None: + from .harbor import HarborConverter + + _converters = [ + HarborConverter(), + # Future: InspectConverter(), METRConverter(), ... + ] + return _converters + + +def get_converter(name: str) -> BaseConverter | None: + """Get a converter by its short name (e.g., 'harbor').""" + for c in _load_converters(): + if c.name == name: + return c + return None + + +def detect_format(path: Path) -> BaseConverter | None: + """Auto-detect which converter can handle the given path.""" + for c in _load_converters(): + if c.detect(path): + return c + return None + + +def list_formats() -> list[tuple[str, str]]: + """Return (name, description) pairs for all registered converters.""" + return [(c.name, c.description) for c in _load_converters()] + + +# --------------------------------------------------------------------------- +# Output writer +# --------------------------------------------------------------------------- + + +def write_result(result: ConvertResult, output_dir: Path) -> Path: + """Write conversion results to disk. + + Creates the output directory structure: + output_dir/ + ├── env-name-a/ + │ ├── env.py + │ ├── Dockerfile.hud + │ ├── pyproject.toml + │ └── tasks/ + │ └── / (copied from source, minus environment/ & solution/) + └── taskset.json + + Returns the path to the generated taskset.json. + """ + output_dir.mkdir(parents=True, exist_ok=True) + + for env_gen in result.environments: + env_dir = output_dir / env_gen.name + env_dir.mkdir(parents=True, exist_ok=True) + + # Write generated files + (env_dir / "env.py").write_text(env_gen.env_py, encoding="utf-8") + (env_dir / "Dockerfile.hud").write_text(env_gen.dockerfile, encoding="utf-8") + (env_dir / "pyproject.toml").write_text(env_gen.pyproject_toml, encoding="utf-8") + + # Copy build context files from source environment/ directory + # (e.g., warriors/*.red that Harbor Dockerfiles reference via COPY) + if env_gen.build_context_source and env_gen.build_context_source.is_dir(): + for item in env_gen.build_context_source.iterdir(): + # Skip the Dockerfile itself (we already generated Dockerfile.hud) + if item.name.lower() in ("dockerfile", "dockerfile.hud"): + continue + dest_item = env_dir / item.name + if dest_item.exists(): + if dest_item.is_dir(): + shutil.rmtree(dest_item) + else: + dest_item.unlink() + if item.is_dir(): + shutil.copytree(item, dest_item) + else: + shutil.copy2(item, dest_item) + + # Copy task data directories (skip environment/ and solution/) + tasks_dir = env_dir / "tasks" + tasks_dir.mkdir(parents=True, exist_ok=True) + + for task_id, source_dir in env_gen.task_dirs.items(): + dest = tasks_dir / task_id + if dest.exists(): + shutil.rmtree(dest) + dest.mkdir(parents=True, exist_ok=True) + + for item in source_dir.iterdir(): + # Skip dirs that are handled by the Dockerfile or ignored + if item.name in ("environment", "solution"): + continue + if item.is_dir(): + shutil.copytree(item, dest / item.name) + else: + shutil.copy2(item, dest / item.name) + + # Normalize CRLF -> LF in all shell scripts (fixes Windows git checkout) + _normalize_line_endings(env_dir) + + LOGGER.info( + "Wrote environment '%s' with %d task(s)", + env_gen.name, + len(env_gen.task_dirs), + ) + + # Write taskset + taskset_path = output_dir / "taskset.json" + with open(taskset_path, "w", encoding="utf-8") as f: + json.dump(result.taskset, f, ensure_ascii=False, indent=2) + f.write("\n") + + LOGGER.info("Wrote taskset with %d task(s) to %s", len(result.taskset), taskset_path) + return taskset_path diff --git a/hud/cli/convert/base.py b/hud/cli/convert/base.py new file mode 100644 index 00000000..4fa86f09 --- /dev/null +++ b/hud/cli/convert/base.py @@ -0,0 +1,78 @@ +"""Abstract base classes for format converters. + +The converter system is pluggable: each format (Harbor, Inspect AI, etc.) +implements BaseConverter with detect() and convert() methods. The CLI +auto-detects format or lets the user specify explicitly. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any + +from pydantic import BaseModel, ConfigDict, Field + +__all__ = ["BaseConverter", "ConvertResult", "GeneratedEnvironment"] + + +class GeneratedEnvironment(BaseModel): + """A generated HUD environment ready to be written to disk. + + Attributes: + name: Environment name (e.g., "hud-harbor-algotune") + env_py: Generated env.py file content + dockerfile: Generated Dockerfile.hud content + pyproject_toml: Generated pyproject.toml content + task_dirs: Mapping of task_id -> source directory path. + Files from these directories (minus environment/ and solution/) + are copied into the output's tasks/ subdirectory. + build_context_source: Optional path to a source directory whose + non-Dockerfile contents should be copied into the environment + root as Docker build context (e.g., Harbor's environment/ dir). + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + name: str + env_py: str + dockerfile: str + pyproject_toml: str + task_dirs: dict[str, Path] + build_context_source: Path | None = None + + +class ConvertResult(BaseModel): + """Result of converting a source format to HUD. + + Attributes: + environments: Generated environment definitions (one per unique env group) + taskset: List of v5 Task dicts ready for taskset.json + summary: Human-readable summary lines for CLI output + """ + + environments: list[GeneratedEnvironment] + taskset: list[dict[str, Any]] + summary: list[str] = Field(default_factory=list) + + +class BaseConverter(ABC): + """Abstract base for format converters. + + Subclasses must define: + name: Short identifier (used with --from flag) + description: Human-readable description (shown in CLI help) + detect(): Check if a path matches this format + convert(): Perform the conversion + """ + + name: str + description: str + + @abstractmethod + def detect(self, path: Path) -> bool: + """Return True if this converter can handle the given path.""" + + @abstractmethod + def convert(self, path: Path) -> ConvertResult: + """Convert the source at path to HUD format.""" diff --git a/hud/cli/convert/harbor.py b/hud/cli/convert/harbor.py new file mode 100644 index 00000000..dc745bc9 --- /dev/null +++ b/hud/cli/convert/harbor.py @@ -0,0 +1,565 @@ +"""Harbor → HUD converter. + +Converts Harbor framework tasks (task.toml + instruction.md + environment/ + tests/) +into HUD environments with scenarios and tasksets. + +Harbor task structure: + task_name/ + ├── instruction.md # Agent prompt + ├── task.toml # Config: timeouts, metadata, resources + ├── environment/ + │ └── Dockerfile # Container the agent runs in + ├── tests/ + │ └── test.sh # Verification → writes reward.txt + └── solution/ # Optional (ignored) + +HUD output: + hud-harbor-{dataset}/ + ├── env.py # Environment with run-task scenario + ├── Dockerfile.hud # Harbor Dockerfile + HUD MCP layer + ├── pyproject.toml + └── tasks/ # All task data baked into image + ├── task-a/ + │ ├── instruction.md + │ └── tests/test.sh + └── task-b/ + ├── instruction.md + └── tests/test.sh + taskset.json # v5 taskset referencing the env +""" + +from __future__ import annotations + +import hashlib +import logging +import re +import tomllib +from dataclasses import dataclass +from pathlib import Path # noqa: TC003 - used at runtime +from typing import Any + +from .base import BaseConverter, ConvertResult, GeneratedEnvironment + +__all__ = ["HarborConverter"] + +LOGGER = logging.getLogger(__name__) + + +# ============================================================================= +# Helpers +# ============================================================================= + + +def _is_harbor_task(path: Path) -> bool: + """Check if a directory looks like a valid Harbor task.""" + return path.is_dir() and (path / "task.toml").exists() and (path / "instruction.md").exists() + + +def _hash_directory(path: Path) -> str: + """Content-hash a directory for grouping tasks by identical environments.""" + hasher = hashlib.sha256() + if not path.exists(): + return "empty" + for file_path in sorted(path.rglob("*")): + if file_path.is_file(): + hasher.update(str(file_path.relative_to(path)).encode()) + hasher.update(file_path.read_bytes()) + return hasher.hexdigest()[:16] + + +def _normalize_name(name: str) -> str: + """Normalize a dataset name to a valid HUD environment name.""" + normalized = name.strip().lower() + normalized = normalized.replace(" ", "-").replace("_", "-") + normalized = re.sub(r"[^a-z0-9-]", "", normalized) + normalized = re.sub(r"-+", "-", normalized) + return normalized.strip("-") or "converted" + + +def _find_dockerfile(env_dir: Path) -> str | None: + """Read the Dockerfile from a Harbor environment directory.""" + for name in ("Dockerfile", "dockerfile"): + path = env_dir / name + if path.exists(): + return path.read_text(encoding="utf-8") + return None + + +def _adapt_harbor_dockerfile(content: str) -> str: + """Comment out CMD/ENTRYPOINT lines from a Harbor Dockerfile. + + These are replaced by the HUD MCP server entrypoint. + """ + lines = content.splitlines() + adapted: list[str] = [] + for line in lines: + stripped = line.strip().upper() + if stripped.startswith(("CMD ", "CMD[", "ENTRYPOINT ", "ENTRYPOINT[")): + adapted.append(f"# [harbor original] {line}") + else: + adapted.append(line) + return "\n".join(adapted) + + +# ============================================================================= +# Data classes +# ============================================================================= + + +@dataclass +class HarborTask: + """Parsed Harbor task.""" + + task_id: str + directory: Path + instruction: str + config: dict[str, Any] + env_hash: str + + +def _parse_task(task_dir: Path) -> HarborTask | None: + """Parse a Harbor task directory into a HarborTask.""" + try: + instruction = (task_dir / "instruction.md").read_text(encoding="utf-8") + except Exception: + LOGGER.warning("Failed to read instruction.md in %s", task_dir) + return None + + try: + raw = (task_dir / "task.toml").read_text(encoding="utf-8") + config: dict[str, Any] = tomllib.loads(raw) + except Exception: + LOGGER.warning("Failed to parse task.toml in %s", task_dir) + config = {} + + env_dir = task_dir / "environment" + env_hash = _hash_directory(env_dir) if env_dir.exists() else "no-env" + + return HarborTask( + task_id=task_dir.name, + directory=task_dir, + instruction=instruction, + config=config, + env_hash=env_hash, + ) + + +# ============================================================================= +# Templates +# ============================================================================= + +# fmt: off + +# Header + shared body split so the scenario signature can vary. +_ENV_PY_HEADER = '''\ +"""{env_name} - HUD environment converted from Harbor. + +Source: {source_path} +Tasks: {task_count} + +This environment runs Harbor-format tasks. Each task has: +- instruction.md: the agent prompt +- tests/test.sh: verification script that writes reward to /logs/verifier/ + +The run-task scenario reads the instruction, lets the agent work, +then executes the test script and parses the reward. +""" + +import json +import logging +import subprocess +from pathlib import Path +{extra_imports} +from hud import Environment +from hud.tools import BashTool, EditTool +from hud.tools.filesystem import GlobTool, GrepTool, ListTool, ReadTool + +LOGGER = logging.getLogger(__name__) + +TASKS_DIR = Path("/harbor/tasks") + +env = Environment("{env_name}") + +# Standard coding tools - agents interact via bash (matching Harbor's model) +env.add_tool(BashTool()) +env.add_tool(EditTool()) +env.add_tool(ReadTool()) +env.add_tool(GrepTool()) +env.add_tool(GlobTool()) +env.add_tool(ListTool()) + +''' + +# Single task: task_id is optional, defaults to the only task. +_SCENARIO_SINGLE = """\ +@env.scenario("run-task") +async def run_task(task_id: str = "{default_task_id}"): +""" + +# Multiple tasks: task_id is required, typed as a Literal. +_SCENARIO_MULTI = """\ +TaskId = Literal[{task_id_literal}] + + +@env.scenario("run-task") +async def run_task(task_id: TaskId): +""" + +_SCENARIO_BODY = '''\ + """Run a Harbor task by ID. + + Reads /harbor/tasks//instruction.md as the prompt. + After the agent works, runs tests/test.sh and parses + /logs/verifier/reward.txt or reward.json for the reward. + """ + task_dir = TASKS_DIR / str(task_id) + if not task_dir.exists(): + available = [d.name for d in TASKS_DIR.iterdir() if d.is_dir()] + raise ValueError( + f"Task '{{task_id}}' not found. Available: {{available}}" + ) + + # Read the task instruction + instruction = (task_dir / "instruction.md").read_text(encoding="utf-8") + + # Setup: yield prompt to the agent + answer = yield instruction + + # Ensure log output directory exists + logs_dir = Path("/logs/verifier") + logs_dir.mkdir(parents=True, exist_ok=True) + + # Harbor mounts the task's tests/ directory at /tests/ — replicate that + tests_link = Path("/tests") + task_tests = task_dir / "tests" + if task_tests.is_dir(): + if tests_link.is_symlink() or tests_link.exists(): + tests_link.unlink() + tests_link.symlink_to(task_tests) + + # Evaluate: run the test script + test_script = task_dir / "tests" / "test.sh" + if test_script.exists(): + try: + result = subprocess.run( + ["bash", str(test_script)], + cwd="/app", + capture_output=True, + text=True, + timeout={verifier_timeout}, + check=False, + ) + if result.stdout: + LOGGER.info("test.sh stdout for %s:\\n%s", task_id, result.stdout[-2000:]) + if result.stderr: + LOGGER.info("test.sh stderr for %s:\\n%s", task_id, result.stderr[-2000:]) + if result.returncode != 0: + LOGGER.warning( + "test.sh exited with code %d for task %s", + result.returncode, task_id, + ) + except subprocess.TimeoutExpired: + LOGGER.warning("Test script timed out for task %s", task_id) + except Exception as exc: + LOGGER.warning("Test script failed for task %s: %s", task_id, exc) + else: + LOGGER.warning("No test script found at %s", test_script) + + # Parse and yield reward + yield _parse_harbor_reward() + + +def _parse_harbor_reward() -> float: + """Parse reward from Harbor standard output locations. + + Harbor test scripts write results to /logs/verifier/ as either: + - reward.txt: a single float value + - reward.json: {{"reward": float}} or just a float + """ + reward_txt = Path("/logs/verifier/reward.txt") + reward_json = Path("/logs/verifier/reward.json") + + if reward_txt.exists(): + try: + return float(reward_txt.read_text(encoding="utf-8").strip()) + except ValueError: + pass + + if reward_json.exists(): + try: + data = json.loads(reward_json.read_text(encoding="utf-8")) + if isinstance(data, dict): + return float(data.get("reward", 0.0)) + return float(data) + except (ValueError, json.JSONDecodeError): + pass + + return 0.0 +''' + + +def _build_env_py( + env_name: str, + source_path: str, + task_ids: list[str], + verifier_timeout: int, +) -> str: + """Build the env.py content, adapting the scenario signature to task count.""" + if len(task_ids) == 1: + extra_imports = "" + scenario = _SCENARIO_SINGLE.format(default_task_id=task_ids[0]) + else: + extra_imports = "\nfrom typing import Literal\n" + literal_values = ", ".join(f'"{tid}"' for tid in sorted(task_ids)) + scenario = _SCENARIO_MULTI.format(task_id_literal=literal_values) + + header = _ENV_PY_HEADER.format( + env_name=env_name, + source_path=source_path, + task_count=len(task_ids), + extra_imports=extra_imports, + ) + body = _SCENARIO_BODY.format(verifier_timeout=verifier_timeout) + return header + scenario + body + +# fmt: on + +# Shared snippet: install uv standalone (works on any base image with curl or +# apt), then use uv to bootstrap Python and sync dependencies. +_HUD_LAYER = """\ +# ============================================================ +# HUD MCP server layer +# ============================================================ +WORKDIR /hud + +# Install uv standalone (no pip/python required on the base image) +RUN command -v curl >/dev/null 2>&1 || \\ + (apt-get update -qq && \\ + apt-get install -y -qq --no-install-recommends curl ca-certificates && \\ + rm -rf /var/lib/apt/lists/*) && \\ + curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +COPY pyproject.toml uv.lock* ./ +RUN uv sync --frozen --no-dev --no-install-project 2>/dev/null || \\ + uv sync --no-dev --no-install-project + +# Harbor task data (instructions + test scripts baked into image) +COPY tasks/ /harbor/tasks/ + +# Ensure standard directories exist and are writable at runtime +# (MCP server may run as non-root; Harbor tasks expect /app writable) +RUN mkdir -p /logs/verifier /workspace /app && chmod 777 /logs/verifier /workspace /app + +COPY env.py ./ + +CMD ["uv", "run", "--no-project", "python", "-m", "hud", "dev", "env:env", "--stdio"] +""" + +DOCKERFILE_WITH_BASE_TEMPLATE = ( + """\ +# ============================================================ +# Harbor environment base +# Source: {source} +# ============================================================ +{base_dockerfile} +""" + + _HUD_LAYER +) + +DOCKERFILE_FALLBACK_TEMPLATE = ( + """\ +FROM python:3.11-slim + +RUN apt-get update && apt-get install -y --no-install-recommends \\ + curl git build-essential && rm -rf /var/lib/apt/lists/* +""" + + _HUD_LAYER +) + +PYPROJECT_TEMPLATE = """\ +[project] +name = "{name}" +version = "0.1.0" +requires-python = ">=3.10" +dependencies = ["hud-python", "openai"] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" +""" + + +# ============================================================================= +# Converter +# ============================================================================= + + +class HarborConverter(BaseConverter): + """Convert Harbor tasks/datasets to HUD format. + + Handles: + - Single task directory (has task.toml directly) + - Dataset directory (subdirectories are Harbor tasks) + - Multi-environment datasets (tasks grouped by Dockerfile hash) + """ + + name = "harbor" + description = "Harbor framework (task.toml + instruction.md + environment/ + tests/)" + + def detect(self, path: Path) -> bool: + if _is_harbor_task(path): + return True + # Check for dataset (directory containing task subdirectories) + if path.is_dir(): + return any(_is_harbor_task(d) for d in path.iterdir() if d.is_dir()) + return False + + def convert(self, path: Path) -> ConvertResult: + path = path.resolve() + + # Discover tasks + if _is_harbor_task(path): + task_dirs = [path] + dataset_name = path.parent.name + else: + task_dirs = sorted(d for d in path.iterdir() if d.is_dir() and _is_harbor_task(d)) + dataset_name = path.name + + if not task_dirs: + raise ValueError(f"No Harbor tasks found in {path}") + + # Parse all tasks + tasks: list[HarborTask] = [] + skipped = 0 + for td in task_dirs: + parsed = _parse_task(td) + if parsed: + tasks.append(parsed) + else: + skipped += 1 + + if not tasks: + raise ValueError("All Harbor tasks failed to parse") + + if skipped: + LOGGER.warning("Skipped %d task(s) that failed to parse", skipped) + + LOGGER.info("Parsed %d Harbor task(s) from %s", len(tasks), path) + + # Group by environment Dockerfile hash + groups: dict[str, list[HarborTask]] = {} + for task in tasks: + groups.setdefault(task.env_hash, []).append(task) + + LOGGER.info("Found %d unique environment group(s)", len(groups)) + + # Generate environments and taskset + environments: list[GeneratedEnvironment] = [] + taskset: list[dict[str, Any]] = [] + base_name = f"hud-harbor-{_normalize_name(dataset_name)}" + + # Sort groups by size (largest first) for consistent naming + sorted_groups = sorted(groups.items(), key=lambda x: -len(x[1])) + + for idx, (_env_hash, group_tasks) in enumerate(sorted_groups, start=1): + # Naming: single group gets base_name, multiple get suffix + env_name = base_name if len(sorted_groups) == 1 else f"{base_name}-g{idx}" + + # Use representative task for shared config + rep_task = group_tasks[0] + env_dir = rep_task.directory / "environment" + dockerfile_content = _find_dockerfile(env_dir) if env_dir.exists() else None + + # Extract verifier timeout from config + verifier_timeout = 600 + verifier_cfg = rep_task.config.get("verifier", {}) + if isinstance(verifier_cfg, dict): + timeout_val = verifier_cfg.get("timeout_sec") + if timeout_val is not None: + verifier_timeout = int(timeout_val) + + # --- Generate env.py --- + # Use forward slashes in source_path to avoid unicode escape issues on Windows + task_ids = [t.task_id for t in group_tasks] + env_py = _build_env_py( + env_name=env_name, + source_path=path.as_posix(), + task_ids=task_ids, + verifier_timeout=verifier_timeout, + ) + + # --- Generate Dockerfile.hud --- + if dockerfile_content: + adapted = _adapt_harbor_dockerfile(dockerfile_content) + dockerfile = DOCKERFILE_WITH_BASE_TEMPLATE.format( + source=env_dir.as_posix(), + base_dockerfile=adapted, + ) + else: + dockerfile = DOCKERFILE_FALLBACK_TEMPLATE + + # --- Generate pyproject.toml --- + pyproject = PYPROJECT_TEMPLATE.format(name=env_name) + + # --- Map task IDs to source directories --- + task_dir_map = {t.task_id: t.directory for t in group_tasks} + + # Build context: non-Dockerfile files from environment/ dir + # (e.g., warriors/*.red that the Dockerfile COPYs) + build_ctx = env_dir if env_dir.exists() else None + + environments.append( + GeneratedEnvironment( + name=env_name, + env_py=env_py, + dockerfile=dockerfile, + pyproject_toml=pyproject, + task_dirs=task_dir_map, + build_context_source=build_ctx, + ) + ) + + # --- Generate v5 taskset entries --- + for task in group_tasks: + metadata: dict[str, Any] = { + "harbor_source": task.directory.relative_to(path.parent).as_posix(), + } + # Pull metadata from task.toml [metadata] section + toml_meta = task.config.get("metadata", {}) + if isinstance(toml_meta, dict): + metadata.update(toml_meta) + + taskset.append( + { + "env": {"name": env_name}, + "scenario": f"{env_name}:run-task", + "args": {"task_id": task.task_id}, + "metadata": metadata, + } + ) + + # Build summary lines + summary = [ + f"Converted {len(tasks)} Harbor task(s) into {len(environments)} environment(s).", + ] + if skipped: + summary.append(f"Skipped {skipped} task(s) that failed to parse.") + summary.append("") + for env_gen in environments: + task_count = len(env_gen.task_dirs) + summary.append(f" {env_gen.name}/ ({task_count} tasks)") + summary.extend( + [ + "", + "Next steps:", + " 1. hud deploy /", + " 2. hud eval taskset.json", + ] + ) + + return ConvertResult( + environments=environments, + taskset=taskset, + summary=summary, + ) diff --git a/hud/cli/convert/tests/__init__.py b/hud/cli/convert/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hud/cli/convert/tests/conftest.py b/hud/cli/convert/tests/conftest.py new file mode 100644 index 00000000..e6f7b683 --- /dev/null +++ b/hud/cli/convert/tests/conftest.py @@ -0,0 +1,258 @@ +"""Shared fixtures for Harbor converter tests. + +Provides builders that create synthetic Harbor-format task directories +matching the terminal-bench-2 layout: + + task_name/ + ├── task.toml + ├── instruction.md + ├── environment/ + │ └── Dockerfile + ├── tests/ + │ └── test.sh + └── solution/ # optional, should be ignored by converter +""" + +from __future__ import annotations + +import textwrap +from pathlib import Path # noqa: TC003 - used at runtime + +import pytest + +# --------------------------------------------------------------------------- +# task.toml templates (matching real terminal-bench style) +# --------------------------------------------------------------------------- + +_DEFAULT_TASK_TOML = textwrap.dedent("""\ + [metadata] + category = "systems" + difficulty = "medium" + tags = ["bash", "linux"] + + [verifier] + timeout_sec = 120 +""") + +_TASK_TOML_WITH_IMAGE = textwrap.dedent("""\ + [metadata] + category = "machine-learning" + difficulty = "hard" + tags = ["python", "ml"] + + [docker] + image = "alexgshaw/caffe-cifar-10:20251031" + + [verifier] + timeout_sec = 300 +""") + + +# --------------------------------------------------------------------------- +# Dockerfile templates +# --------------------------------------------------------------------------- + +_SIMPLE_DOCKERFILE = textwrap.dedent("""\ + FROM python:3.11-slim + RUN apt-get update && apt-get install -y curl git + WORKDIR /workspace + CMD ["bash"] +""") + +_ML_DOCKERFILE = textwrap.dedent("""\ + FROM nvidia/cuda:12.0-runtime-ubuntu22.04 + RUN apt-get update && apt-get install -y python3 python3-pip + RUN pip3 install torch numpy + WORKDIR /workspace + ENTRYPOINT ["/bin/bash"] +""") + + +# --------------------------------------------------------------------------- +# Helper to build a single task directory +# --------------------------------------------------------------------------- + + +def make_harbor_task( + parent: Path, + name: str, + instruction: str = "Solve the task.", + task_toml: str = _DEFAULT_TASK_TOML, + dockerfile: str | None = _SIMPLE_DOCKERFILE, + test_script: str = '#!/bin/bash\necho "1.0" > /logs/verifier/reward.txt\n', + include_solution: bool = False, +) -> Path: + """Create a synthetic Harbor task directory under *parent*. + + Returns the task directory path. + """ + task_dir = parent / name + task_dir.mkdir(parents=True, exist_ok=True) + + (task_dir / "instruction.md").write_text(instruction, encoding="utf-8") + (task_dir / "task.toml").write_text(task_toml, encoding="utf-8") + + if dockerfile is not None: + env_dir = task_dir / "environment" + env_dir.mkdir(exist_ok=True) + (env_dir / "Dockerfile").write_text(dockerfile, encoding="utf-8") + + tests_dir = task_dir / "tests" + tests_dir.mkdir(exist_ok=True) + (tests_dir / "test.sh").write_text(test_script, encoding="utf-8") + + if include_solution: + sol_dir = task_dir / "solution" + sol_dir.mkdir(exist_ok=True) + (sol_dir / "solve.sh").write_text("#!/bin/bash\necho done\n", encoding="utf-8") + + return task_dir + + +# --------------------------------------------------------------------------- +# Pytest fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture() +def single_task(tmp_path: Path) -> Path: + """A single Harbor task directory (like a standalone task).""" + return make_harbor_task( + tmp_path, + "cancel-async-tasks", + instruction=( + "# Cancel Async Tasks\n\n" + "Write a Python script that launches 5 asyncio tasks and cancels " + "all of them within 2 seconds.\n" + ), + ) + + +@pytest.fixture() +def dataset_same_env(tmp_path: Path) -> Path: + """A dataset directory with 3 tasks sharing the same Dockerfile.""" + dataset = tmp_path / "terminal-bench-sample" + dataset.mkdir() + + for name in ("cancel-async-tasks", "build-pmars", "chess-best-move"): + make_harbor_task( + dataset, + name, + instruction=f"# {name}\n\nSolve the {name} task.\n", + ) + + return dataset + + +@pytest.fixture() +def dataset_multi_env(tmp_path: Path) -> Path: + """A dataset directory with tasks split across 2 different Dockerfiles.""" + dataset = tmp_path / "mixed-bench" + dataset.mkdir() + + # Group 1: simple python tasks (same Dockerfile) + for name in ("cancel-async-tasks", "build-pmars"): + make_harbor_task( + dataset, + name, + instruction=f"# {name}\n\nDo the thing.\n", + dockerfile=_SIMPLE_DOCKERFILE, + ) + + # Group 2: ML tasks (different Dockerfile) + for name in ("caffe-cifar-10", "sam-cell-seg"): + make_harbor_task( + dataset, + name, + instruction=f"# {name}\n\nTrain the model.\n", + task_toml=_TASK_TOML_WITH_IMAGE, + dockerfile=_ML_DOCKERFILE, + ) + + return dataset + + +@pytest.fixture() +def dataset_no_dockerfile(tmp_path: Path) -> Path: + """A dataset where tasks have no environment/Dockerfile.""" + dataset = tmp_path / "no-docker-bench" + dataset.mkdir() + + for name in ("task-a", "task-b"): + make_harbor_task( + dataset, + name, + instruction=f"# {name}\n\nSimple task.\n", + dockerfile=None, # No Dockerfile + ) + + return dataset + + +@pytest.fixture() +def dataset_with_solutions(tmp_path: Path) -> Path: + """A dataset where tasks include solution/ directories.""" + dataset = tmp_path / "solved-bench" + dataset.mkdir() + + for name in ("task-x", "task-y"): + make_harbor_task( + dataset, + name, + instruction=f"# {name}\n\nSolve it.\n", + include_solution=True, + ) + + return dataset + + +@pytest.fixture() +def task_with_build_context(tmp_path: Path) -> Path: + """A single task whose environment/ dir has extra build context files. + + Mimics build-pmars which has warriors/*.red files that the + Dockerfile COPYs into the image. + """ + task_dir = tmp_path / "build-pmars" + task_dir.mkdir() + + (task_dir / "instruction.md").write_text( + "# Build pMARS\n\nBuild the pMARS simulator.\n", encoding="utf-8" + ) + (task_dir / "task.toml").write_text( + textwrap.dedent("""\ + [metadata] + category = "software-engineering" + difficulty = "medium" + + [verifier] + timeout_sec = 900 + """), + encoding="utf-8", + ) + + # environment/ with Dockerfile AND extra build context files + env_dir = task_dir / "environment" + env_dir.mkdir() + (env_dir / "Dockerfile").write_text( + textwrap.dedent("""\ + FROM debian:13.0-slim + RUN apt-get update && apt-get install -y tmux + WORKDIR /app + COPY warriors/flashpaper.red warriors/rave.red /app/ + """), + encoding="utf-8", + ) + warriors = env_dir / "warriors" + warriors.mkdir() + (warriors / "flashpaper.red").write_text(";redcode\nMOV 0, 1\n", encoding="utf-8") + (warriors / "rave.red").write_text(";redcode\nSPL 0, 0\n", encoding="utf-8") + + # tests/ + tests_dir = task_dir / "tests" + tests_dir.mkdir() + (tests_dir / "test.sh").write_text( + '#!/bin/bash\necho "1.0" > /logs/verifier/reward.txt\n', encoding="utf-8" + ) + + return task_dir diff --git a/hud/cli/convert/tests/test_harbor.py b/hud/cli/convert/tests/test_harbor.py new file mode 100644 index 00000000..64c6c6b2 --- /dev/null +++ b/hud/cli/convert/tests/test_harbor.py @@ -0,0 +1,751 @@ +"""Tests for the Harbor → HUD converter. + +Exercises HarborConverter.detect(), HarborConverter.convert(), and the +write_result() writer using synthetic terminal-bench-style fixtures +defined in conftest.py. +""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +import pytest + +if TYPE_CHECKING: + from pathlib import Path + +from hud.cli.convert import detect_format, get_converter, list_formats, write_result +from hud.cli.convert.harbor import ( + HarborConverter, + _adapt_harbor_dockerfile, + _find_dockerfile, + _hash_directory, + _is_harbor_task, + _normalize_name, + _parse_task, +) + +from .conftest import make_harbor_task + +# ============================================================================ +# Helper unit tests +# ============================================================================ + + +class TestNormalizeName: + def test_simple(self) -> None: + assert _normalize_name("terminal-bench") == "terminal-bench" + + def test_underscores(self) -> None: + assert _normalize_name("my_cool_bench") == "my-cool-bench" + + def test_spaces(self) -> None: + assert _normalize_name("My Cool Bench") == "my-cool-bench" + + def test_special_chars(self) -> None: + assert _normalize_name("bench@2.0!") == "bench20" + + def test_empty(self) -> None: + assert _normalize_name("") == "converted" + + def test_only_special_chars(self) -> None: + assert _normalize_name("@#$") == "converted" + + def test_leading_trailing_dashes(self) -> None: + assert _normalize_name("--hello--") == "hello" + + def test_consecutive_dashes(self) -> None: + assert _normalize_name("a---b") == "a-b" + + +class TestAdaptDockerfile: + def test_comments_cmd(self) -> None: + result = _adapt_harbor_dockerfile('CMD ["bash"]') + assert result == '# [harbor original] CMD ["bash"]' + + def test_comments_entrypoint(self) -> None: + result = _adapt_harbor_dockerfile('ENTRYPOINT ["/bin/bash"]') + assert result == '# [harbor original] ENTRYPOINT ["/bin/bash"]' + + def test_preserves_other_lines(self) -> None: + dockerfile = "FROM python:3.11\nRUN echo hi\nCMD bash" + result = _adapt_harbor_dockerfile(dockerfile) + lines = result.splitlines() + assert lines[0] == "FROM python:3.11" + assert lines[1] == "RUN echo hi" + assert lines[2] == "# [harbor original] CMD bash" + + def test_case_insensitive_match(self) -> None: + # The implementation uses .upper() so indented CMD should match + result = _adapt_harbor_dockerfile(" CMD bash") + assert result == "# [harbor original] CMD bash" + + def test_no_cmd_or_entrypoint(self) -> None: + dockerfile = "FROM python:3.11\nRUN apt-get update" + assert _adapt_harbor_dockerfile(dockerfile) == dockerfile + + +class TestHashDirectory: + def test_same_content_same_hash(self, tmp_path: Path) -> None: + dir_a = tmp_path / "a" + dir_a.mkdir() + (dir_a / "file.txt").write_text("hello") + + dir_b = tmp_path / "b" + dir_b.mkdir() + (dir_b / "file.txt").write_text("hello") + + assert _hash_directory(dir_a) == _hash_directory(dir_b) + + def test_different_content_different_hash(self, tmp_path: Path) -> None: + dir_a = tmp_path / "a" + dir_a.mkdir() + (dir_a / "file.txt").write_text("hello") + + dir_b = tmp_path / "b" + dir_b.mkdir() + (dir_b / "file.txt").write_text("world") + + assert _hash_directory(dir_a) != _hash_directory(dir_b) + + def test_nonexistent_returns_empty(self, tmp_path: Path) -> None: + assert _hash_directory(tmp_path / "nonexistent") == "empty" + + def test_empty_directory(self, tmp_path: Path) -> None: + empty = tmp_path / "empty" + empty.mkdir() + # Empty dir has a deterministic hash (sha256 of nothing) + result = _hash_directory(empty) + assert isinstance(result, str) + assert len(result) == 16 + + +class TestFindDockerfile: + def test_finds_dockerfile(self, tmp_path: Path) -> None: + (tmp_path / "Dockerfile").write_text("FROM python:3.11") + assert _find_dockerfile(tmp_path) == "FROM python:3.11" + + def test_finds_lowercase(self, tmp_path: Path) -> None: + (tmp_path / "dockerfile").write_text("FROM alpine") + assert _find_dockerfile(tmp_path) == "FROM alpine" + + def test_returns_none_when_missing(self, tmp_path: Path) -> None: + assert _find_dockerfile(tmp_path) is None + + +class TestIsHarborTask: + def test_valid_task(self, single_task: Path) -> None: + assert _is_harbor_task(single_task) is True + + def test_missing_instruction(self, tmp_path: Path) -> None: + task = tmp_path / "bad-task" + task.mkdir() + (task / "task.toml").write_text("[metadata]\n") + assert _is_harbor_task(task) is False + + def test_missing_task_toml(self, tmp_path: Path) -> None: + task = tmp_path / "bad-task" + task.mkdir() + (task / "instruction.md").write_text("# Do something") + assert _is_harbor_task(task) is False + + def test_not_a_directory(self, tmp_path: Path) -> None: + f = tmp_path / "file.txt" + f.write_text("not a dir") + assert _is_harbor_task(f) is False + + +class TestParseTask: + def test_parses_valid_task(self, single_task: Path) -> None: + result = _parse_task(single_task) + assert result is not None + assert result.task_id == "cancel-async-tasks" + assert "Cancel Async Tasks" in result.instruction + assert result.config.get("metadata", {}).get("category") == "systems" + + def test_parses_verifier_timeout(self, single_task: Path) -> None: + result = _parse_task(single_task) + assert result is not None + assert result.config["verifier"]["timeout_sec"] == 120 + + def test_returns_none_for_bad_instruction(self, tmp_path: Path) -> None: + task_dir = tmp_path / "bad" + task_dir.mkdir() + (task_dir / "task.toml").write_text("[metadata]\n") + # instruction.md missing + assert _parse_task(task_dir) is None + + def test_handles_bad_toml_gracefully(self, tmp_path: Path) -> None: + task_dir = tmp_path / "broken-toml" + task_dir.mkdir() + (task_dir / "instruction.md").write_text("# Hello") + (task_dir / "task.toml").write_text("this is not valid toml {{{") + result = _parse_task(task_dir) + assert result is not None + # Config should be empty dict when toml fails + assert result.config == {} + + +# ============================================================================ +# HarborConverter.detect() +# ============================================================================ + + +class TestHarborConverterDetect: + def setup_method(self) -> None: + self.converter = HarborConverter() + + def test_detects_single_task(self, single_task: Path) -> None: + assert self.converter.detect(single_task) is True + + def test_detects_dataset(self, dataset_same_env: Path) -> None: + assert self.converter.detect(dataset_same_env) is True + + def test_rejects_empty_dir(self, tmp_path: Path) -> None: + assert self.converter.detect(tmp_path) is False + + def test_rejects_non_harbor_dir(self, tmp_path: Path) -> None: + (tmp_path / "random.txt").write_text("nope") + assert self.converter.detect(tmp_path) is False + + +# ============================================================================ +# HarborConverter.convert() +# ============================================================================ + + +class TestHarborConverterConvertSingleTask: + """Convert a single Harbor task directory.""" + + def setup_method(self) -> None: + self.converter = HarborConverter() + + def test_single_task_produces_one_env(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + assert len(result.environments) == 1 + assert len(result.taskset) == 1 + + def test_env_name_uses_parent_dir(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + env = result.environments[0] + # Parent dir name is the tmp_path random name, but it gets normalized + assert env.name.startswith("hud-harbor-") + + def test_env_py_contains_scenario(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + env_py = result.environments[0].env_py + assert "@env.scenario" in env_py + assert "run-task" in env_py + + def test_env_py_has_correct_timeout(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + env_py = result.environments[0].env_py + assert "timeout=120" in env_py + + def test_taskset_references_env(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + entry = result.taskset[0] + env_name = result.environments[0].name + assert entry["scenario"] == f"{env_name}:run-task" + assert entry["args"]["task_id"] == "cancel-async-tasks" + + def test_task_dirs_map(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + env = result.environments[0] + assert "cancel-async-tasks" in env.task_dirs + assert env.task_dirs["cancel-async-tasks"] == single_task + + def test_summary_not_empty(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + assert len(result.summary) > 0 + assert any("1" in line for line in result.summary) + + +class TestHarborConverterConvertDataset: + """Convert a dataset directory with multiple tasks sharing the same env.""" + + def setup_method(self) -> None: + self.converter = HarborConverter() + + def test_same_env_groups_into_one(self, dataset_same_env: Path) -> None: + result = self.converter.convert(dataset_same_env) + assert len(result.environments) == 1 + assert len(result.taskset) == 3 + + def test_all_task_ids_present(self, dataset_same_env: Path) -> None: + result = self.converter.convert(dataset_same_env) + task_ids = {e["args"]["task_id"] for e in result.taskset} + assert task_ids == {"cancel-async-tasks", "build-pmars", "chess-best-move"} + + def test_env_name_from_dataset(self, dataset_same_env: Path) -> None: + result = self.converter.convert(dataset_same_env) + env = result.environments[0] + assert env.name == "hud-harbor-terminal-bench-sample" + + +class TestHarborConverterConvertMultiEnv: + """Convert a dataset with tasks split across different Dockerfiles.""" + + def setup_method(self) -> None: + self.converter = HarborConverter() + + def test_creates_two_envs(self, dataset_multi_env: Path) -> None: + result = self.converter.convert(dataset_multi_env) + assert len(result.environments) == 2 + assert len(result.taskset) == 4 + + def test_env_names_have_group_suffix(self, dataset_multi_env: Path) -> None: + result = self.converter.convert(dataset_multi_env) + names = {e.name for e in result.environments} + assert all(n.startswith("hud-harbor-mixed-bench") for n in names) + # With multiple groups, names should have -g1, -g2 suffixes + assert any("-g1" in n for n in names) + assert any("-g2" in n for n in names) + + def test_each_env_has_correct_tasks(self, dataset_multi_env: Path) -> None: + result = self.converter.convert(dataset_multi_env) + for env in result.environments: + task_ids = set(env.task_dirs.keys()) + # Each group should have exactly 2 tasks + assert len(task_ids) == 2 + + def test_ml_env_has_nvidia_dockerfile(self, dataset_multi_env: Path) -> None: + result = self.converter.convert(dataset_multi_env) + # One of the environments should reference nvidia in its dockerfile + dockerfiles = [e.dockerfile for e in result.environments] + assert any("nvidia" in d for d in dockerfiles) + + def test_simple_env_has_python_dockerfile(self, dataset_multi_env: Path) -> None: + result = self.converter.convert(dataset_multi_env) + dockerfiles = [e.dockerfile for e in result.environments] + assert any("python:3.11-slim" in d for d in dockerfiles) + + +class TestBuildContextSource: + """Verify build_context_source is set for tasks with environment/ dirs.""" + + def setup_method(self) -> None: + self.converter = HarborConverter() + + def test_build_context_source_set(self, task_with_build_context: Path) -> None: + result = self.converter.convert(task_with_build_context) + env = result.environments[0] + assert env.build_context_source is not None + assert env.build_context_source.is_dir() + + def test_build_context_source_none_when_no_env_dir(self, dataset_no_dockerfile: Path) -> None: + result = self.converter.convert(dataset_no_dockerfile) + env = result.environments[0] + assert env.build_context_source is None + + +class TestWriteBuildContext: + """Verify that build context files from environment/ are copied to env root.""" + + def setup_method(self) -> None: + self.converter = HarborConverter() + + def test_warriors_copied_to_env_root( + self, task_with_build_context: Path, tmp_path: Path + ) -> None: + result = self.converter.convert(task_with_build_context) + out = tmp_path / "output" + write_result(result, out) + + env = result.environments[0] + env_dir = out / env.name + + # warriors/ dir should exist at env root (Docker build context) + assert (env_dir / "warriors").is_dir() + assert (env_dir / "warriors" / "flashpaper.red").is_file() + assert (env_dir / "warriors" / "rave.red").is_file() + + def test_dockerfile_not_duplicated(self, task_with_build_context: Path, tmp_path: Path) -> None: + result = self.converter.convert(task_with_build_context) + out = tmp_path / "output" + write_result(result, out) + + env = result.environments[0] + env_dir = out / env.name + + # Should have Dockerfile.hud (generated), NOT a raw Dockerfile copy + assert (env_dir / "Dockerfile.hud").is_file() + assert not (env_dir / "Dockerfile").exists() + + def test_build_context_content_correct( + self, task_with_build_context: Path, tmp_path: Path + ) -> None: + result = self.converter.convert(task_with_build_context) + out = tmp_path / "output" + write_result(result, out) + + env = result.environments[0] + content = (out / env.name / "warriors" / "flashpaper.red").read_text(encoding="utf-8") + assert "MOV 0, 1" in content + + +class TestHarborConverterConvertNoDockerfile: + """Tasks without environment/Dockerfile should use fallback.""" + + def setup_method(self) -> None: + self.converter = HarborConverter() + + def test_fallback_dockerfile(self, dataset_no_dockerfile: Path) -> None: + result = self.converter.convert(dataset_no_dockerfile) + assert len(result.environments) == 1 + # Fallback dockerfile starts with FROM python:3.11-slim + assert "FROM python:3.11-slim" in result.environments[0].dockerfile + + def test_no_harbor_original_comments(self, dataset_no_dockerfile: Path) -> None: + result = self.converter.convert(dataset_no_dockerfile) + # Fallback dockerfile should NOT have commented-out lines + assert "# [harbor original]" not in result.environments[0].dockerfile + + +class TestHarborConverterConvertWithSolutions: + """Verify that solution/ dirs show up in task_dirs but write_result skips them.""" + + def setup_method(self) -> None: + self.converter = HarborConverter() + + def test_solutions_present_in_source(self, dataset_with_solutions: Path) -> None: + # Verify the fixture has solution dirs + for name in ("task-x", "task-y"): + assert (dataset_with_solutions / name / "solution").is_dir() + + def test_convert_succeeds(self, dataset_with_solutions: Path) -> None: + result = self.converter.convert(dataset_with_solutions) + assert len(result.environments) == 1 + assert len(result.taskset) == 2 + + +class TestHarborConverterEdgeCases: + def setup_method(self) -> None: + self.converter = HarborConverter() + + def test_no_tasks_raises(self, tmp_path: Path) -> None: + empty = tmp_path / "empty-dataset" + empty.mkdir() + with pytest.raises(ValueError, match="No Harbor tasks found"): + self.converter.convert(empty) + + def test_all_tasks_fail_raises(self, tmp_path: Path) -> None: + dataset = tmp_path / "bad-dataset" + dataset.mkdir() + # Create subdirs that look like tasks but have no instruction.md + for name in ("a", "b"): + d = dataset / name + d.mkdir() + (d / "task.toml").write_text("[metadata]\n") + # Missing instruction.md -> will fail detect, so not even found as task + with pytest.raises(ValueError, match="No Harbor tasks found"): + self.converter.convert(dataset) + + def test_partial_failure_skips_bad_tasks(self, tmp_path: Path) -> None: + dataset = tmp_path / "partial" + dataset.mkdir() + + # One good task + make_harbor_task(dataset, "good-task") + + # One bad task (has task.toml + instruction.md but instruction unreadable) + bad = dataset / "bad-task" + bad.mkdir() + (bad / "task.toml").write_text("[metadata]\n") + (bad / "instruction.md").write_text("# OK") # actually valid + + result = self.converter.convert(dataset) + # Both should parse, so 2 tasks + assert len(result.taskset) == 2 + + +# ============================================================================ +# Taskset metadata +# ============================================================================ + + +class TestTasksetMetadata: + def setup_method(self) -> None: + self.converter = HarborConverter() + + def test_metadata_includes_harbor_source(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + entry = result.taskset[0] + assert "harbor_source" in entry["metadata"] + + def test_metadata_includes_toml_metadata(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + entry = result.taskset[0] + meta = entry["metadata"] + assert meta.get("category") == "systems" + assert meta.get("difficulty") == "medium" + + +# ============================================================================ +# Dockerfile generation +# ============================================================================ + + +class TestDockerfileGeneration: + def setup_method(self) -> None: + self.converter = HarborConverter() + + def test_cmd_commented_out(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + dockerfile = result.environments[0].dockerfile + # Original CMD ["bash"] should be commented out + assert "# [harbor original]" in dockerfile + + def test_hud_layer_present(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + dockerfile = result.environments[0].dockerfile + assert "COPY env.py" in dockerfile + assert "uv" in dockerfile + assert "hud" in dockerfile + + def test_tasks_copied_into_image(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + dockerfile = result.environments[0].dockerfile + assert "COPY tasks/ /harbor/tasks/" in dockerfile + + def test_logs_dir_created(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + dockerfile = result.environments[0].dockerfile + assert "/logs/verifier" in dockerfile + + +# ============================================================================ +# env.py generation +# ============================================================================ + + +class TestEnvPyGeneration: + def setup_method(self) -> None: + self.converter = HarborConverter() + + def test_imports_present(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + env_py = result.environments[0].env_py + assert "from hud import Environment" in env_py + assert "from hud.tools import BashTool" in env_py + + def test_tools_added(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + env_py = result.environments[0].env_py + assert "env.add_tool(BashTool())" in env_py + assert "env.add_tool(EditTool())" in env_py + + def test_reward_parsing_logic(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + env_py = result.environments[0].env_py + assert "_parse_harbor_reward" in env_py + assert "reward.txt" in env_py + assert "reward.json" in env_py + + +# ============================================================================ +# Scenario signature: single-task default vs multi-task Literal +# ============================================================================ + + +class TestScenarioSignature: + """Verify that single-task envs get a default and multi-task envs get a Literal.""" + + def setup_method(self) -> None: + self.converter = HarborConverter() + + # --- single task: optional with default --- + + def test_single_task_has_default(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + env_py = result.environments[0].env_py + assert 'task_id: str = "cancel-async-tasks"' in env_py + + def test_single_task_no_literal_import(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + env_py = result.environments[0].env_py + assert "from typing import Literal" not in env_py + assert "TaskId" not in env_py + + # --- multi-task (same env): Literal type --- + + def test_multi_task_has_literal(self, dataset_same_env: Path) -> None: + result = self.converter.convert(dataset_same_env) + env_py = result.environments[0].env_py + assert "from typing import Literal" in env_py + assert "TaskId = Literal[" in env_py + + def test_multi_task_literal_lists_all_ids(self, dataset_same_env: Path) -> None: + result = self.converter.convert(dataset_same_env) + env_py = result.environments[0].env_py + for name in ("cancel-async-tasks", "build-pmars", "chess-best-move"): + assert f'"{name}"' in env_py + + def test_multi_task_signature_uses_literal(self, dataset_same_env: Path) -> None: + result = self.converter.convert(dataset_same_env) + env_py = result.environments[0].env_py + assert "def run_task(task_id: TaskId):" in env_py + + def test_multi_task_no_default(self, dataset_same_env: Path) -> None: + result = self.converter.convert(dataset_same_env) + env_py = result.environments[0].env_py + # Should NOT have a default value + assert "task_id: TaskId):" in env_py + assert "= " not in env_py.split("def run_task(")[1].split("):")[0] + + # --- multi-env dataset: each env gets the right variant --- + + def test_multi_env_single_task_per_env(self, dataset_multi_env: Path) -> None: + result = self.converter.convert(dataset_multi_env) + # Each env has 2 tasks, so all should use Literal + for env in result.environments: + assert "TaskId = Literal[" in env.env_py + + def test_single_task_build_context_fixture(self, task_with_build_context: Path) -> None: + result = self.converter.convert(task_with_build_context) + env_py = result.environments[0].env_py + assert 'task_id: str = "build-pmars"' in env_py + + +# ============================================================================ +# pyproject.toml generation +# ============================================================================ + + +class TestPyprojectGeneration: + def setup_method(self) -> None: + self.converter = HarborConverter() + + def test_has_hud_dependency(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + pyproject = result.environments[0].pyproject_toml + assert "hud-python" in pyproject + + def test_name_matches_env(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + env = result.environments[0] + assert env.name in env.pyproject_toml + + +# ============================================================================ +# write_result() +# ============================================================================ + + +class TestWriteResult: + def setup_method(self) -> None: + self.converter = HarborConverter() + + def test_creates_directory_structure(self, single_task: Path, tmp_path: Path) -> None: + result = self.converter.convert(single_task) + out = tmp_path / "output" + write_result(result, out) + + env = result.environments[0] + env_dir = out / env.name + + assert env_dir.is_dir() + assert (env_dir / "env.py").is_file() + assert (env_dir / "Dockerfile.hud").is_file() + assert (env_dir / "pyproject.toml").is_file() + assert (env_dir / "tasks").is_dir() + assert (out / "taskset.json").is_file() + + def test_taskset_json_valid(self, single_task: Path, tmp_path: Path) -> None: + result = self.converter.convert(single_task) + out = tmp_path / "output" + taskset_path = write_result(result, out) + + with open(taskset_path, encoding="utf-8") as f: + data = json.load(f) + + assert isinstance(data, list) + assert len(data) == 1 + assert data[0]["args"]["task_id"] == "cancel-async-tasks" + + def test_task_files_copied(self, single_task: Path, tmp_path: Path) -> None: + result = self.converter.convert(single_task) + out = tmp_path / "output" + write_result(result, out) + + env = result.environments[0] + task_out = out / env.name / "tasks" / "cancel-async-tasks" + + assert (task_out / "instruction.md").is_file() + assert (task_out / "task.toml").is_file() + assert (task_out / "tests" / "test.sh").is_file() + + def test_environment_dir_not_copied(self, single_task: Path, tmp_path: Path) -> None: + result = self.converter.convert(single_task) + out = tmp_path / "output" + write_result(result, out) + + env = result.environments[0] + task_out = out / env.name / "tasks" / "cancel-async-tasks" + + # environment/ should be excluded from the copy + assert not (task_out / "environment").exists() + + def test_solution_dir_not_copied(self, dataset_with_solutions: Path, tmp_path: Path) -> None: + result = self.converter.convert(dataset_with_solutions) + out = tmp_path / "output" + write_result(result, out) + + env = result.environments[0] + for task_id in env.task_dirs: + task_out = out / env.name / "tasks" / task_id + assert not (task_out / "solution").exists() + + def test_multi_env_write(self, dataset_multi_env: Path, tmp_path: Path) -> None: + result = self.converter.convert(dataset_multi_env) + out = tmp_path / "output" + write_result(result, out) + + # Both environments should be written + for env in result.environments: + assert (out / env.name).is_dir() + assert (out / env.name / "env.py").is_file() + + # Single taskset.json with all tasks + with open(out / "taskset.json", encoding="utf-8") as f: + data = json.load(f) + assert len(data) == 4 + + def test_overwrites_existing(self, single_task: Path, tmp_path: Path) -> None: + result = self.converter.convert(single_task) + out = tmp_path / "output" + + # Write twice — should not error + write_result(result, out) + write_result(result, out) + + assert (out / "taskset.json").is_file() + + +# ============================================================================ +# Registry integration (detect_format, get_converter, list_formats) +# ============================================================================ + + +class TestConverterRegistry: + def test_get_converter_by_name(self) -> None: + converter = get_converter("harbor") + assert converter is not None + assert isinstance(converter, HarborConverter) + + def test_get_converter_unknown(self) -> None: + assert get_converter("nonexistent") is None + + def test_detect_format_harbor(self, single_task: Path) -> None: + converter = detect_format(single_task) + assert converter is not None + assert converter.name == "harbor" + + def test_detect_format_unknown(self, tmp_path: Path) -> None: + assert detect_format(tmp_path) is None + + def test_list_formats_includes_harbor(self) -> None: + formats = list_formats() + names = [name for name, _desc in formats] + assert "harbor" in names diff --git a/hud/cli/deploy.py b/hud/cli/deploy.py index 1cb77afa..3e354c34 100644 --- a/hud/cli/deploy.py +++ b/hud/cli/deploy.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +import logging import os import time from pathlib import Path @@ -14,10 +15,16 @@ from hud.cli.utils.build_logs import poll_build_status, stream_build_logs from hud.cli.utils.config import parse_env_file from hud.cli.utils.context import create_build_context_tarball, format_size -from hud.cli.utils.environment import find_dockerfile, get_environment_name +from hud.cli.utils.environment import ( + find_dockerfile, + get_environment_name, + is_environment_directory, +) from hud.cli.utils.validation import validate_environment from hud.utils.hud_console import HUDConsole +LOGGER = logging.getLogger(__name__) + def collect_environment_variables( directory: Path, @@ -505,6 +512,96 @@ def _save_deploy_link( console.warning(f"Failed to save deploy link: {e}") +def discover_environments(directory: Path) -> list[Path]: + """Find all HUD environment subdirectories within a parent directory. + + Scans immediate children for directories containing a Dockerfile + (Dockerfile.hud or Dockerfile) and pyproject.toml. + + Returns sorted list of environment directory paths. + """ + if not directory.is_dir(): + return [] + return [ + child + for child in sorted(directory.iterdir()) + if child.is_dir() and is_environment_directory(child) + ] + + +def deploy_all( + directory: str, + env: list[str] | None = None, + env_file: str | None = None, + no_cache: bool = False, + verbose: bool = False, + build_args: list[str] | None = None, + build_secrets: list[str] | None = None, +) -> None: + """Deploy all HUD environments found in a directory. + + Discovers subdirectories that are valid HUD environments and deploys + each one sequentially. + """ + hud_console = HUDConsole() + parent = Path(directory).resolve() + + if not parent.is_dir(): + hud_console.error(f"Directory does not exist: {directory}") + raise typer.Exit(1) + + envs = discover_environments(parent) + if not envs: + hud_console.error(f"No HUD environments found in {parent}") + hud_console.info("Expected subdirectories containing Dockerfile.hud + pyproject.toml") + raise typer.Exit(1) + + hud_console.header("Deploy All Environments") + hud_console.info(f"Found {len(envs)} environment(s) in {parent}:") + for env_dir in envs: + hud_console.info(f" {env_dir.name}/") + hud_console.info("") + + succeeded: list[str] = [] + failed: list[str] = [] + + for i, env_dir in enumerate(envs, start=1): + hud_console.section_title(f"[{i}/{len(envs)}] Deploying {env_dir.name}") + + try: + deploy_environment( + directory=str(env_dir), + name=None, + env=env, + env_file=env_file, + no_cache=no_cache, + verbose=verbose, + registry_id=None, + build_args=build_args, + build_secrets=build_secrets, + ) + succeeded.append(env_dir.name) + except (typer.Exit, SystemExit): + LOGGER.warning("Deploy failed for environment %s", env_dir.name) + failed.append(env_dir.name) + except Exception: + LOGGER.exception("Unexpected error deploying %s", env_dir.name) + failed.append(env_dir.name) + + # Summary + hud_console.info("") + hud_console.header("Deploy All Summary") + if succeeded: + hud_console.success(f"{len(succeeded)} environment(s) deployed successfully:") + for name in succeeded: + hud_console.info(f" {name}") + if failed: + hud_console.error(f"{len(failed)} environment(s) failed:") + for name in failed: + hud_console.info(f" {name}") + raise typer.Exit(1) + + def deploy_command( directory: str = typer.Argument(".", help="Environment directory"), name: str | None = typer.Option( @@ -513,6 +610,12 @@ def deploy_command( "-n", help="Environment display name (defaults to directory name)", ), + all_envs: bool = typer.Option( + False, + "--all", + "-a", + help="Deploy all HUD environments found in directory", + ), env: list[str] | None = typer.Option( # noqa: B008 None, "--env", @@ -568,11 +671,24 @@ def deploy_command( hud deploy environments/browser hud deploy . --name my-env # Custom name hud deploy . -e API_KEY=xxx # With env vars + hud deploy ./converted --all # Deploy all envs in directory hud deploy . --build-arg NODE_ENV=production # With build args hud deploy . --secret id=MY_KEY,env=MY_KEY # With build secrets (will be encrypted at rest) hud deploy . --secret id=MY_KEY,src=./my_key.txt # Secret from file hud deploy . --no-cache # Force rebuild[/not dim] """ + if all_envs: + deploy_all( + directory=directory, + env=env, + env_file=env_file, + no_cache=no_cache, + verbose=verbose, + build_args=build_args, + build_secrets=secrets, + ) + return + deploy_environment( directory=directory, name=name, diff --git a/hud/cli/tests/test_build.py b/hud/cli/tests/test_build.py index 1c7be8eb..f1efbbf8 100644 --- a/hud/cli/tests/test_build.py +++ b/hud/cli/tests/test_build.py @@ -60,12 +60,12 @@ def test_increment_patch(self): def test_increment_minor(self): """Test incrementing minor version.""" assert increment_version("1.2.3", "minor") == "1.3.0" - assert increment_version("0.5.20", "minor") == "0.6.0" + assert increment_version("0.5.21", "minor") == "0.6.0" def test_increment_major(self): """Test incrementing major version.""" assert increment_version("1.2.3", "major") == "2.0.0" - assert increment_version("0.5.20", "major") == "1.0.0" + assert increment_version("0.5.21", "major") == "1.0.0" def test_increment_with_v_prefix(self): """Test incrementing version with v prefix.""" diff --git a/hud/environment/scenarios.py b/hud/environment/scenarios.py index bec337b2..e33627b9 100644 --- a/hud/environment/scenarios.py +++ b/hud/environment/scenarios.py @@ -628,12 +628,35 @@ async def prompt_handler(**handler_args: Any) -> list[str]: if annotation is not None: try: adapter = TypeAdapter(annotation) - deserialized_args[arg_name] = adapter.validate_json(arg_value) + except Exception: + # Unresolvable annotation (e.g. raw string from + # PEP 563 fallback) -- treat as untyped + adapter = None + + if adapter is not None: + # Try validate_json first (handles Pydantic models, + # lists, enums, datetimes from JSON-encoded strings) + try: + deserialized_args[arg_name] = adapter.validate_json(arg_value) + continue + except Exception: # noqa: S110 + pass + + # Fall back to validate_python (handles Literal[str] + # where validate_json("0") would parse as int 0, + # losing the string type) + try: + deserialized_args[arg_name] = adapter.validate_python(arg_value) + continue + except Exception: # noqa: S110 + pass + + # TypeAdapter couldn't handle it -- skip generic + # heuristics that would lose type information + deserialized_args[arg_name] = arg_value continue - except Exception: # noqa: S110 - pass # Fall through to generic JSON decode - # Try JSON decode for strings that look like JSON + # No annotation (or unresolvable): try generic JSON decode heuristics stripped = arg_value.strip() if (stripped and stripped[0] in "[{") or stripped in ("true", "false", "null"): try: diff --git a/hud/environment/tests/test_scenarios.py b/hud/environment/tests/test_scenarios.py index 048b893e..74ac9355 100644 --- a/hud/environment/tests/test_scenarios.py +++ b/hud/environment/tests/test_scenarios.py @@ -4,7 +4,7 @@ from datetime import datetime from enum import Enum -from typing import Any +from typing import Any, Literal import pytest from pydantic import BaseModel @@ -792,6 +792,239 @@ async def list_pydantic_scenario(items: list[_Item]): assert received_items[1].name == "Banana" +class TestLiteralDeserialization: + """Tests for Literal type deserialization edge cases. + + The MCP protocol sends all arguments as strings. When the scenario + function uses Literal types, the deserializer must correctly match + string values -- especially numeric-looking strings like "0", "1". + """ + + @pytest.mark.asyncio + async def test_literal_string_kept_as_string(self) -> None: + """Literal["a", "b"] receives string values correctly.""" + env = Environment("test-env") + received: str | None = None + + @env.scenario("literal_str") + async def literal_str_scenario(choice: Literal["a", "b"]): + nonlocal received + received = choice + yield f"Got {choice}" + yield 1.0 + + prompt = env._prompt_manager._prompts.get("test-env:literal_str") + assert prompt is not None + + await prompt.render({"choice": "a"}) + assert received == "a" + assert isinstance(received, str) + + @pytest.mark.asyncio + async def test_literal_numeric_string_not_coerced_to_int(self) -> None: + """Literal["0", "1", "2"] keeps "0" as string, not int 0. + + This is the GPQA Diamond bug: task IDs are "0", "1", etc. + and must stay as strings for Path operations. + """ + env = Environment("test-env") + received: Any = None + + @env.scenario("literal_numeric") + async def literal_numeric_scenario(task_id: Literal["0", "1", "2"]): + nonlocal received + received = task_id + yield f"Task {task_id}" + yield 1.0 + + prompt = env._prompt_manager._prompts.get("test-env:literal_numeric") + assert prompt is not None + + await prompt.render({"task_id": "0"}) + assert received == "0" + assert isinstance(received, str) + + @pytest.mark.asyncio + async def test_literal_numeric_string_various_values(self) -> None: + """All numeric-looking Literal string values stay as strings.""" + env = Environment("test-env") + received: Any = None + + @env.scenario("literal_nums") + async def literal_nums_scenario(idx: Literal["0", "42", "197"]): + nonlocal received + received = idx + yield f"Index {idx}" + yield 1.0 + + prompt = env._prompt_manager._prompts.get("test-env:literal_nums") + assert prompt is not None + + for val in ("0", "42", "197"): + await prompt.render({"idx": val}) + assert received == val, f"Expected {val!r}, got {received!r}" + assert isinstance(received, str), f"Expected str, got {type(received)}" + + @pytest.mark.asyncio + async def test_literal_int_coerces_correctly(self) -> None: + """Literal[1, 2, 3] with int values coerces string "1" to int 1.""" + env = Environment("test-env") + received: Any = None + + @env.scenario("literal_int") + async def literal_int_scenario(level: Literal[1, 2, 3]): + nonlocal received + received = level + yield f"Level {level}" + yield 1.0 + + prompt = env._prompt_manager._prompts.get("test-env:literal_int") + assert prompt is not None + + await prompt.render({"level": "2"}) + assert received == 2 + assert isinstance(received, int) + + @pytest.mark.asyncio + async def test_literal_mixed_types(self) -> None: + """Literal["auto", 0, 1] handles mixed string/int literal values.""" + env = Environment("test-env") + received: Any = None + + @env.scenario("literal_mixed") + async def literal_mixed_scenario(mode: Literal["auto", 0, 1]): + nonlocal received + received = mode + yield f"Mode {mode}" + yield 1.0 + + prompt = env._prompt_manager._prompts.get("test-env:literal_mixed") + assert prompt is not None + + await prompt.render({"mode": "auto"}) + assert received == "auto" + + @pytest.mark.asyncio + async def test_literal_with_default(self) -> None: + """Literal with default value works when arg is provided.""" + env = Environment("test-env") + received: Any = None + + @env.scenario("literal_default") + async def literal_default_scenario( + task_id: Literal["build-pmars"] = "build-pmars", + ): + nonlocal received + received = task_id + yield f"Task {task_id}" + yield 1.0 + + prompt = env._prompt_manager._prompts.get("test-env:literal_default") + assert prompt is not None + + await prompt.render({"task_id": "build-pmars"}) + assert received == "build-pmars" + + @pytest.mark.asyncio + async def test_int_annotation_coerces_numeric_string(self) -> None: + """Plain int annotation coerces "42" to 42.""" + env = Environment("test-env") + received: Any = None + + @env.scenario("int_arg") + async def int_arg_scenario(count: int): + nonlocal received + received = count + yield f"Count {count}" + yield 1.0 + + prompt = env._prompt_manager._prompts.get("test-env:int_arg") + assert prompt is not None + + await prompt.render({"count": "42"}) + assert received == 42 + assert isinstance(received, int) + + @pytest.mark.asyncio + async def test_float_annotation_coerces_numeric_string(self) -> None: + """Plain float annotation coerces "3.14" to 3.14.""" + env = Environment("test-env") + received: Any = None + + @env.scenario("float_arg") + async def float_arg_scenario(rate: float): + nonlocal received + received = rate + yield f"Rate {rate}" + yield 1.0 + + prompt = env._prompt_manager._prompts.get("test-env:float_arg") + assert prompt is not None + + await prompt.render({"rate": "3.14"}) + assert received == pytest.approx(3.14) + assert isinstance(received, float) + + @pytest.mark.asyncio + async def test_bool_annotation_coerces_string(self) -> None: + """Bool annotation coerces "true"/"false" correctly.""" + env = Environment("test-env") + received: Any = None + + @env.scenario("bool_arg") + async def bool_arg_scenario(verbose: bool): + nonlocal received + received = verbose + yield f"Verbose {verbose}" + yield 1.0 + + prompt = env._prompt_manager._prompts.get("test-env:bool_arg") + assert prompt is not None + + await prompt.render({"verbose": "true"}) + assert received is True + + @pytest.mark.asyncio + async def test_str_annotation_preserves_numeric_string(self) -> None: + """Plain str annotation keeps "42" as string "42".""" + env = Environment("test-env") + received: Any = None + + @env.scenario("str_numeric") + async def str_numeric_scenario(name: str): + nonlocal received + received = name + yield f"Name {name}" + yield 1.0 + + prompt = env._prompt_manager._prompts.get("test-env:str_numeric") + assert prompt is not None + + await prompt.render({"name": "42"}) + assert received == "42" + assert isinstance(received, str) + + @pytest.mark.asyncio + async def test_no_annotation_numeric_becomes_int(self) -> None: + """Untyped arg with numeric-looking string falls through to json.loads.""" + env = Environment("test-env") + received: Any = None + + @env.scenario("untyped_num") + async def untyped_num_scenario(val): + nonlocal received + received = val + yield f"Val {val}" + yield 1.0 + + prompt = env._prompt_manager._prompts.get("test-env:untyped_num") + assert prompt is not None + + await prompt.render({"val": "42"}) + # Without annotation, generic heuristic converts to int + assert received == 42 + + class TestScenarioNameNormalization: """Test edge cases for environment and scenario name handling.""" diff --git a/hud/patches/mcp_patches.py b/hud/patches/mcp_patches.py index fbac6bd7..d8d73fa7 100644 --- a/hud/patches/mcp_patches.py +++ b/hud/patches/mcp_patches.py @@ -8,11 +8,60 @@ from __future__ import annotations import logging -from typing import Any +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + import httpx + from mcp.client.streamable_http import StreamWriter logger = logging.getLogger(__name__) +def patch_json_response_error_propagation() -> None: + """ + Patch _handle_json_response to re-raise exceptions instead of swallowing them. + + The original implementation catches all exceptions (e.g. ReadError during + response.aread(), ValidationError during JSON parsing) and sends them as raw + Exception objects to the read stream — where BaseSession._handle_incoming + silently drops them. This causes the caller (call_tool / send_request) to + hang forever waiting for a response that will never arrive. + + By re-raising, exceptions propagate to the retry loop in our patched + post_writer, which already distinguishes retryable errors (ReadError → + retry with backoff) from non-retryable ones (ValidationError → send + proper JSONRPCError to resolve the pending request). + """ + try: + from mcp.client.streamable_http import StreamableHTTPTransport + from mcp.shared.message import SessionMessage + from mcp.types import JSONRPCMessage + + async def patched_handle_json_response( + self: Any, + response: httpx.Response, + read_stream_writer: StreamWriter, + is_initialization: bool = False, + ) -> None: + try: + content = await response.aread() + message = JSONRPCMessage.model_validate_json(content) + if is_initialization: + self._maybe_extract_protocol_version_from_message(message) + await read_stream_writer.send(SessionMessage(message)) + except Exception: + logger.exception("Error in _handle_json_response") + raise + + StreamableHTTPTransport._handle_json_response = patched_handle_json_response + logger.debug("Patched StreamableHTTPTransport._handle_json_response to re-raise errors") + + except ImportError: + logger.debug("mcp.client.streamable_http not available, skipping patch") + except Exception as e: + logger.warning("Failed to patch _handle_json_response: %s", e) + + def patch_streamable_http_error_handling() -> None: """ Patch StreamableHTTPTransport.post_writer to handle request errors properly. @@ -313,6 +362,7 @@ def suppress_fastmcp_logging(level: int = logging.WARNING) -> None: def apply_all_patches() -> None: """Apply all MCP patches.""" + patch_json_response_error_propagation() patch_streamable_http_error_handling() patch_client_session_validation() patch_server_output_validation() diff --git a/hud/utils/tests/test_version.py b/hud/utils/tests/test_version.py index 4a70e3b6..014478d7 100644 --- a/hud/utils/tests/test_version.py +++ b/hud/utils/tests/test_version.py @@ -5,4 +5,4 @@ def test_import(): """Test that the package can be imported.""" import hud - assert hud.__version__ == "0.5.20" + assert hud.__version__ == "0.5.21" diff --git a/hud/version.py b/hud/version.py index de07e468..65160cef 100644 --- a/hud/version.py +++ b/hud/version.py @@ -4,4 +4,4 @@ from __future__ import annotations -__version__ = "0.5.20" +__version__ = "0.5.21" diff --git a/pyproject.toml b/pyproject.toml index fb785572..3b059cf8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "hud-python" -version = "0.5.20" +version = "0.5.21" description = "SDK for the HUD platform." readme = "README.md" requires-python = ">=3.11, <3.13"