diff --git a/docs/advanced/harbor-convert.mdx b/docs/advanced/harbor-convert.mdx
index 358335df..bfdd9ef9 100644
--- a/docs/advanced/harbor-convert.mdx
+++ b/docs/advanced/harbor-convert.mdx
@@ -15,7 +15,7 @@ git clone https://github.com/laude-institute/terminal-bench-2.git
# 2. Convert to HUD format
hud convert ./terminal-bench-2/ --output ./tb2-hud
-# 3. Deploy all environments
+# 3. Deploy all environments (~3 min per environment, leave it running)
hud deploy ./tb2-hud --all
# 4. Run evaluation
@@ -24,6 +24,11 @@ hud eval ./tb2-hud/taskset.json
That's it. The converter handles Dockerfile adaptation, build context, test scripts, and reward parsing automatically.
+
+Each environment takes roughly 3 minutes to build and deploy. For datasets with many environments,
+`hud deploy --all` runs them sequentially -- just leave it running and check back when it's done.
+
+
## What Gets Converted
A Harbor task directory:
@@ -81,9 +86,29 @@ Harbor test scripts write results to `/logs/verifier/`. The converter supports b
- `reward.txt` -- a single float (`1.0` for pass, `0.0` for fail)
- `reward.json` -- `{"reward": 1.0}` or just a float
-## Running Programmatically
+## Running Tasks
+
+### Option 1: Upload as a Taskset (recommended)
+
+The generated `taskset.json` can be uploaded directly to the HUD platform for managed evaluation, leaderboards, and comparison across models:
+
+1. Go to [hud.ai/evalsets](https://hud.ai/evalsets) and create a new taskset
+2. Click **Upload Tasks** and paste the contents of `taskset.json`
+3. Run evaluations from the platform UI or via `hud eval`
+
+See the [Tasksets guide](/platform/tasksets) for full details on creating and managing tasksets.
+
+### Option 2: CLI eval
+
+Run the taskset directly from the command line:
+
+```bash
+hud eval ./tb2-hud/taskset.json
+```
+
+### Option 3: Python SDK
-You can also run converted tasks from Python using the SDK:
+Run tasks programmatically with any agent:
```python
import asyncio
@@ -108,7 +133,7 @@ async def main():
asyncio.run(main())
```
-Or load the full taskset:
+Or load the full taskset as Task objects:
```python
import json
diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py
index daf9ff8c..ab22eea5 100644
--- a/hud/cli/__init__.py
+++ b/hud/cli/__init__.py
@@ -1026,46 +1026,153 @@ def get(
@app.command()
def convert(
- tasks_file: str = typer.Argument(
- ..., help="Path to tasks file (JSON/JSONL) to convert to remote MCP configuration"
+ path: str = typer.Argument(
+ ..., help="Path to source tasks/dataset directory to convert to HUD format"
+ ),
+ from_format: str = typer.Option(
+ "auto",
+ "--from",
+ "-f",
+ help="Source format (auto, harbor). Use 'auto' to detect automatically.",
+ ),
+ output: str | None = typer.Option(
+ None,
+ "--output",
+ "-o",
+ help="Output directory (default: ./hud_converted)",
),
) -> None:
- """Convert local MCP task configs to remote (mcp.hud.ai) format.
+ """Convert external benchmark formats to HUD environments + tasksets.
- This mirrors the implicit conversion flow used by 'hud rl' and writes a new
- remote_.json next to the source file when needed.
+ [not dim]Converts tasks from frameworks like Harbor into HUD-compatible
+ environments (env.py + Dockerfile.hud) and v5 taskset files.
+
+ Supports pluggable formats. Currently: harbor.
+
+ Examples:
+ hud convert ./algotune/ # Auto-detect, convert dataset
+ hud convert ./my-task/ --from harbor # Explicit format
+ hud convert ./dataset/ --output ./out # Custom output directory[/not dim]
"""
from pathlib import Path
+ from .convert import detect_format, get_converter, list_formats, write_result
+
hud_console = HUDConsole()
+ source_path = Path(path).resolve()
- try:
- from .flows.tasks import convert_tasks_to_remote
+ if not source_path.exists():
+ hud_console.error(f"Path does not exist: {path}")
+ raise typer.Exit(1)
- result_path = convert_tasks_to_remote(tasks_file)
+ # Resolve converter
+ if from_format == "auto":
+ converter = detect_format(source_path)
+ if converter is None:
+ # Auto-detect failed — prompt user to pick a format
+ available = list_formats()
+ if not available:
+ hud_console.error("No converters registered.")
+ raise typer.Exit(1)
+
+ if len(available) == 1:
+ # Only one format exists, just use it
+ converter = get_converter(available[0][0])
+ if converter:
+ hud_console.info(f"Using format: {converter.name}")
+ else:
+ import questionary
+
+ choices = [
+ questionary.Choice(title=f"{name} — {desc}", value=name)
+ for name, desc in available
+ ]
+ picked = questionary.select(
+ "Could not auto-detect format. Which format is this?",
+ choices=choices,
+ ).ask()
+ if not picked:
+ raise typer.Exit(1)
+ converter = get_converter(picked)
- # If nothing changed, inform the user
- try:
- if Path(result_path).resolve() == Path(tasks_file).resolve():
- hud_console.success(
- "Tasks already reference remote MCP URLs. No conversion needed."
- )
- hud_console.hint("You can run them directly with: hud eval --full")
- return
- except Exception as e:
- # Best effort; continue with success message
- hud_console.debug(f"Path comparison failed, continuing: {e}")
-
- hud_console.success(f"Converted tasks written to: {result_path}")
- hud_console.hint(
- "You can now run remote flows: hud rl or hud eval "
- )
- except typer.Exit:
- raise
+ if converter is None:
+ hud_console.error("No converter selected.")
+ raise typer.Exit(1)
+ else:
+ hud_console.info(f"Detected format: {converter.name}")
+ else:
+ converter = get_converter(from_format)
+ if converter is None:
+ hud_console.error(f"Unknown format: {from_format}")
+ available = list_formats()
+ if available:
+ hud_console.info("Available formats:")
+ for name, desc in available:
+ hud_console.info(f" {name}: {desc}")
+ raise typer.Exit(1)
+
+ # Run conversion
+ try:
+ result = converter.convert(source_path)
+ except ValueError as e:
+ hud_console.error(str(e))
+ raise typer.Exit(1) from e
+ except Exception as e:
+ hud_console.error(f"Conversion failed: {e}")
+ raise typer.Exit(1) from e
+
+ # Write output
+ output_dir = Path(output) if output else Path("./hud_converted")
+ try:
+ taskset_path = write_result(result, output_dir.resolve())
except Exception as e:
- hud_console.error(f"Failed to convert tasks: {e}")
+ hud_console.error(f"Failed to write output: {e}")
raise typer.Exit(1) from e
+ # Display results
+ hud_console.header("Convert Complete")
+ hud_console.info("")
+
+ total_tasks = len(result.taskset)
+ total_envs = len(result.environments)
+ hud_console.success(f"Converted {total_tasks} task(s) into {total_envs} environment(s).")
+ hud_console.info("")
+
+ # Show each environment
+ hud_console.section_title("Environments")
+ for env_gen in result.environments:
+ task_count = len(env_gen.task_dirs)
+ hud_console.status_item(env_gen.name, f"{task_count} tasks")
+ hud_console.info("")
+
+ # Show output paths
+ hud_console.section_title("Output")
+ hud_console.status_item("Directory", str(output_dir.resolve()))
+ hud_console.status_item("Taskset", str(taskset_path))
+ hud_console.info("")
+
+ # Show next steps with numbered commands
+ hud_console.section_title("Next Steps")
+ hud_console.info("")
+
+ hud_console.info("1. Deploy environment(s):")
+ if total_envs > 1:
+ hud_console.command_example(
+ f"hud deploy {output_dir.resolve()} --all",
+ f"Deploy all {total_envs} environments",
+ )
+ else:
+ first_env = result.environments[0].name if result.environments else ""
+ hud_console.command_example(
+ f"hud deploy {output_dir.resolve() / first_env}",
+ "Build & deploy to HUD platform",
+ )
+ hud_console.info("")
+
+ hud_console.info("2. Run evaluation:")
+ hud_console.command_example(f"hud eval {taskset_path}", "Run agent against tasks")
+ hud_console.info("")
+
@app.command()
def cancel(
diff --git a/hud/cli/convert/__init__.py b/hud/cli/convert/__init__.py
new file mode 100644
index 00000000..c30ef455
--- /dev/null
+++ b/hud/cli/convert/__init__.py
@@ -0,0 +1,177 @@
+"""Pluggable format conversion system for HUD.
+
+Converts external benchmark formats (Harbor, Inspect AI, etc.) into
+HUD environments + tasksets.
+
+Usage:
+ hud convert # Auto-detect format
+ hud convert --from harbor # Explicit format
+ hud convert --output ./out # Custom output directory
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import shutil
+from pathlib import Path # noqa: TC003 - used at runtime
+
+from .base import BaseConverter, ConvertResult, GeneratedEnvironment
+
+__all__ = [
+ "BaseConverter",
+ "ConvertResult",
+ "GeneratedEnvironment",
+ "detect_format",
+ "get_converter",
+ "list_formats",
+ "write_result",
+]
+
+LOGGER = logging.getLogger(__name__)
+
+# Shell script extensions that need CRLF -> LF normalization
+_SHELL_EXTENSIONS = frozenset({".sh", ".bash", ".zsh", ".ksh"})
+
+
+def _normalize_line_endings(directory: Path) -> None:
+ """Convert CRLF to LF in all shell scripts under a directory.
+
+ Git on Windows with autocrlf=true converts LF to CRLF on checkout.
+ Shell scripts with CRLF break on Linux (e.g., shebang errors,
+ 'set: pipefail\\r: invalid option name').
+ """
+ for path in directory.rglob("*"):
+ if path.is_file() and path.suffix in _SHELL_EXTENSIONS:
+ raw = path.read_bytes()
+ if b"\r" in raw:
+ path.write_bytes(raw.replace(b"\r\n", b"\n").replace(b"\r", b"\n"))
+ LOGGER.debug("Normalized line endings: %s", path)
+
+
+# ---------------------------------------------------------------------------
+# Converter registry
+# ---------------------------------------------------------------------------
+
+# Lazy-loaded to avoid import cost on unrelated CLI commands
+_converters: list[BaseConverter] | None = None
+
+
+def _load_converters() -> list[BaseConverter]:
+ global _converters
+ if _converters is None:
+ from .harbor import HarborConverter
+
+ _converters = [
+ HarborConverter(),
+ # Future: InspectConverter(), METRConverter(), ...
+ ]
+ return _converters
+
+
+def get_converter(name: str) -> BaseConverter | None:
+ """Get a converter by its short name (e.g., 'harbor')."""
+ for c in _load_converters():
+ if c.name == name:
+ return c
+ return None
+
+
+def detect_format(path: Path) -> BaseConverter | None:
+ """Auto-detect which converter can handle the given path."""
+ for c in _load_converters():
+ if c.detect(path):
+ return c
+ return None
+
+
+def list_formats() -> list[tuple[str, str]]:
+ """Return (name, description) pairs for all registered converters."""
+ return [(c.name, c.description) for c in _load_converters()]
+
+
+# ---------------------------------------------------------------------------
+# Output writer
+# ---------------------------------------------------------------------------
+
+
+def write_result(result: ConvertResult, output_dir: Path) -> Path:
+ """Write conversion results to disk.
+
+ Creates the output directory structure:
+ output_dir/
+ ├── env-name-a/
+ │ ├── env.py
+ │ ├── Dockerfile.hud
+ │ ├── pyproject.toml
+ │ └── tasks/
+ │ └── / (copied from source, minus environment/ & solution/)
+ └── taskset.json
+
+ Returns the path to the generated taskset.json.
+ """
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ for env_gen in result.environments:
+ env_dir = output_dir / env_gen.name
+ env_dir.mkdir(parents=True, exist_ok=True)
+
+ # Write generated files
+ (env_dir / "env.py").write_text(env_gen.env_py, encoding="utf-8")
+ (env_dir / "Dockerfile.hud").write_text(env_gen.dockerfile, encoding="utf-8")
+ (env_dir / "pyproject.toml").write_text(env_gen.pyproject_toml, encoding="utf-8")
+
+ # Copy build context files from source environment/ directory
+ # (e.g., warriors/*.red that Harbor Dockerfiles reference via COPY)
+ if env_gen.build_context_source and env_gen.build_context_source.is_dir():
+ for item in env_gen.build_context_source.iterdir():
+ # Skip the Dockerfile itself (we already generated Dockerfile.hud)
+ if item.name.lower() in ("dockerfile", "dockerfile.hud"):
+ continue
+ dest_item = env_dir / item.name
+ if dest_item.exists():
+ if dest_item.is_dir():
+ shutil.rmtree(dest_item)
+ else:
+ dest_item.unlink()
+ if item.is_dir():
+ shutil.copytree(item, dest_item)
+ else:
+ shutil.copy2(item, dest_item)
+
+ # Copy task data directories (skip environment/ and solution/)
+ tasks_dir = env_dir / "tasks"
+ tasks_dir.mkdir(parents=True, exist_ok=True)
+
+ for task_id, source_dir in env_gen.task_dirs.items():
+ dest = tasks_dir / task_id
+ if dest.exists():
+ shutil.rmtree(dest)
+ dest.mkdir(parents=True, exist_ok=True)
+
+ for item in source_dir.iterdir():
+ # Skip dirs that are handled by the Dockerfile or ignored
+ if item.name in ("environment", "solution"):
+ continue
+ if item.is_dir():
+ shutil.copytree(item, dest / item.name)
+ else:
+ shutil.copy2(item, dest / item.name)
+
+ # Normalize CRLF -> LF in all shell scripts (fixes Windows git checkout)
+ _normalize_line_endings(env_dir)
+
+ LOGGER.info(
+ "Wrote environment '%s' with %d task(s)",
+ env_gen.name,
+ len(env_gen.task_dirs),
+ )
+
+ # Write taskset
+ taskset_path = output_dir / "taskset.json"
+ with open(taskset_path, "w", encoding="utf-8") as f:
+ json.dump(result.taskset, f, ensure_ascii=False, indent=2)
+ f.write("\n")
+
+ LOGGER.info("Wrote taskset with %d task(s) to %s", len(result.taskset), taskset_path)
+ return taskset_path
diff --git a/hud/cli/convert/base.py b/hud/cli/convert/base.py
new file mode 100644
index 00000000..4fa86f09
--- /dev/null
+++ b/hud/cli/convert/base.py
@@ -0,0 +1,78 @@
+"""Abstract base classes for format converters.
+
+The converter system is pluggable: each format (Harbor, Inspect AI, etc.)
+implements BaseConverter with detect() and convert() methods. The CLI
+auto-detects format or lets the user specify explicitly.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any
+
+from pydantic import BaseModel, ConfigDict, Field
+
+__all__ = ["BaseConverter", "ConvertResult", "GeneratedEnvironment"]
+
+
+class GeneratedEnvironment(BaseModel):
+ """A generated HUD environment ready to be written to disk.
+
+ Attributes:
+ name: Environment name (e.g., "hud-harbor-algotune")
+ env_py: Generated env.py file content
+ dockerfile: Generated Dockerfile.hud content
+ pyproject_toml: Generated pyproject.toml content
+ task_dirs: Mapping of task_id -> source directory path.
+ Files from these directories (minus environment/ and solution/)
+ are copied into the output's tasks/ subdirectory.
+ build_context_source: Optional path to a source directory whose
+ non-Dockerfile contents should be copied into the environment
+ root as Docker build context (e.g., Harbor's environment/ dir).
+ """
+
+ model_config = ConfigDict(arbitrary_types_allowed=True)
+
+ name: str
+ env_py: str
+ dockerfile: str
+ pyproject_toml: str
+ task_dirs: dict[str, Path]
+ build_context_source: Path | None = None
+
+
+class ConvertResult(BaseModel):
+ """Result of converting a source format to HUD.
+
+ Attributes:
+ environments: Generated environment definitions (one per unique env group)
+ taskset: List of v5 Task dicts ready for taskset.json
+ summary: Human-readable summary lines for CLI output
+ """
+
+ environments: list[GeneratedEnvironment]
+ taskset: list[dict[str, Any]]
+ summary: list[str] = Field(default_factory=list)
+
+
+class BaseConverter(ABC):
+ """Abstract base for format converters.
+
+ Subclasses must define:
+ name: Short identifier (used with --from flag)
+ description: Human-readable description (shown in CLI help)
+ detect(): Check if a path matches this format
+ convert(): Perform the conversion
+ """
+
+ name: str
+ description: str
+
+ @abstractmethod
+ def detect(self, path: Path) -> bool:
+ """Return True if this converter can handle the given path."""
+
+ @abstractmethod
+ def convert(self, path: Path) -> ConvertResult:
+ """Convert the source at path to HUD format."""
diff --git a/hud/cli/convert/harbor.py b/hud/cli/convert/harbor.py
new file mode 100644
index 00000000..dc745bc9
--- /dev/null
+++ b/hud/cli/convert/harbor.py
@@ -0,0 +1,565 @@
+"""Harbor → HUD converter.
+
+Converts Harbor framework tasks (task.toml + instruction.md + environment/ + tests/)
+into HUD environments with scenarios and tasksets.
+
+Harbor task structure:
+ task_name/
+ ├── instruction.md # Agent prompt
+ ├── task.toml # Config: timeouts, metadata, resources
+ ├── environment/
+ │ └── Dockerfile # Container the agent runs in
+ ├── tests/
+ │ └── test.sh # Verification → writes reward.txt
+ └── solution/ # Optional (ignored)
+
+HUD output:
+ hud-harbor-{dataset}/
+ ├── env.py # Environment with run-task scenario
+ ├── Dockerfile.hud # Harbor Dockerfile + HUD MCP layer
+ ├── pyproject.toml
+ └── tasks/ # All task data baked into image
+ ├── task-a/
+ │ ├── instruction.md
+ │ └── tests/test.sh
+ └── task-b/
+ ├── instruction.md
+ └── tests/test.sh
+ taskset.json # v5 taskset referencing the env
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import re
+import tomllib
+from dataclasses import dataclass
+from pathlib import Path # noqa: TC003 - used at runtime
+from typing import Any
+
+from .base import BaseConverter, ConvertResult, GeneratedEnvironment
+
+__all__ = ["HarborConverter"]
+
+LOGGER = logging.getLogger(__name__)
+
+
+# =============================================================================
+# Helpers
+# =============================================================================
+
+
+def _is_harbor_task(path: Path) -> bool:
+ """Check if a directory looks like a valid Harbor task."""
+ return path.is_dir() and (path / "task.toml").exists() and (path / "instruction.md").exists()
+
+
+def _hash_directory(path: Path) -> str:
+ """Content-hash a directory for grouping tasks by identical environments."""
+ hasher = hashlib.sha256()
+ if not path.exists():
+ return "empty"
+ for file_path in sorted(path.rglob("*")):
+ if file_path.is_file():
+ hasher.update(str(file_path.relative_to(path)).encode())
+ hasher.update(file_path.read_bytes())
+ return hasher.hexdigest()[:16]
+
+
+def _normalize_name(name: str) -> str:
+ """Normalize a dataset name to a valid HUD environment name."""
+ normalized = name.strip().lower()
+ normalized = normalized.replace(" ", "-").replace("_", "-")
+ normalized = re.sub(r"[^a-z0-9-]", "", normalized)
+ normalized = re.sub(r"-+", "-", normalized)
+ return normalized.strip("-") or "converted"
+
+
+def _find_dockerfile(env_dir: Path) -> str | None:
+ """Read the Dockerfile from a Harbor environment directory."""
+ for name in ("Dockerfile", "dockerfile"):
+ path = env_dir / name
+ if path.exists():
+ return path.read_text(encoding="utf-8")
+ return None
+
+
+def _adapt_harbor_dockerfile(content: str) -> str:
+ """Comment out CMD/ENTRYPOINT lines from a Harbor Dockerfile.
+
+ These are replaced by the HUD MCP server entrypoint.
+ """
+ lines = content.splitlines()
+ adapted: list[str] = []
+ for line in lines:
+ stripped = line.strip().upper()
+ if stripped.startswith(("CMD ", "CMD[", "ENTRYPOINT ", "ENTRYPOINT[")):
+ adapted.append(f"# [harbor original] {line}")
+ else:
+ adapted.append(line)
+ return "\n".join(adapted)
+
+
+# =============================================================================
+# Data classes
+# =============================================================================
+
+
+@dataclass
+class HarborTask:
+ """Parsed Harbor task."""
+
+ task_id: str
+ directory: Path
+ instruction: str
+ config: dict[str, Any]
+ env_hash: str
+
+
+def _parse_task(task_dir: Path) -> HarborTask | None:
+ """Parse a Harbor task directory into a HarborTask."""
+ try:
+ instruction = (task_dir / "instruction.md").read_text(encoding="utf-8")
+ except Exception:
+ LOGGER.warning("Failed to read instruction.md in %s", task_dir)
+ return None
+
+ try:
+ raw = (task_dir / "task.toml").read_text(encoding="utf-8")
+ config: dict[str, Any] = tomllib.loads(raw)
+ except Exception:
+ LOGGER.warning("Failed to parse task.toml in %s", task_dir)
+ config = {}
+
+ env_dir = task_dir / "environment"
+ env_hash = _hash_directory(env_dir) if env_dir.exists() else "no-env"
+
+ return HarborTask(
+ task_id=task_dir.name,
+ directory=task_dir,
+ instruction=instruction,
+ config=config,
+ env_hash=env_hash,
+ )
+
+
+# =============================================================================
+# Templates
+# =============================================================================
+
+# fmt: off
+
+# Header + shared body split so the scenario signature can vary.
+_ENV_PY_HEADER = '''\
+"""{env_name} - HUD environment converted from Harbor.
+
+Source: {source_path}
+Tasks: {task_count}
+
+This environment runs Harbor-format tasks. Each task has:
+- instruction.md: the agent prompt
+- tests/test.sh: verification script that writes reward to /logs/verifier/
+
+The run-task scenario reads the instruction, lets the agent work,
+then executes the test script and parses the reward.
+"""
+
+import json
+import logging
+import subprocess
+from pathlib import Path
+{extra_imports}
+from hud import Environment
+from hud.tools import BashTool, EditTool
+from hud.tools.filesystem import GlobTool, GrepTool, ListTool, ReadTool
+
+LOGGER = logging.getLogger(__name__)
+
+TASKS_DIR = Path("/harbor/tasks")
+
+env = Environment("{env_name}")
+
+# Standard coding tools - agents interact via bash (matching Harbor's model)
+env.add_tool(BashTool())
+env.add_tool(EditTool())
+env.add_tool(ReadTool())
+env.add_tool(GrepTool())
+env.add_tool(GlobTool())
+env.add_tool(ListTool())
+
+'''
+
+# Single task: task_id is optional, defaults to the only task.
+_SCENARIO_SINGLE = """\
+@env.scenario("run-task")
+async def run_task(task_id: str = "{default_task_id}"):
+"""
+
+# Multiple tasks: task_id is required, typed as a Literal.
+_SCENARIO_MULTI = """\
+TaskId = Literal[{task_id_literal}]
+
+
+@env.scenario("run-task")
+async def run_task(task_id: TaskId):
+"""
+
+_SCENARIO_BODY = '''\
+ """Run a Harbor task by ID.
+
+ Reads /harbor/tasks//instruction.md as the prompt.
+ After the agent works, runs tests/test.sh and parses
+ /logs/verifier/reward.txt or reward.json for the reward.
+ """
+ task_dir = TASKS_DIR / str(task_id)
+ if not task_dir.exists():
+ available = [d.name for d in TASKS_DIR.iterdir() if d.is_dir()]
+ raise ValueError(
+ f"Task '{{task_id}}' not found. Available: {{available}}"
+ )
+
+ # Read the task instruction
+ instruction = (task_dir / "instruction.md").read_text(encoding="utf-8")
+
+ # Setup: yield prompt to the agent
+ answer = yield instruction
+
+ # Ensure log output directory exists
+ logs_dir = Path("/logs/verifier")
+ logs_dir.mkdir(parents=True, exist_ok=True)
+
+ # Harbor mounts the task's tests/ directory at /tests/ — replicate that
+ tests_link = Path("/tests")
+ task_tests = task_dir / "tests"
+ if task_tests.is_dir():
+ if tests_link.is_symlink() or tests_link.exists():
+ tests_link.unlink()
+ tests_link.symlink_to(task_tests)
+
+ # Evaluate: run the test script
+ test_script = task_dir / "tests" / "test.sh"
+ if test_script.exists():
+ try:
+ result = subprocess.run(
+ ["bash", str(test_script)],
+ cwd="/app",
+ capture_output=True,
+ text=True,
+ timeout={verifier_timeout},
+ check=False,
+ )
+ if result.stdout:
+ LOGGER.info("test.sh stdout for %s:\\n%s", task_id, result.stdout[-2000:])
+ if result.stderr:
+ LOGGER.info("test.sh stderr for %s:\\n%s", task_id, result.stderr[-2000:])
+ if result.returncode != 0:
+ LOGGER.warning(
+ "test.sh exited with code %d for task %s",
+ result.returncode, task_id,
+ )
+ except subprocess.TimeoutExpired:
+ LOGGER.warning("Test script timed out for task %s", task_id)
+ except Exception as exc:
+ LOGGER.warning("Test script failed for task %s: %s", task_id, exc)
+ else:
+ LOGGER.warning("No test script found at %s", test_script)
+
+ # Parse and yield reward
+ yield _parse_harbor_reward()
+
+
+def _parse_harbor_reward() -> float:
+ """Parse reward from Harbor standard output locations.
+
+ Harbor test scripts write results to /logs/verifier/ as either:
+ - reward.txt: a single float value
+ - reward.json: {{"reward": float}} or just a float
+ """
+ reward_txt = Path("/logs/verifier/reward.txt")
+ reward_json = Path("/logs/verifier/reward.json")
+
+ if reward_txt.exists():
+ try:
+ return float(reward_txt.read_text(encoding="utf-8").strip())
+ except ValueError:
+ pass
+
+ if reward_json.exists():
+ try:
+ data = json.loads(reward_json.read_text(encoding="utf-8"))
+ if isinstance(data, dict):
+ return float(data.get("reward", 0.0))
+ return float(data)
+ except (ValueError, json.JSONDecodeError):
+ pass
+
+ return 0.0
+'''
+
+
+def _build_env_py(
+ env_name: str,
+ source_path: str,
+ task_ids: list[str],
+ verifier_timeout: int,
+) -> str:
+ """Build the env.py content, adapting the scenario signature to task count."""
+ if len(task_ids) == 1:
+ extra_imports = ""
+ scenario = _SCENARIO_SINGLE.format(default_task_id=task_ids[0])
+ else:
+ extra_imports = "\nfrom typing import Literal\n"
+ literal_values = ", ".join(f'"{tid}"' for tid in sorted(task_ids))
+ scenario = _SCENARIO_MULTI.format(task_id_literal=literal_values)
+
+ header = _ENV_PY_HEADER.format(
+ env_name=env_name,
+ source_path=source_path,
+ task_count=len(task_ids),
+ extra_imports=extra_imports,
+ )
+ body = _SCENARIO_BODY.format(verifier_timeout=verifier_timeout)
+ return header + scenario + body
+
+# fmt: on
+
+# Shared snippet: install uv standalone (works on any base image with curl or
+# apt), then use uv to bootstrap Python and sync dependencies.
+_HUD_LAYER = """\
+# ============================================================
+# HUD MCP server layer
+# ============================================================
+WORKDIR /hud
+
+# Install uv standalone (no pip/python required on the base image)
+RUN command -v curl >/dev/null 2>&1 || \\
+ (apt-get update -qq && \\
+ apt-get install -y -qq --no-install-recommends curl ca-certificates && \\
+ rm -rf /var/lib/apt/lists/*) && \\
+ curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+
+COPY pyproject.toml uv.lock* ./
+RUN uv sync --frozen --no-dev --no-install-project 2>/dev/null || \\
+ uv sync --no-dev --no-install-project
+
+# Harbor task data (instructions + test scripts baked into image)
+COPY tasks/ /harbor/tasks/
+
+# Ensure standard directories exist and are writable at runtime
+# (MCP server may run as non-root; Harbor tasks expect /app writable)
+RUN mkdir -p /logs/verifier /workspace /app && chmod 777 /logs/verifier /workspace /app
+
+COPY env.py ./
+
+CMD ["uv", "run", "--no-project", "python", "-m", "hud", "dev", "env:env", "--stdio"]
+"""
+
+DOCKERFILE_WITH_BASE_TEMPLATE = (
+ """\
+# ============================================================
+# Harbor environment base
+# Source: {source}
+# ============================================================
+{base_dockerfile}
+"""
+ + _HUD_LAYER
+)
+
+DOCKERFILE_FALLBACK_TEMPLATE = (
+ """\
+FROM python:3.11-slim
+
+RUN apt-get update && apt-get install -y --no-install-recommends \\
+ curl git build-essential && rm -rf /var/lib/apt/lists/*
+"""
+ + _HUD_LAYER
+)
+
+PYPROJECT_TEMPLATE = """\
+[project]
+name = "{name}"
+version = "0.1.0"
+requires-python = ">=3.10"
+dependencies = ["hud-python", "openai"]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+"""
+
+
+# =============================================================================
+# Converter
+# =============================================================================
+
+
+class HarborConverter(BaseConverter):
+ """Convert Harbor tasks/datasets to HUD format.
+
+ Handles:
+ - Single task directory (has task.toml directly)
+ - Dataset directory (subdirectories are Harbor tasks)
+ - Multi-environment datasets (tasks grouped by Dockerfile hash)
+ """
+
+ name = "harbor"
+ description = "Harbor framework (task.toml + instruction.md + environment/ + tests/)"
+
+ def detect(self, path: Path) -> bool:
+ if _is_harbor_task(path):
+ return True
+ # Check for dataset (directory containing task subdirectories)
+ if path.is_dir():
+ return any(_is_harbor_task(d) for d in path.iterdir() if d.is_dir())
+ return False
+
+ def convert(self, path: Path) -> ConvertResult:
+ path = path.resolve()
+
+ # Discover tasks
+ if _is_harbor_task(path):
+ task_dirs = [path]
+ dataset_name = path.parent.name
+ else:
+ task_dirs = sorted(d for d in path.iterdir() if d.is_dir() and _is_harbor_task(d))
+ dataset_name = path.name
+
+ if not task_dirs:
+ raise ValueError(f"No Harbor tasks found in {path}")
+
+ # Parse all tasks
+ tasks: list[HarborTask] = []
+ skipped = 0
+ for td in task_dirs:
+ parsed = _parse_task(td)
+ if parsed:
+ tasks.append(parsed)
+ else:
+ skipped += 1
+
+ if not tasks:
+ raise ValueError("All Harbor tasks failed to parse")
+
+ if skipped:
+ LOGGER.warning("Skipped %d task(s) that failed to parse", skipped)
+
+ LOGGER.info("Parsed %d Harbor task(s) from %s", len(tasks), path)
+
+ # Group by environment Dockerfile hash
+ groups: dict[str, list[HarborTask]] = {}
+ for task in tasks:
+ groups.setdefault(task.env_hash, []).append(task)
+
+ LOGGER.info("Found %d unique environment group(s)", len(groups))
+
+ # Generate environments and taskset
+ environments: list[GeneratedEnvironment] = []
+ taskset: list[dict[str, Any]] = []
+ base_name = f"hud-harbor-{_normalize_name(dataset_name)}"
+
+ # Sort groups by size (largest first) for consistent naming
+ sorted_groups = sorted(groups.items(), key=lambda x: -len(x[1]))
+
+ for idx, (_env_hash, group_tasks) in enumerate(sorted_groups, start=1):
+ # Naming: single group gets base_name, multiple get suffix
+ env_name = base_name if len(sorted_groups) == 1 else f"{base_name}-g{idx}"
+
+ # Use representative task for shared config
+ rep_task = group_tasks[0]
+ env_dir = rep_task.directory / "environment"
+ dockerfile_content = _find_dockerfile(env_dir) if env_dir.exists() else None
+
+ # Extract verifier timeout from config
+ verifier_timeout = 600
+ verifier_cfg = rep_task.config.get("verifier", {})
+ if isinstance(verifier_cfg, dict):
+ timeout_val = verifier_cfg.get("timeout_sec")
+ if timeout_val is not None:
+ verifier_timeout = int(timeout_val)
+
+ # --- Generate env.py ---
+ # Use forward slashes in source_path to avoid unicode escape issues on Windows
+ task_ids = [t.task_id for t in group_tasks]
+ env_py = _build_env_py(
+ env_name=env_name,
+ source_path=path.as_posix(),
+ task_ids=task_ids,
+ verifier_timeout=verifier_timeout,
+ )
+
+ # --- Generate Dockerfile.hud ---
+ if dockerfile_content:
+ adapted = _adapt_harbor_dockerfile(dockerfile_content)
+ dockerfile = DOCKERFILE_WITH_BASE_TEMPLATE.format(
+ source=env_dir.as_posix(),
+ base_dockerfile=adapted,
+ )
+ else:
+ dockerfile = DOCKERFILE_FALLBACK_TEMPLATE
+
+ # --- Generate pyproject.toml ---
+ pyproject = PYPROJECT_TEMPLATE.format(name=env_name)
+
+ # --- Map task IDs to source directories ---
+ task_dir_map = {t.task_id: t.directory for t in group_tasks}
+
+ # Build context: non-Dockerfile files from environment/ dir
+ # (e.g., warriors/*.red that the Dockerfile COPYs)
+ build_ctx = env_dir if env_dir.exists() else None
+
+ environments.append(
+ GeneratedEnvironment(
+ name=env_name,
+ env_py=env_py,
+ dockerfile=dockerfile,
+ pyproject_toml=pyproject,
+ task_dirs=task_dir_map,
+ build_context_source=build_ctx,
+ )
+ )
+
+ # --- Generate v5 taskset entries ---
+ for task in group_tasks:
+ metadata: dict[str, Any] = {
+ "harbor_source": task.directory.relative_to(path.parent).as_posix(),
+ }
+ # Pull metadata from task.toml [metadata] section
+ toml_meta = task.config.get("metadata", {})
+ if isinstance(toml_meta, dict):
+ metadata.update(toml_meta)
+
+ taskset.append(
+ {
+ "env": {"name": env_name},
+ "scenario": f"{env_name}:run-task",
+ "args": {"task_id": task.task_id},
+ "metadata": metadata,
+ }
+ )
+
+ # Build summary lines
+ summary = [
+ f"Converted {len(tasks)} Harbor task(s) into {len(environments)} environment(s).",
+ ]
+ if skipped:
+ summary.append(f"Skipped {skipped} task(s) that failed to parse.")
+ summary.append("")
+ for env_gen in environments:
+ task_count = len(env_gen.task_dirs)
+ summary.append(f" {env_gen.name}/ ({task_count} tasks)")
+ summary.extend(
+ [
+ "",
+ "Next steps:",
+ " 1. hud deploy /",
+ " 2. hud eval taskset.json",
+ ]
+ )
+
+ return ConvertResult(
+ environments=environments,
+ taskset=taskset,
+ summary=summary,
+ )
diff --git a/hud/cli/convert/tests/__init__.py b/hud/cli/convert/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hud/cli/convert/tests/conftest.py b/hud/cli/convert/tests/conftest.py
new file mode 100644
index 00000000..e6f7b683
--- /dev/null
+++ b/hud/cli/convert/tests/conftest.py
@@ -0,0 +1,258 @@
+"""Shared fixtures for Harbor converter tests.
+
+Provides builders that create synthetic Harbor-format task directories
+matching the terminal-bench-2 layout:
+
+ task_name/
+ ├── task.toml
+ ├── instruction.md
+ ├── environment/
+ │ └── Dockerfile
+ ├── tests/
+ │ └── test.sh
+ └── solution/ # optional, should be ignored by converter
+"""
+
+from __future__ import annotations
+
+import textwrap
+from pathlib import Path # noqa: TC003 - used at runtime
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# task.toml templates (matching real terminal-bench style)
+# ---------------------------------------------------------------------------
+
+_DEFAULT_TASK_TOML = textwrap.dedent("""\
+ [metadata]
+ category = "systems"
+ difficulty = "medium"
+ tags = ["bash", "linux"]
+
+ [verifier]
+ timeout_sec = 120
+""")
+
+_TASK_TOML_WITH_IMAGE = textwrap.dedent("""\
+ [metadata]
+ category = "machine-learning"
+ difficulty = "hard"
+ tags = ["python", "ml"]
+
+ [docker]
+ image = "alexgshaw/caffe-cifar-10:20251031"
+
+ [verifier]
+ timeout_sec = 300
+""")
+
+
+# ---------------------------------------------------------------------------
+# Dockerfile templates
+# ---------------------------------------------------------------------------
+
+_SIMPLE_DOCKERFILE = textwrap.dedent("""\
+ FROM python:3.11-slim
+ RUN apt-get update && apt-get install -y curl git
+ WORKDIR /workspace
+ CMD ["bash"]
+""")
+
+_ML_DOCKERFILE = textwrap.dedent("""\
+ FROM nvidia/cuda:12.0-runtime-ubuntu22.04
+ RUN apt-get update && apt-get install -y python3 python3-pip
+ RUN pip3 install torch numpy
+ WORKDIR /workspace
+ ENTRYPOINT ["/bin/bash"]
+""")
+
+
+# ---------------------------------------------------------------------------
+# Helper to build a single task directory
+# ---------------------------------------------------------------------------
+
+
+def make_harbor_task(
+ parent: Path,
+ name: str,
+ instruction: str = "Solve the task.",
+ task_toml: str = _DEFAULT_TASK_TOML,
+ dockerfile: str | None = _SIMPLE_DOCKERFILE,
+ test_script: str = '#!/bin/bash\necho "1.0" > /logs/verifier/reward.txt\n',
+ include_solution: bool = False,
+) -> Path:
+ """Create a synthetic Harbor task directory under *parent*.
+
+ Returns the task directory path.
+ """
+ task_dir = parent / name
+ task_dir.mkdir(parents=True, exist_ok=True)
+
+ (task_dir / "instruction.md").write_text(instruction, encoding="utf-8")
+ (task_dir / "task.toml").write_text(task_toml, encoding="utf-8")
+
+ if dockerfile is not None:
+ env_dir = task_dir / "environment"
+ env_dir.mkdir(exist_ok=True)
+ (env_dir / "Dockerfile").write_text(dockerfile, encoding="utf-8")
+
+ tests_dir = task_dir / "tests"
+ tests_dir.mkdir(exist_ok=True)
+ (tests_dir / "test.sh").write_text(test_script, encoding="utf-8")
+
+ if include_solution:
+ sol_dir = task_dir / "solution"
+ sol_dir.mkdir(exist_ok=True)
+ (sol_dir / "solve.sh").write_text("#!/bin/bash\necho done\n", encoding="utf-8")
+
+ return task_dir
+
+
+# ---------------------------------------------------------------------------
+# Pytest fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture()
+def single_task(tmp_path: Path) -> Path:
+ """A single Harbor task directory (like a standalone task)."""
+ return make_harbor_task(
+ tmp_path,
+ "cancel-async-tasks",
+ instruction=(
+ "# Cancel Async Tasks\n\n"
+ "Write a Python script that launches 5 asyncio tasks and cancels "
+ "all of them within 2 seconds.\n"
+ ),
+ )
+
+
+@pytest.fixture()
+def dataset_same_env(tmp_path: Path) -> Path:
+ """A dataset directory with 3 tasks sharing the same Dockerfile."""
+ dataset = tmp_path / "terminal-bench-sample"
+ dataset.mkdir()
+
+ for name in ("cancel-async-tasks", "build-pmars", "chess-best-move"):
+ make_harbor_task(
+ dataset,
+ name,
+ instruction=f"# {name}\n\nSolve the {name} task.\n",
+ )
+
+ return dataset
+
+
+@pytest.fixture()
+def dataset_multi_env(tmp_path: Path) -> Path:
+ """A dataset directory with tasks split across 2 different Dockerfiles."""
+ dataset = tmp_path / "mixed-bench"
+ dataset.mkdir()
+
+ # Group 1: simple python tasks (same Dockerfile)
+ for name in ("cancel-async-tasks", "build-pmars"):
+ make_harbor_task(
+ dataset,
+ name,
+ instruction=f"# {name}\n\nDo the thing.\n",
+ dockerfile=_SIMPLE_DOCKERFILE,
+ )
+
+ # Group 2: ML tasks (different Dockerfile)
+ for name in ("caffe-cifar-10", "sam-cell-seg"):
+ make_harbor_task(
+ dataset,
+ name,
+ instruction=f"# {name}\n\nTrain the model.\n",
+ task_toml=_TASK_TOML_WITH_IMAGE,
+ dockerfile=_ML_DOCKERFILE,
+ )
+
+ return dataset
+
+
+@pytest.fixture()
+def dataset_no_dockerfile(tmp_path: Path) -> Path:
+ """A dataset where tasks have no environment/Dockerfile."""
+ dataset = tmp_path / "no-docker-bench"
+ dataset.mkdir()
+
+ for name in ("task-a", "task-b"):
+ make_harbor_task(
+ dataset,
+ name,
+ instruction=f"# {name}\n\nSimple task.\n",
+ dockerfile=None, # No Dockerfile
+ )
+
+ return dataset
+
+
+@pytest.fixture()
+def dataset_with_solutions(tmp_path: Path) -> Path:
+ """A dataset where tasks include solution/ directories."""
+ dataset = tmp_path / "solved-bench"
+ dataset.mkdir()
+
+ for name in ("task-x", "task-y"):
+ make_harbor_task(
+ dataset,
+ name,
+ instruction=f"# {name}\n\nSolve it.\n",
+ include_solution=True,
+ )
+
+ return dataset
+
+
+@pytest.fixture()
+def task_with_build_context(tmp_path: Path) -> Path:
+ """A single task whose environment/ dir has extra build context files.
+
+ Mimics build-pmars which has warriors/*.red files that the
+ Dockerfile COPYs into the image.
+ """
+ task_dir = tmp_path / "build-pmars"
+ task_dir.mkdir()
+
+ (task_dir / "instruction.md").write_text(
+ "# Build pMARS\n\nBuild the pMARS simulator.\n", encoding="utf-8"
+ )
+ (task_dir / "task.toml").write_text(
+ textwrap.dedent("""\
+ [metadata]
+ category = "software-engineering"
+ difficulty = "medium"
+
+ [verifier]
+ timeout_sec = 900
+ """),
+ encoding="utf-8",
+ )
+
+ # environment/ with Dockerfile AND extra build context files
+ env_dir = task_dir / "environment"
+ env_dir.mkdir()
+ (env_dir / "Dockerfile").write_text(
+ textwrap.dedent("""\
+ FROM debian:13.0-slim
+ RUN apt-get update && apt-get install -y tmux
+ WORKDIR /app
+ COPY warriors/flashpaper.red warriors/rave.red /app/
+ """),
+ encoding="utf-8",
+ )
+ warriors = env_dir / "warriors"
+ warriors.mkdir()
+ (warriors / "flashpaper.red").write_text(";redcode\nMOV 0, 1\n", encoding="utf-8")
+ (warriors / "rave.red").write_text(";redcode\nSPL 0, 0\n", encoding="utf-8")
+
+ # tests/
+ tests_dir = task_dir / "tests"
+ tests_dir.mkdir()
+ (tests_dir / "test.sh").write_text(
+ '#!/bin/bash\necho "1.0" > /logs/verifier/reward.txt\n', encoding="utf-8"
+ )
+
+ return task_dir
diff --git a/hud/cli/convert/tests/test_harbor.py b/hud/cli/convert/tests/test_harbor.py
new file mode 100644
index 00000000..64c6c6b2
--- /dev/null
+++ b/hud/cli/convert/tests/test_harbor.py
@@ -0,0 +1,751 @@
+"""Tests for the Harbor → HUD converter.
+
+Exercises HarborConverter.detect(), HarborConverter.convert(), and the
+write_result() writer using synthetic terminal-bench-style fixtures
+defined in conftest.py.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING
+
+import pytest
+
+if TYPE_CHECKING:
+ from pathlib import Path
+
+from hud.cli.convert import detect_format, get_converter, list_formats, write_result
+from hud.cli.convert.harbor import (
+ HarborConverter,
+ _adapt_harbor_dockerfile,
+ _find_dockerfile,
+ _hash_directory,
+ _is_harbor_task,
+ _normalize_name,
+ _parse_task,
+)
+
+from .conftest import make_harbor_task
+
+# ============================================================================
+# Helper unit tests
+# ============================================================================
+
+
+class TestNormalizeName:
+ def test_simple(self) -> None:
+ assert _normalize_name("terminal-bench") == "terminal-bench"
+
+ def test_underscores(self) -> None:
+ assert _normalize_name("my_cool_bench") == "my-cool-bench"
+
+ def test_spaces(self) -> None:
+ assert _normalize_name("My Cool Bench") == "my-cool-bench"
+
+ def test_special_chars(self) -> None:
+ assert _normalize_name("bench@2.0!") == "bench20"
+
+ def test_empty(self) -> None:
+ assert _normalize_name("") == "converted"
+
+ def test_only_special_chars(self) -> None:
+ assert _normalize_name("@#$") == "converted"
+
+ def test_leading_trailing_dashes(self) -> None:
+ assert _normalize_name("--hello--") == "hello"
+
+ def test_consecutive_dashes(self) -> None:
+ assert _normalize_name("a---b") == "a-b"
+
+
+class TestAdaptDockerfile:
+ def test_comments_cmd(self) -> None:
+ result = _adapt_harbor_dockerfile('CMD ["bash"]')
+ assert result == '# [harbor original] CMD ["bash"]'
+
+ def test_comments_entrypoint(self) -> None:
+ result = _adapt_harbor_dockerfile('ENTRYPOINT ["/bin/bash"]')
+ assert result == '# [harbor original] ENTRYPOINT ["/bin/bash"]'
+
+ def test_preserves_other_lines(self) -> None:
+ dockerfile = "FROM python:3.11\nRUN echo hi\nCMD bash"
+ result = _adapt_harbor_dockerfile(dockerfile)
+ lines = result.splitlines()
+ assert lines[0] == "FROM python:3.11"
+ assert lines[1] == "RUN echo hi"
+ assert lines[2] == "# [harbor original] CMD bash"
+
+ def test_case_insensitive_match(self) -> None:
+ # The implementation uses .upper() so indented CMD should match
+ result = _adapt_harbor_dockerfile(" CMD bash")
+ assert result == "# [harbor original] CMD bash"
+
+ def test_no_cmd_or_entrypoint(self) -> None:
+ dockerfile = "FROM python:3.11\nRUN apt-get update"
+ assert _adapt_harbor_dockerfile(dockerfile) == dockerfile
+
+
+class TestHashDirectory:
+ def test_same_content_same_hash(self, tmp_path: Path) -> None:
+ dir_a = tmp_path / "a"
+ dir_a.mkdir()
+ (dir_a / "file.txt").write_text("hello")
+
+ dir_b = tmp_path / "b"
+ dir_b.mkdir()
+ (dir_b / "file.txt").write_text("hello")
+
+ assert _hash_directory(dir_a) == _hash_directory(dir_b)
+
+ def test_different_content_different_hash(self, tmp_path: Path) -> None:
+ dir_a = tmp_path / "a"
+ dir_a.mkdir()
+ (dir_a / "file.txt").write_text("hello")
+
+ dir_b = tmp_path / "b"
+ dir_b.mkdir()
+ (dir_b / "file.txt").write_text("world")
+
+ assert _hash_directory(dir_a) != _hash_directory(dir_b)
+
+ def test_nonexistent_returns_empty(self, tmp_path: Path) -> None:
+ assert _hash_directory(tmp_path / "nonexistent") == "empty"
+
+ def test_empty_directory(self, tmp_path: Path) -> None:
+ empty = tmp_path / "empty"
+ empty.mkdir()
+ # Empty dir has a deterministic hash (sha256 of nothing)
+ result = _hash_directory(empty)
+ assert isinstance(result, str)
+ assert len(result) == 16
+
+
+class TestFindDockerfile:
+ def test_finds_dockerfile(self, tmp_path: Path) -> None:
+ (tmp_path / "Dockerfile").write_text("FROM python:3.11")
+ assert _find_dockerfile(tmp_path) == "FROM python:3.11"
+
+ def test_finds_lowercase(self, tmp_path: Path) -> None:
+ (tmp_path / "dockerfile").write_text("FROM alpine")
+ assert _find_dockerfile(tmp_path) == "FROM alpine"
+
+ def test_returns_none_when_missing(self, tmp_path: Path) -> None:
+ assert _find_dockerfile(tmp_path) is None
+
+
+class TestIsHarborTask:
+ def test_valid_task(self, single_task: Path) -> None:
+ assert _is_harbor_task(single_task) is True
+
+ def test_missing_instruction(self, tmp_path: Path) -> None:
+ task = tmp_path / "bad-task"
+ task.mkdir()
+ (task / "task.toml").write_text("[metadata]\n")
+ assert _is_harbor_task(task) is False
+
+ def test_missing_task_toml(self, tmp_path: Path) -> None:
+ task = tmp_path / "bad-task"
+ task.mkdir()
+ (task / "instruction.md").write_text("# Do something")
+ assert _is_harbor_task(task) is False
+
+ def test_not_a_directory(self, tmp_path: Path) -> None:
+ f = tmp_path / "file.txt"
+ f.write_text("not a dir")
+ assert _is_harbor_task(f) is False
+
+
+class TestParseTask:
+ def test_parses_valid_task(self, single_task: Path) -> None:
+ result = _parse_task(single_task)
+ assert result is not None
+ assert result.task_id == "cancel-async-tasks"
+ assert "Cancel Async Tasks" in result.instruction
+ assert result.config.get("metadata", {}).get("category") == "systems"
+
+ def test_parses_verifier_timeout(self, single_task: Path) -> None:
+ result = _parse_task(single_task)
+ assert result is not None
+ assert result.config["verifier"]["timeout_sec"] == 120
+
+ def test_returns_none_for_bad_instruction(self, tmp_path: Path) -> None:
+ task_dir = tmp_path / "bad"
+ task_dir.mkdir()
+ (task_dir / "task.toml").write_text("[metadata]\n")
+ # instruction.md missing
+ assert _parse_task(task_dir) is None
+
+ def test_handles_bad_toml_gracefully(self, tmp_path: Path) -> None:
+ task_dir = tmp_path / "broken-toml"
+ task_dir.mkdir()
+ (task_dir / "instruction.md").write_text("# Hello")
+ (task_dir / "task.toml").write_text("this is not valid toml {{{")
+ result = _parse_task(task_dir)
+ assert result is not None
+ # Config should be empty dict when toml fails
+ assert result.config == {}
+
+
+# ============================================================================
+# HarborConverter.detect()
+# ============================================================================
+
+
+class TestHarborConverterDetect:
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ def test_detects_single_task(self, single_task: Path) -> None:
+ assert self.converter.detect(single_task) is True
+
+ def test_detects_dataset(self, dataset_same_env: Path) -> None:
+ assert self.converter.detect(dataset_same_env) is True
+
+ def test_rejects_empty_dir(self, tmp_path: Path) -> None:
+ assert self.converter.detect(tmp_path) is False
+
+ def test_rejects_non_harbor_dir(self, tmp_path: Path) -> None:
+ (tmp_path / "random.txt").write_text("nope")
+ assert self.converter.detect(tmp_path) is False
+
+
+# ============================================================================
+# HarborConverter.convert()
+# ============================================================================
+
+
+class TestHarborConverterConvertSingleTask:
+ """Convert a single Harbor task directory."""
+
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ def test_single_task_produces_one_env(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ assert len(result.environments) == 1
+ assert len(result.taskset) == 1
+
+ def test_env_name_uses_parent_dir(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ env = result.environments[0]
+ # Parent dir name is the tmp_path random name, but it gets normalized
+ assert env.name.startswith("hud-harbor-")
+
+ def test_env_py_contains_scenario(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ env_py = result.environments[0].env_py
+ assert "@env.scenario" in env_py
+ assert "run-task" in env_py
+
+ def test_env_py_has_correct_timeout(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ env_py = result.environments[0].env_py
+ assert "timeout=120" in env_py
+
+ def test_taskset_references_env(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ entry = result.taskset[0]
+ env_name = result.environments[0].name
+ assert entry["scenario"] == f"{env_name}:run-task"
+ assert entry["args"]["task_id"] == "cancel-async-tasks"
+
+ def test_task_dirs_map(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ env = result.environments[0]
+ assert "cancel-async-tasks" in env.task_dirs
+ assert env.task_dirs["cancel-async-tasks"] == single_task
+
+ def test_summary_not_empty(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ assert len(result.summary) > 0
+ assert any("1" in line for line in result.summary)
+
+
+class TestHarborConverterConvertDataset:
+ """Convert a dataset directory with multiple tasks sharing the same env."""
+
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ def test_same_env_groups_into_one(self, dataset_same_env: Path) -> None:
+ result = self.converter.convert(dataset_same_env)
+ assert len(result.environments) == 1
+ assert len(result.taskset) == 3
+
+ def test_all_task_ids_present(self, dataset_same_env: Path) -> None:
+ result = self.converter.convert(dataset_same_env)
+ task_ids = {e["args"]["task_id"] for e in result.taskset}
+ assert task_ids == {"cancel-async-tasks", "build-pmars", "chess-best-move"}
+
+ def test_env_name_from_dataset(self, dataset_same_env: Path) -> None:
+ result = self.converter.convert(dataset_same_env)
+ env = result.environments[0]
+ assert env.name == "hud-harbor-terminal-bench-sample"
+
+
+class TestHarborConverterConvertMultiEnv:
+ """Convert a dataset with tasks split across different Dockerfiles."""
+
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ def test_creates_two_envs(self, dataset_multi_env: Path) -> None:
+ result = self.converter.convert(dataset_multi_env)
+ assert len(result.environments) == 2
+ assert len(result.taskset) == 4
+
+ def test_env_names_have_group_suffix(self, dataset_multi_env: Path) -> None:
+ result = self.converter.convert(dataset_multi_env)
+ names = {e.name for e in result.environments}
+ assert all(n.startswith("hud-harbor-mixed-bench") for n in names)
+ # With multiple groups, names should have -g1, -g2 suffixes
+ assert any("-g1" in n for n in names)
+ assert any("-g2" in n for n in names)
+
+ def test_each_env_has_correct_tasks(self, dataset_multi_env: Path) -> None:
+ result = self.converter.convert(dataset_multi_env)
+ for env in result.environments:
+ task_ids = set(env.task_dirs.keys())
+ # Each group should have exactly 2 tasks
+ assert len(task_ids) == 2
+
+ def test_ml_env_has_nvidia_dockerfile(self, dataset_multi_env: Path) -> None:
+ result = self.converter.convert(dataset_multi_env)
+ # One of the environments should reference nvidia in its dockerfile
+ dockerfiles = [e.dockerfile for e in result.environments]
+ assert any("nvidia" in d for d in dockerfiles)
+
+ def test_simple_env_has_python_dockerfile(self, dataset_multi_env: Path) -> None:
+ result = self.converter.convert(dataset_multi_env)
+ dockerfiles = [e.dockerfile for e in result.environments]
+ assert any("python:3.11-slim" in d for d in dockerfiles)
+
+
+class TestBuildContextSource:
+ """Verify build_context_source is set for tasks with environment/ dirs."""
+
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ def test_build_context_source_set(self, task_with_build_context: Path) -> None:
+ result = self.converter.convert(task_with_build_context)
+ env = result.environments[0]
+ assert env.build_context_source is not None
+ assert env.build_context_source.is_dir()
+
+ def test_build_context_source_none_when_no_env_dir(self, dataset_no_dockerfile: Path) -> None:
+ result = self.converter.convert(dataset_no_dockerfile)
+ env = result.environments[0]
+ assert env.build_context_source is None
+
+
+class TestWriteBuildContext:
+ """Verify that build context files from environment/ are copied to env root."""
+
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ def test_warriors_copied_to_env_root(
+ self, task_with_build_context: Path, tmp_path: Path
+ ) -> None:
+ result = self.converter.convert(task_with_build_context)
+ out = tmp_path / "output"
+ write_result(result, out)
+
+ env = result.environments[0]
+ env_dir = out / env.name
+
+ # warriors/ dir should exist at env root (Docker build context)
+ assert (env_dir / "warriors").is_dir()
+ assert (env_dir / "warriors" / "flashpaper.red").is_file()
+ assert (env_dir / "warriors" / "rave.red").is_file()
+
+ def test_dockerfile_not_duplicated(self, task_with_build_context: Path, tmp_path: Path) -> None:
+ result = self.converter.convert(task_with_build_context)
+ out = tmp_path / "output"
+ write_result(result, out)
+
+ env = result.environments[0]
+ env_dir = out / env.name
+
+ # Should have Dockerfile.hud (generated), NOT a raw Dockerfile copy
+ assert (env_dir / "Dockerfile.hud").is_file()
+ assert not (env_dir / "Dockerfile").exists()
+
+ def test_build_context_content_correct(
+ self, task_with_build_context: Path, tmp_path: Path
+ ) -> None:
+ result = self.converter.convert(task_with_build_context)
+ out = tmp_path / "output"
+ write_result(result, out)
+
+ env = result.environments[0]
+ content = (out / env.name / "warriors" / "flashpaper.red").read_text(encoding="utf-8")
+ assert "MOV 0, 1" in content
+
+
+class TestHarborConverterConvertNoDockerfile:
+ """Tasks without environment/Dockerfile should use fallback."""
+
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ def test_fallback_dockerfile(self, dataset_no_dockerfile: Path) -> None:
+ result = self.converter.convert(dataset_no_dockerfile)
+ assert len(result.environments) == 1
+ # Fallback dockerfile starts with FROM python:3.11-slim
+ assert "FROM python:3.11-slim" in result.environments[0].dockerfile
+
+ def test_no_harbor_original_comments(self, dataset_no_dockerfile: Path) -> None:
+ result = self.converter.convert(dataset_no_dockerfile)
+ # Fallback dockerfile should NOT have commented-out lines
+ assert "# [harbor original]" not in result.environments[0].dockerfile
+
+
+class TestHarborConverterConvertWithSolutions:
+ """Verify that solution/ dirs show up in task_dirs but write_result skips them."""
+
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ def test_solutions_present_in_source(self, dataset_with_solutions: Path) -> None:
+ # Verify the fixture has solution dirs
+ for name in ("task-x", "task-y"):
+ assert (dataset_with_solutions / name / "solution").is_dir()
+
+ def test_convert_succeeds(self, dataset_with_solutions: Path) -> None:
+ result = self.converter.convert(dataset_with_solutions)
+ assert len(result.environments) == 1
+ assert len(result.taskset) == 2
+
+
+class TestHarborConverterEdgeCases:
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ def test_no_tasks_raises(self, tmp_path: Path) -> None:
+ empty = tmp_path / "empty-dataset"
+ empty.mkdir()
+ with pytest.raises(ValueError, match="No Harbor tasks found"):
+ self.converter.convert(empty)
+
+ def test_all_tasks_fail_raises(self, tmp_path: Path) -> None:
+ dataset = tmp_path / "bad-dataset"
+ dataset.mkdir()
+ # Create subdirs that look like tasks but have no instruction.md
+ for name in ("a", "b"):
+ d = dataset / name
+ d.mkdir()
+ (d / "task.toml").write_text("[metadata]\n")
+ # Missing instruction.md -> will fail detect, so not even found as task
+ with pytest.raises(ValueError, match="No Harbor tasks found"):
+ self.converter.convert(dataset)
+
+ def test_partial_failure_skips_bad_tasks(self, tmp_path: Path) -> None:
+ dataset = tmp_path / "partial"
+ dataset.mkdir()
+
+ # One good task
+ make_harbor_task(dataset, "good-task")
+
+ # One bad task (has task.toml + instruction.md but instruction unreadable)
+ bad = dataset / "bad-task"
+ bad.mkdir()
+ (bad / "task.toml").write_text("[metadata]\n")
+ (bad / "instruction.md").write_text("# OK") # actually valid
+
+ result = self.converter.convert(dataset)
+ # Both should parse, so 2 tasks
+ assert len(result.taskset) == 2
+
+
+# ============================================================================
+# Taskset metadata
+# ============================================================================
+
+
+class TestTasksetMetadata:
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ def test_metadata_includes_harbor_source(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ entry = result.taskset[0]
+ assert "harbor_source" in entry["metadata"]
+
+ def test_metadata_includes_toml_metadata(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ entry = result.taskset[0]
+ meta = entry["metadata"]
+ assert meta.get("category") == "systems"
+ assert meta.get("difficulty") == "medium"
+
+
+# ============================================================================
+# Dockerfile generation
+# ============================================================================
+
+
+class TestDockerfileGeneration:
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ def test_cmd_commented_out(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ dockerfile = result.environments[0].dockerfile
+ # Original CMD ["bash"] should be commented out
+ assert "# [harbor original]" in dockerfile
+
+ def test_hud_layer_present(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ dockerfile = result.environments[0].dockerfile
+ assert "COPY env.py" in dockerfile
+ assert "uv" in dockerfile
+ assert "hud" in dockerfile
+
+ def test_tasks_copied_into_image(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ dockerfile = result.environments[0].dockerfile
+ assert "COPY tasks/ /harbor/tasks/" in dockerfile
+
+ def test_logs_dir_created(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ dockerfile = result.environments[0].dockerfile
+ assert "/logs/verifier" in dockerfile
+
+
+# ============================================================================
+# env.py generation
+# ============================================================================
+
+
+class TestEnvPyGeneration:
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ def test_imports_present(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ env_py = result.environments[0].env_py
+ assert "from hud import Environment" in env_py
+ assert "from hud.tools import BashTool" in env_py
+
+ def test_tools_added(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ env_py = result.environments[0].env_py
+ assert "env.add_tool(BashTool())" in env_py
+ assert "env.add_tool(EditTool())" in env_py
+
+ def test_reward_parsing_logic(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ env_py = result.environments[0].env_py
+ assert "_parse_harbor_reward" in env_py
+ assert "reward.txt" in env_py
+ assert "reward.json" in env_py
+
+
+# ============================================================================
+# Scenario signature: single-task default vs multi-task Literal
+# ============================================================================
+
+
+class TestScenarioSignature:
+ """Verify that single-task envs get a default and multi-task envs get a Literal."""
+
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ # --- single task: optional with default ---
+
+ def test_single_task_has_default(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ env_py = result.environments[0].env_py
+ assert 'task_id: str = "cancel-async-tasks"' in env_py
+
+ def test_single_task_no_literal_import(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ env_py = result.environments[0].env_py
+ assert "from typing import Literal" not in env_py
+ assert "TaskId" not in env_py
+
+ # --- multi-task (same env): Literal type ---
+
+ def test_multi_task_has_literal(self, dataset_same_env: Path) -> None:
+ result = self.converter.convert(dataset_same_env)
+ env_py = result.environments[0].env_py
+ assert "from typing import Literal" in env_py
+ assert "TaskId = Literal[" in env_py
+
+ def test_multi_task_literal_lists_all_ids(self, dataset_same_env: Path) -> None:
+ result = self.converter.convert(dataset_same_env)
+ env_py = result.environments[0].env_py
+ for name in ("cancel-async-tasks", "build-pmars", "chess-best-move"):
+ assert f'"{name}"' in env_py
+
+ def test_multi_task_signature_uses_literal(self, dataset_same_env: Path) -> None:
+ result = self.converter.convert(dataset_same_env)
+ env_py = result.environments[0].env_py
+ assert "def run_task(task_id: TaskId):" in env_py
+
+ def test_multi_task_no_default(self, dataset_same_env: Path) -> None:
+ result = self.converter.convert(dataset_same_env)
+ env_py = result.environments[0].env_py
+ # Should NOT have a default value
+ assert "task_id: TaskId):" in env_py
+ assert "= " not in env_py.split("def run_task(")[1].split("):")[0]
+
+ # --- multi-env dataset: each env gets the right variant ---
+
+ def test_multi_env_single_task_per_env(self, dataset_multi_env: Path) -> None:
+ result = self.converter.convert(dataset_multi_env)
+ # Each env has 2 tasks, so all should use Literal
+ for env in result.environments:
+ assert "TaskId = Literal[" in env.env_py
+
+ def test_single_task_build_context_fixture(self, task_with_build_context: Path) -> None:
+ result = self.converter.convert(task_with_build_context)
+ env_py = result.environments[0].env_py
+ assert 'task_id: str = "build-pmars"' in env_py
+
+
+# ============================================================================
+# pyproject.toml generation
+# ============================================================================
+
+
+class TestPyprojectGeneration:
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ def test_has_hud_dependency(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ pyproject = result.environments[0].pyproject_toml
+ assert "hud-python" in pyproject
+
+ def test_name_matches_env(self, single_task: Path) -> None:
+ result = self.converter.convert(single_task)
+ env = result.environments[0]
+ assert env.name in env.pyproject_toml
+
+
+# ============================================================================
+# write_result()
+# ============================================================================
+
+
+class TestWriteResult:
+ def setup_method(self) -> None:
+ self.converter = HarborConverter()
+
+ def test_creates_directory_structure(self, single_task: Path, tmp_path: Path) -> None:
+ result = self.converter.convert(single_task)
+ out = tmp_path / "output"
+ write_result(result, out)
+
+ env = result.environments[0]
+ env_dir = out / env.name
+
+ assert env_dir.is_dir()
+ assert (env_dir / "env.py").is_file()
+ assert (env_dir / "Dockerfile.hud").is_file()
+ assert (env_dir / "pyproject.toml").is_file()
+ assert (env_dir / "tasks").is_dir()
+ assert (out / "taskset.json").is_file()
+
+ def test_taskset_json_valid(self, single_task: Path, tmp_path: Path) -> None:
+ result = self.converter.convert(single_task)
+ out = tmp_path / "output"
+ taskset_path = write_result(result, out)
+
+ with open(taskset_path, encoding="utf-8") as f:
+ data = json.load(f)
+
+ assert isinstance(data, list)
+ assert len(data) == 1
+ assert data[0]["args"]["task_id"] == "cancel-async-tasks"
+
+ def test_task_files_copied(self, single_task: Path, tmp_path: Path) -> None:
+ result = self.converter.convert(single_task)
+ out = tmp_path / "output"
+ write_result(result, out)
+
+ env = result.environments[0]
+ task_out = out / env.name / "tasks" / "cancel-async-tasks"
+
+ assert (task_out / "instruction.md").is_file()
+ assert (task_out / "task.toml").is_file()
+ assert (task_out / "tests" / "test.sh").is_file()
+
+ def test_environment_dir_not_copied(self, single_task: Path, tmp_path: Path) -> None:
+ result = self.converter.convert(single_task)
+ out = tmp_path / "output"
+ write_result(result, out)
+
+ env = result.environments[0]
+ task_out = out / env.name / "tasks" / "cancel-async-tasks"
+
+ # environment/ should be excluded from the copy
+ assert not (task_out / "environment").exists()
+
+ def test_solution_dir_not_copied(self, dataset_with_solutions: Path, tmp_path: Path) -> None:
+ result = self.converter.convert(dataset_with_solutions)
+ out = tmp_path / "output"
+ write_result(result, out)
+
+ env = result.environments[0]
+ for task_id in env.task_dirs:
+ task_out = out / env.name / "tasks" / task_id
+ assert not (task_out / "solution").exists()
+
+ def test_multi_env_write(self, dataset_multi_env: Path, tmp_path: Path) -> None:
+ result = self.converter.convert(dataset_multi_env)
+ out = tmp_path / "output"
+ write_result(result, out)
+
+ # Both environments should be written
+ for env in result.environments:
+ assert (out / env.name).is_dir()
+ assert (out / env.name / "env.py").is_file()
+
+ # Single taskset.json with all tasks
+ with open(out / "taskset.json", encoding="utf-8") as f:
+ data = json.load(f)
+ assert len(data) == 4
+
+ def test_overwrites_existing(self, single_task: Path, tmp_path: Path) -> None:
+ result = self.converter.convert(single_task)
+ out = tmp_path / "output"
+
+ # Write twice — should not error
+ write_result(result, out)
+ write_result(result, out)
+
+ assert (out / "taskset.json").is_file()
+
+
+# ============================================================================
+# Registry integration (detect_format, get_converter, list_formats)
+# ============================================================================
+
+
+class TestConverterRegistry:
+ def test_get_converter_by_name(self) -> None:
+ converter = get_converter("harbor")
+ assert converter is not None
+ assert isinstance(converter, HarborConverter)
+
+ def test_get_converter_unknown(self) -> None:
+ assert get_converter("nonexistent") is None
+
+ def test_detect_format_harbor(self, single_task: Path) -> None:
+ converter = detect_format(single_task)
+ assert converter is not None
+ assert converter.name == "harbor"
+
+ def test_detect_format_unknown(self, tmp_path: Path) -> None:
+ assert detect_format(tmp_path) is None
+
+ def test_list_formats_includes_harbor(self) -> None:
+ formats = list_formats()
+ names = [name for name, _desc in formats]
+ assert "harbor" in names
diff --git a/hud/cli/deploy.py b/hud/cli/deploy.py
index 1cb77afa..3e354c34 100644
--- a/hud/cli/deploy.py
+++ b/hud/cli/deploy.py
@@ -3,6 +3,7 @@
from __future__ import annotations
import asyncio
+import logging
import os
import time
from pathlib import Path
@@ -14,10 +15,16 @@
from hud.cli.utils.build_logs import poll_build_status, stream_build_logs
from hud.cli.utils.config import parse_env_file
from hud.cli.utils.context import create_build_context_tarball, format_size
-from hud.cli.utils.environment import find_dockerfile, get_environment_name
+from hud.cli.utils.environment import (
+ find_dockerfile,
+ get_environment_name,
+ is_environment_directory,
+)
from hud.cli.utils.validation import validate_environment
from hud.utils.hud_console import HUDConsole
+LOGGER = logging.getLogger(__name__)
+
def collect_environment_variables(
directory: Path,
@@ -505,6 +512,96 @@ def _save_deploy_link(
console.warning(f"Failed to save deploy link: {e}")
+def discover_environments(directory: Path) -> list[Path]:
+ """Find all HUD environment subdirectories within a parent directory.
+
+ Scans immediate children for directories containing a Dockerfile
+ (Dockerfile.hud or Dockerfile) and pyproject.toml.
+
+ Returns sorted list of environment directory paths.
+ """
+ if not directory.is_dir():
+ return []
+ return [
+ child
+ for child in sorted(directory.iterdir())
+ if child.is_dir() and is_environment_directory(child)
+ ]
+
+
+def deploy_all(
+ directory: str,
+ env: list[str] | None = None,
+ env_file: str | None = None,
+ no_cache: bool = False,
+ verbose: bool = False,
+ build_args: list[str] | None = None,
+ build_secrets: list[str] | None = None,
+) -> None:
+ """Deploy all HUD environments found in a directory.
+
+ Discovers subdirectories that are valid HUD environments and deploys
+ each one sequentially.
+ """
+ hud_console = HUDConsole()
+ parent = Path(directory).resolve()
+
+ if not parent.is_dir():
+ hud_console.error(f"Directory does not exist: {directory}")
+ raise typer.Exit(1)
+
+ envs = discover_environments(parent)
+ if not envs:
+ hud_console.error(f"No HUD environments found in {parent}")
+ hud_console.info("Expected subdirectories containing Dockerfile.hud + pyproject.toml")
+ raise typer.Exit(1)
+
+ hud_console.header("Deploy All Environments")
+ hud_console.info(f"Found {len(envs)} environment(s) in {parent}:")
+ for env_dir in envs:
+ hud_console.info(f" {env_dir.name}/")
+ hud_console.info("")
+
+ succeeded: list[str] = []
+ failed: list[str] = []
+
+ for i, env_dir in enumerate(envs, start=1):
+ hud_console.section_title(f"[{i}/{len(envs)}] Deploying {env_dir.name}")
+
+ try:
+ deploy_environment(
+ directory=str(env_dir),
+ name=None,
+ env=env,
+ env_file=env_file,
+ no_cache=no_cache,
+ verbose=verbose,
+ registry_id=None,
+ build_args=build_args,
+ build_secrets=build_secrets,
+ )
+ succeeded.append(env_dir.name)
+ except (typer.Exit, SystemExit):
+ LOGGER.warning("Deploy failed for environment %s", env_dir.name)
+ failed.append(env_dir.name)
+ except Exception:
+ LOGGER.exception("Unexpected error deploying %s", env_dir.name)
+ failed.append(env_dir.name)
+
+ # Summary
+ hud_console.info("")
+ hud_console.header("Deploy All Summary")
+ if succeeded:
+ hud_console.success(f"{len(succeeded)} environment(s) deployed successfully:")
+ for name in succeeded:
+ hud_console.info(f" {name}")
+ if failed:
+ hud_console.error(f"{len(failed)} environment(s) failed:")
+ for name in failed:
+ hud_console.info(f" {name}")
+ raise typer.Exit(1)
+
+
def deploy_command(
directory: str = typer.Argument(".", help="Environment directory"),
name: str | None = typer.Option(
@@ -513,6 +610,12 @@ def deploy_command(
"-n",
help="Environment display name (defaults to directory name)",
),
+ all_envs: bool = typer.Option(
+ False,
+ "--all",
+ "-a",
+ help="Deploy all HUD environments found in directory",
+ ),
env: list[str] | None = typer.Option( # noqa: B008
None,
"--env",
@@ -568,11 +671,24 @@ def deploy_command(
hud deploy environments/browser
hud deploy . --name my-env # Custom name
hud deploy . -e API_KEY=xxx # With env vars
+ hud deploy ./converted --all # Deploy all envs in directory
hud deploy . --build-arg NODE_ENV=production # With build args
hud deploy . --secret id=MY_KEY,env=MY_KEY # With build secrets (will be encrypted at rest)
hud deploy . --secret id=MY_KEY,src=./my_key.txt # Secret from file
hud deploy . --no-cache # Force rebuild[/not dim]
"""
+ if all_envs:
+ deploy_all(
+ directory=directory,
+ env=env,
+ env_file=env_file,
+ no_cache=no_cache,
+ verbose=verbose,
+ build_args=build_args,
+ build_secrets=secrets,
+ )
+ return
+
deploy_environment(
directory=directory,
name=name,
diff --git a/hud/cli/tests/test_build.py b/hud/cli/tests/test_build.py
index 1c7be8eb..f1efbbf8 100644
--- a/hud/cli/tests/test_build.py
+++ b/hud/cli/tests/test_build.py
@@ -60,12 +60,12 @@ def test_increment_patch(self):
def test_increment_minor(self):
"""Test incrementing minor version."""
assert increment_version("1.2.3", "minor") == "1.3.0"
- assert increment_version("0.5.20", "minor") == "0.6.0"
+ assert increment_version("0.5.21", "minor") == "0.6.0"
def test_increment_major(self):
"""Test incrementing major version."""
assert increment_version("1.2.3", "major") == "2.0.0"
- assert increment_version("0.5.20", "major") == "1.0.0"
+ assert increment_version("0.5.21", "major") == "1.0.0"
def test_increment_with_v_prefix(self):
"""Test incrementing version with v prefix."""
diff --git a/hud/environment/scenarios.py b/hud/environment/scenarios.py
index bec337b2..e33627b9 100644
--- a/hud/environment/scenarios.py
+++ b/hud/environment/scenarios.py
@@ -628,12 +628,35 @@ async def prompt_handler(**handler_args: Any) -> list[str]:
if annotation is not None:
try:
adapter = TypeAdapter(annotation)
- deserialized_args[arg_name] = adapter.validate_json(arg_value)
+ except Exception:
+ # Unresolvable annotation (e.g. raw string from
+ # PEP 563 fallback) -- treat as untyped
+ adapter = None
+
+ if adapter is not None:
+ # Try validate_json first (handles Pydantic models,
+ # lists, enums, datetimes from JSON-encoded strings)
+ try:
+ deserialized_args[arg_name] = adapter.validate_json(arg_value)
+ continue
+ except Exception: # noqa: S110
+ pass
+
+ # Fall back to validate_python (handles Literal[str]
+ # where validate_json("0") would parse as int 0,
+ # losing the string type)
+ try:
+ deserialized_args[arg_name] = adapter.validate_python(arg_value)
+ continue
+ except Exception: # noqa: S110
+ pass
+
+ # TypeAdapter couldn't handle it -- skip generic
+ # heuristics that would lose type information
+ deserialized_args[arg_name] = arg_value
continue
- except Exception: # noqa: S110
- pass # Fall through to generic JSON decode
- # Try JSON decode for strings that look like JSON
+ # No annotation (or unresolvable): try generic JSON decode heuristics
stripped = arg_value.strip()
if (stripped and stripped[0] in "[{") or stripped in ("true", "false", "null"):
try:
diff --git a/hud/environment/tests/test_scenarios.py b/hud/environment/tests/test_scenarios.py
index 048b893e..74ac9355 100644
--- a/hud/environment/tests/test_scenarios.py
+++ b/hud/environment/tests/test_scenarios.py
@@ -4,7 +4,7 @@
from datetime import datetime
from enum import Enum
-from typing import Any
+from typing import Any, Literal
import pytest
from pydantic import BaseModel
@@ -792,6 +792,239 @@ async def list_pydantic_scenario(items: list[_Item]):
assert received_items[1].name == "Banana"
+class TestLiteralDeserialization:
+ """Tests for Literal type deserialization edge cases.
+
+ The MCP protocol sends all arguments as strings. When the scenario
+ function uses Literal types, the deserializer must correctly match
+ string values -- especially numeric-looking strings like "0", "1".
+ """
+
+ @pytest.mark.asyncio
+ async def test_literal_string_kept_as_string(self) -> None:
+ """Literal["a", "b"] receives string values correctly."""
+ env = Environment("test-env")
+ received: str | None = None
+
+ @env.scenario("literal_str")
+ async def literal_str_scenario(choice: Literal["a", "b"]):
+ nonlocal received
+ received = choice
+ yield f"Got {choice}"
+ yield 1.0
+
+ prompt = env._prompt_manager._prompts.get("test-env:literal_str")
+ assert prompt is not None
+
+ await prompt.render({"choice": "a"})
+ assert received == "a"
+ assert isinstance(received, str)
+
+ @pytest.mark.asyncio
+ async def test_literal_numeric_string_not_coerced_to_int(self) -> None:
+ """Literal["0", "1", "2"] keeps "0" as string, not int 0.
+
+ This is the GPQA Diamond bug: task IDs are "0", "1", etc.
+ and must stay as strings for Path operations.
+ """
+ env = Environment("test-env")
+ received: Any = None
+
+ @env.scenario("literal_numeric")
+ async def literal_numeric_scenario(task_id: Literal["0", "1", "2"]):
+ nonlocal received
+ received = task_id
+ yield f"Task {task_id}"
+ yield 1.0
+
+ prompt = env._prompt_manager._prompts.get("test-env:literal_numeric")
+ assert prompt is not None
+
+ await prompt.render({"task_id": "0"})
+ assert received == "0"
+ assert isinstance(received, str)
+
+ @pytest.mark.asyncio
+ async def test_literal_numeric_string_various_values(self) -> None:
+ """All numeric-looking Literal string values stay as strings."""
+ env = Environment("test-env")
+ received: Any = None
+
+ @env.scenario("literal_nums")
+ async def literal_nums_scenario(idx: Literal["0", "42", "197"]):
+ nonlocal received
+ received = idx
+ yield f"Index {idx}"
+ yield 1.0
+
+ prompt = env._prompt_manager._prompts.get("test-env:literal_nums")
+ assert prompt is not None
+
+ for val in ("0", "42", "197"):
+ await prompt.render({"idx": val})
+ assert received == val, f"Expected {val!r}, got {received!r}"
+ assert isinstance(received, str), f"Expected str, got {type(received)}"
+
+ @pytest.mark.asyncio
+ async def test_literal_int_coerces_correctly(self) -> None:
+ """Literal[1, 2, 3] with int values coerces string "1" to int 1."""
+ env = Environment("test-env")
+ received: Any = None
+
+ @env.scenario("literal_int")
+ async def literal_int_scenario(level: Literal[1, 2, 3]):
+ nonlocal received
+ received = level
+ yield f"Level {level}"
+ yield 1.0
+
+ prompt = env._prompt_manager._prompts.get("test-env:literal_int")
+ assert prompt is not None
+
+ await prompt.render({"level": "2"})
+ assert received == 2
+ assert isinstance(received, int)
+
+ @pytest.mark.asyncio
+ async def test_literal_mixed_types(self) -> None:
+ """Literal["auto", 0, 1] handles mixed string/int literal values."""
+ env = Environment("test-env")
+ received: Any = None
+
+ @env.scenario("literal_mixed")
+ async def literal_mixed_scenario(mode: Literal["auto", 0, 1]):
+ nonlocal received
+ received = mode
+ yield f"Mode {mode}"
+ yield 1.0
+
+ prompt = env._prompt_manager._prompts.get("test-env:literal_mixed")
+ assert prompt is not None
+
+ await prompt.render({"mode": "auto"})
+ assert received == "auto"
+
+ @pytest.mark.asyncio
+ async def test_literal_with_default(self) -> None:
+ """Literal with default value works when arg is provided."""
+ env = Environment("test-env")
+ received: Any = None
+
+ @env.scenario("literal_default")
+ async def literal_default_scenario(
+ task_id: Literal["build-pmars"] = "build-pmars",
+ ):
+ nonlocal received
+ received = task_id
+ yield f"Task {task_id}"
+ yield 1.0
+
+ prompt = env._prompt_manager._prompts.get("test-env:literal_default")
+ assert prompt is not None
+
+ await prompt.render({"task_id": "build-pmars"})
+ assert received == "build-pmars"
+
+ @pytest.mark.asyncio
+ async def test_int_annotation_coerces_numeric_string(self) -> None:
+ """Plain int annotation coerces "42" to 42."""
+ env = Environment("test-env")
+ received: Any = None
+
+ @env.scenario("int_arg")
+ async def int_arg_scenario(count: int):
+ nonlocal received
+ received = count
+ yield f"Count {count}"
+ yield 1.0
+
+ prompt = env._prompt_manager._prompts.get("test-env:int_arg")
+ assert prompt is not None
+
+ await prompt.render({"count": "42"})
+ assert received == 42
+ assert isinstance(received, int)
+
+ @pytest.mark.asyncio
+ async def test_float_annotation_coerces_numeric_string(self) -> None:
+ """Plain float annotation coerces "3.14" to 3.14."""
+ env = Environment("test-env")
+ received: Any = None
+
+ @env.scenario("float_arg")
+ async def float_arg_scenario(rate: float):
+ nonlocal received
+ received = rate
+ yield f"Rate {rate}"
+ yield 1.0
+
+ prompt = env._prompt_manager._prompts.get("test-env:float_arg")
+ assert prompt is not None
+
+ await prompt.render({"rate": "3.14"})
+ assert received == pytest.approx(3.14)
+ assert isinstance(received, float)
+
+ @pytest.mark.asyncio
+ async def test_bool_annotation_coerces_string(self) -> None:
+ """Bool annotation coerces "true"/"false" correctly."""
+ env = Environment("test-env")
+ received: Any = None
+
+ @env.scenario("bool_arg")
+ async def bool_arg_scenario(verbose: bool):
+ nonlocal received
+ received = verbose
+ yield f"Verbose {verbose}"
+ yield 1.0
+
+ prompt = env._prompt_manager._prompts.get("test-env:bool_arg")
+ assert prompt is not None
+
+ await prompt.render({"verbose": "true"})
+ assert received is True
+
+ @pytest.mark.asyncio
+ async def test_str_annotation_preserves_numeric_string(self) -> None:
+ """Plain str annotation keeps "42" as string "42"."""
+ env = Environment("test-env")
+ received: Any = None
+
+ @env.scenario("str_numeric")
+ async def str_numeric_scenario(name: str):
+ nonlocal received
+ received = name
+ yield f"Name {name}"
+ yield 1.0
+
+ prompt = env._prompt_manager._prompts.get("test-env:str_numeric")
+ assert prompt is not None
+
+ await prompt.render({"name": "42"})
+ assert received == "42"
+ assert isinstance(received, str)
+
+ @pytest.mark.asyncio
+ async def test_no_annotation_numeric_becomes_int(self) -> None:
+ """Untyped arg with numeric-looking string falls through to json.loads."""
+ env = Environment("test-env")
+ received: Any = None
+
+ @env.scenario("untyped_num")
+ async def untyped_num_scenario(val):
+ nonlocal received
+ received = val
+ yield f"Val {val}"
+ yield 1.0
+
+ prompt = env._prompt_manager._prompts.get("test-env:untyped_num")
+ assert prompt is not None
+
+ await prompt.render({"val": "42"})
+ # Without annotation, generic heuristic converts to int
+ assert received == 42
+
+
class TestScenarioNameNormalization:
"""Test edge cases for environment and scenario name handling."""
diff --git a/hud/patches/mcp_patches.py b/hud/patches/mcp_patches.py
index fbac6bd7..d8d73fa7 100644
--- a/hud/patches/mcp_patches.py
+++ b/hud/patches/mcp_patches.py
@@ -8,11 +8,60 @@
from __future__ import annotations
import logging
-from typing import Any
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+ import httpx
+ from mcp.client.streamable_http import StreamWriter
logger = logging.getLogger(__name__)
+def patch_json_response_error_propagation() -> None:
+ """
+ Patch _handle_json_response to re-raise exceptions instead of swallowing them.
+
+ The original implementation catches all exceptions (e.g. ReadError during
+ response.aread(), ValidationError during JSON parsing) and sends them as raw
+ Exception objects to the read stream — where BaseSession._handle_incoming
+ silently drops them. This causes the caller (call_tool / send_request) to
+ hang forever waiting for a response that will never arrive.
+
+ By re-raising, exceptions propagate to the retry loop in our patched
+ post_writer, which already distinguishes retryable errors (ReadError →
+ retry with backoff) from non-retryable ones (ValidationError → send
+ proper JSONRPCError to resolve the pending request).
+ """
+ try:
+ from mcp.client.streamable_http import StreamableHTTPTransport
+ from mcp.shared.message import SessionMessage
+ from mcp.types import JSONRPCMessage
+
+ async def patched_handle_json_response(
+ self: Any,
+ response: httpx.Response,
+ read_stream_writer: StreamWriter,
+ is_initialization: bool = False,
+ ) -> None:
+ try:
+ content = await response.aread()
+ message = JSONRPCMessage.model_validate_json(content)
+ if is_initialization:
+ self._maybe_extract_protocol_version_from_message(message)
+ await read_stream_writer.send(SessionMessage(message))
+ except Exception:
+ logger.exception("Error in _handle_json_response")
+ raise
+
+ StreamableHTTPTransport._handle_json_response = patched_handle_json_response
+ logger.debug("Patched StreamableHTTPTransport._handle_json_response to re-raise errors")
+
+ except ImportError:
+ logger.debug("mcp.client.streamable_http not available, skipping patch")
+ except Exception as e:
+ logger.warning("Failed to patch _handle_json_response: %s", e)
+
+
def patch_streamable_http_error_handling() -> None:
"""
Patch StreamableHTTPTransport.post_writer to handle request errors properly.
@@ -313,6 +362,7 @@ def suppress_fastmcp_logging(level: int = logging.WARNING) -> None:
def apply_all_patches() -> None:
"""Apply all MCP patches."""
+ patch_json_response_error_propagation()
patch_streamable_http_error_handling()
patch_client_session_validation()
patch_server_output_validation()
diff --git a/hud/utils/tests/test_version.py b/hud/utils/tests/test_version.py
index 4a70e3b6..014478d7 100644
--- a/hud/utils/tests/test_version.py
+++ b/hud/utils/tests/test_version.py
@@ -5,4 +5,4 @@ def test_import():
"""Test that the package can be imported."""
import hud
- assert hud.__version__ == "0.5.20"
+ assert hud.__version__ == "0.5.21"
diff --git a/hud/version.py b/hud/version.py
index de07e468..65160cef 100644
--- a/hud/version.py
+++ b/hud/version.py
@@ -4,4 +4,4 @@
from __future__ import annotations
-__version__ = "0.5.20"
+__version__ = "0.5.21"
diff --git a/pyproject.toml b/pyproject.toml
index fb785572..3b059cf8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "hud-python"
-version = "0.5.20"
+version = "0.5.21"
description = "SDK for the HUD platform."
readme = "README.md"
requires-python = ">=3.11, <3.13"