diff --git a/docs/advanced/harbor-convert.mdx b/docs/advanced/harbor-convert.mdx
index 358335df..bfdd9ef9 100644
--- a/docs/advanced/harbor-convert.mdx
+++ b/docs/advanced/harbor-convert.mdx
@@ -15,7 +15,7 @@ git clone https://github.com/laude-institute/terminal-bench-2.git
 # 2. Convert to HUD format
 hud convert ./terminal-bench-2/ --output ./tb2-hud
 
-# 3. Deploy all environments
+# 3. Deploy all environments (~3 min per environment, leave it running)
 hud deploy ./tb2-hud --all
 
 # 4. Run evaluation
@@ -24,6 +24,11 @@ hud eval ./tb2-hud/taskset.json
 
 That's it. The converter handles Dockerfile adaptation, build context, test scripts, and reward parsing automatically.
 
+<Tip>
+Each environment takes roughly 3 minutes to build and deploy. For datasets with many environments,
+`hud deploy --all` runs them sequentially -- just leave it running and check back when it's done.
+</Tip>
+
 ## What Gets Converted
 
 A Harbor task directory:
@@ -81,9 +86,29 @@ Harbor test scripts write results to `/logs/verifier/`. The converter supports b
 - `reward.txt` -- a single float (`1.0` for pass, `0.0` for fail)
 - `reward.json` -- `{"reward": 1.0}` or just a float
 
-## Running Programmatically
+## Running Tasks
+
+### Option 1: Upload as a Taskset (recommended)
+
+The generated `taskset.json` can be uploaded directly to the HUD platform for managed evaluation, leaderboards, and comparison across models:
+
+1. Go to [hud.ai/evalsets](https://hud.ai/evalsets) and create a new taskset
+2. Click **Upload Tasks** and paste the contents of `taskset.json`
+3. Run evaluations from the platform UI or via `hud eval`
+
+See the [Tasksets guide](/platform/tasksets) for full details on creating and managing tasksets.
+
+### Option 2: CLI eval
+
+Run the taskset directly from the command line:
+
+```bash
+hud eval ./tb2-hud/taskset.json
+```
+
+### Option 3: Python SDK
 
-You can also run converted tasks from Python using the SDK:
+Run tasks programmatically with any agent:
 
 ```python
 import asyncio
@@ -108,7 +133,7 @@ async def main():
 asyncio.run(main())
 ```
 
-Or load the full taskset:
+Or load the full taskset as Task objects:
 
 ```python
 import json
diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py
index daf9ff8c..ab22eea5 100644
--- a/hud/cli/__init__.py
+++ b/hud/cli/__init__.py
@@ -1026,46 +1026,153 @@ def get(
 
 @app.command()
 def convert(
-    tasks_file: str = typer.Argument(
-        ..., help="Path to tasks file (JSON/JSONL) to convert to remote MCP configuration"
+    path: str = typer.Argument(
+        ..., help="Path to source tasks/dataset directory to convert to HUD format"
+    ),
+    from_format: str = typer.Option(
+        "auto",
+        "--from",
+        "-f",
+        help="Source format (auto, harbor). Use 'auto' to detect automatically.",
+    ),
+    output: str | None = typer.Option(
+        None,
+        "--output",
+        "-o",
+        help="Output directory (default: ./hud_converted)",
     ),
 ) -> None:
-    """Convert local MCP task configs to remote (mcp.hud.ai) format.
+    """Convert external benchmark formats to HUD environments + tasksets.
 
-    This mirrors the implicit conversion flow used by 'hud rl' and writes a new
-    remote_<name>.json next to the source file when needed.
+    [not dim]Converts tasks from frameworks like Harbor into HUD-compatible
+    environments (env.py + Dockerfile.hud) and v5 taskset files.
+
+    Supports pluggable formats. Currently: harbor.
+
+    Examples:
+        hud convert ./algotune/                  # Auto-detect, convert dataset
+        hud convert ./my-task/ --from harbor      # Explicit format
+        hud convert ./dataset/ --output ./out     # Custom output directory[/not dim]
     """
     from pathlib import Path
 
+    from .convert import detect_format, get_converter, list_formats, write_result
+
     hud_console = HUDConsole()
+    source_path = Path(path).resolve()
 
-    try:
-        from .flows.tasks import convert_tasks_to_remote
+    if not source_path.exists():
+        hud_console.error(f"Path does not exist: {path}")
+        raise typer.Exit(1)
 
-        result_path = convert_tasks_to_remote(tasks_file)
+    # Resolve converter
+    if from_format == "auto":
+        converter = detect_format(source_path)
+        if converter is None:
+            # Auto-detect failed — prompt user to pick a format
+            available = list_formats()
+            if not available:
+                hud_console.error("No converters registered.")
+                raise typer.Exit(1)
+
+            if len(available) == 1:
+                # Only one format exists, just use it
+                converter = get_converter(available[0][0])
+                if converter:
+                    hud_console.info(f"Using format: {converter.name}")
+            else:
+                import questionary
+
+                choices = [
+                    questionary.Choice(title=f"{name} — {desc}", value=name)
+                    for name, desc in available
+                ]
+                picked = questionary.select(
+                    "Could not auto-detect format. Which format is this?",
+                    choices=choices,
+                ).ask()
+                if not picked:
+                    raise typer.Exit(1)
+                converter = get_converter(picked)
 
-        # If nothing changed, inform the user
-        try:
-            if Path(result_path).resolve() == Path(tasks_file).resolve():
-                hud_console.success(
-                    "Tasks already reference remote MCP URLs. No conversion needed."
-                )
-                hud_console.hint("You can run them directly with: hud eval <tasks_file> --full")
-                return
-        except Exception as e:
-            # Best effort; continue with success message
-            hud_console.debug(f"Path comparison failed, continuing: {e}")
-
-        hud_console.success(f"Converted tasks written to: {result_path}")
-        hud_console.hint(
-            "You can now run remote flows: hud rl <converted_file> or hud eval <converted_file>"
-        )
-    except typer.Exit:
-        raise
+            if converter is None:
+                hud_console.error("No converter selected.")
+                raise typer.Exit(1)
+        else:
+            hud_console.info(f"Detected format: {converter.name}")
+    else:
+        converter = get_converter(from_format)
+        if converter is None:
+            hud_console.error(f"Unknown format: {from_format}")
+            available = list_formats()
+            if available:
+                hud_console.info("Available formats:")
+                for name, desc in available:
+                    hud_console.info(f"  {name}: {desc}")
+            raise typer.Exit(1)
+
+    # Run conversion
+    try:
+        result = converter.convert(source_path)
+    except ValueError as e:
+        hud_console.error(str(e))
+        raise typer.Exit(1) from e
+    except Exception as e:
+        hud_console.error(f"Conversion failed: {e}")
+        raise typer.Exit(1) from e
+
+    # Write output
+    output_dir = Path(output) if output else Path("./hud_converted")
+    try:
+        taskset_path = write_result(result, output_dir.resolve())
     except Exception as e:
-        hud_console.error(f"Failed to convert tasks: {e}")
+        hud_console.error(f"Failed to write output: {e}")
         raise typer.Exit(1) from e
 
+    # Display results
+    hud_console.header("Convert Complete")
+    hud_console.info("")
+
+    total_tasks = len(result.taskset)
+    total_envs = len(result.environments)
+    hud_console.success(f"Converted {total_tasks} task(s) into {total_envs} environment(s).")
+    hud_console.info("")
+
+    # Show each environment
+    hud_console.section_title("Environments")
+    for env_gen in result.environments:
+        task_count = len(env_gen.task_dirs)
+        hud_console.status_item(env_gen.name, f"{task_count} tasks")
+    hud_console.info("")
+
+    # Show output paths
+    hud_console.section_title("Output")
+    hud_console.status_item("Directory", str(output_dir.resolve()))
+    hud_console.status_item("Taskset", str(taskset_path))
+    hud_console.info("")
+
+    # Show next steps with numbered commands
+    hud_console.section_title("Next Steps")
+    hud_console.info("")
+
+    hud_console.info("1. Deploy environment(s):")
+    if total_envs > 1:
+        hud_console.command_example(
+            f"hud deploy {output_dir.resolve()} --all",
+            f"Deploy all {total_envs} environments",
+        )
+    else:
+        first_env = result.environments[0].name if result.environments else "<env>"
+        hud_console.command_example(
+            f"hud deploy {output_dir.resolve() / first_env}",
+            "Build & deploy to HUD platform",
+        )
+    hud_console.info("")
+
+    hud_console.info("2. Run evaluation:")
+    hud_console.command_example(f"hud eval {taskset_path}", "Run agent against tasks")
+    hud_console.info("")
+
 
 @app.command()
 def cancel(
diff --git a/hud/cli/convert/__init__.py b/hud/cli/convert/__init__.py
new file mode 100644
index 00000000..c30ef455
--- /dev/null
+++ b/hud/cli/convert/__init__.py
@@ -0,0 +1,177 @@
+"""Pluggable format conversion system for HUD.
+
+Converts external benchmark formats (Harbor, Inspect AI, etc.) into
+HUD environments + tasksets.
+
+Usage:
+    hud convert <path>                    # Auto-detect format
+    hud convert <path> --from harbor      # Explicit format
+    hud convert <path> --output ./out     # Custom output directory
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import shutil
+from pathlib import Path  # noqa: TC003 - used at runtime
+
+from .base import BaseConverter, ConvertResult, GeneratedEnvironment
+
+__all__ = [
+    "BaseConverter",
+    "ConvertResult",
+    "GeneratedEnvironment",
+    "detect_format",
+    "get_converter",
+    "list_formats",
+    "write_result",
+]
+
+LOGGER = logging.getLogger(__name__)
+
+# Shell script extensions that need CRLF -> LF normalization
+_SHELL_EXTENSIONS = frozenset({".sh", ".bash", ".zsh", ".ksh"})
+
+
+def _normalize_line_endings(directory: Path) -> None:
+    """Convert CRLF to LF in all shell scripts under a directory.
+
+    Git on Windows with autocrlf=true converts LF to CRLF on checkout.
+    Shell scripts with CRLF break on Linux (e.g., shebang errors,
+    'set: pipefail\\r: invalid option name').
+    """
+    for path in directory.rglob("*"):
+        if path.is_file() and path.suffix in _SHELL_EXTENSIONS:
+            raw = path.read_bytes()
+            if b"\r" in raw:
+                path.write_bytes(raw.replace(b"\r\n", b"\n").replace(b"\r", b"\n"))
+                LOGGER.debug("Normalized line endings: %s", path)
+
+
+# ---------------------------------------------------------------------------
+# Converter registry
+# ---------------------------------------------------------------------------
+
+# Lazy-loaded to avoid import cost on unrelated CLI commands
+_converters: list[BaseConverter] | None = None
+
+
+def _load_converters() -> list[BaseConverter]:
+    global _converters
+    if _converters is None:
+        from .harbor import HarborConverter
+
+        _converters = [
+            HarborConverter(),
+            # Future: InspectConverter(), METRConverter(), ...
+        ]
+    return _converters
+
+
+def get_converter(name: str) -> BaseConverter | None:
+    """Get a converter by its short name (e.g., 'harbor')."""
+    for c in _load_converters():
+        if c.name == name:
+            return c
+    return None
+
+
+def detect_format(path: Path) -> BaseConverter | None:
+    """Auto-detect which converter can handle the given path."""
+    for c in _load_converters():
+        if c.detect(path):
+            return c
+    return None
+
+
+def list_formats() -> list[tuple[str, str]]:
+    """Return (name, description) pairs for all registered converters."""
+    return [(c.name, c.description) for c in _load_converters()]
+
+
+# ---------------------------------------------------------------------------
+# Output writer
+# ---------------------------------------------------------------------------
+
+
+def write_result(result: ConvertResult, output_dir: Path) -> Path:
+    """Write conversion results to disk.
+
+    Creates the output directory structure:
+        output_dir/
+        ├── env-name-a/
+        │   ├── env.py
+        │   ├── Dockerfile.hud
+        │   ├── pyproject.toml
+        │   └── tasks/
+        │       └── <task_id>/  (copied from source, minus environment/ & solution/)
+        └── taskset.json
+
+    Returns the path to the generated taskset.json.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for env_gen in result.environments:
+        env_dir = output_dir / env_gen.name
+        env_dir.mkdir(parents=True, exist_ok=True)
+
+        # Write generated files
+        (env_dir / "env.py").write_text(env_gen.env_py, encoding="utf-8")
+        (env_dir / "Dockerfile.hud").write_text(env_gen.dockerfile, encoding="utf-8")
+        (env_dir / "pyproject.toml").write_text(env_gen.pyproject_toml, encoding="utf-8")
+
+        # Copy build context files from source environment/ directory
+        # (e.g., warriors/*.red that Harbor Dockerfiles reference via COPY)
+        if env_gen.build_context_source and env_gen.build_context_source.is_dir():
+            for item in env_gen.build_context_source.iterdir():
+                # Skip the Dockerfile itself (we already generated Dockerfile.hud)
+                if item.name.lower() in ("dockerfile", "dockerfile.hud"):
+                    continue
+                dest_item = env_dir / item.name
+                if dest_item.exists():
+                    if dest_item.is_dir():
+                        shutil.rmtree(dest_item)
+                    else:
+                        dest_item.unlink()
+                if item.is_dir():
+                    shutil.copytree(item, dest_item)
+                else:
+                    shutil.copy2(item, dest_item)
+
+        # Copy task data directories (skip environment/ and solution/)
+        tasks_dir = env_dir / "tasks"
+        tasks_dir.mkdir(parents=True, exist_ok=True)
+
+        for task_id, source_dir in env_gen.task_dirs.items():
+            dest = tasks_dir / task_id
+            if dest.exists():
+                shutil.rmtree(dest)
+            dest.mkdir(parents=True, exist_ok=True)
+
+            for item in source_dir.iterdir():
+                # Skip dirs that are handled by the Dockerfile or ignored
+                if item.name in ("environment", "solution"):
+                    continue
+                if item.is_dir():
+                    shutil.copytree(item, dest / item.name)
+                else:
+                    shutil.copy2(item, dest / item.name)
+
+        # Normalize CRLF -> LF in all shell scripts (fixes Windows git checkout)
+        _normalize_line_endings(env_dir)
+
+        LOGGER.info(
+            "Wrote environment '%s' with %d task(s)",
+            env_gen.name,
+            len(env_gen.task_dirs),
+        )
+
+    # Write taskset
+    taskset_path = output_dir / "taskset.json"
+    with open(taskset_path, "w", encoding="utf-8") as f:
+        json.dump(result.taskset, f, ensure_ascii=False, indent=2)
+        f.write("\n")
+
+    LOGGER.info("Wrote taskset with %d task(s) to %s", len(result.taskset), taskset_path)
+    return taskset_path
diff --git a/hud/cli/convert/base.py b/hud/cli/convert/base.py
new file mode 100644
index 00000000..4fa86f09
--- /dev/null
+++ b/hud/cli/convert/base.py
@@ -0,0 +1,78 @@
+"""Abstract base classes for format converters.
+
+The converter system is pluggable: each format (Harbor, Inspect AI, etc.)
+implements BaseConverter with detect() and convert() methods. The CLI
+auto-detects format or lets the user specify explicitly.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any
+
+from pydantic import BaseModel, ConfigDict, Field
+
+__all__ = ["BaseConverter", "ConvertResult", "GeneratedEnvironment"]
+
+
+class GeneratedEnvironment(BaseModel):
+    """A generated HUD environment ready to be written to disk.
+
+    Attributes:
+        name: Environment name (e.g., "hud-harbor-algotune")
+        env_py: Generated env.py file content
+        dockerfile: Generated Dockerfile.hud content
+        pyproject_toml: Generated pyproject.toml content
+        task_dirs: Mapping of task_id -> source directory path.
+            Files from these directories (minus environment/ and solution/)
+            are copied into the output's tasks/ subdirectory.
+        build_context_source: Optional path to a source directory whose
+            non-Dockerfile contents should be copied into the environment
+            root as Docker build context (e.g., Harbor's environment/ dir).
+    """
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    name: str
+    env_py: str
+    dockerfile: str
+    pyproject_toml: str
+    task_dirs: dict[str, Path]
+    build_context_source: Path | None = None
+
+
+class ConvertResult(BaseModel):
+    """Result of converting a source format to HUD.
+
+    Attributes:
+        environments: Generated environment definitions (one per unique env group)
+        taskset: List of v5 Task dicts ready for taskset.json
+        summary: Human-readable summary lines for CLI output
+    """
+
+    environments: list[GeneratedEnvironment]
+    taskset: list[dict[str, Any]]
+    summary: list[str] = Field(default_factory=list)
+
+
+class BaseConverter(ABC):
+    """Abstract base for format converters.
+
+    Subclasses must define:
+        name: Short identifier (used with --from flag)
+        description: Human-readable description (shown in CLI help)
+        detect(): Check if a path matches this format
+        convert(): Perform the conversion
+    """
+
+    name: str
+    description: str
+
+    @abstractmethod
+    def detect(self, path: Path) -> bool:
+        """Return True if this converter can handle the given path."""
+
+    @abstractmethod
+    def convert(self, path: Path) -> ConvertResult:
+        """Convert the source at path to HUD format."""
diff --git a/hud/cli/convert/harbor.py b/hud/cli/convert/harbor.py
new file mode 100644
index 00000000..dc745bc9
--- /dev/null
+++ b/hud/cli/convert/harbor.py
@@ -0,0 +1,565 @@
+"""Harbor → HUD converter.
+
+Converts Harbor framework tasks (task.toml + instruction.md + environment/ + tests/)
+into HUD environments with scenarios and tasksets.
+
+Harbor task structure:
+    task_name/
+    ├── instruction.md          # Agent prompt
+    ├── task.toml               # Config: timeouts, metadata, resources
+    ├── environment/
+    │   └── Dockerfile          # Container the agent runs in
+    ├── tests/
+    │   └── test.sh             # Verification → writes reward.txt
+    └── solution/               # Optional (ignored)
+
+HUD output:
+    hud-harbor-{dataset}/
+    ├── env.py                  # Environment with run-task scenario
+    ├── Dockerfile.hud          # Harbor Dockerfile + HUD MCP layer
+    ├── pyproject.toml
+    └── tasks/                  # All task data baked into image
+        ├── task-a/
+        │   ├── instruction.md
+        │   └── tests/test.sh
+        └── task-b/
+            ├── instruction.md
+            └── tests/test.sh
+    taskset.json                # v5 taskset referencing the env
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import re
+import tomllib
+from dataclasses import dataclass
+from pathlib import Path  # noqa: TC003 - used at runtime
+from typing import Any
+
+from .base import BaseConverter, ConvertResult, GeneratedEnvironment
+
+__all__ = ["HarborConverter"]
+
+LOGGER = logging.getLogger(__name__)
+
+
+# =============================================================================
+# Helpers
+# =============================================================================
+
+
+def _is_harbor_task(path: Path) -> bool:
+    """Check if a directory looks like a valid Harbor task."""
+    return path.is_dir() and (path / "task.toml").exists() and (path / "instruction.md").exists()
+
+
+def _hash_directory(path: Path) -> str:
+    """Content-hash a directory for grouping tasks by identical environments."""
+    hasher = hashlib.sha256()
+    if not path.exists():
+        return "empty"
+    for file_path in sorted(path.rglob("*")):
+        if file_path.is_file():
+            hasher.update(str(file_path.relative_to(path)).encode())
+            hasher.update(file_path.read_bytes())
+    return hasher.hexdigest()[:16]
+
+
+def _normalize_name(name: str) -> str:
+    """Normalize a dataset name to a valid HUD environment name."""
+    normalized = name.strip().lower()
+    normalized = normalized.replace(" ", "-").replace("_", "-")
+    normalized = re.sub(r"[^a-z0-9-]", "", normalized)
+    normalized = re.sub(r"-+", "-", normalized)
+    return normalized.strip("-") or "converted"
+
+
+def _find_dockerfile(env_dir: Path) -> str | None:
+    """Read the Dockerfile from a Harbor environment directory."""
+    for name in ("Dockerfile", "dockerfile"):
+        path = env_dir / name
+        if path.exists():
+            return path.read_text(encoding="utf-8")
+    return None
+
+
+def _adapt_harbor_dockerfile(content: str) -> str:
+    """Comment out CMD/ENTRYPOINT lines from a Harbor Dockerfile.
+
+    These are replaced by the HUD MCP server entrypoint.
+    """
+    lines = content.splitlines()
+    adapted: list[str] = []
+    for line in lines:
+        stripped = line.strip().upper()
+        if stripped.startswith(("CMD ", "CMD[", "ENTRYPOINT ", "ENTRYPOINT[")):
+            adapted.append(f"# [harbor original] {line}")
+        else:
+            adapted.append(line)
+    return "\n".join(adapted)
+
+
+# =============================================================================
+# Data classes
+# =============================================================================
+
+
+@dataclass
+class HarborTask:
+    """Parsed Harbor task."""
+
+    task_id: str
+    directory: Path
+    instruction: str
+    config: dict[str, Any]
+    env_hash: str
+
+
+def _parse_task(task_dir: Path) -> HarborTask | None:
+    """Parse a Harbor task directory into a HarborTask."""
+    try:
+        instruction = (task_dir / "instruction.md").read_text(encoding="utf-8")
+    except Exception:
+        LOGGER.warning("Failed to read instruction.md in %s", task_dir)
+        return None
+
+    try:
+        raw = (task_dir / "task.toml").read_text(encoding="utf-8")
+        config: dict[str, Any] = tomllib.loads(raw)
+    except Exception:
+        LOGGER.warning("Failed to parse task.toml in %s", task_dir)
+        config = {}
+
+    env_dir = task_dir / "environment"
+    env_hash = _hash_directory(env_dir) if env_dir.exists() else "no-env"
+
+    return HarborTask(
+        task_id=task_dir.name,
+        directory=task_dir,
+        instruction=instruction,
+        config=config,
+        env_hash=env_hash,
+    )
+
+
+# =============================================================================
+# Templates
+# =============================================================================
+
+# fmt: off
+
+# Header + shared body split so the scenario signature can vary.
+_ENV_PY_HEADER = '''\
+"""{env_name} - HUD environment converted from Harbor.
+
+Source: {source_path}
+Tasks: {task_count}
+
+This environment runs Harbor-format tasks. Each task has:
+- instruction.md: the agent prompt
+- tests/test.sh: verification script that writes reward to /logs/verifier/
+
+The run-task scenario reads the instruction, lets the agent work,
+then executes the test script and parses the reward.
+"""
+
+import json
+import logging
+import subprocess
+from pathlib import Path
+{extra_imports}
+from hud import Environment
+from hud.tools import BashTool, EditTool
+from hud.tools.filesystem import GlobTool, GrepTool, ListTool, ReadTool
+
+LOGGER = logging.getLogger(__name__)
+
+TASKS_DIR = Path("/harbor/tasks")
+
+env = Environment("{env_name}")
+
+# Standard coding tools - agents interact via bash (matching Harbor's model)
+env.add_tool(BashTool())
+env.add_tool(EditTool())
+env.add_tool(ReadTool())
+env.add_tool(GrepTool())
+env.add_tool(GlobTool())
+env.add_tool(ListTool())
+
+'''
+
+# Single task: task_id is optional, defaults to the only task.
+_SCENARIO_SINGLE = """\
+@env.scenario("run-task")
+async def run_task(task_id: str = "{default_task_id}"):
+"""
+
+# Multiple tasks: task_id is required, typed as a Literal.
+_SCENARIO_MULTI = """\
+TaskId = Literal[{task_id_literal}]
+
+
+@env.scenario("run-task")
+async def run_task(task_id: TaskId):
+"""
+
+_SCENARIO_BODY = '''\
+    """Run a Harbor task by ID.
+
+    Reads /harbor/tasks/<task_id>/instruction.md as the prompt.
+    After the agent works, runs tests/test.sh and parses
+    /logs/verifier/reward.txt or reward.json for the reward.
+    """
+    task_dir = TASKS_DIR / str(task_id)
+    if not task_dir.exists():
+        available = [d.name for d in TASKS_DIR.iterdir() if d.is_dir()]
+        raise ValueError(
+            f"Task '{{task_id}}' not found. Available: {{available}}"
+        )
+
+    # Read the task instruction
+    instruction = (task_dir / "instruction.md").read_text(encoding="utf-8")
+
+    # Setup: yield prompt to the agent
+    answer = yield instruction
+
+    # Ensure log output directory exists
+    logs_dir = Path("/logs/verifier")
+    logs_dir.mkdir(parents=True, exist_ok=True)
+
+    # Harbor mounts the task's tests/ directory at /tests/ — replicate that
+    tests_link = Path("/tests")
+    task_tests = task_dir / "tests"
+    if task_tests.is_dir():
+        if tests_link.is_symlink() or tests_link.exists():
+            tests_link.unlink()
+        tests_link.symlink_to(task_tests)
+
+    # Evaluate: run the test script
+    test_script = task_dir / "tests" / "test.sh"
+    if test_script.exists():
+        try:
+            result = subprocess.run(
+                ["bash", str(test_script)],
+                cwd="/app",
+                capture_output=True,
+                text=True,
+                timeout={verifier_timeout},
+                check=False,
+            )
+            if result.stdout:
+                LOGGER.info("test.sh stdout for %s:\\n%s", task_id, result.stdout[-2000:])
+            if result.stderr:
+                LOGGER.info("test.sh stderr for %s:\\n%s", task_id, result.stderr[-2000:])
+            if result.returncode != 0:
+                LOGGER.warning(
+                    "test.sh exited with code %d for task %s",
+                    result.returncode, task_id,
+                )
+        except subprocess.TimeoutExpired:
+            LOGGER.warning("Test script timed out for task %s", task_id)
+        except Exception as exc:
+            LOGGER.warning("Test script failed for task %s: %s", task_id, exc)
+    else:
+        LOGGER.warning("No test script found at %s", test_script)
+
+    # Parse and yield reward
+    yield _parse_harbor_reward()
+
+
+def _parse_harbor_reward() -> float:
+    """Parse reward from Harbor standard output locations.
+
+    Harbor test scripts write results to /logs/verifier/ as either:
+    - reward.txt: a single float value
+    - reward.json: {{"reward": float}} or just a float
+    """
+    reward_txt = Path("/logs/verifier/reward.txt")
+    reward_json = Path("/logs/verifier/reward.json")
+
+    if reward_txt.exists():
+        try:
+            return float(reward_txt.read_text(encoding="utf-8").strip())
+        except ValueError:
+            pass
+
+    if reward_json.exists():
+        try:
+            data = json.loads(reward_json.read_text(encoding="utf-8"))
+            if isinstance(data, dict):
+                return float(data.get("reward", 0.0))
+            return float(data)
+        except (ValueError, json.JSONDecodeError):
+            pass
+
+    return 0.0
+'''
+
+
+def _build_env_py(
+    env_name: str,
+    source_path: str,
+    task_ids: list[str],
+    verifier_timeout: int,
+) -> str:
+    """Build the env.py content, adapting the scenario signature to task count."""
+    if len(task_ids) == 1:
+        extra_imports = ""
+        scenario = _SCENARIO_SINGLE.format(default_task_id=task_ids[0])
+    else:
+        extra_imports = "\nfrom typing import Literal\n"
+        literal_values = ", ".join(f'"{tid}"' for tid in sorted(task_ids))
+        scenario = _SCENARIO_MULTI.format(task_id_literal=literal_values)
+
+    header = _ENV_PY_HEADER.format(
+        env_name=env_name,
+        source_path=source_path,
+        task_count=len(task_ids),
+        extra_imports=extra_imports,
+    )
+    body = _SCENARIO_BODY.format(verifier_timeout=verifier_timeout)
+    return header + scenario + body
+
+# fmt: on
+
+# Shared snippet: install uv standalone (works on any base image with curl or
+# apt), then use uv to bootstrap Python and sync dependencies.
+_HUD_LAYER = """\
+# ============================================================
+# HUD MCP server layer
+# ============================================================
+WORKDIR /hud
+
+# Install uv standalone (no pip/python required on the base image)
+RUN command -v curl >/dev/null 2>&1 || \\
+    (apt-get update -qq && \\
+     apt-get install -y -qq --no-install-recommends curl ca-certificates && \\
+     rm -rf /var/lib/apt/lists/*) && \\
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+
+COPY pyproject.toml uv.lock* ./
+RUN uv sync --frozen --no-dev --no-install-project 2>/dev/null || \\
+    uv sync --no-dev --no-install-project
+
+# Harbor task data (instructions + test scripts baked into image)
+COPY tasks/ /harbor/tasks/
+
+# Ensure standard directories exist and are writable at runtime
+# (MCP server may run as non-root; Harbor tasks expect /app writable)
+RUN mkdir -p /logs/verifier /workspace /app && chmod 777 /logs/verifier /workspace /app
+
+COPY env.py ./
+
+CMD ["uv", "run", "--no-project", "python", "-m", "hud", "dev", "env:env", "--stdio"]
+"""
+
+DOCKERFILE_WITH_BASE_TEMPLATE = (
+    """\
+# ============================================================
+# Harbor environment base
+# Source: {source}
+# ============================================================
+{base_dockerfile}
+"""
+    + _HUD_LAYER
+)
+
+DOCKERFILE_FALLBACK_TEMPLATE = (
+    """\
+FROM python:3.11-slim
+
+RUN apt-get update && apt-get install -y --no-install-recommends \\
+    curl git build-essential && rm -rf /var/lib/apt/lists/*
+"""
+    + _HUD_LAYER
+)
+
+PYPROJECT_TEMPLATE = """\
+[project]
+name = "{name}"
+version = "0.1.0"
+requires-python = ">=3.10"
+dependencies = ["hud-python", "openai"]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+"""
+
+
+# =============================================================================
+# Converter
+# =============================================================================
+
+
+class HarborConverter(BaseConverter):
+    """Convert Harbor tasks/datasets to HUD format.
+
+    Handles:
+    - Single task directory (has task.toml directly)
+    - Dataset directory (subdirectories are Harbor tasks)
+    - Multi-environment datasets (tasks grouped by Dockerfile hash)
+    """
+
+    name = "harbor"
+    description = "Harbor framework (task.toml + instruction.md + environment/ + tests/)"
+
+    def detect(self, path: Path) -> bool:
+        if _is_harbor_task(path):
+            return True
+        # Check for dataset (directory containing task subdirectories)
+        if path.is_dir():
+            return any(_is_harbor_task(d) for d in path.iterdir() if d.is_dir())
+        return False
+
+    def convert(self, path: Path) -> ConvertResult:
+        path = path.resolve()
+
+        # Discover tasks
+        if _is_harbor_task(path):
+            task_dirs = [path]
+            dataset_name = path.parent.name
+        else:
+            task_dirs = sorted(d for d in path.iterdir() if d.is_dir() and _is_harbor_task(d))
+            dataset_name = path.name
+
+        if not task_dirs:
+            raise ValueError(f"No Harbor tasks found in {path}")
+
+        # Parse all tasks
+        tasks: list[HarborTask] = []
+        skipped = 0
+        for td in task_dirs:
+            parsed = _parse_task(td)
+            if parsed:
+                tasks.append(parsed)
+            else:
+                skipped += 1
+
+        if not tasks:
+            raise ValueError("All Harbor tasks failed to parse")
+
+        if skipped:
+            LOGGER.warning("Skipped %d task(s) that failed to parse", skipped)
+
+        LOGGER.info("Parsed %d Harbor task(s) from %s", len(tasks), path)
+
+        # Group by environment Dockerfile hash
+        groups: dict[str, list[HarborTask]] = {}
+        for task in tasks:
+            groups.setdefault(task.env_hash, []).append(task)
+
+        LOGGER.info("Found %d unique environment group(s)", len(groups))
+
+        # Generate environments and taskset
+        environments: list[GeneratedEnvironment] = []
+        taskset: list[dict[str, Any]] = []
+        base_name = f"hud-harbor-{_normalize_name(dataset_name)}"
+
+        # Sort groups by size (largest first) for consistent naming
+        sorted_groups = sorted(groups.items(), key=lambda x: -len(x[1]))
+
+        for idx, (_env_hash, group_tasks) in enumerate(sorted_groups, start=1):
+            # Naming: single group gets base_name, multiple get suffix
+            env_name = base_name if len(sorted_groups) == 1 else f"{base_name}-g{idx}"
+
+            # Use representative task for shared config
+            rep_task = group_tasks[0]
+            env_dir = rep_task.directory / "environment"
+            dockerfile_content = _find_dockerfile(env_dir) if env_dir.exists() else None
+
+            # Extract verifier timeout from config
+            verifier_timeout = 600
+            verifier_cfg = rep_task.config.get("verifier", {})
+            if isinstance(verifier_cfg, dict):
+                timeout_val = verifier_cfg.get("timeout_sec")
+                if timeout_val is not None:
+                    verifier_timeout = int(timeout_val)
+
+            # --- Generate env.py ---
+            # Use forward slashes in source_path to avoid unicode escape issues on Windows
+            task_ids = [t.task_id for t in group_tasks]
+            env_py = _build_env_py(
+                env_name=env_name,
+                source_path=path.as_posix(),
+                task_ids=task_ids,
+                verifier_timeout=verifier_timeout,
+            )
+
+            # --- Generate Dockerfile.hud ---
+            if dockerfile_content:
+                adapted = _adapt_harbor_dockerfile(dockerfile_content)
+                dockerfile = DOCKERFILE_WITH_BASE_TEMPLATE.format(
+                    source=env_dir.as_posix(),
+                    base_dockerfile=adapted,
+                )
+            else:
+                dockerfile = DOCKERFILE_FALLBACK_TEMPLATE
+
+            # --- Generate pyproject.toml ---
+            pyproject = PYPROJECT_TEMPLATE.format(name=env_name)
+
+            # --- Map task IDs to source directories ---
+            task_dir_map = {t.task_id: t.directory for t in group_tasks}
+
+            # Build context: non-Dockerfile files from environment/ dir
+            # (e.g., warriors/*.red that the Dockerfile COPYs)
+            build_ctx = env_dir if env_dir.exists() else None
+
+            environments.append(
+                GeneratedEnvironment(
+                    name=env_name,
+                    env_py=env_py,
+                    dockerfile=dockerfile,
+                    pyproject_toml=pyproject,
+                    task_dirs=task_dir_map,
+                    build_context_source=build_ctx,
+                )
+            )
+
+            # --- Generate v5 taskset entries ---
+            for task in group_tasks:
+                metadata: dict[str, Any] = {
+                    "harbor_source": task.directory.relative_to(path.parent).as_posix(),
+                }
+                # Pull metadata from task.toml [metadata] section
+                toml_meta = task.config.get("metadata", {})
+                if isinstance(toml_meta, dict):
+                    metadata.update(toml_meta)
+
+                taskset.append(
+                    {
+                        "env": {"name": env_name},
+                        "scenario": f"{env_name}:run-task",
+                        "args": {"task_id": task.task_id},
+                        "metadata": metadata,
+                    }
+                )
+
+        # Build summary lines
+        summary = [
+            f"Converted {len(tasks)} Harbor task(s) into {len(environments)} environment(s).",
+        ]
+        if skipped:
+            summary.append(f"Skipped {skipped} task(s) that failed to parse.")
+        summary.append("")
+        for env_gen in environments:
+            task_count = len(env_gen.task_dirs)
+            summary.append(f"  {env_gen.name}/ ({task_count} tasks)")
+        summary.extend(
+            [
+                "",
+                "Next steps:",
+                "  1. hud deploy <environment>/",
+                "  2. hud eval taskset.json",
+            ]
+        )
+
+        return ConvertResult(
+            environments=environments,
+            taskset=taskset,
+            summary=summary,
+        )
diff --git a/hud/cli/convert/tests/__init__.py b/hud/cli/convert/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/hud/cli/convert/tests/conftest.py b/hud/cli/convert/tests/conftest.py
new file mode 100644
index 00000000..e6f7b683
--- /dev/null
+++ b/hud/cli/convert/tests/conftest.py
@@ -0,0 +1,258 @@
+"""Shared fixtures for Harbor converter tests.
+
+Provides builders that create synthetic Harbor-format task directories
+matching the terminal-bench-2 layout:
+
+    task_name/
+    ├── task.toml
+    ├── instruction.md
+    ├── environment/
+    │   └── Dockerfile
+    ├── tests/
+    │   └── test.sh
+    └── solution/          # optional, should be ignored by converter
+"""
+
+from __future__ import annotations
+
+import textwrap
+from pathlib import Path  # noqa: TC003 - used at runtime
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# task.toml templates (matching real terminal-bench style)
+# ---------------------------------------------------------------------------
+
+_DEFAULT_TASK_TOML = textwrap.dedent("""\
+    [metadata]
+    category = "systems"
+    difficulty = "medium"
+    tags = ["bash", "linux"]
+
+    [verifier]
+    timeout_sec = 120
+""")
+
+_TASK_TOML_WITH_IMAGE = textwrap.dedent("""\
+    [metadata]
+    category = "machine-learning"
+    difficulty = "hard"
+    tags = ["python", "ml"]
+
+    [docker]
+    image = "alexgshaw/caffe-cifar-10:20251031"
+
+    [verifier]
+    timeout_sec = 300
+""")
+
+
+# ---------------------------------------------------------------------------
+# Dockerfile templates
+# ---------------------------------------------------------------------------
+
+_SIMPLE_DOCKERFILE = textwrap.dedent("""\
+    FROM python:3.11-slim
+    RUN apt-get update && apt-get install -y curl git
+    WORKDIR /workspace
+    CMD ["bash"]
+""")
+
+_ML_DOCKERFILE = textwrap.dedent("""\
+    FROM nvidia/cuda:12.0-runtime-ubuntu22.04
+    RUN apt-get update && apt-get install -y python3 python3-pip
+    RUN pip3 install torch numpy
+    WORKDIR /workspace
+    ENTRYPOINT ["/bin/bash"]
+""")
+
+
+# ---------------------------------------------------------------------------
+# Helper to build a single task directory
+# ---------------------------------------------------------------------------
+
+
+def make_harbor_task(
+    parent: Path,
+    name: str,
+    instruction: str = "Solve the task.",
+    task_toml: str = _DEFAULT_TASK_TOML,
+    dockerfile: str | None = _SIMPLE_DOCKERFILE,
+    test_script: str = '#!/bin/bash\necho "1.0" > /logs/verifier/reward.txt\n',
+    include_solution: bool = False,
+) -> Path:
+    """Create a synthetic Harbor task directory under *parent*.
+
+    Returns the task directory path.
+    """
+    task_dir = parent / name
+    task_dir.mkdir(parents=True, exist_ok=True)
+
+    (task_dir / "instruction.md").write_text(instruction, encoding="utf-8")
+    (task_dir / "task.toml").write_text(task_toml, encoding="utf-8")
+
+    if dockerfile is not None:
+        env_dir = task_dir / "environment"
+        env_dir.mkdir(exist_ok=True)
+        (env_dir / "Dockerfile").write_text(dockerfile, encoding="utf-8")
+
+    tests_dir = task_dir / "tests"
+    tests_dir.mkdir(exist_ok=True)
+    (tests_dir / "test.sh").write_text(test_script, encoding="utf-8")
+
+    if include_solution:
+        sol_dir = task_dir / "solution"
+        sol_dir.mkdir(exist_ok=True)
+        (sol_dir / "solve.sh").write_text("#!/bin/bash\necho done\n", encoding="utf-8")
+
+    return task_dir
+
+
+# ---------------------------------------------------------------------------
+# Pytest fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture()
+def single_task(tmp_path: Path) -> Path:
+    """A single Harbor task directory (like a standalone task)."""
+    return make_harbor_task(
+        tmp_path,
+        "cancel-async-tasks",
+        instruction=(
+            "# Cancel Async Tasks\n\n"
+            "Write a Python script that launches 5 asyncio tasks and cancels "
+            "all of them within 2 seconds.\n"
+        ),
+    )
+
+
+@pytest.fixture()
+def dataset_same_env(tmp_path: Path) -> Path:
+    """A dataset directory with 3 tasks sharing the same Dockerfile."""
+    dataset = tmp_path / "terminal-bench-sample"
+    dataset.mkdir()
+
+    for name in ("cancel-async-tasks", "build-pmars", "chess-best-move"):
+        make_harbor_task(
+            dataset,
+            name,
+            instruction=f"# {name}\n\nSolve the {name} task.\n",
+        )
+
+    return dataset
+
+
+@pytest.fixture()
+def dataset_multi_env(tmp_path: Path) -> Path:
+    """A dataset directory with tasks split across 2 different Dockerfiles."""
+    dataset = tmp_path / "mixed-bench"
+    dataset.mkdir()
+
+    # Group 1: simple python tasks (same Dockerfile)
+    for name in ("cancel-async-tasks", "build-pmars"):
+        make_harbor_task(
+            dataset,
+            name,
+            instruction=f"# {name}\n\nDo the thing.\n",
+            dockerfile=_SIMPLE_DOCKERFILE,
+        )
+
+    # Group 2: ML tasks (different Dockerfile)
+    for name in ("caffe-cifar-10", "sam-cell-seg"):
+        make_harbor_task(
+            dataset,
+            name,
+            instruction=f"# {name}\n\nTrain the model.\n",
+            task_toml=_TASK_TOML_WITH_IMAGE,
+            dockerfile=_ML_DOCKERFILE,
+        )
+
+    return dataset
+
+
+@pytest.fixture()
+def dataset_no_dockerfile(tmp_path: Path) -> Path:
+    """A dataset where tasks have no environment/Dockerfile."""
+    dataset = tmp_path / "no-docker-bench"
+    dataset.mkdir()
+
+    for name in ("task-a", "task-b"):
+        make_harbor_task(
+            dataset,
+            name,
+            instruction=f"# {name}\n\nSimple task.\n",
+            dockerfile=None,  # No Dockerfile
+        )
+
+    return dataset
+
+
+@pytest.fixture()
+def dataset_with_solutions(tmp_path: Path) -> Path:
+    """A dataset where tasks include solution/ directories."""
+    dataset = tmp_path / "solved-bench"
+    dataset.mkdir()
+
+    for name in ("task-x", "task-y"):
+        make_harbor_task(
+            dataset,
+            name,
+            instruction=f"# {name}\n\nSolve it.\n",
+            include_solution=True,
+        )
+
+    return dataset
+
+
+@pytest.fixture()
+def task_with_build_context(tmp_path: Path) -> Path:
+    """A single task whose environment/ dir has extra build context files.
+
+    Mimics build-pmars which has warriors/*.red files that the
+    Dockerfile COPYs into the image.
+    """
+    task_dir = tmp_path / "build-pmars"
+    task_dir.mkdir()
+
+    (task_dir / "instruction.md").write_text(
+        "# Build pMARS\n\nBuild the pMARS simulator.\n", encoding="utf-8"
+    )
+    (task_dir / "task.toml").write_text(
+        textwrap.dedent("""\
+            [metadata]
+            category = "software-engineering"
+            difficulty = "medium"
+
+            [verifier]
+            timeout_sec = 900
+        """),
+        encoding="utf-8",
+    )
+
+    # environment/ with Dockerfile AND extra build context files
+    env_dir = task_dir / "environment"
+    env_dir.mkdir()
+    (env_dir / "Dockerfile").write_text(
+        textwrap.dedent("""\
+            FROM debian:13.0-slim
+            RUN apt-get update && apt-get install -y tmux
+            WORKDIR /app
+            COPY warriors/flashpaper.red warriors/rave.red /app/
+        """),
+        encoding="utf-8",
+    )
+    warriors = env_dir / "warriors"
+    warriors.mkdir()
+    (warriors / "flashpaper.red").write_text(";redcode\nMOV 0, 1\n", encoding="utf-8")
+    (warriors / "rave.red").write_text(";redcode\nSPL 0, 0\n", encoding="utf-8")
+
+    # tests/
+    tests_dir = task_dir / "tests"
+    tests_dir.mkdir()
+    (tests_dir / "test.sh").write_text(
+        '#!/bin/bash\necho "1.0" > /logs/verifier/reward.txt\n', encoding="utf-8"
+    )
+
+    return task_dir
diff --git a/hud/cli/convert/tests/test_harbor.py b/hud/cli/convert/tests/test_harbor.py
new file mode 100644
index 00000000..64c6c6b2
--- /dev/null
+++ b/hud/cli/convert/tests/test_harbor.py
@@ -0,0 +1,751 @@
+"""Tests for the Harbor → HUD converter.
+
+Exercises HarborConverter.detect(), HarborConverter.convert(), and the
+write_result() writer using synthetic terminal-bench-style fixtures
+defined in conftest.py.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING
+
+import pytest
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+from hud.cli.convert import detect_format, get_converter, list_formats, write_result
+from hud.cli.convert.harbor import (
+    HarborConverter,
+    _adapt_harbor_dockerfile,
+    _find_dockerfile,
+    _hash_directory,
+    _is_harbor_task,
+    _normalize_name,
+    _parse_task,
+)
+
+from .conftest import make_harbor_task
+
+# ============================================================================
+# Helper unit tests
+# ============================================================================
+
+
+class TestNormalizeName:
+    def test_simple(self) -> None:
+        assert _normalize_name("terminal-bench") == "terminal-bench"
+
+    def test_underscores(self) -> None:
+        assert _normalize_name("my_cool_bench") == "my-cool-bench"
+
+    def test_spaces(self) -> None:
+        assert _normalize_name("My Cool Bench") == "my-cool-bench"
+
+    def test_special_chars(self) -> None:
+        assert _normalize_name("bench@2.0!") == "bench20"
+
+    def test_empty(self) -> None:
+        assert _normalize_name("") == "converted"
+
+    def test_only_special_chars(self) -> None:
+        assert _normalize_name("@#$") == "converted"
+
+    def test_leading_trailing_dashes(self) -> None:
+        assert _normalize_name("--hello--") == "hello"
+
+    def test_consecutive_dashes(self) -> None:
+        assert _normalize_name("a---b") == "a-b"
+
+
+class TestAdaptDockerfile:
+    def test_comments_cmd(self) -> None:
+        result = _adapt_harbor_dockerfile('CMD ["bash"]')
+        assert result == '# [harbor original] CMD ["bash"]'
+
+    def test_comments_entrypoint(self) -> None:
+        result = _adapt_harbor_dockerfile('ENTRYPOINT ["/bin/bash"]')
+        assert result == '# [harbor original] ENTRYPOINT ["/bin/bash"]'
+
+    def test_preserves_other_lines(self) -> None:
+        dockerfile = "FROM python:3.11\nRUN echo hi\nCMD bash"
+        result = _adapt_harbor_dockerfile(dockerfile)
+        lines = result.splitlines()
+        assert lines[0] == "FROM python:3.11"
+        assert lines[1] == "RUN echo hi"
+        assert lines[2] == "# [harbor original] CMD bash"
+
+    def test_case_insensitive_match(self) -> None:
+        # The implementation uses .upper() so indented CMD should match
+        result = _adapt_harbor_dockerfile("  CMD bash")
+        assert result == "# [harbor original]   CMD bash"
+
+    def test_no_cmd_or_entrypoint(self) -> None:
+        dockerfile = "FROM python:3.11\nRUN apt-get update"
+        assert _adapt_harbor_dockerfile(dockerfile) == dockerfile
+
+
+class TestHashDirectory:
+    def test_same_content_same_hash(self, tmp_path: Path) -> None:
+        dir_a = tmp_path / "a"
+        dir_a.mkdir()
+        (dir_a / "file.txt").write_text("hello")
+
+        dir_b = tmp_path / "b"
+        dir_b.mkdir()
+        (dir_b / "file.txt").write_text("hello")
+
+        assert _hash_directory(dir_a) == _hash_directory(dir_b)
+
+    def test_different_content_different_hash(self, tmp_path: Path) -> None:
+        dir_a = tmp_path / "a"
+        dir_a.mkdir()
+        (dir_a / "file.txt").write_text("hello")
+
+        dir_b = tmp_path / "b"
+        dir_b.mkdir()
+        (dir_b / "file.txt").write_text("world")
+
+        assert _hash_directory(dir_a) != _hash_directory(dir_b)
+
+    def test_nonexistent_returns_empty(self, tmp_path: Path) -> None:
+        assert _hash_directory(tmp_path / "nonexistent") == "empty"
+
+    def test_empty_directory(self, tmp_path: Path) -> None:
+        empty = tmp_path / "empty"
+        empty.mkdir()
+        # Empty dir has a deterministic hash (sha256 of nothing)
+        result = _hash_directory(empty)
+        assert isinstance(result, str)
+        assert len(result) == 16
+
+
+class TestFindDockerfile:
+    def test_finds_dockerfile(self, tmp_path: Path) -> None:
+        (tmp_path / "Dockerfile").write_text("FROM python:3.11")
+        assert _find_dockerfile(tmp_path) == "FROM python:3.11"
+
+    def test_finds_lowercase(self, tmp_path: Path) -> None:
+        (tmp_path / "dockerfile").write_text("FROM alpine")
+        assert _find_dockerfile(tmp_path) == "FROM alpine"
+
+    def test_returns_none_when_missing(self, tmp_path: Path) -> None:
+        assert _find_dockerfile(tmp_path) is None
+
+
+class TestIsHarborTask:
+    def test_valid_task(self, single_task: Path) -> None:
+        assert _is_harbor_task(single_task) is True
+
+    def test_missing_instruction(self, tmp_path: Path) -> None:
+        task = tmp_path / "bad-task"
+        task.mkdir()
+        (task / "task.toml").write_text("[metadata]\n")
+        assert _is_harbor_task(task) is False
+
+    def test_missing_task_toml(self, tmp_path: Path) -> None:
+        task = tmp_path / "bad-task"
+        task.mkdir()
+        (task / "instruction.md").write_text("# Do something")
+        assert _is_harbor_task(task) is False
+
+    def test_not_a_directory(self, tmp_path: Path) -> None:
+        f = tmp_path / "file.txt"
+        f.write_text("not a dir")
+        assert _is_harbor_task(f) is False
+
+
+class TestParseTask:
+    def test_parses_valid_task(self, single_task: Path) -> None:
+        result = _parse_task(single_task)
+        assert result is not None
+        assert result.task_id == "cancel-async-tasks"
+        assert "Cancel Async Tasks" in result.instruction
+        assert result.config.get("metadata", {}).get("category") == "systems"
+
+    def test_parses_verifier_timeout(self, single_task: Path) -> None:
+        result = _parse_task(single_task)
+        assert result is not None
+        assert result.config["verifier"]["timeout_sec"] == 120
+
+    def test_returns_none_for_bad_instruction(self, tmp_path: Path) -> None:
+        task_dir = tmp_path / "bad"
+        task_dir.mkdir()
+        (task_dir / "task.toml").write_text("[metadata]\n")
+        # instruction.md missing
+        assert _parse_task(task_dir) is None
+
+    def test_handles_bad_toml_gracefully(self, tmp_path: Path) -> None:
+        task_dir = tmp_path / "broken-toml"
+        task_dir.mkdir()
+        (task_dir / "instruction.md").write_text("# Hello")
+        (task_dir / "task.toml").write_text("this is not valid toml {{{")
+        result = _parse_task(task_dir)
+        assert result is not None
+        # Config should be empty dict when toml fails
+        assert result.config == {}
+
+
+# ============================================================================
+# HarborConverter.detect()
+# ============================================================================
+
+
+class TestHarborConverterDetect:
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    def test_detects_single_task(self, single_task: Path) -> None:
+        assert self.converter.detect(single_task) is True
+
+    def test_detects_dataset(self, dataset_same_env: Path) -> None:
+        assert self.converter.detect(dataset_same_env) is True
+
+    def test_rejects_empty_dir(self, tmp_path: Path) -> None:
+        assert self.converter.detect(tmp_path) is False
+
+    def test_rejects_non_harbor_dir(self, tmp_path: Path) -> None:
+        (tmp_path / "random.txt").write_text("nope")
+        assert self.converter.detect(tmp_path) is False
+
+
+# ============================================================================
+# HarborConverter.convert()
+# ============================================================================
+
+
+class TestHarborConverterConvertSingleTask:
+    """Convert a single Harbor task directory."""
+
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    def test_single_task_produces_one_env(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        assert len(result.environments) == 1
+        assert len(result.taskset) == 1
+
+    def test_env_name_uses_parent_dir(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        env = result.environments[0]
+        # Parent dir name is the tmp_path random name, but it gets normalized
+        assert env.name.startswith("hud-harbor-")
+
+    def test_env_py_contains_scenario(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        env_py = result.environments[0].env_py
+        assert "@env.scenario" in env_py
+        assert "run-task" in env_py
+
+    def test_env_py_has_correct_timeout(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        env_py = result.environments[0].env_py
+        assert "timeout=120" in env_py
+
+    def test_taskset_references_env(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        entry = result.taskset[0]
+        env_name = result.environments[0].name
+        assert entry["scenario"] == f"{env_name}:run-task"
+        assert entry["args"]["task_id"] == "cancel-async-tasks"
+
+    def test_task_dirs_map(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        env = result.environments[0]
+        assert "cancel-async-tasks" in env.task_dirs
+        assert env.task_dirs["cancel-async-tasks"] == single_task
+
+    def test_summary_not_empty(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        assert len(result.summary) > 0
+        assert any("1" in line for line in result.summary)
+
+
+class TestHarborConverterConvertDataset:
+    """Convert a dataset directory with multiple tasks sharing the same env."""
+
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    def test_same_env_groups_into_one(self, dataset_same_env: Path) -> None:
+        result = self.converter.convert(dataset_same_env)
+        assert len(result.environments) == 1
+        assert len(result.taskset) == 3
+
+    def test_all_task_ids_present(self, dataset_same_env: Path) -> None:
+        result = self.converter.convert(dataset_same_env)
+        task_ids = {e["args"]["task_id"] for e in result.taskset}
+        assert task_ids == {"cancel-async-tasks", "build-pmars", "chess-best-move"}
+
+    def test_env_name_from_dataset(self, dataset_same_env: Path) -> None:
+        result = self.converter.convert(dataset_same_env)
+        env = result.environments[0]
+        assert env.name == "hud-harbor-terminal-bench-sample"
+
+
+class TestHarborConverterConvertMultiEnv:
+    """Convert a dataset with tasks split across different Dockerfiles."""
+
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    def test_creates_two_envs(self, dataset_multi_env: Path) -> None:
+        result = self.converter.convert(dataset_multi_env)
+        assert len(result.environments) == 2
+        assert len(result.taskset) == 4
+
+    def test_env_names_have_group_suffix(self, dataset_multi_env: Path) -> None:
+        result = self.converter.convert(dataset_multi_env)
+        names = {e.name for e in result.environments}
+        assert all(n.startswith("hud-harbor-mixed-bench") for n in names)
+        # With multiple groups, names should have -g1, -g2 suffixes
+        assert any("-g1" in n for n in names)
+        assert any("-g2" in n for n in names)
+
+    def test_each_env_has_correct_tasks(self, dataset_multi_env: Path) -> None:
+        result = self.converter.convert(dataset_multi_env)
+        for env in result.environments:
+            task_ids = set(env.task_dirs.keys())
+            # Each group should have exactly 2 tasks
+            assert len(task_ids) == 2
+
+    def test_ml_env_has_nvidia_dockerfile(self, dataset_multi_env: Path) -> None:
+        result = self.converter.convert(dataset_multi_env)
+        # One of the environments should reference nvidia in its dockerfile
+        dockerfiles = [e.dockerfile for e in result.environments]
+        assert any("nvidia" in d for d in dockerfiles)
+
+    def test_simple_env_has_python_dockerfile(self, dataset_multi_env: Path) -> None:
+        result = self.converter.convert(dataset_multi_env)
+        dockerfiles = [e.dockerfile for e in result.environments]
+        assert any("python:3.11-slim" in d for d in dockerfiles)
+
+
+class TestBuildContextSource:
+    """Verify build_context_source is set for tasks with environment/ dirs."""
+
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    def test_build_context_source_set(self, task_with_build_context: Path) -> None:
+        result = self.converter.convert(task_with_build_context)
+        env = result.environments[0]
+        assert env.build_context_source is not None
+        assert env.build_context_source.is_dir()
+
+    def test_build_context_source_none_when_no_env_dir(self, dataset_no_dockerfile: Path) -> None:
+        result = self.converter.convert(dataset_no_dockerfile)
+        env = result.environments[0]
+        assert env.build_context_source is None
+
+
+class TestWriteBuildContext:
+    """Verify that build context files from environment/ are copied to env root."""
+
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    def test_warriors_copied_to_env_root(
+        self, task_with_build_context: Path, tmp_path: Path
+    ) -> None:
+        result = self.converter.convert(task_with_build_context)
+        out = tmp_path / "output"
+        write_result(result, out)
+
+        env = result.environments[0]
+        env_dir = out / env.name
+
+        # warriors/ dir should exist at env root (Docker build context)
+        assert (env_dir / "warriors").is_dir()
+        assert (env_dir / "warriors" / "flashpaper.red").is_file()
+        assert (env_dir / "warriors" / "rave.red").is_file()
+
+    def test_dockerfile_not_duplicated(self, task_with_build_context: Path, tmp_path: Path) -> None:
+        result = self.converter.convert(task_with_build_context)
+        out = tmp_path / "output"
+        write_result(result, out)
+
+        env = result.environments[0]
+        env_dir = out / env.name
+
+        # Should have Dockerfile.hud (generated), NOT a raw Dockerfile copy
+        assert (env_dir / "Dockerfile.hud").is_file()
+        assert not (env_dir / "Dockerfile").exists()
+
+    def test_build_context_content_correct(
+        self, task_with_build_context: Path, tmp_path: Path
+    ) -> None:
+        result = self.converter.convert(task_with_build_context)
+        out = tmp_path / "output"
+        write_result(result, out)
+
+        env = result.environments[0]
+        content = (out / env.name / "warriors" / "flashpaper.red").read_text(encoding="utf-8")
+        assert "MOV 0, 1" in content
+
+
+class TestHarborConverterConvertNoDockerfile:
+    """Tasks without environment/Dockerfile should use fallback."""
+
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    def test_fallback_dockerfile(self, dataset_no_dockerfile: Path) -> None:
+        result = self.converter.convert(dataset_no_dockerfile)
+        assert len(result.environments) == 1
+        # Fallback dockerfile starts with FROM python:3.11-slim
+        assert "FROM python:3.11-slim" in result.environments[0].dockerfile
+
+    def test_no_harbor_original_comments(self, dataset_no_dockerfile: Path) -> None:
+        result = self.converter.convert(dataset_no_dockerfile)
+        # Fallback dockerfile should NOT have commented-out lines
+        assert "# [harbor original]" not in result.environments[0].dockerfile
+
+
+class TestHarborConverterConvertWithSolutions:
+    """Verify that solution/ dirs show up in task_dirs but write_result skips them."""
+
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    def test_solutions_present_in_source(self, dataset_with_solutions: Path) -> None:
+        # Verify the fixture has solution dirs
+        for name in ("task-x", "task-y"):
+            assert (dataset_with_solutions / name / "solution").is_dir()
+
+    def test_convert_succeeds(self, dataset_with_solutions: Path) -> None:
+        result = self.converter.convert(dataset_with_solutions)
+        assert len(result.environments) == 1
+        assert len(result.taskset) == 2
+
+
+class TestHarborConverterEdgeCases:
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    def test_no_tasks_raises(self, tmp_path: Path) -> None:
+        empty = tmp_path / "empty-dataset"
+        empty.mkdir()
+        with pytest.raises(ValueError, match="No Harbor tasks found"):
+            self.converter.convert(empty)
+
+    def test_all_tasks_fail_raises(self, tmp_path: Path) -> None:
+        dataset = tmp_path / "bad-dataset"
+        dataset.mkdir()
+        # Create subdirs that look like tasks but have no instruction.md
+        for name in ("a", "b"):
+            d = dataset / name
+            d.mkdir()
+            (d / "task.toml").write_text("[metadata]\n")
+            # Missing instruction.md -> will fail detect, so not even found as task
+        with pytest.raises(ValueError, match="No Harbor tasks found"):
+            self.converter.convert(dataset)
+
+    def test_partial_failure_skips_bad_tasks(self, tmp_path: Path) -> None:
+        dataset = tmp_path / "partial"
+        dataset.mkdir()
+
+        # One good task
+        make_harbor_task(dataset, "good-task")
+
+        # One bad task (has task.toml + instruction.md but instruction unreadable)
+        bad = dataset / "bad-task"
+        bad.mkdir()
+        (bad / "task.toml").write_text("[metadata]\n")
+        (bad / "instruction.md").write_text("# OK")  # actually valid
+
+        result = self.converter.convert(dataset)
+        # Both should parse, so 2 tasks
+        assert len(result.taskset) == 2
+
+
+# ============================================================================
+# Taskset metadata
+# ============================================================================
+
+
+class TestTasksetMetadata:
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    def test_metadata_includes_harbor_source(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        entry = result.taskset[0]
+        assert "harbor_source" in entry["metadata"]
+
+    def test_metadata_includes_toml_metadata(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        entry = result.taskset[0]
+        meta = entry["metadata"]
+        assert meta.get("category") == "systems"
+        assert meta.get("difficulty") == "medium"
+
+
+# ============================================================================
+# Dockerfile generation
+# ============================================================================
+
+
+class TestDockerfileGeneration:
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    def test_cmd_commented_out(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        dockerfile = result.environments[0].dockerfile
+        # Original CMD ["bash"] should be commented out
+        assert "# [harbor original]" in dockerfile
+
+    def test_hud_layer_present(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        dockerfile = result.environments[0].dockerfile
+        assert "COPY env.py" in dockerfile
+        assert "uv" in dockerfile
+        assert "hud" in dockerfile
+
+    def test_tasks_copied_into_image(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        dockerfile = result.environments[0].dockerfile
+        assert "COPY tasks/ /harbor/tasks/" in dockerfile
+
+    def test_logs_dir_created(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        dockerfile = result.environments[0].dockerfile
+        assert "/logs/verifier" in dockerfile
+
+
+# ============================================================================
+# env.py generation
+# ============================================================================
+
+
+class TestEnvPyGeneration:
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    def test_imports_present(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        env_py = result.environments[0].env_py
+        assert "from hud import Environment" in env_py
+        assert "from hud.tools import BashTool" in env_py
+
+    def test_tools_added(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        env_py = result.environments[0].env_py
+        assert "env.add_tool(BashTool())" in env_py
+        assert "env.add_tool(EditTool())" in env_py
+
+    def test_reward_parsing_logic(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        env_py = result.environments[0].env_py
+        assert "_parse_harbor_reward" in env_py
+        assert "reward.txt" in env_py
+        assert "reward.json" in env_py
+
+
+# ============================================================================
+# Scenario signature: single-task default vs multi-task Literal
+# ============================================================================
+
+
+class TestScenarioSignature:
+    """Verify that single-task envs get a default and multi-task envs get a Literal."""
+
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    # --- single task: optional with default ---
+
+    def test_single_task_has_default(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        env_py = result.environments[0].env_py
+        assert 'task_id: str = "cancel-async-tasks"' in env_py
+
+    def test_single_task_no_literal_import(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        env_py = result.environments[0].env_py
+        assert "from typing import Literal" not in env_py
+        assert "TaskId" not in env_py
+
+    # --- multi-task (same env): Literal type ---
+
+    def test_multi_task_has_literal(self, dataset_same_env: Path) -> None:
+        result = self.converter.convert(dataset_same_env)
+        env_py = result.environments[0].env_py
+        assert "from typing import Literal" in env_py
+        assert "TaskId = Literal[" in env_py
+
+    def test_multi_task_literal_lists_all_ids(self, dataset_same_env: Path) -> None:
+        result = self.converter.convert(dataset_same_env)
+        env_py = result.environments[0].env_py
+        for name in ("cancel-async-tasks", "build-pmars", "chess-best-move"):
+            assert f'"{name}"' in env_py
+
+    def test_multi_task_signature_uses_literal(self, dataset_same_env: Path) -> None:
+        result = self.converter.convert(dataset_same_env)
+        env_py = result.environments[0].env_py
+        assert "def run_task(task_id: TaskId):" in env_py
+
+    def test_multi_task_no_default(self, dataset_same_env: Path) -> None:
+        result = self.converter.convert(dataset_same_env)
+        env_py = result.environments[0].env_py
+        # Should NOT have a default value
+        assert "task_id: TaskId):" in env_py
+        assert "= " not in env_py.split("def run_task(")[1].split("):")[0]
+
+    # --- multi-env dataset: each env gets the right variant ---
+
+    def test_multi_env_single_task_per_env(self, dataset_multi_env: Path) -> None:
+        result = self.converter.convert(dataset_multi_env)
+        # Each env has 2 tasks, so all should use Literal
+        for env in result.environments:
+            assert "TaskId = Literal[" in env.env_py
+
+    def test_single_task_build_context_fixture(self, task_with_build_context: Path) -> None:
+        result = self.converter.convert(task_with_build_context)
+        env_py = result.environments[0].env_py
+        assert 'task_id: str = "build-pmars"' in env_py
+
+
+# ============================================================================
+# pyproject.toml generation
+# ============================================================================
+
+
+class TestPyprojectGeneration:
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    def test_has_hud_dependency(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        pyproject = result.environments[0].pyproject_toml
+        assert "hud-python" in pyproject
+
+    def test_name_matches_env(self, single_task: Path) -> None:
+        result = self.converter.convert(single_task)
+        env = result.environments[0]
+        assert env.name in env.pyproject_toml
+
+
+# ============================================================================
+# write_result()
+# ============================================================================
+
+
+class TestWriteResult:
+    def setup_method(self) -> None:
+        self.converter = HarborConverter()
+
+    def test_creates_directory_structure(self, single_task: Path, tmp_path: Path) -> None:
+        result = self.converter.convert(single_task)
+        out = tmp_path / "output"
+        write_result(result, out)
+
+        env = result.environments[0]
+        env_dir = out / env.name
+
+        assert env_dir.is_dir()
+        assert (env_dir / "env.py").is_file()
+        assert (env_dir / "Dockerfile.hud").is_file()
+        assert (env_dir / "pyproject.toml").is_file()
+        assert (env_dir / "tasks").is_dir()
+        assert (out / "taskset.json").is_file()
+
+    def test_taskset_json_valid(self, single_task: Path, tmp_path: Path) -> None:
+        result = self.converter.convert(single_task)
+        out = tmp_path / "output"
+        taskset_path = write_result(result, out)
+
+        with open(taskset_path, encoding="utf-8") as f:
+            data = json.load(f)
+
+        assert isinstance(data, list)
+        assert len(data) == 1
+        assert data[0]["args"]["task_id"] == "cancel-async-tasks"
+
+    def test_task_files_copied(self, single_task: Path, tmp_path: Path) -> None:
+        result = self.converter.convert(single_task)
+        out = tmp_path / "output"
+        write_result(result, out)
+
+        env = result.environments[0]
+        task_out = out / env.name / "tasks" / "cancel-async-tasks"
+
+        assert (task_out / "instruction.md").is_file()
+        assert (task_out / "task.toml").is_file()
+        assert (task_out / "tests" / "test.sh").is_file()
+
+    def test_environment_dir_not_copied(self, single_task: Path, tmp_path: Path) -> None:
+        result = self.converter.convert(single_task)
+        out = tmp_path / "output"
+        write_result(result, out)
+
+        env = result.environments[0]
+        task_out = out / env.name / "tasks" / "cancel-async-tasks"
+
+        # environment/ should be excluded from the copy
+        assert not (task_out / "environment").exists()
+
+    def test_solution_dir_not_copied(self, dataset_with_solutions: Path, tmp_path: Path) -> None:
+        result = self.converter.convert(dataset_with_solutions)
+        out = tmp_path / "output"
+        write_result(result, out)
+
+        env = result.environments[0]
+        for task_id in env.task_dirs:
+            task_out = out / env.name / "tasks" / task_id
+            assert not (task_out / "solution").exists()
+
+    def test_multi_env_write(self, dataset_multi_env: Path, tmp_path: Path) -> None:
+        result = self.converter.convert(dataset_multi_env)
+        out = tmp_path / "output"
+        write_result(result, out)
+
+        # Both environments should be written
+        for env in result.environments:
+            assert (out / env.name).is_dir()
+            assert (out / env.name / "env.py").is_file()
+
+        # Single taskset.json with all tasks
+        with open(out / "taskset.json", encoding="utf-8") as f:
+            data = json.load(f)
+        assert len(data) == 4
+
+    def test_overwrites_existing(self, single_task: Path, tmp_path: Path) -> None:
+        result = self.converter.convert(single_task)
+        out = tmp_path / "output"
+
+        # Write twice — should not error
+        write_result(result, out)
+        write_result(result, out)
+
+        assert (out / "taskset.json").is_file()
+
+
+# ============================================================================
+# Registry integration (detect_format, get_converter, list_formats)
+# ============================================================================
+
+
+class TestConverterRegistry:
+    def test_get_converter_by_name(self) -> None:
+        converter = get_converter("harbor")
+        assert converter is not None
+        assert isinstance(converter, HarborConverter)
+
+    def test_get_converter_unknown(self) -> None:
+        assert get_converter("nonexistent") is None
+
+    def test_detect_format_harbor(self, single_task: Path) -> None:
+        converter = detect_format(single_task)
+        assert converter is not None
+        assert converter.name == "harbor"
+
+    def test_detect_format_unknown(self, tmp_path: Path) -> None:
+        assert detect_format(tmp_path) is None
+
+    def test_list_formats_includes_harbor(self) -> None:
+        formats = list_formats()
+        names = [name for name, _desc in formats]
+        assert "harbor" in names
diff --git a/hud/cli/deploy.py b/hud/cli/deploy.py
index 1cb77afa..3e354c34 100644
--- a/hud/cli/deploy.py
+++ b/hud/cli/deploy.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import asyncio
+import logging
 import os
 import time
 from pathlib import Path
@@ -14,10 +15,16 @@
 from hud.cli.utils.build_logs import poll_build_status, stream_build_logs
 from hud.cli.utils.config import parse_env_file
 from hud.cli.utils.context import create_build_context_tarball, format_size
-from hud.cli.utils.environment import find_dockerfile, get_environment_name
+from hud.cli.utils.environment import (
+    find_dockerfile,
+    get_environment_name,
+    is_environment_directory,
+)
 from hud.cli.utils.validation import validate_environment
 from hud.utils.hud_console import HUDConsole
 
+LOGGER = logging.getLogger(__name__)
+
 
 def collect_environment_variables(
     directory: Path,
@@ -505,6 +512,96 @@ def _save_deploy_link(
         console.warning(f"Failed to save deploy link: {e}")
 
 
+def discover_environments(directory: Path) -> list[Path]:
+    """Find all HUD environment subdirectories within a parent directory.
+
+    Scans immediate children for directories containing a Dockerfile
+    (Dockerfile.hud or Dockerfile) and pyproject.toml.
+
+    Returns sorted list of environment directory paths.
+    """
+    if not directory.is_dir():
+        return []
+    return [
+        child
+        for child in sorted(directory.iterdir())
+        if child.is_dir() and is_environment_directory(child)
+    ]
+
+
+def deploy_all(
+    directory: str,
+    env: list[str] | None = None,
+    env_file: str | None = None,
+    no_cache: bool = False,
+    verbose: bool = False,
+    build_args: list[str] | None = None,
+    build_secrets: list[str] | None = None,
+) -> None:
+    """Deploy all HUD environments found in a directory.
+
+    Discovers subdirectories that are valid HUD environments and deploys
+    each one sequentially.
+    """
+    hud_console = HUDConsole()
+    parent = Path(directory).resolve()
+
+    if not parent.is_dir():
+        hud_console.error(f"Directory does not exist: {directory}")
+        raise typer.Exit(1)
+
+    envs = discover_environments(parent)
+    if not envs:
+        hud_console.error(f"No HUD environments found in {parent}")
+        hud_console.info("Expected subdirectories containing Dockerfile.hud + pyproject.toml")
+        raise typer.Exit(1)
+
+    hud_console.header("Deploy All Environments")
+    hud_console.info(f"Found {len(envs)} environment(s) in {parent}:")
+    for env_dir in envs:
+        hud_console.info(f"  {env_dir.name}/")
+    hud_console.info("")
+
+    succeeded: list[str] = []
+    failed: list[str] = []
+
+    for i, env_dir in enumerate(envs, start=1):
+        hud_console.section_title(f"[{i}/{len(envs)}] Deploying {env_dir.name}")
+
+        try:
+            deploy_environment(
+                directory=str(env_dir),
+                name=None,
+                env=env,
+                env_file=env_file,
+                no_cache=no_cache,
+                verbose=verbose,
+                registry_id=None,
+                build_args=build_args,
+                build_secrets=build_secrets,
+            )
+            succeeded.append(env_dir.name)
+        except (typer.Exit, SystemExit):
+            LOGGER.warning("Deploy failed for environment %s", env_dir.name)
+            failed.append(env_dir.name)
+        except Exception:
+            LOGGER.exception("Unexpected error deploying %s", env_dir.name)
+            failed.append(env_dir.name)
+
+    # Summary
+    hud_console.info("")
+    hud_console.header("Deploy All Summary")
+    if succeeded:
+        hud_console.success(f"{len(succeeded)} environment(s) deployed successfully:")
+        for name in succeeded:
+            hud_console.info(f"  {name}")
+    if failed:
+        hud_console.error(f"{len(failed)} environment(s) failed:")
+        for name in failed:
+            hud_console.info(f"  {name}")
+        raise typer.Exit(1)
+
+
 def deploy_command(
     directory: str = typer.Argument(".", help="Environment directory"),
     name: str | None = typer.Option(
@@ -513,6 +610,12 @@ def deploy_command(
         "-n",
         help="Environment display name (defaults to directory name)",
     ),
+    all_envs: bool = typer.Option(
+        False,
+        "--all",
+        "-a",
+        help="Deploy all HUD environments found in directory",
+    ),
     env: list[str] | None = typer.Option(  # noqa: B008
         None,
         "--env",
@@ -568,11 +671,24 @@ def deploy_command(
         hud deploy environments/browser
         hud deploy . --name my-env     # Custom name
         hud deploy . -e API_KEY=xxx    # With env vars
+        hud deploy ./converted --all   # Deploy all envs in directory
         hud deploy . --build-arg NODE_ENV=production  # With build args
         hud deploy . --secret id=MY_KEY,env=MY_KEY  # With build secrets (will be encrypted at rest)
         hud deploy . --secret id=MY_KEY,src=./my_key.txt  # Secret from file
         hud deploy . --no-cache        # Force rebuild[/not dim]
     """
+    if all_envs:
+        deploy_all(
+            directory=directory,
+            env=env,
+            env_file=env_file,
+            no_cache=no_cache,
+            verbose=verbose,
+            build_args=build_args,
+            build_secrets=secrets,
+        )
+        return
+
     deploy_environment(
         directory=directory,
         name=name,
diff --git a/hud/cli/tests/test_build.py b/hud/cli/tests/test_build.py
index 1c7be8eb..f1efbbf8 100644
--- a/hud/cli/tests/test_build.py
+++ b/hud/cli/tests/test_build.py
@@ -60,12 +60,12 @@ def test_increment_patch(self):
     def test_increment_minor(self):
         """Test incrementing minor version."""
         assert increment_version("1.2.3", "minor") == "1.3.0"
-        assert increment_version("0.5.20", "minor") == "0.6.0"
+        assert increment_version("0.5.21", "minor") == "0.6.0"
 
     def test_increment_major(self):
         """Test incrementing major version."""
         assert increment_version("1.2.3", "major") == "2.0.0"
-        assert increment_version("0.5.20", "major") == "1.0.0"
+        assert increment_version("0.5.21", "major") == "1.0.0"
 
     def test_increment_with_v_prefix(self):
         """Test incrementing version with v prefix."""
diff --git a/hud/environment/scenarios.py b/hud/environment/scenarios.py
index bec337b2..e33627b9 100644
--- a/hud/environment/scenarios.py
+++ b/hud/environment/scenarios.py
@@ -628,12 +628,35 @@ async def prompt_handler(**handler_args: Any) -> list[str]:
                     if annotation is not None:
                         try:
                             adapter = TypeAdapter(annotation)
-                            deserialized_args[arg_name] = adapter.validate_json(arg_value)
+                        except Exception:
+                            # Unresolvable annotation (e.g. raw string from
+                            # PEP 563 fallback) -- treat as untyped
+                            adapter = None
+
+                        if adapter is not None:
+                            # Try validate_json first (handles Pydantic models,
+                            # lists, enums, datetimes from JSON-encoded strings)
+                            try:
+                                deserialized_args[arg_name] = adapter.validate_json(arg_value)
+                                continue
+                            except Exception:  # noqa: S110
+                                pass
+
+                            # Fall back to validate_python (handles Literal[str]
+                            # where validate_json("0") would parse as int 0,
+                            # losing the string type)
+                            try:
+                                deserialized_args[arg_name] = adapter.validate_python(arg_value)
+                                continue
+                            except Exception:  # noqa: S110
+                                pass
+
+                            # TypeAdapter couldn't handle it -- skip generic
+                            # heuristics that would lose type information
+                            deserialized_args[arg_name] = arg_value
                             continue
-                        except Exception:  # noqa: S110
-                            pass  # Fall through to generic JSON decode
 
-                    # Try JSON decode for strings that look like JSON
+                    # No annotation (or unresolvable): try generic JSON decode heuristics
                     stripped = arg_value.strip()
                     if (stripped and stripped[0] in "[{") or stripped in ("true", "false", "null"):
                         try:
diff --git a/hud/environment/tests/test_scenarios.py b/hud/environment/tests/test_scenarios.py
index 048b893e..74ac9355 100644
--- a/hud/environment/tests/test_scenarios.py
+++ b/hud/environment/tests/test_scenarios.py
@@ -4,7 +4,7 @@
 
 from datetime import datetime
 from enum import Enum
-from typing import Any
+from typing import Any, Literal
 
 import pytest
 from pydantic import BaseModel
@@ -792,6 +792,239 @@ async def list_pydantic_scenario(items: list[_Item]):
         assert received_items[1].name == "Banana"
 
 
+class TestLiteralDeserialization:
+    """Tests for Literal type deserialization edge cases.
+
+    The MCP protocol sends all arguments as strings. When the scenario
+    function uses Literal types, the deserializer must correctly match
+    string values -- especially numeric-looking strings like "0", "1".
+    """
+
+    @pytest.mark.asyncio
+    async def test_literal_string_kept_as_string(self) -> None:
+        """Literal["a", "b"] receives string values correctly."""
+        env = Environment("test-env")
+        received: str | None = None
+
+        @env.scenario("literal_str")
+        async def literal_str_scenario(choice: Literal["a", "b"]):
+            nonlocal received
+            received = choice
+            yield f"Got {choice}"
+            yield 1.0
+
+        prompt = env._prompt_manager._prompts.get("test-env:literal_str")
+        assert prompt is not None
+
+        await prompt.render({"choice": "a"})
+        assert received == "a"
+        assert isinstance(received, str)
+
+    @pytest.mark.asyncio
+    async def test_literal_numeric_string_not_coerced_to_int(self) -> None:
+        """Literal["0", "1", "2"] keeps "0" as string, not int 0.
+
+        This is the GPQA Diamond bug: task IDs are "0", "1", etc.
+        and must stay as strings for Path operations.
+        """
+        env = Environment("test-env")
+        received: Any = None
+
+        @env.scenario("literal_numeric")
+        async def literal_numeric_scenario(task_id: Literal["0", "1", "2"]):
+            nonlocal received
+            received = task_id
+            yield f"Task {task_id}"
+            yield 1.0
+
+        prompt = env._prompt_manager._prompts.get("test-env:literal_numeric")
+        assert prompt is not None
+
+        await prompt.render({"task_id": "0"})
+        assert received == "0"
+        assert isinstance(received, str)
+
+    @pytest.mark.asyncio
+    async def test_literal_numeric_string_various_values(self) -> None:
+        """All numeric-looking Literal string values stay as strings."""
+        env = Environment("test-env")
+        received: Any = None
+
+        @env.scenario("literal_nums")
+        async def literal_nums_scenario(idx: Literal["0", "42", "197"]):
+            nonlocal received
+            received = idx
+            yield f"Index {idx}"
+            yield 1.0
+
+        prompt = env._prompt_manager._prompts.get("test-env:literal_nums")
+        assert prompt is not None
+
+        for val in ("0", "42", "197"):
+            await prompt.render({"idx": val})
+            assert received == val, f"Expected {val!r}, got {received!r}"
+            assert isinstance(received, str), f"Expected str, got {type(received)}"
+
+    @pytest.mark.asyncio
+    async def test_literal_int_coerces_correctly(self) -> None:
+        """Literal[1, 2, 3] with int values coerces string "1" to int 1."""
+        env = Environment("test-env")
+        received: Any = None
+
+        @env.scenario("literal_int")
+        async def literal_int_scenario(level: Literal[1, 2, 3]):
+            nonlocal received
+            received = level
+            yield f"Level {level}"
+            yield 1.0
+
+        prompt = env._prompt_manager._prompts.get("test-env:literal_int")
+        assert prompt is not None
+
+        await prompt.render({"level": "2"})
+        assert received == 2
+        assert isinstance(received, int)
+
+    @pytest.mark.asyncio
+    async def test_literal_mixed_types(self) -> None:
+        """Literal["auto", 0, 1] handles mixed string/int literal values."""
+        env = Environment("test-env")
+        received: Any = None
+
+        @env.scenario("literal_mixed")
+        async def literal_mixed_scenario(mode: Literal["auto", 0, 1]):
+            nonlocal received
+            received = mode
+            yield f"Mode {mode}"
+            yield 1.0
+
+        prompt = env._prompt_manager._prompts.get("test-env:literal_mixed")
+        assert prompt is not None
+
+        await prompt.render({"mode": "auto"})
+        assert received == "auto"
+
+    @pytest.mark.asyncio
+    async def test_literal_with_default(self) -> None:
+        """Literal with default value works when arg is provided."""
+        env = Environment("test-env")
+        received: Any = None
+
+        @env.scenario("literal_default")
+        async def literal_default_scenario(
+            task_id: Literal["build-pmars"] = "build-pmars",
+        ):
+            nonlocal received
+            received = task_id
+            yield f"Task {task_id}"
+            yield 1.0
+
+        prompt = env._prompt_manager._prompts.get("test-env:literal_default")
+        assert prompt is not None
+
+        await prompt.render({"task_id": "build-pmars"})
+        assert received == "build-pmars"
+
+    @pytest.mark.asyncio
+    async def test_int_annotation_coerces_numeric_string(self) -> None:
+        """Plain int annotation coerces "42" to 42."""
+        env = Environment("test-env")
+        received: Any = None
+
+        @env.scenario("int_arg")
+        async def int_arg_scenario(count: int):
+            nonlocal received
+            received = count
+            yield f"Count {count}"
+            yield 1.0
+
+        prompt = env._prompt_manager._prompts.get("test-env:int_arg")
+        assert prompt is not None
+
+        await prompt.render({"count": "42"})
+        assert received == 42
+        assert isinstance(received, int)
+
+    @pytest.mark.asyncio
+    async def test_float_annotation_coerces_numeric_string(self) -> None:
+        """Plain float annotation coerces "3.14" to 3.14."""
+        env = Environment("test-env")
+        received: Any = None
+
+        @env.scenario("float_arg")
+        async def float_arg_scenario(rate: float):
+            nonlocal received
+            received = rate
+            yield f"Rate {rate}"
+            yield 1.0
+
+        prompt = env._prompt_manager._prompts.get("test-env:float_arg")
+        assert prompt is not None
+
+        await prompt.render({"rate": "3.14"})
+        assert received == pytest.approx(3.14)
+        assert isinstance(received, float)
+
+    @pytest.mark.asyncio
+    async def test_bool_annotation_coerces_string(self) -> None:
+        """Bool annotation coerces "true"/"false" correctly."""
+        env = Environment("test-env")
+        received: Any = None
+
+        @env.scenario("bool_arg")
+        async def bool_arg_scenario(verbose: bool):
+            nonlocal received
+            received = verbose
+            yield f"Verbose {verbose}"
+            yield 1.0
+
+        prompt = env._prompt_manager._prompts.get("test-env:bool_arg")
+        assert prompt is not None
+
+        await prompt.render({"verbose": "true"})
+        assert received is True
+
+    @pytest.mark.asyncio
+    async def test_str_annotation_preserves_numeric_string(self) -> None:
+        """Plain str annotation keeps "42" as string "42"."""
+        env = Environment("test-env")
+        received: Any = None
+
+        @env.scenario("str_numeric")
+        async def str_numeric_scenario(name: str):
+            nonlocal received
+            received = name
+            yield f"Name {name}"
+            yield 1.0
+
+        prompt = env._prompt_manager._prompts.get("test-env:str_numeric")
+        assert prompt is not None
+
+        await prompt.render({"name": "42"})
+        assert received == "42"
+        assert isinstance(received, str)
+
+    @pytest.mark.asyncio
+    async def test_no_annotation_numeric_becomes_int(self) -> None:
+        """Untyped arg with numeric-looking string falls through to json.loads."""
+        env = Environment("test-env")
+        received: Any = None
+
+        @env.scenario("untyped_num")
+        async def untyped_num_scenario(val):
+            nonlocal received
+            received = val
+            yield f"Val {val}"
+            yield 1.0
+
+        prompt = env._prompt_manager._prompts.get("test-env:untyped_num")
+        assert prompt is not None
+
+        await prompt.render({"val": "42"})
+        # Without annotation, generic heuristic converts to int
+        assert received == 42
+
+
 class TestScenarioNameNormalization:
     """Test edge cases for environment and scenario name handling."""
 
diff --git a/hud/patches/mcp_patches.py b/hud/patches/mcp_patches.py
index fbac6bd7..d8d73fa7 100644
--- a/hud/patches/mcp_patches.py
+++ b/hud/patches/mcp_patches.py
@@ -8,11 +8,60 @@
 from __future__ import annotations
 
 import logging
-from typing import Any
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    import httpx
+    from mcp.client.streamable_http import StreamWriter
 
 logger = logging.getLogger(__name__)
 
 
+def patch_json_response_error_propagation() -> None:
+    """
+    Patch _handle_json_response to re-raise exceptions instead of swallowing them.
+
+    The original implementation catches all exceptions (e.g. ReadError during
+    response.aread(), ValidationError during JSON parsing) and sends them as raw
+    Exception objects to the read stream — where BaseSession._handle_incoming
+    silently drops them. This causes the caller (call_tool / send_request) to
+    hang forever waiting for a response that will never arrive.
+
+    By re-raising, exceptions propagate to the retry loop in our patched
+    post_writer, which already distinguishes retryable errors (ReadError →
+    retry with backoff) from non-retryable ones (ValidationError → send
+    proper JSONRPCError to resolve the pending request).
+    """
+    try:
+        from mcp.client.streamable_http import StreamableHTTPTransport
+        from mcp.shared.message import SessionMessage
+        from mcp.types import JSONRPCMessage
+
+        async def patched_handle_json_response(
+            self: Any,
+            response: httpx.Response,
+            read_stream_writer: StreamWriter,
+            is_initialization: bool = False,
+        ) -> None:
+            try:
+                content = await response.aread()
+                message = JSONRPCMessage.model_validate_json(content)
+                if is_initialization:
+                    self._maybe_extract_protocol_version_from_message(message)
+                await read_stream_writer.send(SessionMessage(message))
+            except Exception:
+                logger.exception("Error in _handle_json_response")
+                raise
+
+        StreamableHTTPTransport._handle_json_response = patched_handle_json_response
+        logger.debug("Patched StreamableHTTPTransport._handle_json_response to re-raise errors")
+
+    except ImportError:
+        logger.debug("mcp.client.streamable_http not available, skipping patch")
+    except Exception as e:
+        logger.warning("Failed to patch _handle_json_response: %s", e)
+
+
 def patch_streamable_http_error_handling() -> None:
     """
     Patch StreamableHTTPTransport.post_writer to handle request errors properly.
@@ -313,6 +362,7 @@ def suppress_fastmcp_logging(level: int = logging.WARNING) -> None:
 
 def apply_all_patches() -> None:
     """Apply all MCP patches."""
+    patch_json_response_error_propagation()
     patch_streamable_http_error_handling()
     patch_client_session_validation()
     patch_server_output_validation()
diff --git a/hud/utils/tests/test_version.py b/hud/utils/tests/test_version.py
index 4a70e3b6..014478d7 100644
--- a/hud/utils/tests/test_version.py
+++ b/hud/utils/tests/test_version.py
@@ -5,4 +5,4 @@ def test_import():
     """Test that the package can be imported."""
     import hud
 
-    assert hud.__version__ == "0.5.20"
+    assert hud.__version__ == "0.5.21"
diff --git a/hud/version.py b/hud/version.py
index de07e468..65160cef 100644
--- a/hud/version.py
+++ b/hud/version.py
@@ -4,4 +4,4 @@
 
 from __future__ import annotations
 
-__version__ = "0.5.20"
+__version__ = "0.5.21"
diff --git a/pyproject.toml b/pyproject.toml
index fb785572..3b059cf8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "hud-python"
-version = "0.5.20"
+version = "0.5.21"
 description = "SDK for the HUD platform."
 readme = "README.md"
 requires-python = ">=3.11, <3.13"