Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 29 additions & 4 deletions docs/advanced/harbor-convert.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ git clone https://github.com/laude-institute/terminal-bench-2.git
# 2. Convert to HUD format
hud convert ./terminal-bench-2/ --output ./tb2-hud

# 3. Deploy all environments
# 3. Deploy all environments (~3 min per environment, leave it running)
hud deploy ./tb2-hud --all

# 4. Run evaluation
Expand All @@ -24,6 +24,11 @@ hud eval ./tb2-hud/taskset.json

That's it. The converter handles Dockerfile adaptation, build context, test scripts, and reward parsing automatically.

<Tip>
Each environment takes roughly 3 minutes to build and deploy. For datasets with many environments,
`hud deploy --all` runs them sequentially -- just leave it running and check back when it's done.
</Tip>

## What Gets Converted

A Harbor task directory:
Expand Down Expand Up @@ -81,9 +86,29 @@ Harbor test scripts write results to `/logs/verifier/`. The converter supports b
- `reward.txt` -- a single float (`1.0` for pass, `0.0` for fail)
- `reward.json` -- `{"reward": 1.0}` or just a float

## Running Programmatically
## Running Tasks

### Option 1: Upload as a Taskset (recommended)

The generated `taskset.json` can be uploaded directly to the HUD platform for managed evaluation, leaderboards, and comparison across models:

1. Go to [hud.ai/evalsets](https://hud.ai/evalsets) and create a new taskset
2. Click **Upload Tasks** and paste the contents of `taskset.json`
3. Run evaluations from the platform UI or via `hud eval`

See the [Tasksets guide](/platform/tasksets) for full details on creating and managing tasksets.

### Option 2: CLI eval

Run the taskset directly from the command line:

```bash
hud eval ./tb2-hud/taskset.json
```

### Option 3: Python SDK

You can also run converted tasks from Python using the SDK:
Run tasks programmatically with any agent:

```python
import asyncio
Expand All @@ -108,7 +133,7 @@ async def main():
asyncio.run(main())
```

Or load the full taskset:
Or load the full taskset as Task objects:

```python
import json
Expand Down
161 changes: 134 additions & 27 deletions hud/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1026,46 +1026,153 @@ def get(

@app.command()
def convert(
tasks_file: str = typer.Argument(
..., help="Path to tasks file (JSON/JSONL) to convert to remote MCP configuration"
path: str = typer.Argument(
..., help="Path to source tasks/dataset directory to convert to HUD format"
),
from_format: str = typer.Option(
"auto",
"--from",
"-f",
help="Source format (auto, harbor). Use 'auto' to detect automatically.",
),
output: str | None = typer.Option(
None,
"--output",
"-o",
help="Output directory (default: ./hud_converted)",
),
) -> None:
"""Convert local MCP task configs to remote (mcp.hud.ai) format.
"""Convert external benchmark formats to HUD environments + tasksets.

This mirrors the implicit conversion flow used by 'hud rl' and writes a new
remote_<name>.json next to the source file when needed.
[not dim]Converts tasks from frameworks like Harbor into HUD-compatible
environments (env.py + Dockerfile.hud) and v5 taskset files.

Supports pluggable formats. Currently: harbor.

Examples:
hud convert ./algotune/ # Auto-detect, convert dataset
hud convert ./my-task/ --from harbor # Explicit format
hud convert ./dataset/ --output ./out # Custom output directory[/not dim]
"""
from pathlib import Path

from .convert import detect_format, get_converter, list_formats, write_result

hud_console = HUDConsole()
source_path = Path(path).resolve()

try:
from .flows.tasks import convert_tasks_to_remote
if not source_path.exists():
hud_console.error(f"Path does not exist: {path}")
raise typer.Exit(1)

result_path = convert_tasks_to_remote(tasks_file)
# Resolve converter
if from_format == "auto":
converter = detect_format(source_path)
if converter is None:
# Auto-detect failed — prompt user to pick a format
available = list_formats()
if not available:
hud_console.error("No converters registered.")
raise typer.Exit(1)

if len(available) == 1:
# Only one format exists, just use it
converter = get_converter(available[0][0])
if converter:
hud_console.info(f"Using format: {converter.name}")
else:
import questionary

choices = [
questionary.Choice(title=f"{name} — {desc}", value=name)
for name, desc in available
]
picked = questionary.select(
"Could not auto-detect format. Which format is this?",
choices=choices,
).ask()
if not picked:
raise typer.Exit(1)
converter = get_converter(picked)

# If nothing changed, inform the user
try:
if Path(result_path).resolve() == Path(tasks_file).resolve():
hud_console.success(
"Tasks already reference remote MCP URLs. No conversion needed."
)
hud_console.hint("You can run them directly with: hud eval <tasks_file> --full")
return
except Exception as e:
# Best effort; continue with success message
hud_console.debug(f"Path comparison failed, continuing: {e}")

hud_console.success(f"Converted tasks written to: {result_path}")
hud_console.hint(
"You can now run remote flows: hud rl <converted_file> or hud eval <converted_file>"
)
except typer.Exit:
raise
if converter is None:
hud_console.error("No converter selected.")
raise typer.Exit(1)
else:
hud_console.info(f"Detected format: {converter.name}")
else:
converter = get_converter(from_format)
if converter is None:
hud_console.error(f"Unknown format: {from_format}")
available = list_formats()
if available:
hud_console.info("Available formats:")
for name, desc in available:
hud_console.info(f" {name}: {desc}")
raise typer.Exit(1)

# Run conversion
try:
result = converter.convert(source_path)
except ValueError as e:
hud_console.error(str(e))
raise typer.Exit(1) from e
except Exception as e:
hud_console.error(f"Conversion failed: {e}")
raise typer.Exit(1) from e

# Write output
output_dir = Path(output) if output else Path("./hud_converted")
try:
taskset_path = write_result(result, output_dir.resolve())
except Exception as e:
hud_console.error(f"Failed to convert tasks: {e}")
hud_console.error(f"Failed to write output: {e}")
raise typer.Exit(1) from e

# Display results
hud_console.header("Convert Complete")
hud_console.info("")

total_tasks = len(result.taskset)
total_envs = len(result.environments)
hud_console.success(f"Converted {total_tasks} task(s) into {total_envs} environment(s).")
hud_console.info("")

# Show each environment
hud_console.section_title("Environments")
for env_gen in result.environments:
task_count = len(env_gen.task_dirs)
hud_console.status_item(env_gen.name, f"{task_count} tasks")
hud_console.info("")

# Show output paths
hud_console.section_title("Output")
hud_console.status_item("Directory", str(output_dir.resolve()))
hud_console.status_item("Taskset", str(taskset_path))
hud_console.info("")

# Show next steps with numbered commands
hud_console.section_title("Next Steps")
hud_console.info("")

hud_console.info("1. Deploy environment(s):")
if total_envs > 1:
hud_console.command_example(
f"hud deploy {output_dir.resolve()} --all",
f"Deploy all {total_envs} environments",
)
else:
first_env = result.environments[0].name if result.environments else "<env>"
hud_console.command_example(
f"hud deploy {output_dir.resolve() / first_env}",
"Build & deploy to HUD platform",
)
hud_console.info("")

hud_console.info("2. Run evaluation:")
hud_console.command_example(f"hud eval {taskset_path}", "Run agent against tasks")
hud_console.info("")


@app.command()
def cancel(
Expand Down
Loading
Loading