From 971f613ee4cb1ed4053e6d37df3f543857b7de01 Mon Sep 17 00:00:00 2001 From: ryantzr1 Date: Mon, 12 Jan 2026 16:33:29 +0800 Subject: [PATCH] feat: add multi-agent orchestration pattern - Add multi-agent example with conductor and sub-agents - Add skills documentation (overview, authoring guide, orchestration) - Fix codex model routing to check only matched ID - Remove unsafe eval() from docs example --- docs/cookbooks/multi-agent.mdx | 398 ++++++++++++++++++++++++++++++ docs/docs.json | 1 + examples/07_multi_agent.py | 267 ++++++++++++++++++++ hud/agents/resolver.py | 13 +- hud/agents/tests/test_resolver.py | 38 +++ 5 files changed, 716 insertions(+), 1 deletion(-) create mode 100644 docs/cookbooks/multi-agent.mdx create mode 100644 examples/07_multi_agent.py diff --git a/docs/cookbooks/multi-agent.mdx b/docs/cookbooks/multi-agent.mdx new file mode 100644 index 00000000..1887cc4f --- /dev/null +++ b/docs/cookbooks/multi-agent.mdx @@ -0,0 +1,398 @@ +--- +title: "Multi-Agent Orchestration" +description: "Compose specialized agents into coordinated multi-agent systems" +icon: "diagram-project" +--- + +Multi-agent systems let you combine specialized agents—each with their own environment, tools, and model—into a coordinated workflow. A "conductor" agent orchestrates the specialists, dispatching tasks and synthesizing results. + + + Follow along with the full working example on GitHub. + + +## Overview + +The multi-agent pattern solves a common problem: as agent capabilities grow, a single agent with 50+ tools becomes unwieldy. By splitting responsibilities across specialized agents, each one stays focused and effective. + +```mermaid +flowchart TD + subgraph orch["Coordinator (Conductor)"] + O["2 sub-agent tools"] + end + + subgraph browser["Browser Agent"] + B1["navigate"] + B2["click"] + B3["extract_text"] + end + + subgraph coding["Coding Agent"] + C1["shell"] + C2["apply_patch"] + C3["read_file"] + end + + O --> browser + O --> coding +``` + +The conductor sees only 2 tools—one per specialist. Each specialist has a focused toolset for its domain. + +## Key Concepts + +| Concept | Description | +|---------|-------------| +| **Coordinator** | An Environment with sub-agents registered as tools | +| **AgentTool** | Wraps an environment + scenario as a callable tool | +| **Conductor** | The agent that runs the coordinator (makes decisions) | +| **Sub-agent** | A specialized agent wrapped as a tool | +| **Eval-only params** | Parameters hidden from conductor but available for evaluation | + +## Quick Start + +### Prerequisites + +```bash +export HUD_API_KEY="sk-hud-..." +``` + +Get your API key at [hud.ai/project/api-keys](https://hud.ai/project/api-keys). + + + **Prerequisites**: You must deploy two hub environments before running this example: + + 1. **Remote Browser**: Go to [hud-evals/hud-remote-browser](https://github.com/hud-evals/hud-remote-browser) → Fork to your GitHub → [hud.ai](https://hud.ai) → **New** → **Environment** → Import from your repo. Set required browser provider API keys (e.g., `ANCHOR_API_KEY`). + + 2. **Codex Sandbox**: Go to [hud.ai](https://hud.ai) → **New** → **Environment** → Import from [hud-evals/codex_environment_sandbox](https://github.com/hud-evals/codex_environment_sandbox). + + Once deployed, update the `connect_hub()` calls to use your environment slugs (e.g., `my-org/remote-browser`). + + +### Running the Example + +```bash +# Default task: research and save to markdown +uv run python examples/07_multi_agent.py + +# Custom research task +uv run python examples/07_multi_agent.py \ + --task "Find current prices of Bitcoin and Ethereum and save to crypto.md" + +# Verbose mode +uv run python examples/07_multi_agent.py --verbose +``` + +## Building a Multi-Agent System + +The pattern is simple: +1. Create `AgentTool`s that wrap environments + models +2. Register them on a coordinator `Environment` +3. Run a "conductor" agent that dispatches work to sub-agents + +### Step 1: Create Sub-Agent Environments + +Each sub-agent is an `Environment` with its own tools and scenario. Connect to HUD Hub environments or define local tools: + +```python +from hud import Environment +from hud.tools.agent import AgentTool + + +def create_browser_agent() -> AgentTool: + """Create a browser sub-agent for web research.""" + env = Environment("browser") + env.connect_hub("hud-remote-browser-2") + + @env.scenario() + async def web_research( + task: str, + start_url: str | None = None, + expected_outcome: str | None = None, # Eval-only (hidden from conductor) + ): + """Research information on the web.""" + prompt = f"""You are a web research agent with browser access. + +Research Task: {task} +""" + if start_url: + prompt += f"\nStart URL: {start_url}" + + prompt += """ + +Find relevant information, extract key data, and return structured findings.""" + + yield prompt + yield 1.0 + + return AgentTool( + env("web_research"), + model="claude-sonnet-4-5", # Good at browser navigation + name="web_research", + description="Research information on the web. Use for finding articles, " + "scraping data, comparing prices, and extracting structured information.", + ) +``` + +### Step 2: Define the Coding Agent + +```python +def create_coding_agent() -> AgentTool: + """Create a coding sub-agent for file operations.""" + env = Environment("coding") + env.connect_hub("codex_environment_sandbox") + + @env.scenario() + async def create_markdown( + filename: str, + content: str, + expected_result: str | None = None, # Eval-only + ): + """Create a markdown file with the given content.""" + prompt = f"""You are a file creation assistant. + +Task: Create a markdown file named '{filename}' with the following content: + +{content} + +IMPORTANT: Use the `apply_patch` tool to create the file. + +Steps: +1. Use apply_patch to create '{filename}' with the content above +2. Confirm it was created successfully + +Return a confirmation message.""" + + yield prompt + yield 1.0 + + return AgentTool( + env("create_markdown"), + model="gpt-5.1", # Codex-capable for native shell/apply_patch + name="create_markdown", + description="Create a markdown file with specified content. Use for " + "saving research findings, creating reports, and documenting results.", + ) +``` + +### Step 3: Create the Coordinator + +Create an `Environment` with sub-agents as tools, then run a conductor agent: + +```python +import hud +from hud import Environment +from hud.agents import create_agent + + +async def run_research(task: str): + # Create sub-agents as tools + browser_agent = create_browser_agent() + coding_agent = create_coding_agent() + + # Create coordinator environment with sub-agents as tools + coordinator = Environment("coordinator") + coordinator.add_tool(browser_agent) + coordinator.add_tool(coding_agent) + + # Define the coordination scenario + @coordinator.scenario() + async def coordinate(prompt: str): + yield prompt + yield 1.0 + + # System prompt for the conductor + system_prompt = """You are a research assistant coordinating specialized agents. + +Available sub-agents (call as tools): +- web_research: Find information on the web +- create_markdown: Create markdown files + +CRITICAL: Sub-agents don't share context. When calling create_markdown, +you MUST pass the content you want to save. + +Workflow: +1. web_research: Gather data +2. Format the data into markdown content +3. create_markdown: Save the formatted content +""" + + # Run with eval context + async with hud.eval( + coordinator("coordinate", prompt=task), + name="multi-agent-research", + ) as ctx: + conductor = create_agent("gpt-4o", system_prompt=system_prompt) + result = await conductor.run(ctx, max_steps=10) + + print(f"Reward: {ctx.reward}") + print(f"Result: {result.content}") +``` + +## AgentTool API + +`AgentTool` wraps an environment's scenario as a callable tool: + +```python +from hud.tools.agent import AgentTool + +tool = AgentTool( + env("scenario_name"), # Task from environment + model="claude-sonnet-4-5", # Model for this sub-agent + name="tool_name", # Name shown to conductor + description="...", # Description for conductor + agent=None, # Or provide custom agent class + agent_params={}, # Params passed to agent + trace=False, # Enable separate tracing +) +``` + +### Eval-Only Parameters + +Parameters with `| None = None` are automatically hidden from the conductor's tool schema: + +```python +@env.scenario() +async def investigate( + query: str, # Visible to conductor + expected_finding: str | None = None, # Hidden (eval-only) +): + response = yield f"Investigate: {query}" + + # Use expected_finding for scoring + if expected_finding and response: + yield 1.0 if expected_finding.lower() in response.lower() else 0.0 + else: + yield 1.0 +``` + +This lets you include ground truth for evaluations without exposing it to the conductor. + +## Context Isolation + + +**Sub-agents don't share context.** Each sub-agent runs in its own isolated environment. The conductor must explicitly pass all necessary data when calling a sub-agent. + + +```python +# ❌ Wrong: Assuming sub-agent knows about previous results +result = await ctx.call_tool(name="web_research", arguments={"task": "Find stock prices"}) +# The create_markdown agent won't know what web_research found! +await ctx.call_tool(name="create_markdown", arguments={"filename": "report.md"}) + +# ✅ Correct: Pass data explicitly +result = await ctx.call_tool(name="web_research", arguments={"task": "Find stock prices"}) +await ctx.call_tool(name="create_markdown", arguments={ + "filename": "report.md", + "content": result.content # Pass the data! +}) +``` + +Your system prompt should remind the conductor about this: + +```python +system_prompt="""... +CRITICAL: Sub-agents don't share context. When calling create_markdown, +you MUST pass the content you want to save. +...""" +``` + +## Trace Continuity + +All sub-agent activity appears in a single trace on the HUD platform. When the conductor calls a sub-agent tool, the inference and tool calls are recorded under the parent trace—no separate URLs to track. + +``` +🎭 Coordinator Trace +├── 🤖 Conductor: "I'll research GOOGL prices first..." +│ └── 🔧 web_research(task="Find GOOGL price") +│ ├── 🤖 Browser Agent: "Navigating to finance site..." +│ │ └── 🔧 navigate(url="https://finance.google.com") +│ │ └── 🔧 extract_text(selector=".price") +│ └── ✅ "GOOGL: $185.42" +├── 🤖 Conductor: "Now I'll save to markdown..." +│ └── 🔧 create_markdown(filename="googl.md", content="# GOOGL Price\n...") +│ ├── 🤖 Coding Agent: "Creating file..." +│ │ └── 🔧 apply_patch(type="create_file", path="googl.md", ...) +│ └── ✅ "Created googl.md" +└── ✅ "Research complete!" +``` + +## Advanced Patterns + +### Custom Conductor Agent + +Use a custom agent class for the conductor: + +```python +from hud.agents.claude import ClaudeAgent + +# Create and run with a custom agent +async with hud.eval(coordinator("coordinate", prompt=task)) as ctx: + conductor = ClaudeAgent.create( + checkpoint_name="claude-sonnet-4-5", + system_prompt=system_prompt, + max_tokens=8192, + ) + result = await conductor.run(ctx, max_steps=10) +``` + +### Multiple Scenarios + +Define multiple scenarios on the coordinator: + +```python +@coordinator.scenario() +async def research(prompt: str): + yield prompt + yield 1.0 + +@coordinator.scenario() +async def summarize(topic: str, length: str = "short"): + yield f"Summarize {topic} in a {length} format" + yield 1.0 + +# Use different scenarios +async with hud.eval(coordinator("research", prompt="Find Python frameworks")) as ctx: + ... + +async with hud.eval(coordinator("summarize", topic="ML", length="detailed")) as ctx: + ... +``` + +### Mixing AgentTools with Regular Tools + +You can add both AgentTools (sub-agents) and regular tools: + +```python +from hud.tools.base import BaseTool + +class CalculatorTool(BaseTool): + def __init__(self): + super().__init__(name="calculator", description="Add two numbers") + + async def __call__(self, a: float, b: float) -> str: + return str(a + b) + +coordinator = Environment("hybrid") +coordinator.add_tool(browser_agent) # AgentTool (spawns sub-agent) +coordinator.add_tool(CalculatorTool()) # Regular tool (runs directly) +``` + +## CLI Options + +| Flag | Default | Description | +|------|---------|-------------| +| `--task` | Stock research | The task for the coordinator | +| `--conductor` | `gpt-4o` | Model for the conductor agent | +| `--max-steps` | `10` | Maximum conductor steps | +| `--verbose` | Off | Enable verbose output | + +## See Also + +- [Ops Diagnostics](/cookbooks/ops-diagnostics) - A more complex multi-agent example +- [AgentTool Reference](/reference/tools#agenttool) - Detailed AgentTool API +- [Building Environments](/build-environments) - Creating custom environments +- [Scenarios](/reference/environments#scenarios) - Scenario patterns and best practices diff --git a/docs/docs.json b/docs/docs.json index d3f7332c..1ca08a0e 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -64,6 +64,7 @@ "group": "Cookbooks", "pages": [ "cookbooks/codex-coding", + "cookbooks/multi-agent", "cookbooks/ops-diagnostics" ] }, diff --git a/examples/07_multi_agent.py b/examples/07_multi_agent.py new file mode 100644 index 00000000..4a4a356c --- /dev/null +++ b/examples/07_multi_agent.py @@ -0,0 +1,267 @@ +""" +Multi-Agent Example - Smart Research Assistant + +This example demonstrates how to compose multiple specialized agents +into a multi-agent system using AgentTools. + +The pattern is simple: +1. Create AgentTools that wrap environments + models +2. Register them on a coordinator environment +3. Run a "conductor" agent that dispatches work to sub-agents + +The Smart Research Assistant combines: +- Browser agent: Finds information, scrapes data, navigates websites +- Coding agent: Creates markdown files with research findings + +Uses real HUD Hub environments: +- codex_environment_sandbox: Coding environment with shell and file editing tools +- hud-remote-browser-2: Browser automation for web tasks + +Usage: + export HUD_API_KEY="sk-hud-..." + uv run python examples/07_multi_agent.py + + # Custom task + uv run python examples/07_multi_agent.py \\ + --task "Find current prices of Bitcoin and Ethereum and save to crypto.md" +""" + +import argparse +import asyncio + +from dotenv import load_dotenv + +load_dotenv() + +import hud +from hud import Environment +from hud.agents import create_agent +from hud.settings import settings +from hud.tools.agent import AgentTool + + +# ============================================================================= +# Create Sub-Agents from Hub Environments +# ============================================================================= + + +def create_coding_agent() -> AgentTool: + """Create a coding sub-agent for markdown file creation.""" + env = Environment("coding") + env.connect_hub("codex_environment_sandbox") + + @env.scenario() + async def create_markdown( + filename: str, + content: str, + expected_result: str | None = None, # Eval-only param (hidden from tool schema) + ): + """Create a markdown file with the given content.""" + prompt = f"""You are a file creation assistant with access to a coding environment. + +Task: Create a markdown file named '{filename}' with the following content: + +{content} + +IMPORTANT: Use the `apply_patch` tool to create the file. Do NOT use shell commands like cat or echo. + +Steps: +1. Use apply_patch to create '{filename}' with the content above +2. Use list_files or read_file to confirm it was created + +Return a confirmation message with the filename and location.""" + + yield prompt + yield 1.0 + + return AgentTool( + env("create_markdown"), + model="gpt-5.1", + name="create_markdown", + description="Create a markdown file with specified content. Use for: " + "saving research findings, creating reports, documenting results.", + ) + + +def create_browser_agent() -> AgentTool: + """Create a browser automation sub-agent for web research.""" + env = Environment("browser") + env.connect_hub("hud-remote-browser-2") + + @env.scenario() + async def web_research( + task: str, + start_url: str | None = None, + expected_outcome: str | None = None, # Eval-only param + ): + """Research information on the web using browser automation.""" + prompt = f"""You are a web research agent with access to a browser. + +Research Task: {task} +""" + if start_url: + prompt += f"\nStart URL: {start_url}" + + prompt += """ + +Your job is to: +1. Navigate to relevant websites +2. Search for information related to the task +3. Extract key data, facts, and information +4. Return a clear, structured summary + +Include: key findings, data points (prices, numbers, dates), and sources visited.""" + + yield prompt + yield 1.0 + + return AgentTool( + env("web_research"), + model="claude-sonnet-4-5", + name="web_research", + description="Research information on the web. Use for finding articles, " + "scraping data, comparing prices, and extracting structured information.", + ) + + +# ============================================================================= +# Multi-Agent Orchestration Pattern +# ============================================================================= + + +async def run_multi_agent( + task: str, + conductor_model: str = "gpt-4o", + max_steps: int = 10, + verbose: bool = False, +) -> None: + """ + Run a multi-agent system with a conductor dispatching to sub-agents. + + This shows the core pattern for multi-agent orchestration: + 1. Create an Environment for the coordinator + 2. Add AgentTools as callable tools + 3. Run a conductor agent that dispatches work + """ + + if not settings.api_key: + raise ValueError( + "HUD_API_KEY is required for hub environments.\n" + "Get yours at: https://hud.ai/project/api-keys\n" + "Then: export HUD_API_KEY='sk-hud-...'" + ) + + # Create sub-agents as tools + coding_agent = create_coding_agent() + browser_agent = create_browser_agent() + + # Create coordinator environment with sub-agents as tools + coordinator = Environment("coordinator") + coordinator.add_tool(browser_agent) + coordinator.add_tool(coding_agent) + + # Define the coordination scenario + @coordinator.scenario() + async def coordinate(prompt: str): + yield prompt + yield 1.0 + + # System prompt for the conductor + system_prompt = """You are a Smart Research Assistant coordinating specialized agents. + +Available sub-agents (call as tools): +- web_research: Find information, scrape data, compare prices +- create_markdown: Create markdown files with specified content + +CRITICAL: Sub-agents don't share context. When calling create_markdown, +you MUST pass the full content you want to save. + +Workflow: +1. Use web_research to gather data (prices, facts, numbers) +2. Format the data into markdown content +3. Use create_markdown to save the formatted content +4. Iterate if needed""" + + print("🎭 Smart Research Assistant") + print(f"🤖 Conductor: {conductor_model}") + print(f"🔧 Sub-agents: {browser_agent.name}, {coding_agent.name}") + print(f"📋 Task: {task}") + print("=" * 70) + + # Run with eval context + async with hud.eval( + coordinator("coordinate", prompt=task), + name="multi-agent-research", + ) as ctx: + # Create conductor agent and run + conductor = create_agent( + conductor_model, + system_prompt=system_prompt, + verbose=verbose, + ) + result = await conductor.run(ctx, max_steps=max_steps) + + print("=" * 70) + print("✅ Research Complete!") + print(f"📊 Reward: {ctx.reward}") + if result.content: + print(f"\n📝 Summary:\n{result.content}") + + +# ============================================================================= +# CLI +# ============================================================================= + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Multi-agent research assistant", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + uv run python examples/07_multi_agent.py \\ + --task "Research AAPL stock price and save to stock_prices.md" + + uv run python examples/07_multi_agent.py \\ + --task "Find 3 laptops under $2000 and save specs to laptops.md" +""", + ) + parser.add_argument( + "--task", + type=str, + default="Research current price of GOOGL and save to googl_price.md", + help="Research task to complete", + ) + parser.add_argument( + "--conductor", + type=str, + default="gpt-4o", + help="Model for the conductor agent (default: gpt-4o)", + ) + parser.add_argument( + "--max-steps", + type=int, + default=10, + help="Maximum steps for conductor (default: 10)", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Enable verbose output", + ) + return parser.parse_args() + + +async def main() -> None: + args = _parse_args() + + await run_multi_agent( + task=args.task, + conductor_model=args.conductor, + max_steps=args.max_steps, + verbose=args.verbose, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/hud/agents/resolver.py b/hud/agents/resolver.py index 80351800..fb1b1bf3 100644 --- a/hud/agents/resolver.py +++ b/hud/agents/resolver.py @@ -15,6 +15,12 @@ _PROVIDER_TO_AGENT = {"anthropic": "claude"} +def _is_codex_capable_model(model_id: str) -> bool: + """Check if model is Codex-capable (needs OpenAIAgent for native tools).""" + m = (model_id or "").lower() + return m in {"gpt-5.1", "gpt-5.1-codex"} or "codex" in m + + def _fetch_gateway_models() -> list[dict[str, Any]]: """Fetch available models from HUD gateway (cached).""" global _models_cache @@ -59,7 +65,12 @@ def resolve_cls(model: str) -> tuple[type[MCPAgent], dict[str, Any] | None]: # Gateway lookup for m in _fetch_gateway_models(): - if model in (m.get("id"), m.get("name"), m.get("model")): + candidate_ids = (m.get("id"), m.get("name"), m.get("model")) + if model in candidate_ids: + # Only check if the matched model string is codex-capable + if _is_codex_capable_model(model): + return AgentType.OPENAI.cls, m + provider = (m.get("provider") or "openai_compatible").lower() agent_str = _PROVIDER_TO_AGENT.get(provider, provider) try: diff --git a/hud/agents/tests/test_resolver.py b/hud/agents/tests/test_resolver.py index 04e6f51e..c826805b 100644 --- a/hud/agents/tests/test_resolver.py +++ b/hud/agents/tests/test_resolver.py @@ -57,6 +57,44 @@ def test_resolves_gateway_model(self) -> None: assert info is not None assert info["id"] == "gpt-4o" + def test_resolves_codex_model_to_openai_agent_even_if_provider_is_openai_compatible( + self, + ) -> None: + """Codex-capable models use OpenAIAgent for native tools.""" + from hud.agents import OpenAIAgent + + mock_models = [ + {"id": "gpt-5.1-codex", "model": "gpt-5.1-codex", "provider": "openai_compatible"}, + ] + + with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models): + cls, info = resolve_cls("gpt-5.1-codex") + assert cls == OpenAIAgent + assert info is not None + assert info["id"] == "gpt-5.1-codex" + + def test_does_not_misroute_claude_when_alias_is_codex_capable(self) -> None: + """Only the matched ID should be checked for codex capability, not aliases.""" + from hud.agents.claude import ClaudeAgent + + # Contrived example: a model entry where one alias is codex-capable + # but the requested ID is not + mock_models = [ + { + "id": "claude-via-gateway", + "name": "gpt-5.1-codex", # Alias happens to be codex-capable + "model": "claude-3-sonnet", + "provider": "anthropic", + }, + ] + + with patch("hud.agents.resolver._fetch_gateway_models", return_value=mock_models): + # Request by the non-codex ID - should route to Claude, not OpenAI + cls, info = resolve_cls("claude-via-gateway") + assert cls == ClaudeAgent + assert info is not None + assert info["id"] == "claude-via-gateway" + def test_resolves_anthropic_provider_to_claude(self) -> None: """Provider 'anthropic' maps to ClaudeAgent.""" from hud.agents.claude import ClaudeAgent