Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/byo-harness.md
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,8 @@ prompt.
Tasksets can expose package-owned upload directories with `get_upload_dirs()`.
The base `Taskset` discovers a sibling `skills/` directory by default, and
`RLM` uploads that directory to `/rlm/skills` unless `skills=` is passed
explicitly to the harness.
explicitly to the harness. When RLM uses taskset-discovered skills, it also
stages generated endpoint-backed skills for resolved V1 tools.
Use `RLMConfig` in `env.harness` for RLM-specific settings such as
`rlm_repo_ref`, `rlm_tools`, `rlm_max_turns`, and `summarize_at_tokens`.

Expand Down
191 changes: 177 additions & 14 deletions tests/test_v1_rlm_swe.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
import asyncio
import importlib.util
import json
import sys
import threading
import types
from collections.abc import Mapping
from collections.abc import Callable, Mapping
from http.server import BaseHTTPRequestHandler, HTTPServer
from pathlib import Path
from typing import cast

import pytest
from datasets import Dataset

import verifiers.v1 as vf
from environments.rlm_swe_v1 import rlm_swe_v1
from verifiers.types import Tool
from verifiers.v1.utils.program_utils import merge_task_program, merge_task_sandbox


def as_mapping(value: object) -> Mapping[str, object]:
assert isinstance(value, Mapping)
return value
return cast(Mapping[str, object], value)


def test_rlm_harness_builds_sandbox_program_without_eager_checkout():
Expand All @@ -23,14 +30,15 @@ def test_rlm_harness_builds_sandbox_program_without_eager_checkout():
program = as_mapping(harness.program)
program_env = as_mapping(program["env"])
artifacts = as_mapping(program["artifacts"])
setup = program["setup"]
setup = cast(list[str], program["setup"])

assert isinstance(harness, vf.Harness)
assert program["sandbox"] is not False
assert isinstance(setup, list)
assert "apt-get -o Acquire::Retries=3 update" in setup[0]
assert "apt-get -o Acquire::Retries=3 install" in setup[0]
assert "RLM_MODEL" in program_env
assert program_env["VF_ENDPOINT_ROOT_URL"] == "state.endpoint_root_url"
assert "rlm_metrics" in artifacts


Expand Down Expand Up @@ -68,23 +76,170 @@ def test_rlm_harness_can_upload_skills(tmp_path: Path):
assert dirs["/rlm/skills"] == skills


def test_rlm_harness_uploads_taskset_skills_by_default(tmp_path: Path):
def test_rlm_harness_stages_taskset_and_endpoint_tool_skills(tmp_path: Path):
skills = tmp_path / "taskset-skills"
skills.mkdir()
(skills / "SKILL.md").write_text("---\nname: taskset\n---\n")
(skills / "static").mkdir(parents=True)
(skills / "static" / "SKILL.md").write_text("---\nname: static\n---\n")

async def greet(name: str) -> str:
"""Greet someone by name.

Args:
name: The person to greet.
"""
return f"Hello, {name}!"

class SkillTaskset(vf.Taskset):
def get_upload_dirs(self):
return {"skills": skills}

taskset = SkillTaskset(
config=vf.TasksetConfig(
source=[{"task_id": "hello", "question": "Say hi.", "answer": ""}]
)
)
taskset.add_toolset(vf.Toolset(tools=[greet]))
env = vf.Env(
taskset=SkillTaskset(config=vf.TasksetConfig(source=[])),
taskset=taskset,
harness=vf.RLM(config=vf.RLMConfig(local_checkout="/tmp/checkout")),
)
task = next(iter(env.taskset))
state = vf.State.for_task(task)
asyncio.run(env.harness.setup_state(task, state))
program = as_mapping(env.harness.program)
dirs = as_mapping(program["dirs"])
loader = cast(Callable[[vf.Task, vf.State], Path], dirs["/rlm/skills"])

assert dirs["/rlm/skills"] == skills
staged = loader(task, state)

assert (staged / "static" / "SKILL.md").exists()
assert (
(staged / "greet" / "pyproject.toml")
.read_text()
.startswith('[project]\nname = "rlm-skill-greet"')
)
skill_source = (staged / "greet" / "src" / "greet" / "greet.py").read_text()
assert (
'TOOL_NAME = "greet"' in skill_source or "TOOL_NAME = 'greet'" in skill_source
)
assert '"User-Agent": "OpenAI/Python"' in skill_source
compile(skill_source, str(staged / "greet" / "src" / "greet" / "greet.py"), "exec")


def test_rlm_skill_staging_sanitizes_cache_paths(tmp_path: Path):
async def greet(name: str) -> str:
return f"Hello, {name}!"

taskset = vf.Taskset(
config=vf.TasksetConfig(
source=[{"task_id": "../outside/group", "question": "Say hi."}]
)
)
taskset.add_toolset(vf.Toolset(tools=[greet]))
env = vf.Env(
taskset=taskset,
harness=vf.RLM(config=vf.RLMConfig(local_checkout="/tmp/checkout")),
)
task = next(iter(env.taskset))
state = vf.State.for_task(task)
asyncio.run(env.harness.setup_state(task, state))

from verifiers.v1.packages.harnesses.rlm_skills import stage_rlm_tool_skills

cache_root = tmp_path / "cache"
staged = stage_rlm_tool_skills(
task, state, env.harness.runtime, cache_root=cache_root
)

assert staged.is_relative_to(cache_root)
assert not (tmp_path / "outside").exists()
assert (staged / "greet" / "SKILL.md").exists()


def test_generated_rlm_skill_calls_v1_tool_endpoint(tmp_path: Path, monkeypatch):
tool = Tool(
name="greet",
description="Greet someone by name.",
parameters={
"type": "object",
"properties": {
"name": {"type": "string", "description": "The person to greet."}
},
"required": ["name"],
},
)
from verifiers.v1.packages.harnesses.rlm_skills import write_tool_skill

write_tool_skill(tmp_path / "greet", tool, "greet")
received = {}

class Handler(BaseHTTPRequestHandler):
def do_POST(self):
length = int(self.headers.get("Content-Length", "0"))
received["path"] = self.path
received["authorization"] = self.headers.get("Authorization")
received["user_agent"] = self.headers.get("User-Agent")
received["body"] = json.loads(self.rfile.read(length).decode())
response = json.dumps({"result": "Hello, Alice!"}).encode()
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(response)))
self.end_headers()
self.wfile.write(response)

def log_message(self, format, *args):
return

server = HTTPServer(("127.0.0.1", 0), Handler)
thread = threading.Thread(target=server.serve_forever, daemon=True)
thread.start()
try:
monkeypatch.setenv(
"VF_ENDPOINT_ROOT_URL",
f"http://127.0.0.1:{server.server_port}/rollout/test",
)
monkeypatch.setenv("OPENAI_API_KEY", "secret")
skill_path = tmp_path / "greet" / "src" / "greet" / "greet.py"
spec = importlib.util.spec_from_file_location("greet_skill", skill_path)
assert spec is not None
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)

result = asyncio.run(module.greet(name="Alice"))

assert result == "Hello, Alice!"
assert module.run is module.greet
assert received["path"] == "/rollout/test/vf/tools/greet"
assert received["authorization"] == "Bearer secret"
assert received["user_agent"] == "OpenAI/Python"
assert received["body"] == {"arguments": {"name": "Alice"}}
finally:
server.shutdown()
server.server_close()
thread.join(timeout=5)


def test_generated_rlm_skill_orders_required_params_first(tmp_path: Path):
tool = Tool(
name="mix",
description="Mix required and optional args.",
parameters={
"type": "object",
"properties": {
"optional": {"type": "string"},
"required": {"type": "string"},
},
"required": ["required"],
},
)
from verifiers.v1.packages.harnesses.rlm_skills import write_tool_skill

write_tool_skill(tmp_path / "mix", tool, "mix")
source = (tmp_path / "mix" / "src" / "mix" / "mix.py").read_text()

assert "async def mix(required: str, optional: str | None = None):" in source
compile(source, str(tmp_path / "mix" / "src" / "mix" / "mix.py"), "exec")


def test_taskset_discovers_sibling_skills_dir_by_default(
Expand Down Expand Up @@ -178,15 +333,15 @@ def fake_load_dataset(dataset_name: str, **kwargs: object) -> Dataset:
assert task["sandbox"]["timeout_minutes"] == 30
task_program_env = as_mapping(as_mapping(task["program"])["env"])
assert task_program_env["AGENT_WORKDIR"] == "/workspace/repo"
assert "/workspace/repo/.venv/bin" in task_program_env["AGENT_PATH"]
assert "/workspace/repo/.venv/bin" in cast(str, task_program_env["AGENT_PATH"])
assert task_program_env["PAGER"] == "cat"
assert task_program_env["CUSTOM"] == "1"
assert "CUSTOM" not in program_env
assert program_env["CALLER"] == "1"
assert program_env["RLM_TOOLS"] == "bash,edit"
assert merged_sandbox["workdir"] == "/workspace/repo"
assert merged_env["AGENT_WORKDIR"] == "/workspace/repo"
assert "/workspace/repo/.venv/bin" in merged_env["AGENT_PATH"]
assert "/workspace/repo/.venv/bin" in cast(str, merged_env["AGENT_PATH"])
assert merged_env["PAGER"] == "cat"
assert merged_env["CUSTOM"] == "1"
assert merged_env["CALLER"] == "1"
Expand All @@ -196,9 +351,13 @@ def test_rlm_swe_taskset_hooks_are_registered_with_runtime():
taskset = rlm_swe_v1.load_taskset(config=rlm_swe_v1.RlmSweTasksetConfig())
env = vf.Env(taskset=taskset)

setup_names = [handler.__name__ for handler in env.harness.runtime.rollout_setup]
setup_names = [
getattr(handler, "__name__", "")
for handler in env.harness.runtime.rollout_setup
]
cleanup_names = [
handler.__name__ for handler in env.harness.runtime.rollout_cleanup
getattr(handler, "__name__", "")
for handler in env.harness.runtime.rollout_cleanup
]
signal_names = {signal["name"] for signal in env.harness.runtime.rollout_signals}

Expand Down Expand Up @@ -262,11 +421,15 @@ async def test_rlm_swe_run_tests_quotes_env_values():
)
sandbox = RecordingSandbox()

output = await taskset.run_tests(sandbox, {}, 123)
output = await taskset.run_tests(
cast(rlm_swe_v1.R2ESandbox, sandbox),
cast(vf.MutableConfigMap, {}),
123,
)

assert output == "test output"
assert len(sandbox.background_jobs) == 1
command = sandbox.background_jobs[0]["command"]
command = cast(str, sandbox.background_jobs[0]["command"])
assert "SAFE='two words; $(echo nope)'" in command
assert "QUOTE='it'\"'\"'s ok'" in command
assert command.endswith("/bin/bash run_tests.sh > test_output.txt 2>&1")
Expand Down
9 changes: 6 additions & 3 deletions verifiers/v1/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -605,13 +605,16 @@ and log/trajectory artifacts.
artifacts.
`RLM` follows the same boundary for recursive LLM runs: `HarborTaskset` owns
the task directory and tests, while `RLM` owns RLM installation, optional skill
upload to `/rlm/skills`, endpoint wiring, and trajectory filtering.
upload to `/rlm/skills`, endpoint-backed tool skills, endpoint wiring, and
trajectory filtering.
Use `RLMConfig` in `env.harness` for RLM-specific settings such as
`rlm_repo_ref`, `rlm_tools`, `rlm_max_turns`, and `summarize_at_tokens`.
Tasksets can expose package-owned upload directories with `get_upload_dirs()`.
The base `Taskset` discovers a sibling `skills/` directory by default, and
`RLM` uploads that directory to `/rlm/skills` unless `skills=` is passed
explicitly to the harness.
explicitly to the harness. When the harness uses taskset-discovered skills, it
also stages a generated RLM skill for each resolved V1 tool so RLM can call the
same interception endpoint as other V1 harnesses.

## State Helpers

Expand Down Expand Up @@ -1467,7 +1470,7 @@ mcp = true
`program.channels` is deliberately limited to `callable` and `mcp`.
Harness-specific tool carriers belong on the harness or taskset contract; for
example, RLM reads `Taskset.get_upload_dirs()["skills"]` and uploads it to
`/rlm/skills`.
`/rlm/skills` alongside generated endpoint-backed skills for resolved V1 tools.

`program.setup` prepares the process. `program.channels.mcp` registers resolved
tool or endpoint config after the interception endpoint is live and before the
Expand Down
20 changes: 17 additions & 3 deletions verifiers/v1/packages/harnesses/rlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
RLM_DEFAULT_APPEND_TO_SYSTEM_PROMPT_PATH,
RLMConfig,
)
from .rlm_skills import stage_rlm_tool_skills
from ...types import ConfigMap, ProgramCommand, ProgramValue

DEFAULT_RLM_CHECKOUT_PATH = "/tmp/rlm-checkout"
Expand All @@ -32,7 +33,8 @@
Path.home() / ".cache" / "verifiers" / "rlm-checkouts"
)
REQUIRED_RLM_CHECKOUT_FILES = ("install.sh", "pyproject.toml")
ProgramDir = str | Path | Traversable
SkillSource = str | Path | Traversable
ProgramDir = SkillSource | Callable[..., Path]


class RLM(Harness[RLMConfig]):
Expand All @@ -51,6 +53,7 @@ def __init__(self, config: RLMConfig | None = None):
env: dict[str, ProgramValue] = {
"PATH": "/root/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"OPENAI_MODEL": "runtime.model",
"VF_ENDPOINT_ROOT_URL": "state.endpoint_root_url",
"RLM_MODEL": "runtime.model",
"RLM_TOOLS": ",".join(harness_config.rlm_tools),
"RLM_MAX_TURNS": str(harness_config.rlm_max_turns),
Expand Down Expand Up @@ -136,14 +139,25 @@ def attach_taskset(self, taskset: Taskset) -> None:
upload_dirs = taskset.get_upload_dirs()
if not isinstance(upload_dirs, Mapping):
raise TypeError("Taskset.get_upload_dirs() must return a mapping.")
skills = upload_dirs.get("skills")
source = upload_dirs.get("skills")
self.set_program_dir(
DEFAULT_RLM_SKILLS_PATH,
cast(ProgramDir | None, skills),
self.skill_loader(cast(SkillSource | None, source)),
)
super().attach_taskset(taskset)
self._program = self.compile_program(self.program)

def skill_loader(self, source: SkillSource | None) -> Callable[..., Path]:
def load(task: Task, state: State) -> Path:
return stage_rlm_tool_skills(
task,
state,
self.runtime,
source=source,
)

return load

def set_program_dir(
self, remote_path: str, local_source: ProgramDir | None
) -> None:
Expand Down
Loading
Loading