From 38612491af550117270b72ce1d9424de0f03b58d Mon Sep 17 00:00:00 2001 From: Xeophon <46377542+xeophon@users.noreply.github.com> Date: Fri, 22 May 2026 23:59:51 +0200 Subject: [PATCH 1/2] Support RLM skills for v1 tools --- docs/byo-harness.md | 3 +- tests/test_v1_rlm_swe.py | 139 +++++++- verifiers/v1/README.md | 9 +- verifiers/v1/packages/harnesses/rlm.py | 20 +- verifiers/v1/packages/harnesses/rlm_skills.py | 302 ++++++++++++++++++ 5 files changed, 452 insertions(+), 21 deletions(-) create mode 100644 verifiers/v1/packages/harnesses/rlm_skills.py diff --git a/docs/byo-harness.md b/docs/byo-harness.md index c534229a2..89f72866d 100644 --- a/docs/byo-harness.md +++ b/docs/byo-harness.md @@ -485,7 +485,8 @@ prompt. Tasksets can expose package-owned upload directories with `get_upload_dirs()`. The base `Taskset` discovers a sibling `skills/` directory by default, and `RLM` uploads that directory to `/rlm/skills` unless `skills=` is passed -explicitly to the harness. +explicitly to the harness. When RLM uses taskset-discovered skills, it also +stages generated endpoint-backed skills for resolved V1 tools. Use `RLMConfig` in `env.harness` for RLM-specific settings such as `rlm_repo_ref`, `rlm_tools`, `rlm_max_turns`, and `summarize_at_tokens`. diff --git a/tests/test_v1_rlm_swe.py b/tests/test_v1_rlm_swe.py index aff1317a8..c7eda0a6b 100644 --- a/tests/test_v1_rlm_swe.py +++ b/tests/test_v1_rlm_swe.py @@ -1,19 +1,26 @@ +import asyncio +import importlib.util +import json import sys +import threading import types -from collections.abc import Mapping +from collections.abc import Callable, Mapping +from http.server import BaseHTTPRequestHandler, HTTPServer from pathlib import Path +from typing import cast import pytest from datasets import Dataset import verifiers.v1 as vf from environments.rlm_swe_v1 import rlm_swe_v1 +from verifiers.types import Tool from verifiers.v1.utils.program_utils import merge_task_program, merge_task_sandbox def as_mapping(value: object) -> Mapping[str, object]: assert isinstance(value, Mapping) - return value + return cast(Mapping[str, object], value) def test_rlm_harness_builds_sandbox_program_without_eager_checkout(): @@ -23,7 +30,7 @@ def test_rlm_harness_builds_sandbox_program_without_eager_checkout(): program = as_mapping(harness.program) program_env = as_mapping(program["env"]) artifacts = as_mapping(program["artifacts"]) - setup = program["setup"] + setup = cast(list[str], program["setup"]) assert isinstance(harness, vf.Harness) assert program["sandbox"] is not False @@ -31,6 +38,7 @@ def test_rlm_harness_builds_sandbox_program_without_eager_checkout(): assert "apt-get -o Acquire::Retries=3 update" in setup[0] assert "apt-get -o Acquire::Retries=3 install" in setup[0] assert "RLM_MODEL" in program_env + assert program_env["VF_ENDPOINT_ROOT_URL"] == "state.endpoint_root_url" assert "rlm_metrics" in artifacts @@ -68,23 +76,118 @@ def test_rlm_harness_can_upload_skills(tmp_path: Path): assert dirs["/rlm/skills"] == skills -def test_rlm_harness_uploads_taskset_skills_by_default(tmp_path: Path): +def test_rlm_harness_stages_taskset_and_endpoint_tool_skills(tmp_path: Path): skills = tmp_path / "taskset-skills" - skills.mkdir() - (skills / "SKILL.md").write_text("---\nname: taskset\n---\n") + (skills / "static").mkdir(parents=True) + (skills / "static" / "SKILL.md").write_text("---\nname: static\n---\n") + + async def greet(name: str) -> str: + """Greet someone by name. + + Args: + name: The person to greet. + """ + return f"Hello, {name}!" class SkillTaskset(vf.Taskset): def get_upload_dirs(self): return {"skills": skills} + taskset = SkillTaskset( + config=vf.TasksetConfig( + source=[{"task_id": "hello", "question": "Say hi.", "answer": ""}] + ) + ) + taskset.add_toolset(vf.Toolset(tools=[greet])) env = vf.Env( - taskset=SkillTaskset(config=vf.TasksetConfig(source=[])), + taskset=taskset, harness=vf.RLM(config=vf.RLMConfig(local_checkout="/tmp/checkout")), ) + task = next(iter(env.taskset)) + state = vf.State.for_task(task) + asyncio.run(env.harness.setup_state(task, state)) program = as_mapping(env.harness.program) dirs = as_mapping(program["dirs"]) + loader = cast(Callable[[vf.Task, vf.State], Path], dirs["/rlm/skills"]) - assert dirs["/rlm/skills"] == skills + staged = loader(task, state) + + assert (staged / "static" / "SKILL.md").exists() + assert ( + (staged / "greet" / "pyproject.toml") + .read_text() + .startswith('[project]\nname = "rlm-skill-greet"') + ) + skill_source = (staged / "greet" / "src" / "greet" / "greet.py").read_text() + assert ( + 'TOOL_NAME = "greet"' in skill_source or "TOOL_NAME = 'greet'" in skill_source + ) + assert '"User-Agent": "OpenAI/Python"' in skill_source + compile(skill_source, str(staged / "greet" / "src" / "greet" / "greet.py"), "exec") + + +def test_generated_rlm_skill_calls_v1_tool_endpoint(tmp_path: Path, monkeypatch): + tool = Tool( + name="greet", + description="Greet someone by name.", + parameters={ + "type": "object", + "properties": { + "name": {"type": "string", "description": "The person to greet."} + }, + "required": ["name"], + }, + ) + from verifiers.v1.packages.harnesses.rlm_skills import write_tool_skill + + write_tool_skill(tmp_path / "greet", tool, "greet") + received = {} + + class Handler(BaseHTTPRequestHandler): + def do_POST(self): + length = int(self.headers.get("Content-Length", "0")) + received["path"] = self.path + received["authorization"] = self.headers.get("Authorization") + received["user_agent"] = self.headers.get("User-Agent") + received["body"] = json.loads(self.rfile.read(length).decode()) + response = json.dumps({"result": "Hello, Alice!"}).encode() + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(response))) + self.end_headers() + self.wfile.write(response) + + def log_message(self, format, *args): + return + + server = HTTPServer(("127.0.0.1", 0), Handler) + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + try: + monkeypatch.setenv( + "VF_ENDPOINT_ROOT_URL", + f"http://127.0.0.1:{server.server_port}/rollout/test", + ) + monkeypatch.setenv("OPENAI_API_KEY", "secret") + skill_path = tmp_path / "greet" / "src" / "greet" / "greet.py" + spec = importlib.util.spec_from_file_location("greet_skill", skill_path) + assert spec is not None + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + + result = asyncio.run(module.greet(name="Alice")) + + assert result == "Hello, Alice!" + assert module.run is module.greet + assert received["path"] == "/rollout/test/vf/tools/greet" + assert received["authorization"] == "Bearer secret" + assert received["user_agent"] == "OpenAI/Python" + assert received["body"] == {"arguments": {"name": "Alice"}} + finally: + server.shutdown() + server.server_close() + thread.join(timeout=5) def test_taskset_discovers_sibling_skills_dir_by_default( @@ -178,7 +281,7 @@ def fake_load_dataset(dataset_name: str, **kwargs: object) -> Dataset: assert task["sandbox"]["timeout_minutes"] == 30 task_program_env = as_mapping(as_mapping(task["program"])["env"]) assert task_program_env["AGENT_WORKDIR"] == "/workspace/repo" - assert "/workspace/repo/.venv/bin" in task_program_env["AGENT_PATH"] + assert "/workspace/repo/.venv/bin" in cast(str, task_program_env["AGENT_PATH"]) assert task_program_env["PAGER"] == "cat" assert task_program_env["CUSTOM"] == "1" assert "CUSTOM" not in program_env @@ -186,7 +289,7 @@ def fake_load_dataset(dataset_name: str, **kwargs: object) -> Dataset: assert program_env["RLM_TOOLS"] == "bash,edit" assert merged_sandbox["workdir"] == "/workspace/repo" assert merged_env["AGENT_WORKDIR"] == "/workspace/repo" - assert "/workspace/repo/.venv/bin" in merged_env["AGENT_PATH"] + assert "/workspace/repo/.venv/bin" in cast(str, merged_env["AGENT_PATH"]) assert merged_env["PAGER"] == "cat" assert merged_env["CUSTOM"] == "1" assert merged_env["CALLER"] == "1" @@ -196,9 +299,13 @@ def test_rlm_swe_taskset_hooks_are_registered_with_runtime(): taskset = rlm_swe_v1.load_taskset(config=rlm_swe_v1.RlmSweTasksetConfig()) env = vf.Env(taskset=taskset) - setup_names = [handler.__name__ for handler in env.harness.runtime.rollout_setup] + setup_names = [ + getattr(handler, "__name__", "") + for handler in env.harness.runtime.rollout_setup + ] cleanup_names = [ - handler.__name__ for handler in env.harness.runtime.rollout_cleanup + getattr(handler, "__name__", "") + for handler in env.harness.runtime.rollout_cleanup ] signal_names = {signal["name"] for signal in env.harness.runtime.rollout_signals} @@ -262,11 +369,15 @@ async def test_rlm_swe_run_tests_quotes_env_values(): ) sandbox = RecordingSandbox() - output = await taskset.run_tests(sandbox, {}, 123) + output = await taskset.run_tests( + cast(rlm_swe_v1.R2ESandbox, sandbox), + cast(vf.MutableConfigMap, {}), + 123, + ) assert output == "test output" assert len(sandbox.background_jobs) == 1 - command = sandbox.background_jobs[0]["command"] + command = cast(str, sandbox.background_jobs[0]["command"]) assert "SAFE='two words; $(echo nope)'" in command assert "QUOTE='it'\"'\"'s ok'" in command assert command.endswith("/bin/bash run_tests.sh > test_output.txt 2>&1") diff --git a/verifiers/v1/README.md b/verifiers/v1/README.md index 64a92d81f..05fb878ad 100644 --- a/verifiers/v1/README.md +++ b/verifiers/v1/README.md @@ -605,13 +605,16 @@ and log/trajectory artifacts. artifacts. `RLM` follows the same boundary for recursive LLM runs: `HarborTaskset` owns the task directory and tests, while `RLM` owns RLM installation, optional skill -upload to `/rlm/skills`, endpoint wiring, and trajectory filtering. +upload to `/rlm/skills`, endpoint-backed tool skills, endpoint wiring, and +trajectory filtering. Use `RLMConfig` in `env.harness` for RLM-specific settings such as `rlm_repo_ref`, `rlm_tools`, `rlm_max_turns`, and `summarize_at_tokens`. Tasksets can expose package-owned upload directories with `get_upload_dirs()`. The base `Taskset` discovers a sibling `skills/` directory by default, and `RLM` uploads that directory to `/rlm/skills` unless `skills=` is passed -explicitly to the harness. +explicitly to the harness. When the harness uses taskset-discovered skills, it +also stages a generated RLM skill for each resolved V1 tool so RLM can call the +same interception endpoint as other V1 harnesses. ## State Helpers @@ -1467,7 +1470,7 @@ mcp = true `program.channels` is deliberately limited to `callable` and `mcp`. Harness-specific tool carriers belong on the harness or taskset contract; for example, RLM reads `Taskset.get_upload_dirs()["skills"]` and uploads it to -`/rlm/skills`. +`/rlm/skills` alongside generated endpoint-backed skills for resolved V1 tools. `program.setup` prepares the process. `program.channels.mcp` registers resolved tool or endpoint config after the interception endpoint is live and before the diff --git a/verifiers/v1/packages/harnesses/rlm.py b/verifiers/v1/packages/harnesses/rlm.py index a7878bdd5..4ce26608c 100644 --- a/verifiers/v1/packages/harnesses/rlm.py +++ b/verifiers/v1/packages/harnesses/rlm.py @@ -24,6 +24,7 @@ RLM_DEFAULT_APPEND_TO_SYSTEM_PROMPT_PATH, RLMConfig, ) +from .rlm_skills import stage_rlm_tool_skills from ...types import ConfigMap, ProgramCommand, ProgramValue DEFAULT_RLM_CHECKOUT_PATH = "/tmp/rlm-checkout" @@ -32,7 +33,8 @@ Path.home() / ".cache" / "verifiers" / "rlm-checkouts" ) REQUIRED_RLM_CHECKOUT_FILES = ("install.sh", "pyproject.toml") -ProgramDir = str | Path | Traversable +SkillSource = str | Path | Traversable +ProgramDir = SkillSource | Callable[..., Path] class RLM(Harness[RLMConfig]): @@ -51,6 +53,7 @@ def __init__(self, config: RLMConfig | None = None): env: dict[str, ProgramValue] = { "PATH": "/root/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "OPENAI_MODEL": "runtime.model", + "VF_ENDPOINT_ROOT_URL": "state.endpoint_root_url", "RLM_MODEL": "runtime.model", "RLM_TOOLS": ",".join(harness_config.rlm_tools), "RLM_MAX_TURNS": str(harness_config.rlm_max_turns), @@ -136,14 +139,25 @@ def attach_taskset(self, taskset: Taskset) -> None: upload_dirs = taskset.get_upload_dirs() if not isinstance(upload_dirs, Mapping): raise TypeError("Taskset.get_upload_dirs() must return a mapping.") - skills = upload_dirs.get("skills") + source = upload_dirs.get("skills") self.set_program_dir( DEFAULT_RLM_SKILLS_PATH, - cast(ProgramDir | None, skills), + self.skill_loader(cast(SkillSource | None, source)), ) super().attach_taskset(taskset) self._program = self.compile_program(self.program) + def skill_loader(self, source: SkillSource | None) -> Callable[..., Path]: + def load(task: Task, state: State) -> Path: + return stage_rlm_tool_skills( + task, + state, + self.runtime, + source=source, + ) + + return load + def set_program_dir( self, remote_path: str, local_source: ProgramDir | None ) -> None: diff --git a/verifiers/v1/packages/harnesses/rlm_skills.py b/verifiers/v1/packages/harnesses/rlm_skills.py new file mode 100644 index 000000000..09dc6c863 --- /dev/null +++ b/verifiers/v1/packages/harnesses/rlm_skills.py @@ -0,0 +1,302 @@ +"""Endpoint-backed RLM skill generation for V1 tools.""" + +import hashlib +import keyword +import re +import shutil +from collections.abc import Mapping +from importlib import resources +from importlib.abc import Traversable +from pathlib import Path +from typing import TypedDict, cast + +from verifiers.types import Tool + +from ...runtime import Runtime +from ...state import State +from ...task import Task +from ...types import ConfigMap + +RLM_SKILLS_CACHE_ROOT = Path.home() / ".cache" / "verifiers" / "rlm-skills" + + +class SkillParam(TypedDict): + name: str + annotation: str + description: str + required: bool + default_literal: str | None + + +def stage_rlm_tool_skills( + task: Task, + state: State, + runtime: Runtime, + *, + source: Path | Traversable | str | None = None, + cache_root: Path = RLM_SKILLS_CACHE_ROOT, +) -> Path: + task_id = str(task.get("task_id") or task.get("task_name") or "task") + key = str(state.get("trajectory_id") or id(state)).replace("/", "_") + target = cache_root / task_id / key + if target.exists(): + shutil.rmtree(target) + target.mkdir(parents=True) + if source is not None: + copy_skill_source(source, target) + existing = {path.name for path in target.iterdir() if path.is_dir()} + for tool in runtime.tool_defs(state) or []: + skill_name = rlm_skill_name(tool.name) + if skill_name not in existing: + write_tool_skill(target / skill_name, tool, skill_name) + return target + + +def copy_skill_source(source: Path | Traversable | str, target: Path) -> None: + if isinstance(source, str): + source = Path(source) + with resources.as_file(source) as path: + shutil.copytree(path, target, dirs_exist_ok=True) + + +def rlm_skill_name(tool_name: str) -> str: + name = re.sub(r"\W", "_", tool_name) + if not name or name[0].isdigit(): + name = f"tool_{name}" + if keyword.iskeyword(name): + name = f"{name}_tool" + if name == tool_name: + return name + digest = hashlib.sha1(tool_name.encode()).hexdigest()[:8] + return f"{name}_{digest}" + + +def write_tool_skill(skill_dir: Path, tool: Tool, skill_name: str) -> None: + src_dir = skill_dir / "src" / skill_name + src_dir.mkdir(parents=True, exist_ok=True) + params = params_from_schema(tool.parameters) + summary = tool.description + arg_descs = {param["name"]: param["description"] for param in params} + (src_dir / "__init__.py").write_text(_INIT_MODULE.format(skill_name=skill_name)) + (src_dir / f"{skill_name}.py").write_text( + build_skill_module(tool.name, skill_name, params, summary, arg_descs) + ) + (skill_dir / "pyproject.toml").write_text( + _PYPROJECT.format( + skill_name=skill_name, skill_dash=skill_name.replace("_", "-") + ) + ) + (skill_dir / "SKILL.md").write_text( + skill_markdown(tool.name, skill_name, params, summary, arg_descs) + ) + + +def params_from_schema(schema: ConfigMap) -> list[SkillParam]: + properties = schema.get("properties") if isinstance(schema, Mapping) else {} + properties = properties if isinstance(properties, Mapping) else {} + raw_required = schema.get("required") if isinstance(schema, Mapping) else [] + required = ( + {name for name in raw_required if isinstance(name, str)} + if isinstance(raw_required, list) + else set() + ) + params: list[SkillParam] = [] + for name, value in properties.items(): + if ( + not isinstance(name, str) + or not name.isidentifier() + or keyword.iskeyword(name) + ): + continue + field_schema = cast(ConfigMap, value) if isinstance(value, Mapping) else {} + annotation = annotation_from_schema(field_schema) + is_required = name in required + params.append( + { + "name": name, + "annotation": annotation if is_required else f"{annotation} | None", + "description": str(field_schema.get("description") or ""), + "required": is_required, + "default_literal": None if is_required else "None", + } + ) + return params + + +def annotation_from_schema(schema: ConfigMap) -> str: + value = schema.get("type") + if isinstance(value, list): + value = next((item for item in value if item != "null"), None) + if value == "integer": + return "int" + if value == "number": + return "float" + if value == "boolean": + return "bool" + if value == "array": + items = schema.get("items") + if isinstance(items, Mapping): + return f"list[{annotation_from_schema(cast(ConfigMap, items))}]" + return "list[str]" + if value == "object": + return "dict" + return "str" + + +def build_skill_module( + tool_name: str, + skill_name: str, + params: list[SkillParam], + summary: str, + arg_descs: dict[str, str], +) -> str: + signature_parts = [] + for param in params: + part = f"{param['name']}: {param['annotation']}" + if not param["required"]: + part += f" = {param['default_literal']}" + signature_parts.append(part) + arguments = ( + "{" + ", ".join(f"{param['name']!r}: {param['name']}" for param in params) + "}" + ) + docstring = skill_docstring(tool_name, params, summary, arg_descs) + return _SKILL_MODULE.format( + tool_name=tool_name, + skill_name=skill_name, + signature=", ".join(signature_parts), + arguments=arguments, + docstring=repr(docstring), + ) + + +def skill_docstring( + tool_name: str, + params: list[SkillParam], + summary: str, + arg_descs: dict[str, str], +) -> str: + lines = [summary or f"Call the {tool_name} V1 tool."] + if arg_descs: + lines.extend(["", "Args:"]) + for param in params: + desc = arg_descs.get(param["name"], "") + lines.append(f" {param['name']}: {desc}".rstrip()) + return "\n".join(lines).strip() + + +def skill_markdown( + tool_name: str, + skill_name: str, + params: list[SkillParam], + summary: str, + arg_descs: dict[str, str], +) -> str: + lines = [f"# {skill_name}", ""] + if summary: + lines.extend([summary, ""]) + if skill_name != tool_name: + lines.extend([f"Calls the V1 tool `{tool_name}`.", ""]) + lines.append("Parameters:") + for param in params: + optional = "" if param["required"] else " (optional)" + desc = arg_descs.get(param["name"], "") + lines.append( + f"- `{param['name']}` ({param['annotation']}){optional}: {desc}".rstrip( + ": " + ) + ) + lines.extend( + ["", "From IPython:", "```python", f"await {skill_name}(...)", "```", ""] + ) + return "\n".join(lines) + + +_PYPROJECT = """\ +[project] +name = "rlm-skill-{skill_dash}" +version = "0.1.0" +requires-python = ">=3.10" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/{skill_name}"] +""" + + +_INIT_MODULE = '''\ +"""Auto-generated RLM skill: {skill_name}.""" + +from .{skill_name} import {skill_name}, run + +__all__ = ["{skill_name}", "run"] +''' + + +_SKILL_MODULE = '''\ +"""Auto-generated RLM skill: {skill_name}.""" + +import json +import os +import urllib.error +import urllib.request + +TOOL_NAME = {tool_name!r} + + +def _endpoint_root() -> str: + root = ( + os.environ.get("VF_ENDPOINT_ROOT_URL") + or os.environ.get("ANTHROPIC_BASE_URL") + ) + if root: + return root.rstrip("/") + base = os.environ.get("OPENAI_BASE_URL", "").rstrip("/") + if base.endswith("/v1"): + return base[:-3] + return base + + +def _call_endpoint(arguments): + root = _endpoint_root() + if not root: + raise RuntimeError("VF endpoint URL is not configured") + payload = json.dumps({{"arguments": arguments}}).encode() + token = ( + os.environ.get("VF_ENDPOINT_API_KEY") + or os.environ.get("OPENAI_API_KEY") + or os.environ.get("ANTHROPIC_API_KEY") + or "intercepted" + ) + request = urllib.request.Request( + f"{{root}}/vf/tools/{{TOOL_NAME}}", + data=payload, + headers={{ + "Authorization": f"Bearer {{token}}", + "Content-Type": "application/json", + "User-Agent": "OpenAI/Python", + }}, + method="POST", + ) + try: + with urllib.request.urlopen(request, timeout=300) as response: + result = json.loads(response.read().decode()) + except urllib.error.HTTPError as error: + body = error.read().decode(errors="replace") + raise RuntimeError(body or str(error)) from error + if "error" in result: + raise RuntimeError(str(result["error"])) + return result.get("result") + + +async def {skill_name}({signature}): + {docstring} + return _call_endpoint( + {{key: value for key, value in {arguments}.items() if value is not None}} + ) + + +run = {skill_name} +''' From e69611ec7d5027f3616f67789c480a9c2e00c9a8 Mon Sep 17 00:00:00 2001 From: Xeophon <46377542+xeophon@users.noreply.github.com> Date: Sat, 23 May 2026 00:17:07 +0200 Subject: [PATCH 2/2] Fix RLM skill staging edge cases --- tests/test_v1_rlm_swe.py | 52 +++++++++++++++++++ verifiers/v1/packages/harnesses/rlm_skills.py | 24 +++++++-- 2 files changed, 71 insertions(+), 5 deletions(-) diff --git a/tests/test_v1_rlm_swe.py b/tests/test_v1_rlm_swe.py index c7eda0a6b..e92c0d5c2 100644 --- a/tests/test_v1_rlm_swe.py +++ b/tests/test_v1_rlm_swe.py @@ -126,6 +126,36 @@ def get_upload_dirs(self): compile(skill_source, str(staged / "greet" / "src" / "greet" / "greet.py"), "exec") +def test_rlm_skill_staging_sanitizes_cache_paths(tmp_path: Path): + async def greet(name: str) -> str: + return f"Hello, {name}!" + + taskset = vf.Taskset( + config=vf.TasksetConfig( + source=[{"task_id": "../outside/group", "question": "Say hi."}] + ) + ) + taskset.add_toolset(vf.Toolset(tools=[greet])) + env = vf.Env( + taskset=taskset, + harness=vf.RLM(config=vf.RLMConfig(local_checkout="/tmp/checkout")), + ) + task = next(iter(env.taskset)) + state = vf.State.for_task(task) + asyncio.run(env.harness.setup_state(task, state)) + + from verifiers.v1.packages.harnesses.rlm_skills import stage_rlm_tool_skills + + cache_root = tmp_path / "cache" + staged = stage_rlm_tool_skills( + task, state, env.harness.runtime, cache_root=cache_root + ) + + assert staged.is_relative_to(cache_root) + assert not (tmp_path / "outside").exists() + assert (staged / "greet" / "SKILL.md").exists() + + def test_generated_rlm_skill_calls_v1_tool_endpoint(tmp_path: Path, monkeypatch): tool = Tool( name="greet", @@ -190,6 +220,28 @@ def log_message(self, format, *args): thread.join(timeout=5) +def test_generated_rlm_skill_orders_required_params_first(tmp_path: Path): + tool = Tool( + name="mix", + description="Mix required and optional args.", + parameters={ + "type": "object", + "properties": { + "optional": {"type": "string"}, + "required": {"type": "string"}, + }, + "required": ["required"], + }, + ) + from verifiers.v1.packages.harnesses.rlm_skills import write_tool_skill + + write_tool_skill(tmp_path / "mix", tool, "mix") + source = (tmp_path / "mix" / "src" / "mix" / "mix.py").read_text() + + assert "async def mix(required: str, optional: str | None = None):" in source + compile(source, str(tmp_path / "mix" / "src" / "mix" / "mix.py"), "exec") + + def test_taskset_discovers_sibling_skills_dir_by_default( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: diff --git a/verifiers/v1/packages/harnesses/rlm_skills.py b/verifiers/v1/packages/harnesses/rlm_skills.py index 09dc6c863..774d6b07c 100644 --- a/verifiers/v1/packages/harnesses/rlm_skills.py +++ b/verifiers/v1/packages/harnesses/rlm_skills.py @@ -17,7 +17,7 @@ from ...task import Task from ...types import ConfigMap -RLM_SKILLS_CACHE_ROOT = Path.home() / ".cache" / "verifiers" / "rlm-skills" +RLM_SKILLS_CACHE_ROOT: Path | None = None class SkillParam(TypedDict): @@ -34,10 +34,14 @@ def stage_rlm_tool_skills( runtime: Runtime, *, source: Path | Traversable | str | None = None, - cache_root: Path = RLM_SKILLS_CACHE_ROOT, + cache_root: Path | None = None, ) -> Path: - task_id = str(task.get("task_id") or task.get("task_name") or "task") - key = str(state.get("trajectory_id") or id(state)).replace("/", "_") + if cache_root is None: + cache_root = ( + RLM_SKILLS_CACHE_ROOT or Path.home() / ".cache" / "verifiers" / "rlm-skills" + ) + task_id = cache_path_segment(task.get("task_id") or task.get("task_name") or "task") + key = cache_path_segment(state.get("trajectory_id") or id(state)) target = cache_root / task_id / key if target.exists(): shutil.rmtree(target) @@ -52,6 +56,16 @@ def stage_rlm_tool_skills( return target +def cache_path_segment(value: object) -> str: + raw = str(value) + name = re.sub(r"[^A-Za-z0-9_.-]+", "_", raw).strip("._-") + name = name or "item" + if name == raw: + return name + digest = hashlib.sha1(raw.encode()).hexdigest()[:8] + return f"{name}_{digest}" + + def copy_skill_source(source: Path | Traversable | str, target: Path) -> None: if isinstance(source, str): source = Path(source) @@ -120,7 +134,7 @@ def params_from_schema(schema: ConfigMap) -> list[SkillParam]: "default_literal": None if is_required else "None", } ) - return params + return sorted(params, key=lambda param: not param["required"]) def annotation_from_schema(schema: ConfigMap) -> str: