Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions benchmarks/commit0/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
EvalMetadata,
EvalOutput,
)
from benchmarks.utils.version import SDK_SHORT_SHA
from benchmarks.utils.version import IMAGE_TAG_PREFIX
from openhands.sdk import Agent, Conversation, Tool, get_logger
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.delegate import DelegateTool
Expand Down Expand Up @@ -190,7 +190,7 @@ def prepare_workspace(
custom_tag = extract_custom_tag(base_docker_image)
suffix = f"-{build_target}" if build_target != "binary" else ""
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
)
workspace = create_docker_workspace(
agent_server_image=agent_server_image,
Expand All @@ -205,11 +205,10 @@ def prepare_workspace(
"RUNTIME_API_KEY environment variable is not set for remote workspace"
)

sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
custom_tag = extract_custom_tag(base_docker_image)
suffix = f"-{build_target}" if build_target != "binary" else ""
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
)

if not remote_image_exists(agent_server_image):
Expand All @@ -220,7 +219,7 @@ def prepare_workspace(

logger.info(
f"Using remote workspace with image {agent_server_image} "
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
workspace = APIRemoteWorkspace(
Expand Down
9 changes: 4 additions & 5 deletions benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
from benchmarks.utils.llm_config import load_llm_config
from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
from benchmarks.utils.version import SDK_SHORT_SHA
from benchmarks.utils.version import IMAGE_TAG_PREFIX
from openhands.sdk import (
Agent,
Conversation,
Expand Down Expand Up @@ -157,7 +157,7 @@ def prepare_workspace(

if self.metadata.workspace_type == "docker":
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-gaia-binary"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary"
)
workspace = create_docker_workspace(
agent_server_image=agent_server_image,
Expand All @@ -177,9 +177,8 @@ def prepare_workspace(
"RUNTIME_API_KEY environment variable is not set for remote workspace"
)

sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-gaia-binary"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary"
)

if not remote_image_exists(agent_server_image):
Expand All @@ -190,7 +189,7 @@ def prepare_workspace(

logger.info(
f"Using remote workspace with GAIA image {agent_server_image} "
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
workspace = APIRemoteWorkspace(
Expand Down
9 changes: 4 additions & 5 deletions benchmarks/multiswebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
EvalMetadata,
EvalOutput,
)
from benchmarks.utils.version import SDK_SHORT_SHA
from benchmarks.utils.version import IMAGE_TAG_PREFIX
from openhands.sdk import Agent, Conversation, Tool, get_logger
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.delegate import DelegateTool
Expand Down Expand Up @@ -209,7 +209,7 @@ def prepare_workspace(

if self.metadata.workspace_type == "docker":
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
)
ensure_local_image(
agent_server_image=agent_server_image,
Expand All @@ -224,14 +224,13 @@ def prepare_workspace(
)
elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
if not runtime_api_key:
raise ValueError(
"RUNTIME_API_KEY environment variable is not set for remote workspace"
)

agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
)
if not remote_image_exists(agent_server_image):
raise RuntimeError(
Expand All @@ -240,7 +239,7 @@ def prepare_workspace(
)
logger.info(
f"Using remote workspace with image {agent_server_image} "
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
workspace = APIRemoteWorkspace(
Expand Down
9 changes: 4 additions & 5 deletions benchmarks/swebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
EvalOutput,
ToolPresetType,
)
from benchmarks.utils.version import SDK_SHORT_SHA
from benchmarks.utils.version import IMAGE_TAG_PREFIX
from openhands.sdk import Agent, Conversation, Tool, get_logger
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.delegate import DelegateTool
Expand Down Expand Up @@ -153,7 +153,7 @@ def prepare_workspace(
f"-{build_target}" if build_target != constants.BUILD_TARGET_BINARY else ""
)
base_agent_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
)
wrap_needed = should_wrap_instance_id(instance.id)
agent_server_image = base_agent_image
Expand Down Expand Up @@ -185,14 +185,13 @@ def prepare_workspace(
)
elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
if not runtime_api_key:
raise ValueError(
"RUNTIME_API_KEY environment variable is not set for remote workspace"
)

agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
)
if not remote_image_exists(agent_server_image):
raise RuntimeError(
Expand All @@ -201,7 +200,7 @@ def prepare_workspace(
)
logger.info(
f"Using remote workspace with image {agent_server_image} "
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
)
startup_timeout = float(
os.getenv(
Expand Down
9 changes: 4 additions & 5 deletions benchmarks/swebenchmultimodal/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
EvalMetadata,
EvalOutput,
)
from benchmarks.utils.version import SDK_SHORT_SHA
from benchmarks.utils.version import IMAGE_TAG_PREFIX
from openhands.sdk import (
Agent,
Conversation,
Expand Down Expand Up @@ -162,7 +162,7 @@ def prepare_workspace(

if self.metadata.workspace_type == "docker":
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
)
ensure_local_image(
agent_server_image=agent_server_image,
Expand All @@ -177,14 +177,13 @@ def prepare_workspace(
)
elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
if not runtime_api_key:
raise ValueError(
"RUNTIME_API_KEY environment variable is not set for remote workspace"
)

agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
)
if not remote_image_exists(agent_server_image):
raise RuntimeError(
Expand All @@ -193,7 +192,7 @@ def prepare_workspace(
)
logger.info(
f"Using remote workspace with image {agent_server_image} "
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
workspace = APIRemoteWorkspace(
Expand Down
19 changes: 7 additions & 12 deletions benchmarks/swefficiency/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
EvalMetadata,
EvalOutput,
)
from benchmarks.utils.version import SDK_SHORT_SHA
from benchmarks.utils.version import IMAGE_TAG_PREFIX
from openhands.sdk import LLM, Agent, Conversation, get_logger
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.preset.default import get_default_tools
Expand Down Expand Up @@ -200,7 +200,7 @@ def prepare_workspace(
# Build agent server image tag
suffix = f"-{build_target}" if build_target != "binary" else ""
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
)

logger.info(f"Base image: {base_docker_image}")
Expand Down Expand Up @@ -237,33 +237,28 @@ def prepare_workspace(

elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
if not runtime_api_key:
raise ValueError(
"RUNTIME_API_KEY environment variable is not set for remote workspace"
)

# For remote, use SDK_SHORT_SHA from env if available
remote_agent_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
)
if not remote_image_exists(remote_agent_image):
if not remote_image_exists(agent_server_image):
raise RuntimeError(
f"Agent server image {remote_agent_image} does not exist in container registry, "
f"Agent server image {agent_server_image} does not exist in container registry, "
"make sure to build, push it, and make it public accessible before using remote workspace."
)

logger.info(
f"Using remote workspace with image {remote_agent_image} "
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
f"Using remote workspace with image {agent_server_image} "
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
)

workspace = APIRemoteWorkspace(
runtime_api_url=os.getenv(
"RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"
),
runtime_api_key=runtime_api_key,
server_image=remote_agent_image,
server_image=agent_server_image,
target_type="source",
forward_env=forward_env or [],
resource_factor=resource_factor,
Expand Down
9 changes: 4 additions & 5 deletions benchmarks/swtbench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
EvalMetadata,
EvalOutput,
)
from benchmarks.utils.version import SDK_SHORT_SHA
from benchmarks.utils.version import IMAGE_TAG_PREFIX
from openhands.agent_server.docker.build import _base_slug
from openhands.sdk import Agent, Conversation, Tool, __version__, get_logger
from openhands.sdk.workspace import RemoteWorkspace
Expand Down Expand Up @@ -168,7 +168,7 @@ def prepare_workspace(

if self.metadata.workspace_type == "docker":
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
)
workspace = create_docker_workspace(
agent_server_image=agent_server_image,
Expand All @@ -178,14 +178,13 @@ def prepare_workspace(
)
elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
if not runtime_api_key:
raise ValueError(
"RUNTIME_API_KEY environment variable is not set for remote workspace"
)

agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
)
if not remote_image_exists(agent_server_image):
raise RuntimeError(
Expand All @@ -194,7 +193,7 @@ def prepare_workspace(
)
logger.info(
f"Using remote workspace with image {agent_server_image} "
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
workspace = APIRemoteWorkspace(
Expand Down
9 changes: 8 additions & 1 deletion benchmarks/utils/constants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import os


OUTPUT_FILENAME = "output.jsonl"
EVAL_AGENT_SERVER_IMAGE = "ghcr.io/openhands/eval-agent-server"

# Image name for agent server (can be overridden via env var)
EVAL_AGENT_SERVER_IMAGE = os.getenv(
"OPENHANDS_EVAL_AGENT_SERVER_IMAGE", "ghcr.io/openhands/eval-agent-server"
)

# Model identifier used in swebench-style prediction entries.
# The swebench harness uses this value to create log directory structures
Expand Down
33 changes: 13 additions & 20 deletions benchmarks/utils/modal_patches.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,17 @@ def emit(message: str) -> None:
return emit


def _get_sdk_short_sha() -> str:
def _get_image_tag_prefix() -> str:
"""
Resolve SDK short SHA from the benchmarks repo when available, otherwise
fall back to environment variables for the Modal function image.
Resolve the image tag prefix from the benchmarks repo when available,
otherwise fall back to environment variables for the Modal function image.
"""
try:
from benchmarks.utils.version import SDK_SHORT_SHA as version_sdk_short_sha
from benchmarks.utils.version import IMAGE_TAG_PREFIX

return version_sdk_short_sha
return IMAGE_TAG_PREFIX
except Exception:
return os.getenv("SDK_SHORT_SHA", "").strip() or "unknown"
return os.getenv("IMAGE_TAG_PREFIX", "").strip() or "unknown"


def _get_agent_server_image_repo() -> str:
Expand Down Expand Up @@ -77,18 +77,18 @@ def _build_prebuilt_image_tag(test_spec) -> str:
if not instance_id:
raise RuntimeError("TestSpec missing instance_id; cannot select Modal image")

sdk_short_sha = _get_sdk_short_sha()
if sdk_short_sha in ("", "unknown", None):
image_tag_prefix = _get_image_tag_prefix()
if image_tag_prefix in ("", "unknown", None):
raise RuntimeError(
"SDK short SHA is unavailable. Set SDK_SHORT_SHA or ensure the "
"Image tag prefix is unavailable. Set IMAGE_TAG_PREFIX or ensure the "
"benchmarks repository has an initialized SDK submodule."
)

target = _get_build_target()
suffix = f"-{target}" if target and target != "binary" else ""
custom_tag = _get_custom_tag_from_instance_id(instance_id)
agent_repo = _get_agent_server_image_repo()
return f"{agent_repo}:{sdk_short_sha}-{custom_tag}{suffix}"
return f"{agent_repo}:{image_tag_prefix}-{custom_tag}{suffix}"


def _patch_modal_sklearn_install_flag() -> None:
Expand Down Expand Up @@ -497,16 +497,9 @@ def _inject_modal_sitecustomize() -> None:
)

env_vars = {"PYTHONPATH": "/root"}
try:
from benchmarks.utils.version import SDK_SHA, SDK_SHORT_SHA

env_vars["SDK_SHA"] = SDK_SHA
env_vars["SDK_SHORT_SHA"] = SDK_SHORT_SHA
except Exception:
sdk_sha_env = os.getenv("SDK_SHA")
if sdk_sha_env:
env_vars["SDK_SHA"] = sdk_sha_env
env_vars["SDK_SHORT_SHA"] = _get_sdk_short_sha()
env_vars["IMAGE_TAG_PREFIX"] = _get_image_tag_prefix()
# Backward compatibility - remove in next major version
env_vars["SDK_SHORT_SHA"] = env_vars["IMAGE_TAG_PREFIX"]

env_vars["EVAL_AGENT_SERVER_IMAGE"] = _get_agent_server_image_repo()
env_vars["SWEBENCH_IMAGE_TARGET"] = _get_build_target()
Expand Down
Loading
Loading