diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c130cfd2e..cda47fc59 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -36,4 +36,4 @@ repos: types: [python] pass_filenames: true always_run: false - exclude: ^legacy/ + exclude: ^(legacy|nemo_evaluator)/ diff --git a/nemo_evaluator/openhands_benchmarks/__init__.py b/nemo_evaluator/openhands_benchmarks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nemo_evaluator/openhands_benchmarks/framework.yml b/nemo_evaluator/openhands_benchmarks/framework.yml new file mode 100644 index 000000000..2d499c157 --- /dev/null +++ b/nemo_evaluator/openhands_benchmarks/framework.yml @@ -0,0 +1,261 @@ +framework: + name: openhands_benchmarks + pkg_name: openhands_benchmarks + full_name: OpenHands Benchmarks + description: Multi-benchmark evaluation harness using the OpenHands agent framework. + url: https://github.com/All-Hands-AI/openhands-agent-benchmarks + +defaults: + command: >- + python3 -m nemo_evaluator.openhands_benchmarks.run_benchmark + --model openai/{{target.api_endpoint.model_id}} + --api-base-url {{target.api_endpoint.url}} + {% if target.api_endpoint.api_key_name is not none %}--api-key-env {{target.api_endpoint.api_key_name}}{% endif %} + --temperature {{config.params.temperature}} + --top-p {{config.params.top_p}} + --max-completion-tokens {{config.params.max_new_tokens}} + --timeout {{config.params.request_timeout}} + --max-retries {{config.params.max_retries}} + --benchmark {{config.params.extra.benchmark}} + {% if config.params.extra.dataset is defined and config.params.extra.dataset is not none %}--dataset {{config.params.extra.dataset}}{% endif %} + {% if config.params.extra.split is defined and config.params.extra.split is not none %}--split {{config.params.extra.split}}{% endif %} + --workspace {{config.params.extra.workspace}} + --max-iterations {{config.params.extra.max_steps}} + --num-workers {{config.params.parallelism}} + --note {{config.type}} + --output-dir {{config.output_dir}} + --max-attempts {{config.params.extra.max_attempts}} + --instance-max-retries {{config.params.extra.instance_max_retries}} + {% if config.params.limit_samples is not none %}--n-limit {{config.params.limit_samples}}{% endif %} + {% if config.params.extra.level is defined and config.params.extra.level is not none %}--level {{config.params.extra.level}}{% endif %} + {% if config.params.extra.repo_split is defined and config.params.extra.repo_split is not none %}--repo-split {{config.params.extra.repo_split}}{% endif %} + {% if config.params.extra.language is defined and config.params.extra.language is not none %}--language {{config.params.extra.language}}{% endif %} + {% if config.params.extra.modal is defined and config.params.extra.modal is not none %}{% if config.params.extra.modal %}--modal{% else %}--no-modal{% endif %}{% endif %} + + config: + params: + limit_samples: null + temperature: 0.6 + top_p: 1.0 + max_new_tokens: 64000 + request_timeout: 84000 + max_retries: 5 + parallelism: 1 + extra: + workspace: docker + max_steps: 100 + max_attempts: 3 + instance_max_retries: 3 + target: + api_endpoint: + adapter_config: + mode: client # disable adapters by default + +evaluations: + # SWE-bench variants + - name: swebench-verified + description: SWE-bench Verified - 500 human-validated GitHub issues + defaults: + config: + type: swebench-verified + supported_endpoint_types: [chat] + params: + extra: + benchmark: swebench + dataset: princeton-nlp/SWE-bench_Verified + split: test + + - name: swebench-lite + description: SWE-bench Lite - 300 curated GitHub issues + defaults: + config: + type: swebench-lite + supported_endpoint_types: [chat] + params: + extra: + benchmark: swebench + dataset: princeton-nlp/SWE-bench_Lite + split: test + + - name: swebench-full + description: SWE-bench Full - Complete dataset of GitHub issues + defaults: + config: + type: swebench-full + supported_endpoint_types: [chat] + params: + extra: + benchmark: swebench + dataset: princeton-nlp/SWE-bench + split: test + + # GAIA benchmark + - name: gaia + description: GAIA - General AI Assistant benchmark for real-world tasks requiring reasoning, tool use, and web browsing + defaults: + config: + type: gaia + supported_endpoint_types: [chat] + params: + extra: + benchmark: gaia + dataset: gaia-benchmark/GAIA + split: test + level: "2023_all" + + # Commit0 benchmark + - name: commit0 + description: Commit0 - Repository-level code generation benchmark + defaults: + config: + type: commit0 + supported_endpoint_types: [chat] + params: + extra: + benchmark: commit0 + dataset: wentingzhao/commit0_combined + split: test + repo_split: lite + max_attempts: 1 + + # Multi-SWE-bench (multilingual) + - name: multiswebench-java + description: Multi-SWE-bench Java - Multilingual SWE-bench for Java repositories + defaults: + config: + type: multiswebench-java + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: java_verified + language: java + + - name: multiswebench-python # empty subset + description: Multi-SWE-bench Python - Multilingual SWE-bench for Python repositories + defaults: + config: + type: multiswebench-python + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: python_verified + language: python + + - name: multiswebench-go + description: Multi-SWE-bench Go - Multilingual SWE-bench for Go repositories + defaults: + config: + type: multiswebench-go + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: go_verified + language: go + + - name: multiswebench-c + description: Multi-SWE-bench C - Multilingual SWE-bench for C repositories + defaults: + config: + type: multiswebench-c + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: c_verified + language: c + + - name: multiswebench-cpp + description: Multi-SWE-bench C++ - Multilingual SWE-bench for C++ repositories + defaults: + config: + type: multiswebench-cpp + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: cpp_verified + language: cpp + + - name: multiswebench-js + description: Multi-SWE-bench JavaScript - Multilingual SWE-bench for JavaScript repositories + defaults: + config: + type: multiswebench-js + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: js_verified + language: js + + - name: multiswebench-rust + description: Multi-SWE-bench Rust - Multilingual SWE-bench for Rust repositories + defaults: + config: + type: multiswebench-rust + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: rust_verified + language: rust + + - name: multiswebench-ts + description: Multi-SWE-bench TypeScript - Multilingual SWE-bench for TypeScript repositories + defaults: + config: + type: multiswebench-ts + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: ts_verified + language: ts + + # SWT-bench + - name: swtbench + description: SWT-bench - Software testing benchmark for test generation + defaults: + config: + type: swtbench + supported_endpoint_types: [chat] + params: + extra: + benchmark: swtbench + + # SWE-bench Multimodal + - name: swebench-multimodal + description: SWE-bench Multimodal - GitHub issues with visual context + defaults: + config: + type: swebench-multimodal + supported_endpoint_types: [chat] + params: + extra: + benchmark: swebenchmultimodal + dataset: princeton-nlp/SWE-bench_Multimodal + split: dev # test split did not work + + # OpenAgentSafety benchmark + - name: openagentsafety + description: OpenAgentSafety - Safety evaluation benchmark for AI agents + defaults: + config: + type: openagentsafety + supported_endpoint_types: [chat] + params: + extra: + benchmark: openagentsafety + dataset: mgulavani/openagentsafety_full_updated_v3 + split: train diff --git a/nemo_evaluator/openhands_benchmarks/generate_llm_config.py b/nemo_evaluator/openhands_benchmarks/generate_llm_config.py new file mode 100644 index 000000000..651720772 --- /dev/null +++ b/nemo_evaluator/openhands_benchmarks/generate_llm_config.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +import argparse +import json +import os +from pathlib import Path + + +def generate_config( + model: str, + output_path: str, + api_base_url: str | None = None, + api_key_env: str | None = None, + temperature: float | None = None, + top_p: float | None = None, + max_completion_tokens: int | None = None, + timeout: int | None = None, + max_retries: int | None = None, +) -> None: + llm_config: dict[str, object] = {"model": model} + + if api_base_url: + # Strip /chat/completions suffix for LiteLLM compatibility + base_url = api_base_url.rstrip("/") + if base_url.endswith("/chat/completions"): + base_url = base_url.removesuffix("/chat/completions") + llm_config["base_url"] = base_url + if api_key_env: + # Resolve env var name to actual API key + api_key = os.environ.get(api_key_env, "") + if not api_key: + raise ValueError( + f"Environment variable {api_key_env} is not set or empty. " + f"Please set it with your API key." + ) + llm_config["api_key"] = api_key + if temperature is not None: + llm_config["temperature"] = temperature + if top_p is not None: + llm_config["top_p"] = top_p + if max_completion_tokens is not None: + llm_config["max_output_tokens"] = max_completion_tokens + if timeout is not None: + llm_config["timeout"] = timeout + if max_retries is not None: + llm_config["num_retries"] = max_retries + + out_path = Path(output_path) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(llm_config, indent=2) + "\n", encoding="utf-8") + + print(f"Wrote LLM config to {str(out_path)}") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Generate LLM config from CLI args", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument("--model", type=str, required=True, help="Model name/id") + parser.add_argument("--api-base-url", type=str, help="API base URL") + parser.add_argument( + "--api-key-env", + type=str, + help="Environment variable name containing the API key", + ) + parser.add_argument("--temperature", type=float, help="Sampling temperature") + parser.add_argument("--top-p", type=float, help="Nucleus sampling (top-p)") + parser.add_argument( + "--max-completion-tokens", type=int, help="Max completion tokens" + ) + parser.add_argument("--timeout", type=int, help="API timeout in seconds") + parser.add_argument("--max-retries", type=int, help="Max API call retries") + parser.add_argument( + "--output-path", + type=str, + required=True, + help="Where to write the generated JSON config", + ) + + args = parser.parse_args() + + generate_config( + model=args.model, + output_path=args.output_path, + api_base_url=args.api_base_url, + api_key_env=args.api_key_env, + temperature=args.temperature, + top_p=args.top_p, + max_completion_tokens=args.max_completion_tokens, + timeout=args.timeout, + max_retries=args.max_retries, + ) + + +if __name__ == "__main__": + main() diff --git a/nemo_evaluator/openhands_benchmarks/output.py b/nemo_evaluator/openhands_benchmarks/output.py new file mode 100644 index 000000000..7e96e9fc0 --- /dev/null +++ b/nemo_evaluator/openhands_benchmarks/output.py @@ -0,0 +1,61 @@ +import json +import pathlib + +from nemo_evaluator.api.api_dataclasses import EvaluationResult + + +def parse_output(output_dir: str) -> EvaluationResult: + output_path = pathlib.Path(output_dir) + + # Find any .report.json file (all benchmarks use this naming convention) + report_files = sorted(output_path.rglob("*.report.json")) + + if not report_files: + raise FileNotFoundError( + f"No .report.json file found under {output_dir}. " + "Make sure the evaluation completed successfully." + ) + + if len(report_files) > 1: + raise ValueError( + f"Multiple .report.json files found: {report_files}. " + "`output_dir` must contain a single evaluation run." + ) + + report = json.loads(report_files[0].read_text(encoding="utf-8")) + + # Get benchmark name from metadata written by run_benchmark.py + metadata_file = output_path / "nemo_metadata.json" + if not metadata_file.exists(): + raise FileNotFoundError( + f"nemo_metadata.json not found in {output_dir}. " + "Make sure the benchmark was run via run_benchmark.py." + ) + metadata = json.loads(metadata_file.read_text(encoding="utf-8")) + task_name = metadata["benchmark"] + + # All benchmarks have these common fields in their report + resolved = report.get("resolved_instances", 0) + submitted = report.get("submitted_instances", 0) + + # Calculate accuracy (handle division by zero) + accuracy = resolved / submitted if submitted > 0 else 0.0 + + metrics = { + "accuracy": { + "scores": { + "accuracy": { + "value": accuracy, + "stats": { + "resolved": resolved, + "total": submitted, + }, + } + } + } + } + + tasks = {task_name: {"metrics": metrics}} + groups = {task_name: {"metrics": metrics}} + + return EvaluationResult(tasks=tasks, groups=groups) diff --git a/nemo_evaluator/openhands_benchmarks/run_benchmark.py b/nemo_evaluator/openhands_benchmarks/run_benchmark.py new file mode 100644 index 000000000..31317797b --- /dev/null +++ b/nemo_evaluator/openhands_benchmarks/run_benchmark.py @@ -0,0 +1,255 @@ +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +from collections.abc import Callable +from pathlib import Path + +from nemo_evaluator.openhands_benchmarks.generate_llm_config import generate_config + + +INFER_ENTRYPOINTS = { + "swebench": "swebench-infer", + "gaia": "gaia-infer", + "commit0": "commit0-infer", + "multiswebench": "multiswebench-infer", + "swtbench": "swtbench-infer", + "swebenchmultimodal": "swebenchmultimodal-infer", + "openagentsafety": "openagentsafety-infer", +} + +EVAL_ENTRYPOINTS = { + "swebench": "swebench-eval", + "gaia": "gaia-eval", + "commit0": "commit0-eval", + "multiswebench": "multiswebench-eval", + "swtbench": "swtbench-eval", + "swebenchmultimodal": "swebenchmultimodal-eval", + # openagentsafety doesn't have a separate eval entrypoint +} + +# Benchmark-specific inference parameters. +# Each entry maps a benchmark name to a function that returns a list of +# (flag, value) tuples to append to the inference command. +BENCHMARK_INFER_PARAMS: dict[ + str, Callable[[argparse.Namespace], list[tuple[str, str]]] +] = { + "gaia": lambda args: [("--level", args.level)] if args.level else [], + "commit0": lambda args: [("--repo-split", args.repo_split)] + if args.repo_split + else [], + "multiswebench": lambda args: [("--lang", args.language)] if args.language else [], +} + +# Benchmark-specific evaluation parameters. +# Each entry returns a list of (flag, value) tuples. Empty value = bare flag. +BENCHMARK_EVAL_PARAMS: dict[ + str, Callable[[argparse.Namespace, Path], list[tuple[str, str]]] +] = { + "swebench": lambda args, out: [ + *([("--dataset", args.dataset)] if args.dataset else []), + ("--run-id", out.stem), + *([("--modal", "")] if args.modal is True else []), + *([("--no-modal", "")] if args.modal is False else []), + ], + "swebenchmultimodal": lambda args, _: [ + *([("--dataset", args.dataset)] if args.dataset else []), + ], + "multiswebench": lambda args, _: [ + *([("--dataset", args.dataset)] if args.dataset else []), + *([("--lang", args.language)] if args.language else []), + ], +} + +# Benchmark-specific environment variables to set before inference. +BENCHMARK_ENV_VARS: dict[str, Callable[[argparse.Namespace], dict[str, str]]] = { + "multiswebench": lambda args: ( + {"LANGUAGE": args.language} if args.language else {} + ), +} + +# Patch-based benchmarks use "finish_with_patch" (requires git patch). +# gaia and openagentsafety use "pass" (accept any completed output). +BENCHMARK_CRITIC = { + "swebench": "finish_with_patch", + "swtbench": "finish_with_patch", + "swebenchmultimodal": "finish_with_patch", + "multiswebench": "finish_with_patch", + "commit0": "finish_with_patch", + "gaia": "pass", + "openagentsafety": "pass", +} + + +def _build_infer_cmd(args: argparse.Namespace, llm_config_path: Path) -> list[str]: + """Build the inference command with benchmark-specific args.""" + cmd = [ + INFER_ENTRYPOINTS[args.benchmark], + str(llm_config_path), + "--workspace", + args.workspace, + "--max-iterations", + str(args.max_iterations), + "--num-workers", + str(args.num_workers), + "--output-dir", + str(args.output_dir), + "--max-attempts", + str(args.max_attempts), + "--max-retries", + str(args.instance_max_retries), + "--critic", + BENCHMARK_CRITIC.get(args.benchmark, "finish_with_patch"), + ] + if args.dataset: + cmd.extend(["--dataset", args.dataset]) + if args.split: + cmd.extend(["--split", args.split]) + + if args.note: + cmd.extend(["--note", args.note]) + if args.n_limit is not None: + cmd.extend(["--n-limit", str(args.n_limit)]) + # ----- Benchmark-specific inference args ----- + if args.benchmark in BENCHMARK_INFER_PARAMS: + for flag, value in BENCHMARK_INFER_PARAMS[args.benchmark](args): + cmd.extend([flag, value]) + + return cmd + + +def _build_eval_cmd(args: argparse.Namespace, output_jsonl: Path) -> list[str]: + """Build the evaluation command with benchmark-specific args.""" + benchmark = args.benchmark + if benchmark not in EVAL_ENTRYPOINTS: + return [] + + cmd = [EVAL_ENTRYPOINTS[benchmark], str(output_jsonl)] + + if benchmark in BENCHMARK_EVAL_PARAMS: + for flag, value in BENCHMARK_EVAL_PARAMS[benchmark](args, output_jsonl): + if value: + cmd.extend([flag, value]) + else: + cmd.append(flag) + + return cmd + + +def main() -> None: + parser = argparse.ArgumentParser() + + # LLM config generation args + parser.add_argument("--model", type=str, required=True) + parser.add_argument("--api-base-url", type=str, required=True) + parser.add_argument( + "--api-key-env", type=str, default=None, help="Env var name for API key" + ) + parser.add_argument("--temperature", type=float, default=0.0) + parser.add_argument("--top-p", type=float, default=1.0) + parser.add_argument("--max-completion-tokens", type=int, default=4096) + parser.add_argument("--timeout", type=int, default=600) + parser.add_argument("--max-retries", type=int, default=3) + + # Benchmark selection + parser.add_argument("--benchmark", required=True, choices=INFER_ENTRYPOINTS.keys()) + + # Common inference args + parser.add_argument("--dataset", type=str, default=None) + parser.add_argument("--split", type=str, default=None) + parser.add_argument("--workspace", type=str, default="docker") + parser.add_argument("--max-iterations", type=int, default=100) + parser.add_argument("--num-workers", type=int, default=1) + parser.add_argument("--note", type=str, default="") + parser.add_argument("--output-dir", type=str, required=True) + parser.add_argument("--max-attempts", type=int, default=3) + parser.add_argument("--instance-max-retries", type=int, default=3) + parser.add_argument("--n-limit", type=int, default=None) + + # GAIA + parser.add_argument( + "--level", + type=str, + default="2023_all", + help="GAIA level (e.g. 2023_level1, 2023_all)", + ) + # commit0 + parser.add_argument( + "--repo-split", + type=str, + default="lite", + help="commit0 repo split (lite, all, or repo name)", + ) + # multiswebench + parser.add_argument( + "--language", + type=str, + default=None, + help="multiswebench language (java, python, go, c)", + ) + # swebench/swebenchmultimodal + parser.add_argument( + "--modal", + dest="modal", + action=argparse.BooleanOptionalAction, + default=None, + help=( + "Enable/disable Modal for swebench and swebenchmultimodal evaluation. " + "If omitted, each benchmark uses its default." + ), + ) + + args = parser.parse_args() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + llm_config_path = output_dir / "llm_config.json" + + # 1) Generate LLM config + generate_config( + model=args.model, + api_base_url=args.api_base_url, + api_key_env=args.api_key_env, + temperature=args.temperature, + top_p=args.top_p, + max_completion_tokens=args.max_completion_tokens, + timeout=args.timeout, + max_retries=args.max_retries, + output_path=str(llm_config_path), + ) + + # 2) Write NeMo metadata (so output.py can identify the benchmark) + (output_dir / "nemo_metadata.json").write_text( + json.dumps({"benchmark": args.benchmark}) + "\n" + ) + + # 3) Run inference + if args.benchmark in BENCHMARK_ENV_VARS: + os.environ.update(BENCHMARK_ENV_VARS[args.benchmark](args)) + + infer_cmd = _build_infer_cmd(args, llm_config_path) + ret = subprocess.call(infer_cmd) + if ret != 0: + sys.exit(ret) + + # 4) Find output.jsonl and run evaluation + output_files = sorted(output_dir.rglob("output.jsonl")) + if not output_files: + print( + f"ERROR: Inference did not produce output.jsonl under {output_dir}", + file=sys.stderr, + ) + sys.exit(1) + + output_jsonl = output_files[-1] # Use the latest one + + eval_cmd = _build_eval_cmd(args, output_jsonl) + if eval_cmd: + sys.exit(subprocess.call(eval_cmd)) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 2cb142648..65fc36436 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,8 +47,13 @@ dependencies = [ "hf-xet>=1.1.10,<2.0", ] +[project.optional-dependencies] +nemo = ["nemo_evaluator>=0.1.92,<0.2"] + [project.scripts] validate-cfg = "benchmarks.scripts.validate_cfg:main" +generate-llm-config = "nemo_evaluator.openhands_benchmarks.generate_llm_config:main" +run-benchmark = "nemo_evaluator.openhands_benchmarks.run_benchmark:main" swebench-infer = "benchmarks.swebench.run_infer:main" swtbench-infer = "benchmarks.swtbench.run_infer:main" swebench-eval = "benchmarks.swebench.eval_infer:main" @@ -73,12 +78,16 @@ build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] where = ["."] -include = ["benchmarks"] +include = ["benchmarks*", "nemo_evaluator*"] [tool.setuptools] # Install the top-level sitecustomize module so Python auto-loads our Modal logging patch. py-modules = ["sitecustomize"] +[tool.setuptools.package-data] +nemo_evaluator = ["**/*.yml"] +benchmarks = ["**/*.j2", "**/Dockerfile*", "**/*.json"] + [dependency-groups] dev = [ "pre-commit>=4.3.0", @@ -95,7 +104,6 @@ dev = [ [tool.ruff] target-version = "py312" line-length = 88 -exclude = ["legacy"] [tool.ruff.format] quote-style = "double" diff --git a/uv.lock b/uv.lock index db834cb47..288d17f8c 100644 --- a/uv.lock +++ b/uv.lock @@ -178,6 +178,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6f/12/e5e0282d673bb9746bacfb6e2dba8719989d3660cdb2ea79aee9a9651afb/anyio-4.10.0-py3-none-any.whl", hash = "sha256:60e474ac86736bbfd6f210f7a61218939c318f43f9972497381f1c5e930ed3d1", size = 107213, upload-time = "2025-08-04T08:54:24.882Z" }, ] +[[package]] +name = "argcomplete" +version = "3.6.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/38/61/0b9ae6399dd4a58d8c1b1dc5a27d6f2808023d0b5dd3104bb99f45a33ff6/argcomplete-3.6.3.tar.gz", hash = "sha256:62e8ed4fd6a45864acc8235409461b72c9a28ee785a2011cc5eb78318786c89c", size = 73754, upload-time = "2025-10-20T03:33:34.741Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/74/f5/9373290775639cb67a2fce7f629a1c240dce9f12fe927bc32b2736e16dfc/argcomplete-3.6.3-py3-none-any.whl", hash = "sha256:f5007b3a600ccac5d25bbce33089211dfd49eab4a7718da3f10e3082525a92ce", size = 43846, upload-time = "2025-10-20T03:33:33.021Z" }, +] + [[package]] name = "attrs" version = "25.3.0" @@ -242,6 +251,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/24/7e/f7b6f453e6481d1e233540262ccbfcf89adcd43606f44a028d7f5fae5eb2/binaryornot-0.4.4-py2.py3-none-any.whl", hash = "sha256:b8b71173c917bddcd2c16070412e369c3ed7f0528926f70cac18a6c97fd563e4", size = 9006, upload-time = "2017-08-03T15:55:31.23Z" }, ] +[[package]] +name = "blinker" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" }, +] + [[package]] name = "boto3" version = "1.40.49" @@ -978,6 +996,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/4c/93d0f85318da65923e4b91c1c2ff03d8a458cbefebe3bc612a6693c7906d/fire-0.7.1-py3-none-any.whl", hash = "sha256:e43fd8a5033a9001e7e2973bab96070694b9f12f2e0ecf96d4683971b5ab1882", size = 115945, upload-time = "2025-08-16T20:20:22.87Z" }, ] +[[package]] +name = "flask" +version = "3.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "blinker" }, + { name = "click" }, + { name = "itsdangerous" }, + { name = "jinja2" }, + { name = "markupsafe" }, + { name = "werkzeug" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/26/00/35d85dcce6c57fdc871f3867d465d780f302a175ea360f62533f12b27e2b/flask-3.1.3.tar.gz", hash = "sha256:0ef0e52b8a9cd932855379197dd8f94047b359ca0a78695144304cb45f87c9eb", size = 759004, upload-time = "2026-02-19T05:00:57.678Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/9c/34f6962f9b9e9c71f6e5ed806e0d0ff03c9d1b0b2340088a0cf4bce09b18/flask-3.1.3-py3-none-any.whl", hash = "sha256:f4bcbefc124291925f1a26446da31a5178f9483862233b23c0c96a20701f670c", size = 103424, upload-time = "2026-02-19T05:00:56.027Z" }, +] + [[package]] name = "frozenlist" version = "1.7.0" @@ -1547,6 +1582,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320, upload-time = "2024-10-08T23:04:09.501Z" }, ] +[[package]] +name = "itsdangerous" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" }, +] + [[package]] name = "jinja2" version = "3.1.6" @@ -2106,6 +2150,29 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" }, ] +[[package]] +name = "nemo-evaluator" +version = "0.1.92" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "flask" }, + { name = "httpx" }, + { name = "jinja2" }, + { name = "psutil" }, + { name = "pydantic" }, + { name = "pydantic-core" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "structlog" }, + { name = "typing-extensions" }, + { name = "werkzeug" }, + { name = "yq" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d5/ba/f4c275c7a7ac14a413df588ed9dc6b986b7d4dad8df6872170b04ee319f7/nemo_evaluator-0.1.92.tar.gz", hash = "sha256:b274e0f28f37389647d70c890f12ec081ba449a5ec251cab39f320ca349dbc30", size = 150204, upload-time = "2026-03-02T11:54:31.486Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c4/4f/a57f61da43fc12e5e934e4a1eb658c0411a609bea888beb6112b8f418fbf/nemo_evaluator-0.1.92-py3-none-any.whl", hash = "sha256:f075345f0c02aaa8f09edb4dcaaabff87283c78580b61f197a173f99bf98ea83", size = 197442, upload-time = "2026-03-02T11:54:30.436Z" }, +] + [[package]] name = "nodeenv" version = "1.9.1" @@ -2371,6 +2438,11 @@ dependencies = [ { name = "websockets" }, ] +[package.optional-dependencies] +nemo = [ + { name = "nemo-evaluator" }, +] + [package.dev-dependencies] dev = [ { name = "pre-commit" }, @@ -2402,6 +2474,7 @@ requires-dist = [ { name = "lmnr", specifier = ">=0.7.24" }, { name = "modal", specifier = ">=1.1.4" }, { name = "multi-swe-bench", marker = "sys_platform != 'darwin'", specifier = ">=1.1.1" }, + { name = "nemo-evaluator", marker = "extra == 'nemo'", specifier = ">=0.1.92,<0.2" }, { name = "openhands-agent-server", editable = "vendor/software-agent-sdk/openhands-agent-server" }, { name = "openhands-sdk", editable = "vendor/software-agent-sdk/openhands-sdk" }, { name = "openhands-tools", editable = "vendor/software-agent-sdk/openhands-tools" }, @@ -2423,6 +2496,7 @@ requires-dist = [ { name = "unidiff", specifier = ">=0.7.5,<0.8.0" }, { name = "websockets", specifier = ">=12" }, ] +provides-extras = ["nemo"] [package.metadata.requires-dev] dev = [ @@ -6623,6 +6697,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/69/297302c5f5f59c862faa31e6cb9a4cd74721cd1e052b38e464c5b402df8b/StrEnum-0.4.15-py3-none-any.whl", hash = "sha256:a30cda4af7cc6b5bf52c8055bc4bf4b2b6b14a93b574626da33df53cf7740659", size = 8851, upload-time = "2023-06-29T22:02:56.947Z" }, ] +[[package]] +name = "structlog" +version = "25.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ef/52/9ba0f43b686e7f3ddfeaa78ac3af750292662284b3661e91ad5494f21dbc/structlog-25.5.0.tar.gz", hash = "sha256:098522a3bebed9153d4570c6d0288abf80a031dfdb2048d59a49e9dc2190fc98", size = 1460830, upload-time = "2025-10-27T08:28:23.028Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/45/a132b9074aa18e799b891b91ad72133c98d8042c70f6240e4c5f9dabee2f/structlog-25.5.0-py3-none-any.whl", hash = "sha256:a8453e9b9e636ec59bd9e79bbd4a72f025981b3ba0f5837aebf48f02f37a7f9f", size = 72510, upload-time = "2025-10-27T08:28:21.535Z" }, +] + [[package]] name = "swe-rex" version = "1.4.0" @@ -6813,6 +6896,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/44/6f/7120676b6d73228c96e17f1f794d8ab046fc910d781c8d151120c3f1569e/toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", size = 16588, upload-time = "2020-11-01T01:40:20.672Z" }, ] +[[package]] +name = "tomlkit" +version = "0.14.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/af/14b24e41977adb296d6bd1fb59402cf7d60ce364f90c890bd2ec65c43b5a/tomlkit-0.14.0.tar.gz", hash = "sha256:cf00efca415dbd57575befb1f6634c4f42d2d87dbba376128adb42c121b87064", size = 187167, upload-time = "2026-01-13T01:14:53.304Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/11/87d6d29fb5d237229d67973a6c9e06e048f01cf4994dee194ab0ea841814/tomlkit-0.14.0-py3-none-any.whl", hash = "sha256:592064ed85b40fa213469f81ac584f67a4f2992509a7c3ea2d632208623a3680", size = 39310, upload-time = "2026-01-13T01:14:51.965Z" }, +] + [[package]] name = "tqdm" version = "4.67.1" @@ -7144,6 +7236,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/58/e860788190eba3bcce367f74d29c4675466ce8dddfba85f7827588416f01/wsproto-1.2.0-py3-none-any.whl", hash = "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736", size = 24226, upload-time = "2022-08-23T19:58:19.96Z" }, ] +[[package]] +name = "xmltodict" +version = "1.0.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/19/70/80f3b7c10d2630aa66414bf23d210386700aa390547278c789afa994fd7e/xmltodict-1.0.4.tar.gz", hash = "sha256:6d94c9f834dd9e44514162799d344d815a3a4faec913717a9ecbfa5be1bb8e61", size = 26124, upload-time = "2026-02-22T02:21:22.074Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/34/98a2f52245f4d47be93b580dae5f9861ef58977d73a79eb47c58f1ad1f3a/xmltodict-1.0.4-py3-none-any.whl", hash = "sha256:a4a00d300b0e1c59fc2bfccb53d7b2e88c32f200df138a0dd2229f842497026a", size = 13580, upload-time = "2026-02-22T02:21:21.039Z" }, +] + [[package]] name = "xxhash" version = "3.5.0" @@ -7247,6 +7348,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b4/2d/2345fce04cfd4bee161bf1e7d9cdc702e3e16109021035dbb24db654a622/yarl-1.20.1-py3-none-any.whl", hash = "sha256:83b8eb083fe4683c6115795d9fc1cfaf2cbbefb19b3a1cb68f6527460f483a77", size = 46542, upload-time = "2025-06-10T00:46:07.521Z" }, ] +[[package]] +name = "yq" +version = "3.4.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "argcomplete" }, + { name = "pyyaml" }, + { name = "tomlkit" }, + { name = "xmltodict" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/38/6a/eb9721ed0929d0f55d167c2222d288b529723afbef0a07ed7aa6cca72380/yq-3.4.3.tar.gz", hash = "sha256:ba586a1a6f30cf705b2f92206712df2281cd320280210e7b7b80adcb8f256e3b", size = 33214, upload-time = "2024-04-27T15:39:43.29Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f2/ba/d1b21f3e57469030bd6536b91bb28fedd2511d4e68b5a575f2bdb3a3dbb6/yq-3.4.3-py3-none-any.whl", hash = "sha256:547e34bc3caacce83665fd3429bf7c85f8e8b6b9aaee3f953db1ad716ff3434d", size = 18812, upload-time = "2024-04-27T15:39:41.652Z" }, +] + [[package]] name = "zipp" version = "3.23.0"