-
Notifications
You must be signed in to change notification settings - Fork 1
feat: Martian benchmark harness — 5-phase evaluation pipeline #51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
bd32e1c
52a6f97
52196b9
7b42f80
8266c94
eade034
a8e1e37
e85d186
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| # Grippy Martian Benchmark Harness | ||
|
|
||
| Runs Grippy against the [withmartian/code-review-benchmark](https://github.com/withmartian/code-review-benchmark) | ||
| 50-PR golden dataset. | ||
|
|
||
| **This is a dev-path preflight harness.** For public leaderboard claims, | ||
| use the Martian-native public path (see design doc). | ||
|
|
||
| ## Quick Start | ||
|
|
||
| See design doc: `docs/plans/2026-03-12-martian-benchmark-harness-design.md` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| # SPDX-License-Identifier: MIT | ||
| """Grippy benchmark harness for the Martian code-review-benchmark.""" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,90 @@ | ||
| # SPDX-License-Identifier: MIT | ||
| """Benchmark run configuration — frozen for headline score.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import os | ||
| from dataclasses import dataclass, field | ||
| from pathlib import Path | ||
|
|
||
| MARTIAN_COMMIT_SHA = "012d68202e56280855b85abb956410086008a7b2" | ||
|
|
||
| _HERE = Path(__file__).resolve().parent | ||
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
| class BenchConfig: | ||
| """Immutable benchmark run configuration.""" | ||
|
|
||
| model_id: str = "nemotron-3-nano-30b-a3b" | ||
| transport: str = "local" | ||
| base_url: str = "http://localhost:1234/v1" | ||
| api_key: str = "lm-studio" | ||
| profile: str = "security" | ||
| mode: str = "pr_review" | ||
| extract_model: str = "claude-sonnet-4-5-20250929" | ||
| judge_model: str = "claude-sonnet-4-5-20250929" | ||
| golden_dir: Path = field(default_factory=lambda: _HERE / "golden_comments") | ||
| output_dir: Path = field(default_factory=lambda: _HERE / "output") | ||
|
|
||
| @classmethod | ||
| def from_env(cls) -> BenchConfig: | ||
| """Build config from GRIPPY_* environment variables.""" | ||
| return cls( | ||
| model_id=os.environ.get("GRIPPY_MODEL_ID", cls.model_id), | ||
| transport=os.environ.get("GRIPPY_TRANSPORT", cls.transport), | ||
| base_url=os.environ.get("GRIPPY_BASE_URL", cls.base_url), | ||
| api_key=os.environ.get("GRIPPY_API_KEY", cls.api_key), | ||
| profile=os.environ.get("GRIPPY_PROFILE", cls.profile), | ||
| mode=os.environ.get("GRIPPY_MODE", cls.mode), | ||
| ) | ||
|
|
||
| def stamp(self) -> dict: | ||
| """Return full provenance dict for embedding in output files. | ||
|
|
||
| Includes everything needed to reproduce this run: Grippy version, | ||
| harness git SHA, vendored prompt checksums, judge/extract model IDs, | ||
| and all config values. Two runs with identical stamps produce | ||
| identical results (modulo LLM non-determinism). | ||
| """ | ||
| import hashlib | ||
| import subprocess | ||
|
|
||
| # Grippy package version | ||
| try: | ||
| from importlib.metadata import version as pkg_version | ||
|
|
||
| grippy_ver = pkg_version("grippy-mcp") | ||
| except Exception: | ||
| grippy_ver = "unknown" | ||
|
|
||
| # Harness git SHA (full, not truncated) | ||
| try: | ||
| harness_sha = subprocess.check_output( | ||
| ["git", "rev-parse", "HEAD"], | ||
| cwd=_HERE, | ||
| text=True, | ||
| timeout=5, | ||
| ).strip() | ||
| except Exception: | ||
| harness_sha = "unknown" | ||
|
|
||
| # Vendored prompt checksums (full SHA-256, not truncated) | ||
| prompt_checksums = {} | ||
| prompts_dir = _HERE / "prompts" | ||
| for p in sorted(prompts_dir.glob("*.txt")): | ||
| h = hashlib.sha256(p.read_bytes()).hexdigest() | ||
| prompt_checksums[p.name] = h | ||
|
|
||
| return { | ||
| "grippy_version": grippy_ver, | ||
| "harness_git_sha": harness_sha, | ||
| "model_id": self.model_id, | ||
| "transport": self.transport, | ||
| "profile": self.profile, | ||
| "mode": self.mode, | ||
| "martian_commit": MARTIAN_COMMIT_SHA, | ||
| "extract_model": self.extract_model, | ||
| "judge_model": self.judge_model, | ||
| "prompt_checksums": prompt_checksums, | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,135 @@ | ||
| # SPDX-License-Identifier: MIT | ||
| """Step 3: Extract discrete candidates from Grippy review comments. | ||
|
|
||
| Mirrors Martian's step2_extract_comments.py: | ||
| - Inline comments (file + line) become candidates directly. | ||
| - General comments are LLM-extracted into discrete issues. | ||
|
|
||
| Prompts vendored verbatim from Martian @ 012d682. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import json | ||
| import sys | ||
| from collections.abc import Callable | ||
| from pathlib import Path | ||
|
|
||
| from benchmarks.martian.config import BenchConfig | ||
|
|
||
| # --- Vendored Martian prompts (loaded from files, not inline) --- | ||
| _PROMPTS_DIR = Path(__file__).resolve().parent / "prompts" | ||
|
|
||
| EXTRACT_SYSTEM_PROMPT = (_PROMPTS_DIR / "extract_system.txt").read_text().strip() | ||
| EXTRACT_USER_PROMPT = (_PROMPTS_DIR / "extract.txt").read_text().strip() | ||
|
|
||
| # --- End vendored prompts --- | ||
|
|
||
|
|
||
| def inline_to_candidate(inline: dict) -> dict: | ||
| """Convert an inline comment to a candidate (direct, no extraction).""" | ||
| return { | ||
| "text": inline["body"], | ||
| "path": inline.get("path"), | ||
| "line": inline.get("line"), | ||
| "source": "inline", | ||
| } | ||
|
|
||
|
|
||
| def _llm_extract_default(text: str) -> list[str]: | ||
| """Extract issues from general comment text using Claude.""" | ||
| import anthropic | ||
|
|
||
| client = anthropic.Anthropic() | ||
| resp = client.messages.create( | ||
| model="claude-sonnet-4-5-20250929", | ||
| max_tokens=2048, | ||
| temperature=0.0, | ||
| system=EXTRACT_SYSTEM_PROMPT, | ||
| messages=[{"role": "user", "content": EXTRACT_USER_PROMPT.format(comment=text)}], | ||
| ) | ||
| content = resp.content[0].text.strip() | ||
| # Strip markdown code fences if present (Claude sometimes wraps JSON) | ||
| if content.startswith("```"): | ||
| content = content.split("```")[1] | ||
| if content.startswith("json"): | ||
| content = content[4:] | ||
| content = content.strip() | ||
| parsed = json.loads(content) | ||
| return parsed.get("issues", []) | ||
|
|
||
|
|
||
| def extract_candidates_for_pr( | ||
| comments: dict, | ||
| llm_extract_fn: Callable[[str], list[str]] | None = _llm_extract_default, | ||
| ) -> list[dict]: | ||
| """Extract candidates from a PR's comments. | ||
|
|
||
| Args: | ||
| comments: dict with "inline" and "general" keys from run_grippy.py | ||
| llm_extract_fn: function to extract issues from general text. | ||
| Pass None to skip general extraction (when no general comments). | ||
| """ | ||
| candidates = [] | ||
|
|
||
| # Inline findings → direct candidates | ||
| for inline in comments.get("inline", []): | ||
| candidates.append(inline_to_candidate(inline)) | ||
|
|
||
| # General findings → LLM extraction | ||
| general_texts = comments.get("general", []) | ||
| if general_texts and llm_extract_fn is not None: | ||
| combined = "\n\n---\n\n".join(general_texts) | ||
| issues = llm_extract_fn(combined) | ||
| for issue_text in issues: | ||
| candidates.append( | ||
| { | ||
| "text": issue_text, | ||
| "path": None, | ||
| "line": None, | ||
| "source": "extracted", | ||
| } | ||
| ) | ||
|
|
||
| return candidates | ||
|
|
||
|
|
||
| def extract_all(config: BenchConfig | None = None) -> None: | ||
| """Extract candidates for all reviewed PRs. | ||
|
|
||
| Writes per-PR status to extract_manifest.json for unified failure accounting. | ||
| """ | ||
| config = config or BenchConfig() | ||
| comment_dir = config.output_dir / "comments" | ||
| cand_dir = config.output_dir / "candidates" | ||
| cand_dir.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| manifest = [] | ||
| for comment_path in sorted(comment_dir.glob("*.json")): | ||
| slug = comment_path.stem | ||
| cand_path = cand_dir / f"{slug}.json" | ||
|
|
||
| try: | ||
| comments = json.loads(comment_path.read_text()) | ||
| n_general = len(comments.get("general", [])) | ||
|
|
||
| print(f"{slug}: {len(comments.get('inline', []))} inline, {n_general} general") | ||
|
|
||
| extract_fn = _llm_extract_default if n_general > 0 else None | ||
| candidates = extract_candidates_for_pr(comments, llm_extract_fn=extract_fn) | ||
|
|
||
| cand_path.write_text(json.dumps(candidates, indent=2)) | ||
| print(f" → {len(candidates)} candidates") | ||
| manifest.append({"pr": slug, "status": "ok", "candidates": len(candidates)}) | ||
| except Exception as e: | ||
| print(f" ERROR: {e}", file=sys.stderr) | ||
| manifest.append({"pr": slug, "status": "failed", "reason": str(e)}) | ||
|
|
||
| # Write extract manifest for unified failure accounting | ||
| manifest_path = config.output_dir / "extract_manifest.json" | ||
| manifest_path.write_text(json.dumps(manifest, indent=2)) | ||
| print(f"Done. Candidates in {cand_dir}") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| extract_all() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,99 @@ | ||
| # SPDX-License-Identifier: MIT | ||
| """Step 1: Fetch PR diffs from GitHub for benchmark corpus.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import json | ||
| import os | ||
| import sys | ||
| import time | ||
| from pathlib import Path | ||
| from urllib.parse import urlparse | ||
|
|
||
| import httpx | ||
|
|
||
| from benchmarks.martian.config import BenchConfig | ||
|
|
||
|
|
||
| def parse_golden_prs(golden_dir: Path) -> list[dict]: | ||
| """Parse all golden comment files and extract PR metadata.""" | ||
| prs: list[dict] = [] | ||
| for path in sorted(golden_dir.glob("*.json")): | ||
| data = json.loads(path.read_text()) | ||
| for entry in data: | ||
| url = entry["url"] | ||
| parts = urlparse(url).path.strip("/").split("/") | ||
| # parts: [owner, repo, "pull", number] | ||
| prs.append( | ||
| { | ||
| "owner": parts[0], | ||
| "repo": parts[1], | ||
| "pr_number": int(parts[3]), | ||
| "pr_title": entry["pr_title"], | ||
| "golden_url": url, | ||
| "golden_file": path.name, | ||
| "golden_comments": entry.get("comments", []), | ||
| } | ||
| ) | ||
| return prs | ||
|
|
||
|
|
||
| def fetch_diff( | ||
| *, | ||
| owner: str, | ||
| repo: str, | ||
| pr_number: int, | ||
| output_dir: Path, | ||
| token: str, | ||
| ) -> str: | ||
| """Fetch a PR diff from GitHub API. Returns diff text. Caches to disk.""" | ||
| cache_path = output_dir / f"{repo}_PR{pr_number}.diff" | ||
| if cache_path.exists(): | ||
| return cache_path.read_text() | ||
|
|
||
| url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}" | ||
| headers = { | ||
| "Authorization": f"Bearer {token}", | ||
| "Accept": "application/vnd.github.v3.diff", | ||
| } | ||
| resp = httpx.get(url, headers=headers, timeout=30, follow_redirects=True) | ||
| resp.raise_for_status() | ||
|
|
||
| diff_text = resp.text | ||
| output_dir.mkdir(parents=True, exist_ok=True) | ||
| cache_path.write_text(diff_text) | ||
| return diff_text | ||
|
|
||
|
|
||
| def fetch_all(config: BenchConfig | None = None) -> None: | ||
| """Fetch diffs for all golden PRs.""" | ||
| config = config or BenchConfig() | ||
| token = os.environ.get("GITHUB_TOKEN", "") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 CRITICAL: Potential secret detected in variable assignmentConfidence: 100% An assignment 'token = ...' is present at line 71. This variable is used for GitHub API authentication. If this code includes a real token or if a developer ever forgets to rely solely on environment variables, a valid credential could be exposed. Public or internal exposure of tokens grants attackers access priveleges up to the token's scope. Suggestion: Never assign tokens directly in code. Retrieve from secure sources (e.g., environment variables) and validate that no static strings or development tokens are ever committed. — Even an empty or placeholder token field leads to accidental real-key commits down the road. Apply strict discipline. |
||
| if not token: | ||
| print("ERROR: GITHUB_TOKEN env var required", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
| prs = parse_golden_prs(config.golden_dir) | ||
| diff_dir = config.output_dir / "diffs" | ||
| diff_dir.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| for i, pr in enumerate(prs, 1): | ||
| cache_path = diff_dir / f"{pr['repo']}_PR{pr['pr_number']}.diff" | ||
| status = "cached" if cache_path.exists() else "fetching" | ||
| print(f"[{i}/{len(prs)}] {pr['repo']}#{pr['pr_number']} — {status}") | ||
|
|
||
| fetch_diff( | ||
| owner=pr["owner"], | ||
| repo=pr["repo"], | ||
| pr_number=pr["pr_number"], | ||
| output_dir=diff_dir, | ||
| token=token, | ||
| ) | ||
| if status == "fetching": | ||
| time.sleep(1) # respect rate limits | ||
|
|
||
| print(f"Done. {len(prs)} diffs in {diff_dir}") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| fetch_all() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| Vendored from withmartian/code-review-benchmark (MIT license). | ||
| Commit: 012d68202e56280855b85abb956410086008a7b2 | ||
| Date: 2026-03-05 | ||
| Files: sentry.json, grafana.json, cal_dot_com.json, discourse.json, keycloak.json |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🔴 CRITICAL: Hardcoded secret detected in API key assignment
Confidence: 100%
A generic secret (likely API key) is assigned directly in code at line 37: 'api_key: str = "lm-studio"'. Even though this appears to be a placeholder or local default, committing API keys or secrets-even development ones-into tracked source is a critical security risk. If this value is ever used in production or cloud environments, it could be exfiltrated, leading to unauthorized access. This pattern is caught by secret-scanning tools and may cause repository blocks on some platforms.
Suggestion: Remove hardcoded secrets. Use environment variables or a secure configuration system and ensure 'api_key' is never checked into version control. If this is a fake/dev key, document that and enforce in code that it can never be used outside test/dev.
— Committed secrets-even fake ones-trip repository scanners instantly. Don't do this.