Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@ docs/plans/
nboot-spec.json
.hypothesis/
.worktrees/
.claude-flow/
benchmarks/output/
!benchmarks/output/.gitkeep
benchmarks/martian/output/
benchmarks/martian/output-*/
!benchmarks/martian/output/.gitkeep

# Zensical build output
site/
20 changes: 19 additions & 1 deletion .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,15 @@
"line_number": 58
}
],
"benchmarks/martian/config.py": [
{
"type": "Hex High Entropy String",
"filename": "benchmarks/martian/config.py",
"hashed_secret": "e6a37dac608a721d5d88c51c6af2535e403d405c",
"is_verified": false,
"line_number": 10
}
],
"scripts/check_test_parity.py": [
{
"type": "Secret Keyword",
Expand All @@ -182,6 +191,15 @@
"line_number": 38
}
],
"tests/test_bench_martian_config.py": [
{
"type": "Hex High Entropy String",
"filename": "tests/test_bench_martian_config.py",
"hashed_secret": "e6a37dac608a721d5d88c51c6af2535e403d405c",
"is_verified": false,
"line_number": 9
}
],
"tests/test_grippy_agent_evolution.py": [
{
"type": "Secret Keyword",
Expand Down Expand Up @@ -281,5 +299,5 @@
}
]
},
"generated_at": "2026-03-12T03:37:04Z"
"generated_at": "2026-03-13T18:52:34Z"
}
11 changes: 11 additions & 0 deletions benchmarks/martian/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Grippy Martian Benchmark Harness

Runs Grippy against the [withmartian/code-review-benchmark](https://github.com/withmartian/code-review-benchmark)
50-PR golden dataset.

**This is a dev-path preflight harness.** For public leaderboard claims,
use the Martian-native public path (see design doc).

## Quick Start

See design doc: `docs/plans/2026-03-12-martian-benchmark-harness-design.md`
2 changes: 2 additions & 0 deletions benchmarks/martian/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-License-Identifier: MIT
"""Grippy benchmark harness for the Martian code-review-benchmark."""
90 changes: 90 additions & 0 deletions benchmarks/martian/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# SPDX-License-Identifier: MIT
"""Benchmark run configuration — frozen for headline score."""

from __future__ import annotations

import os
from dataclasses import dataclass, field
from pathlib import Path

MARTIAN_COMMIT_SHA = "012d68202e56280855b85abb956410086008a7b2"

_HERE = Path(__file__).resolve().parent


@dataclass(frozen=True)
class BenchConfig:
"""Immutable benchmark run configuration."""

model_id: str = "nemotron-3-nano-30b-a3b"
transport: str = "local"
base_url: str = "http://localhost:1234/v1"
api_key: str = "lm-studio"
profile: str = "security"
mode: str = "pr_review"
extract_model: str = "claude-sonnet-4-5-20250929"
judge_model: str = "claude-sonnet-4-5-20250929"
golden_dir: Path = field(default_factory=lambda: _HERE / "golden_comments")
output_dir: Path = field(default_factory=lambda: _HERE / "output")

@classmethod
def from_env(cls) -> BenchConfig:
"""Build config from GRIPPY_* environment variables."""
return cls(
model_id=os.environ.get("GRIPPY_MODEL_ID", cls.model_id),
transport=os.environ.get("GRIPPY_TRANSPORT", cls.transport),
base_url=os.environ.get("GRIPPY_BASE_URL", cls.base_url),
api_key=os.environ.get("GRIPPY_API_KEY", cls.api_key),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 CRITICAL: Hardcoded secret detected in API key assignment

Confidence: 100%

A generic secret (likely API key) is assigned directly in code at line 37: 'api_key: str = "lm-studio"'. Even though this appears to be a placeholder or local default, committing API keys or secrets-even development ones-into tracked source is a critical security risk. If this value is ever used in production or cloud environments, it could be exfiltrated, leading to unauthorized access. This pattern is caught by secret-scanning tools and may cause repository blocks on some platforms.

Suggestion: Remove hardcoded secrets. Use environment variables or a secure configuration system and ensure 'api_key' is never checked into version control. If this is a fake/dev key, document that and enforce in code that it can never be used outside test/dev.

— Committed secrets-even fake ones-trip repository scanners instantly. Don't do this.

profile=os.environ.get("GRIPPY_PROFILE", cls.profile),
mode=os.environ.get("GRIPPY_MODE", cls.mode),
)

def stamp(self) -> dict:
"""Return full provenance dict for embedding in output files.

Includes everything needed to reproduce this run: Grippy version,
harness git SHA, vendored prompt checksums, judge/extract model IDs,
and all config values. Two runs with identical stamps produce
identical results (modulo LLM non-determinism).
"""
import hashlib
import subprocess

# Grippy package version
try:
from importlib.metadata import version as pkg_version

grippy_ver = pkg_version("grippy-mcp")
except Exception:
grippy_ver = "unknown"

# Harness git SHA (full, not truncated)
try:
harness_sha = subprocess.check_output(
["git", "rev-parse", "HEAD"],
cwd=_HERE,
text=True,
timeout=5,
).strip()
except Exception:
harness_sha = "unknown"

# Vendored prompt checksums (full SHA-256, not truncated)
prompt_checksums = {}
prompts_dir = _HERE / "prompts"
for p in sorted(prompts_dir.glob("*.txt")):
h = hashlib.sha256(p.read_bytes()).hexdigest()
prompt_checksums[p.name] = h

return {
"grippy_version": grippy_ver,
"harness_git_sha": harness_sha,
"model_id": self.model_id,
"transport": self.transport,
"profile": self.profile,
"mode": self.mode,
"martian_commit": MARTIAN_COMMIT_SHA,
"extract_model": self.extract_model,
"judge_model": self.judge_model,
"prompt_checksums": prompt_checksums,
}
135 changes: 135 additions & 0 deletions benchmarks/martian/extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# SPDX-License-Identifier: MIT
"""Step 3: Extract discrete candidates from Grippy review comments.

Mirrors Martian's step2_extract_comments.py:
- Inline comments (file + line) become candidates directly.
- General comments are LLM-extracted into discrete issues.

Prompts vendored verbatim from Martian @ 012d682.
"""

from __future__ import annotations

import json
import sys
from collections.abc import Callable
from pathlib import Path

from benchmarks.martian.config import BenchConfig

# --- Vendored Martian prompts (loaded from files, not inline) ---
_PROMPTS_DIR = Path(__file__).resolve().parent / "prompts"

EXTRACT_SYSTEM_PROMPT = (_PROMPTS_DIR / "extract_system.txt").read_text().strip()
EXTRACT_USER_PROMPT = (_PROMPTS_DIR / "extract.txt").read_text().strip()

# --- End vendored prompts ---


def inline_to_candidate(inline: dict) -> dict:
"""Convert an inline comment to a candidate (direct, no extraction)."""
return {
"text": inline["body"],
"path": inline.get("path"),
"line": inline.get("line"),
"source": "inline",
}


def _llm_extract_default(text: str) -> list[str]:
"""Extract issues from general comment text using Claude."""
import anthropic

client = anthropic.Anthropic()
resp = client.messages.create(
model="claude-sonnet-4-5-20250929",
max_tokens=2048,
temperature=0.0,
system=EXTRACT_SYSTEM_PROMPT,
messages=[{"role": "user", "content": EXTRACT_USER_PROMPT.format(comment=text)}],
)
content = resp.content[0].text.strip()
# Strip markdown code fences if present (Claude sometimes wraps JSON)
if content.startswith("```"):
content = content.split("```")[1]
if content.startswith("json"):
content = content[4:]
content = content.strip()
parsed = json.loads(content)
return parsed.get("issues", [])


def extract_candidates_for_pr(
comments: dict,
llm_extract_fn: Callable[[str], list[str]] | None = _llm_extract_default,
) -> list[dict]:
"""Extract candidates from a PR's comments.

Args:
comments: dict with "inline" and "general" keys from run_grippy.py
llm_extract_fn: function to extract issues from general text.
Pass None to skip general extraction (when no general comments).
"""
candidates = []

# Inline findings → direct candidates
for inline in comments.get("inline", []):
candidates.append(inline_to_candidate(inline))

# General findings → LLM extraction
general_texts = comments.get("general", [])
if general_texts and llm_extract_fn is not None:
combined = "\n\n---\n\n".join(general_texts)
issues = llm_extract_fn(combined)
for issue_text in issues:
candidates.append(
{
"text": issue_text,
"path": None,
"line": None,
"source": "extracted",
}
)

return candidates


def extract_all(config: BenchConfig | None = None) -> None:
"""Extract candidates for all reviewed PRs.

Writes per-PR status to extract_manifest.json for unified failure accounting.
"""
config = config or BenchConfig()
comment_dir = config.output_dir / "comments"
cand_dir = config.output_dir / "candidates"
cand_dir.mkdir(parents=True, exist_ok=True)

manifest = []
for comment_path in sorted(comment_dir.glob("*.json")):
slug = comment_path.stem
cand_path = cand_dir / f"{slug}.json"

try:
comments = json.loads(comment_path.read_text())
n_general = len(comments.get("general", []))

print(f"{slug}: {len(comments.get('inline', []))} inline, {n_general} general")

extract_fn = _llm_extract_default if n_general > 0 else None
candidates = extract_candidates_for_pr(comments, llm_extract_fn=extract_fn)

cand_path.write_text(json.dumps(candidates, indent=2))
print(f" → {len(candidates)} candidates")
manifest.append({"pr": slug, "status": "ok", "candidates": len(candidates)})
except Exception as e:
print(f" ERROR: {e}", file=sys.stderr)
manifest.append({"pr": slug, "status": "failed", "reason": str(e)})

# Write extract manifest for unified failure accounting
manifest_path = config.output_dir / "extract_manifest.json"
manifest_path.write_text(json.dumps(manifest, indent=2))
print(f"Done. Candidates in {cand_dir}")


if __name__ == "__main__":
extract_all()
99 changes: 99 additions & 0 deletions benchmarks/martian/fetch_diffs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# SPDX-License-Identifier: MIT
"""Step 1: Fetch PR diffs from GitHub for benchmark corpus."""

from __future__ import annotations

import json
import os
import sys
import time
from pathlib import Path
from urllib.parse import urlparse

import httpx

from benchmarks.martian.config import BenchConfig


def parse_golden_prs(golden_dir: Path) -> list[dict]:
"""Parse all golden comment files and extract PR metadata."""
prs: list[dict] = []
for path in sorted(golden_dir.glob("*.json")):
data = json.loads(path.read_text())
for entry in data:
url = entry["url"]
parts = urlparse(url).path.strip("/").split("/")
# parts: [owner, repo, "pull", number]
prs.append(
{
"owner": parts[0],
"repo": parts[1],
"pr_number": int(parts[3]),
"pr_title": entry["pr_title"],
"golden_url": url,
"golden_file": path.name,
"golden_comments": entry.get("comments", []),
}
)
return prs


def fetch_diff(
*,
owner: str,
repo: str,
pr_number: int,
output_dir: Path,
token: str,
) -> str:
"""Fetch a PR diff from GitHub API. Returns diff text. Caches to disk."""
cache_path = output_dir / f"{repo}_PR{pr_number}.diff"
if cache_path.exists():
return cache_path.read_text()

url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}"
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github.v3.diff",
}
resp = httpx.get(url, headers=headers, timeout=30, follow_redirects=True)
resp.raise_for_status()

diff_text = resp.text
output_dir.mkdir(parents=True, exist_ok=True)
cache_path.write_text(diff_text)
return diff_text


def fetch_all(config: BenchConfig | None = None) -> None:
"""Fetch diffs for all golden PRs."""
config = config or BenchConfig()
token = os.environ.get("GITHUB_TOKEN", "")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 CRITICAL: Potential secret detected in variable assignment

Confidence: 100%

An assignment 'token = ...' is present at line 71. This variable is used for GitHub API authentication. If this code includes a real token or if a developer ever forgets to rely solely on environment variables, a valid credential could be exposed. Public or internal exposure of tokens grants attackers access priveleges up to the token's scope.

Suggestion: Never assign tokens directly in code. Retrieve from secure sources (e.g., environment variables) and validate that no static strings or development tokens are ever committed.

— Even an empty or placeholder token field leads to accidental real-key commits down the road. Apply strict discipline.

if not token:
print("ERROR: GITHUB_TOKEN env var required", file=sys.stderr)
sys.exit(1)

prs = parse_golden_prs(config.golden_dir)
diff_dir = config.output_dir / "diffs"
diff_dir.mkdir(parents=True, exist_ok=True)

for i, pr in enumerate(prs, 1):
cache_path = diff_dir / f"{pr['repo']}_PR{pr['pr_number']}.diff"
status = "cached" if cache_path.exists() else "fetching"
print(f"[{i}/{len(prs)}] {pr['repo']}#{pr['pr_number']} — {status}")

fetch_diff(
owner=pr["owner"],
repo=pr["repo"],
pr_number=pr["pr_number"],
output_dir=diff_dir,
token=token,
)
if status == "fetching":
time.sleep(1) # respect rate limits

print(f"Done. {len(prs)} diffs in {diff_dir}")


if __name__ == "__main__":
fetch_all()
4 changes: 4 additions & 0 deletions benchmarks/martian/golden_comments/SOURCE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Vendored from withmartian/code-review-benchmark (MIT license).
Commit: 012d68202e56280855b85abb956410086008a7b2
Date: 2026-03-05
Files: sentry.json, grafana.json, cal_dot_com.json, discourse.json, keycloak.json
Loading
Loading