Project-Navi · Nelson Spence (Fieldnote-Echo) · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
@@ -19,8 +19,12 @@ docs/plans/
 nboot-spec.json
 .hypothesis/
 .worktrees/
+.claude-flow/
 benchmarks/output/
 !benchmarks/output/.gitkeep
+benchmarks/martian/output/
+benchmarks/martian/output-*/
+!benchmarks/martian/output/.gitkeep
 
 # Zensical build output
 site/
@@ -173,6 +173,15 @@
         "line_number": 58
       }
     ],
+    "benchmarks/martian/config.py": [
+      {
+        "type": "Hex High Entropy String",
+        "filename": "benchmarks/martian/config.py",
+        "hashed_secret": "e6a37dac608a721d5d88c51c6af2535e403d405c",
+        "is_verified": false,
+        "line_number": 10
+      }
+    ],
     "scripts/check_test_parity.py": [
       {
         "type": "Secret Keyword",
@@ -182,6 +191,15 @@
         "line_number": 38
       }
     ],
+    "tests/test_bench_martian_config.py": [
+      {
+        "type": "Hex High Entropy String",
+        "filename": "tests/test_bench_martian_config.py",
+        "hashed_secret": "e6a37dac608a721d5d88c51c6af2535e403d405c",
+        "is_verified": false,
+        "line_number": 9
+      }
+    ],
     "tests/test_grippy_agent_evolution.py": [
       {
         "type": "Secret Keyword",
@@ -281,5 +299,5 @@
       }
     ]
   },
-  "generated_at": "2026-03-12T03:37:04Z"
+  "generated_at": "2026-03-13T18:52:34Z"
 }
@@ -0,0 +1,11 @@
+# Grippy Martian Benchmark Harness
+
+Runs Grippy against the [withmartian/code-review-benchmark](https://github.com/withmartian/code-review-benchmark)
+50-PR golden dataset.
+
+**This is a dev-path preflight harness.** For public leaderboard claims,
+use the Martian-native public path (see design doc).
+
+## Quick Start
+
+See design doc: `docs/plans/2026-03-12-martian-benchmark-harness-design.md`
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: MIT
+"""Grippy benchmark harness for the Martian code-review-benchmark."""
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: MIT
+"""Benchmark run configuration — frozen for headline score."""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+
+MARTIAN_COMMIT_SHA = "012d68202e56280855b85abb956410086008a7b2"
+
+_HERE = Path(__file__).resolve().parent
+
+
+@dataclass(frozen=True)
+class BenchConfig:
+    """Immutable benchmark run configuration."""
+
+    model_id: str = "nemotron-3-nano-30b-a3b"
+    transport: str = "local"
+    base_url: str = "http://localhost:1234/v1"
+    api_key: str = "lm-studio"
+    profile: str = "security"
+    mode: str = "pr_review"
+    extract_model: str = "claude-sonnet-4-5-20250929"
+    judge_model: str = "claude-sonnet-4-5-20250929"
+    golden_dir: Path = field(default_factory=lambda: _HERE / "golden_comments")
+    output_dir: Path = field(default_factory=lambda: _HERE / "output")
+
+    @classmethod
+    def from_env(cls) -> BenchConfig:
+        """Build config from GRIPPY_* environment variables."""
+        return cls(
+            model_id=os.environ.get("GRIPPY_MODEL_ID", cls.model_id),
+            transport=os.environ.get("GRIPPY_TRANSPORT", cls.transport),
+            base_url=os.environ.get("GRIPPY_BASE_URL", cls.base_url),
+            api_key=os.environ.get("GRIPPY_API_KEY", cls.api_key),
+            profile=os.environ.get("GRIPPY_PROFILE", cls.profile),
+            mode=os.environ.get("GRIPPY_MODE", cls.mode),
+        )
+
+    def stamp(self) -> dict:
+        """Return full provenance dict for embedding in output files.
+
+        Includes everything needed to reproduce this run: Grippy version,
+        harness git SHA, vendored prompt checksums, judge/extract model IDs,
+        and all config values. Two runs with identical stamps produce
+        identical results (modulo LLM non-determinism).
+        """
+        import hashlib
+        import subprocess
+
+        # Grippy package version
+        try:
+            from importlib.metadata import version as pkg_version
+
+            grippy_ver = pkg_version("grippy-mcp")
+        except Exception:
+            grippy_ver = "unknown"
+
+        # Harness git SHA (full, not truncated)
+        try:
+            harness_sha = subprocess.check_output(
+                ["git", "rev-parse", "HEAD"],
+                cwd=_HERE,
+                text=True,
+                timeout=5,
+            ).strip()
+        except Exception:
+            harness_sha = "unknown"
+
+        # Vendored prompt checksums (full SHA-256, not truncated)
+        prompt_checksums = {}
+        prompts_dir = _HERE / "prompts"
+        for p in sorted(prompts_dir.glob("*.txt")):
+            h = hashlib.sha256(p.read_bytes()).hexdigest()
+            prompt_checksums[p.name] = h
+
+        return {
+            "grippy_version": grippy_ver,
+            "harness_git_sha": harness_sha,
+            "model_id": self.model_id,
+            "transport": self.transport,
+            "profile": self.profile,
+            "mode": self.mode,
+            "martian_commit": MARTIAN_COMMIT_SHA,
+            "extract_model": self.extract_model,
+            "judge_model": self.judge_model,
+            "prompt_checksums": prompt_checksums,
+        }
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: MIT
+"""Step 3: Extract discrete candidates from Grippy review comments.
+
+Mirrors Martian's step2_extract_comments.py:
+- Inline comments (file + line) become candidates directly.
+- General comments are LLM-extracted into discrete issues.
+
+Prompts vendored verbatim from Martian @ 012d682.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from collections.abc import Callable
+from pathlib import Path
+
+from benchmarks.martian.config import BenchConfig
+
+# --- Vendored Martian prompts (loaded from files, not inline) ---
+_PROMPTS_DIR = Path(__file__).resolve().parent / "prompts"
+
+EXTRACT_SYSTEM_PROMPT = (_PROMPTS_DIR / "extract_system.txt").read_text().strip()
+EXTRACT_USER_PROMPT = (_PROMPTS_DIR / "extract.txt").read_text().strip()
+
+# --- End vendored prompts ---
+
+
+def inline_to_candidate(inline: dict) -> dict:
+    """Convert an inline comment to a candidate (direct, no extraction)."""
+    return {
+        "text": inline["body"],
+        "path": inline.get("path"),
+        "line": inline.get("line"),
+        "source": "inline",
+    }
+
+
+def _llm_extract_default(text: str) -> list[str]:
+    """Extract issues from general comment text using Claude."""
+    import anthropic
+
+    client = anthropic.Anthropic()
+    resp = client.messages.create(
+        model="claude-sonnet-4-5-20250929",
+        max_tokens=2048,
+        temperature=0.0,
+        system=EXTRACT_SYSTEM_PROMPT,
+        messages=[{"role": "user", "content": EXTRACT_USER_PROMPT.format(comment=text)}],
+    )
+    content = resp.content[0].text.strip()
+    # Strip markdown code fences if present (Claude sometimes wraps JSON)
+    if content.startswith("```"):
+        content = content.split("```")[1]
+        if content.startswith("json"):
+            content = content[4:]
+        content = content.strip()
+    parsed = json.loads(content)
+    return parsed.get("issues", [])
+
+
+def extract_candidates_for_pr(
+    comments: dict,
+    llm_extract_fn: Callable[[str], list[str]] | None = _llm_extract_default,
+) -> list[dict]:
+    """Extract candidates from a PR's comments.
+
+    Args:
+        comments: dict with "inline" and "general" keys from run_grippy.py
+        llm_extract_fn: function to extract issues from general text.
+            Pass None to skip general extraction (when no general comments).
+    """
+    candidates = []
+
+    # Inline findings → direct candidates
+    for inline in comments.get("inline", []):
+        candidates.append(inline_to_candidate(inline))
+
+    # General findings → LLM extraction
+    general_texts = comments.get("general", [])
+    if general_texts and llm_extract_fn is not None:
+        combined = "\n\n---\n\n".join(general_texts)
+        issues = llm_extract_fn(combined)
+        for issue_text in issues:
+            candidates.append(
+                {
+                    "text": issue_text,
+                    "path": None,
+                    "line": None,
+                    "source": "extracted",
+                }
+            )
+
+    return candidates
+
+
+def extract_all(config: BenchConfig | None = None) -> None:
+    """Extract candidates for all reviewed PRs.
+
+    Writes per-PR status to extract_manifest.json for unified failure accounting.
+    """
+    config = config or BenchConfig()
+    comment_dir = config.output_dir / "comments"
+    cand_dir = config.output_dir / "candidates"
+    cand_dir.mkdir(parents=True, exist_ok=True)
+
+    manifest = []
+    for comment_path in sorted(comment_dir.glob("*.json")):
+        slug = comment_path.stem
+        cand_path = cand_dir / f"{slug}.json"
+
+        try:
+            comments = json.loads(comment_path.read_text())
+            n_general = len(comments.get("general", []))
+
+            print(f"{slug}: {len(comments.get('inline', []))} inline, {n_general} general")
+
+            extract_fn = _llm_extract_default if n_general > 0 else None
+            candidates = extract_candidates_for_pr(comments, llm_extract_fn=extract_fn)
+
+            cand_path.write_text(json.dumps(candidates, indent=2))
+            print(f"  → {len(candidates)} candidates")
+            manifest.append({"pr": slug, "status": "ok", "candidates": len(candidates)})
+        except Exception as e:
+            print(f"  ERROR: {e}", file=sys.stderr)
+            manifest.append({"pr": slug, "status": "failed", "reason": str(e)})
+
+    # Write extract manifest for unified failure accounting
+    manifest_path = config.output_dir / "extract_manifest.json"
+    manifest_path.write_text(json.dumps(manifest, indent=2))
+    print(f"Done. Candidates in {cand_dir}")
+
+
+if __name__ == "__main__":
+    extract_all()
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: MIT
+"""Step 1: Fetch PR diffs from GitHub for benchmark corpus."""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+import time
+from pathlib import Path
+from urllib.parse import urlparse
+
+import httpx
+
+from benchmarks.martian.config import BenchConfig
+
+
+def parse_golden_prs(golden_dir: Path) -> list[dict]:
+    """Parse all golden comment files and extract PR metadata."""
+    prs: list[dict] = []
+    for path in sorted(golden_dir.glob("*.json")):
+        data = json.loads(path.read_text())
+        for entry in data:
+            url = entry["url"]
+            parts = urlparse(url).path.strip("/").split("/")
+            # parts: [owner, repo, "pull", number]
+            prs.append(
+                {
+                    "owner": parts[0],
+                    "repo": parts[1],
+                    "pr_number": int(parts[3]),
+                    "pr_title": entry["pr_title"],
+                    "golden_url": url,
+                    "golden_file": path.name,
+                    "golden_comments": entry.get("comments", []),
+                }
+            )
+    return prs
+
+
+def fetch_diff(
+    *,
+    owner: str,
+    repo: str,
+    pr_number: int,
+    output_dir: Path,
+    token: str,
+) -> str:
+    """Fetch a PR diff from GitHub API. Returns diff text. Caches to disk."""
+    cache_path = output_dir / f"{repo}_PR{pr_number}.diff"
+    if cache_path.exists():
+        return cache_path.read_text()
+
+    url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}"
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "Accept": "application/vnd.github.v3.diff",
+    }
+    resp = httpx.get(url, headers=headers, timeout=30, follow_redirects=True)
+    resp.raise_for_status()
+
+    diff_text = resp.text
+    output_dir.mkdir(parents=True, exist_ok=True)
+    cache_path.write_text(diff_text)
+    return diff_text
+
+
+def fetch_all(config: BenchConfig | None = None) -> None:
+    """Fetch diffs for all golden PRs."""
+    config = config or BenchConfig()
+    token = os.environ.get("GITHUB_TOKEN", "")
+    if not token:
+        print("ERROR: GITHUB_TOKEN env var required", file=sys.stderr)
+        sys.exit(1)
+
+    prs = parse_golden_prs(config.golden_dir)
+    diff_dir = config.output_dir / "diffs"
+    diff_dir.mkdir(parents=True, exist_ok=True)
+
+    for i, pr in enumerate(prs, 1):
+        cache_path = diff_dir / f"{pr['repo']}_PR{pr['pr_number']}.diff"
+        status = "cached" if cache_path.exists() else "fetching"
+        print(f"[{i}/{len(prs)}] {pr['repo']}#{pr['pr_number']} — {status}")
+
+        fetch_diff(
+            owner=pr["owner"],
+            repo=pr["repo"],
+            pr_number=pr["pr_number"],
+            output_dir=diff_dir,
+            token=token,
+        )
+        if status == "fetching":
+            time.sleep(1)  # respect rate limits
+
+    print(f"Done. {len(prs)} diffs in {diff_dir}")
+
+
+if __name__ == "__main__":
+    fetch_all()
@@ -0,0 +1,4 @@
+Vendored from withmartian/code-review-benchmark (MIT license).
+Commit: 012d68202e56280855b85abb956410086008a7b2
+Date: 2026-03-05
+Files: sentry.json, grafana.json, cal_dot_com.json, discourse.json, keycloak.json
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# SPDX-License-Identifier: MIT
		"""Grippy benchmark harness for the Martian code-review-benchmark."""