potpie-ai · ASCE-D · Nov 20, 2025 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025
diff --git a/benchmark/download.py b/benchmark/download.py
@@ -144,18 +144,30 @@ def add_worktree_for_commit(
 
 def group_rows_by_repo(
     rows: Iterable[dict[str, str]],
+    task: str = "qa",
 ) -> dict[str, list[Tuple[str, str]]]:
     """
-    Group rows by repo_url.
-    Input row should have keys: 'repo_url', 'commit_id', 'problem_id'
+    Group rows by repo URL.
+
+    Args:
+        rows: Iterable of row dictionaries
+        task: "qa" for legacy format, "codegen" for SWE-bench format
+
     Returns mapping: repo_url -> list of (commit_id, problem_id)
     """
     mapping: dict[str, list[Tuple[str, str]]] = {}
     for r in rows:
-        repo = r["repo_url"]
-        commit = r["commit_id"]
-        prob = r["problem_id"]
-        mapping.setdefault(repo, []).append((commit, prob))
+        if task == "codegen":
+            # SWE-bench format: "django/django" -> "https://github.com/django/django"
+            repo_url = f"https://github.com/{r['repo']}"
+            commit = r["base_commit"]
+            prob = r["instance_id"]
+        else:
+            repo_url = r["repo_url"]
+            commit = r["commit_id"]
+            prob = r["problem_id"]
+
+        mapping.setdefault(repo_url, []).append((commit, prob))
     return mapping
 
 
@@ -273,14 +285,16 @@ def prepare_worktrees(
     batch_no: int,
     max_workers: int = 8,
     skip_if_exists: bool = True,
+    task: str = "qa",
 ) -> tuple[dict[tuple[str, str], dict[tuple[str, int], Path]], list[dict[str, Any]]]:
     """
-    rows: iterable of dicts with keys 'repo_url','commit_id','problem_id'
+    rows: iterable of dicts with keys 'repo_url','commit_id','problem_id' (QA) or 'repo','base_commit','instance_id' (codegen)
     base_dir: root directory where we will create:
         - base_dir/bare/<repo_name>.git
         - base_dir/worktrees/<repo_name>/<worktree_name>
     batch_no: number of batch copies to create per (commit,problem)
     max_workers: number of threads for parallel repo processing
+    task: "qa" for legacy format, "codegen" for SWE-bench format
     Returns a list of per-repo summaries.
     """
     logger.bind(base_dir=base_dir, batch_no=batch_no, max_worker=max_workers).info(
@@ -289,7 +303,7 @@ def prepare_worktrees(
     base = Path(base_dir).resolve()
     base.mkdir(parents=True, exist_ok=True)
 
-    grouped = group_rows_by_repo(rows)
+    grouped = group_rows_by_repo(rows, task=task)
     summaries = []
     repo_map: dict[tuple[str, str], dict[tuple[str, int], Path]] = {}
 

diff --git a/benchmark/evaluate.py b/benchmark/evaluate.py
@@ -10,8 +10,11 @@
 from loguru import logger
 
 from .download import prepare_worktrees
-from .metrics import correctness
-from .potpie import get_all_st_answers as get_all_st_answers_potpie
+from .metrics import code_correctness, correctness
+from .potpie import (
+    get_all_codegen_answers as get_all_codegen_answers_potpie,
+    get_all_st_answers as get_all_st_answers_potpie,
+)
 
 
 def get_available_agents() -> list[str]:
@@ -39,6 +42,14 @@ async def main():
         help="List of agents to evaluate",
     )
 
+    parser.add_argument(
+        "--task",
+        type=str,
+        choices=["qa", "codegen"],
+        default="qa",
+        help="Type of evaluation to perform (default: qa)",
+    )
+
     parser.add_argument(
         "--input",
         type=str,
@@ -53,18 +64,25 @@ async def main():
         help="Output CSV file for results (default: evaluation_results.csv)",
     )
 
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=2,
+        help="Number of batch copies to create per problem (default: 2)",
+    )
+
     args = parser.parse_args()
 
-    nbatch = 2
+    nbatch = args.batch_size
 
     if not Path(args.input).exists():
         print(f"Error: Input file '{args.input}' not found.", file=sys.stderr)
         sys.exit(1)
 
-    assert all(agent in get_available_agents() for agent in args.agents), (
-        "Invalid Agent(s): {}".format(
-            ", ".join(set(args.agents) - set(get_available_agents()))
-        )
+    assert all(
+        agent in get_available_agents() for agent in args.agents
+    ), "Invalid Agent(s): {}".format(
+        ", ".join(set(args.agents) - set(get_available_agents()))
     )
 
     logger.info("Evaluating tools: {}", ", ".join(args.agents))
@@ -78,47 +96,73 @@ async def main():
     )
 
     repo_map, summary = prepare_worktrees(
-        problem_sets, base_dir="/tmp/repos_batch", batch_no=nbatch, max_workers=6
+        problem_sets,
+        base_dir="/tmp/repos_batch",
+        batch_no=nbatch,
+        max_workers=6,
+        task=args.task,
     )
     # repo_dict = worktree_results
 
-    # Don't pass expected answers to the tools
-    problem_sets_without_expected_answers = problem_sets.remove_columns(
-        "expected_answer"
-    )
-
-    result_awaitables = []
-    for agent in args.agents:
-        if agent == "potpie":
-            result_awaitables.append(
-                get_all_st_answers_potpie(
-                    problem_sets_without_expected_answers, repo_map
+    # Select metric and generation function based on task type
+    if args.task == "codegen":
+        metric = code_correctness
+        expected_column = "patch"  # SWE-bench uses "patch" for expected output
+        input_column = "problem_statement"
+
+        # Don't pass expected output to the tools
+        problem_sets_without_expected = problem_sets.remove_columns(expected_column)
+
+        result_awaitables = []
+        for agent in args.agents:
+            if agent == "potpie":
+                result_awaitables.append(
+                    get_all_codegen_answers_potpie(
+                        problem_sets_without_expected, repo_map
+                    )
+                )
+            else:
+                ...
+    else:  # args.task == "qa"
+        metric = correctness
+        expected_column = "expected_answer"
+        input_column = "question"
+
+        # Don't pass expected answers to the tools
+        problem_sets_without_expected = problem_sets.remove_columns(expected_column)
+
+        result_awaitables = []
+        for agent in args.agents:
+            if agent == "potpie":
+                result_awaitables.append(
+                    get_all_st_answers_potpie(
+                        problem_sets_without_expected, repo_map, task=args.task
+                    )
                 )
-            )
-        else:
-            ...
+            else:
+                ...
 
     answers = await asyncio.gather(*result_awaitables)
     print(f"len answers : {len(answers[0])}")
-    expected_answers = problem_sets.select_columns("expected_answer")
-    questions = problem_sets.select_columns("question")
-    questions_batched = [item["question"] for item in questions for _ in range(nbatch)]
+    expected_outputs = problem_sets.select_columns(expected_column)
+    inputs = problem_sets.select_columns(input_column)
+    inputs_batched = [item[input_column] for item in inputs for _ in range(nbatch)]
     answers_flattened = [item for sublist in answers for item in sublist]
 
-    expected_answers_batched = [
-        item["expected_answer"] for item in expected_answers for _ in range(nbatch)
+    expected_outputs_batched = [
+        item[expected_column] for item in expected_outputs for _ in range(nbatch)
     ]
     test_cases = [
         LLMTestCase(
-            input=question, actual_output=answer, expected_output=expected_answer
+            input=input_text, actual_output=answer, expected_output=expected_output
         )
-        for question, answer, expected_answer in zip(
-            questions_batched, answers_flattened, expected_answers_batched
+        for input_text, answer, expected_output in zip(
+            inputs_batched, answers_flattened, expected_outputs_batched
         )
     ]
 
     results = evaluate(
-        metrics=[correctness],
+        metrics=[metric],
         test_cases=test_cases,
     )
     metrics = [test_result.success for test_result in results.test_results]

diff --git a/benchmark/metrics.py b/benchmark/metrics.py
@@ -45,6 +45,37 @@
 Do NOT include any text outside the JSON object.
 """
 
+CODE_EVAL_CRITERIA = """
+You are a senior software engineer evaluating a code fix or implementation.
+Your task is to assess the quality and correctness of the GENERATED CODE compared to the EXPECTED SOLUTION (or based on the PROBLEM DESCRIPTION).
+
+Evaluate using the following criteria:
+1. Functional Correctness (50%): Does the code solve the stated problem?
+2. Code Quality & Best Practices (30%): Is the code idiomatic, safe, and maintainable?
+3. Logic Similarity (20%): Does it follow a similar (or better) valid approach to the reference solution?
+
+Output ONLY a JSON object with:
+{
+  "score": <number_between_0_and_1>,
+  "reason": "<brief explanation>"
+}
+
+Where:
+- 0.0 = The generated code fails to address the problem or is fundamentally incorrect.
+- 1.0 = The generated code completely solves the problem with high quality and matches the reference approach or improves upon it.
+
+Scoring Rules:
+- Functional Correctness is paramount. If the code doesn't solve the problem, the score should be low (<0.5).
+- Code Quality matters. Correct but messy code should be penalized slightly.
+- Logic Similarity allows for different valid approaches, but they must solve the same root cause.
+
+Your final output must contain:
+- A numeric "score" between 0 and 1.
+- A short "reason" describing the main factors affecting the score.
+
+Do NOT include any text outside the JSON object.
+"""
+
 correctness = GEval(
     name="Correctness",
     criteria=EVAL_CRITERIA,
@@ -54,3 +85,14 @@
     ],
     strict_mode=True,
 )
+
+code_correctness = GEval(
+    name="CodeCorrectness",
+    criteria=CODE_EVAL_CRITERIA,
+    evaluation_params=[
+        LLMTestCaseParams.INPUT,
+        LLMTestCaseParams.ACTUAL_OUTPUT,
+        LLMTestCaseParams.EXPECTED_OUTPUT,
+    ],
+    strict_mode=True,
+)