diff --git a/benchmark/download.py b/benchmark/download.py
index cfc8d43e..6938e12d 100644
--- a/benchmark/download.py
+++ b/benchmark/download.py
@@ -144,18 +144,32 @@ def add_worktree_for_commit(
 
 def group_rows_by_repo(
     rows: Iterable[dict[str, str]],
+    task: str = "qa",
 ) -> dict[str, list[Tuple[str, str]]]:
     """
-    Group rows by repo_url.
-    Input row should have keys: 'repo_url', 'commit_id', 'problem_id'
+    Group rows by repo URL.
+
+    Args:
+        rows: Iterable of row dictionaries
+        task: "qa" or "codegen"
+
     Returns mapping: repo_url -> list of (commit_id, problem_id)
     """
     mapping: dict[str, list[Tuple[str, str]]] = {}
     for r in rows:
-        repo = r["repo_url"]
-        commit = r["commit_id"]
-        prob = r["problem_id"]
-        mapping.setdefault(repo, []).append((commit, prob))
+        repo_url_value = r.get("repo_url", "")
+        is_url_format = repo_url_value.startswith(("http://", "https://"))
+
+        if is_url_format:
+            repo_url = r["repo_url"]
+            commit = r["commit_id"]
+            prob = r["problem_id"]
+        else:
+            repo_url = f"https://github.com/{r['repo']}"
+            commit = r["base_commit"]
+            prob = r["instance_id"]
+
+        mapping.setdefault(repo_url, []).append((commit, prob))
     return mapping
 
 
@@ -273,14 +287,16 @@ def prepare_worktrees(
     batch_no: int,
     max_workers: int = 8,
     skip_if_exists: bool = True,
+    task: str = "qa",
 ) -> tuple[dict[tuple[str, str], dict[tuple[str, int], Path]], list[dict[str, Any]]]:
     """
-    rows: iterable of dicts with keys 'repo_url','commit_id','problem_id'
+    rows: iterable of dicts with keys 'repo_url','commit_id','problem_id' (URL format) or 'repo','base_commit','instance_id' (slug format)
     base_dir: root directory where we will create:
         - base_dir/bare/<repo_name>.git
         - base_dir/worktrees/<repo_name>/<worktree_name>
     batch_no: number of batch copies to create per (commit,problem)
     max_workers: number of threads for parallel repo processing
+    task: Deprecated, kept for backward compatibility. Auto-detects format.
     Returns a list of per-repo summaries.
     """
     logger.bind(base_dir=base_dir, batch_no=batch_no, max_worker=max_workers).info(
@@ -289,7 +305,7 @@ def prepare_worktrees(
     base = Path(base_dir).resolve()
     base.mkdir(parents=True, exist_ok=True)
 
-    grouped = group_rows_by_repo(rows)
+    grouped = group_rows_by_repo(rows, task=task)
     summaries = []
     repo_map: dict[tuple[str, str], dict[tuple[str, int], Path]] = {}
 
diff --git a/benchmark/evaluate.py b/benchmark/evaluate.py
index 1fa4673d..8fc930d4 100644
--- a/benchmark/evaluate.py
+++ b/benchmark/evaluate.py
@@ -10,8 +10,11 @@
 from loguru import logger
 
 from .download import prepare_worktrees
-from .metrics import correctness
-from .potpie import get_all_st_answers as get_all_st_answers_potpie
+from .metrics import code_correctness, qa_correctness
+from .potpie import (
+    get_all_codegen_answers as get_all_codegen_answers_potpie,
+    get_all_st_answers as get_all_st_answers_potpie,
+)
 
 
 def get_available_agents() -> list[str]:
@@ -39,6 +42,14 @@ async def main():
         help="List of agents to evaluate",
     )
 
+    parser.add_argument(
+        "--task",
+        type=str,
+        choices=["qa", "codegen"],
+        default="qa",
+        help="Type of evaluation to perform (default: qa)",
+    )
+
     parser.add_argument(
         "--input",
         type=str,
@@ -53,18 +64,25 @@ async def main():
         help="Output CSV file for results (default: evaluation_results.csv)",
     )
 
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=2,
+        help="Number of batch copies to create per problem (default: 2)",
+    )
+
     args = parser.parse_args()
 
-    nbatch = 2
+    nbatch = args.batch_size
 
     if not Path(args.input).exists():
         print(f"Error: Input file '{args.input}' not found.", file=sys.stderr)
         sys.exit(1)
 
-    assert all(agent in get_available_agents() for agent in args.agents), (
-        "Invalid Agent(s): {}".format(
-            ", ".join(set(args.agents) - set(get_available_agents()))
-        )
+    assert all(
+        agent in get_available_agents() for agent in args.agents
+    ), "Invalid Agent(s): {}".format(
+        ", ".join(set(args.agents) - set(get_available_agents()))
     )
 
     logger.info("Evaluating tools: {}", ", ".join(args.agents))
@@ -78,47 +96,73 @@ async def main():
     )
 
     repo_map, summary = prepare_worktrees(
-        problem_sets, base_dir="/tmp/repos_batch", batch_no=nbatch, max_workers=6
+        problem_sets,
+        base_dir="/tmp/repos_batch",
+        batch_no=nbatch,
+        max_workers=6,
+        task=args.task,
     )
     # repo_dict = worktree_results
 
-    # Don't pass expected answers to the tools
-    problem_sets_without_expected_answers = problem_sets.remove_columns(
-        "expected_answer"
-    )
-
-    result_awaitables = []
-    for agent in args.agents:
-        if agent == "potpie":
-            result_awaitables.append(
-                get_all_st_answers_potpie(
-                    problem_sets_without_expected_answers, repo_map
+    # Select metric and generation function based on task type
+    if args.task == "codegen":
+        metric = code_correctness
+        expected_column = "patch"  # SWE-bench uses "patch" for expected output
+        input_column = "problem_statement"
+
+        # Don't pass expected output to the tools
+        problem_sets_without_expected = problem_sets.remove_columns(expected_column)
+
+        result_awaitables = []
+        for agent in args.agents:
+            if agent == "potpie":
+                result_awaitables.append(
+                    get_all_codegen_answers_potpie(
+                        problem_sets_without_expected, repo_map
+                    )
+                )
+            else:
+                ...
+    else:  # args.task == "qa"
+        metric = qa_correctness
+        expected_column = "expected_answer"
+        input_column = "question"
+
+        # Don't pass expected answers to the tools
+        problem_sets_without_expected = problem_sets.remove_columns(expected_column)
+
+        result_awaitables = []
+        for agent in args.agents:
+            if agent == "potpie":
+                result_awaitables.append(
+                    get_all_st_answers_potpie(
+                        problem_sets_without_expected, repo_map, task=args.task
+                    )
                 )
-            )
-        else:
-            ...
+            else:
+                ...
 
     answers = await asyncio.gather(*result_awaitables)
     print(f"len answers : {len(answers[0])}")
-    expected_answers = problem_sets.select_columns("expected_answer")
-    questions = problem_sets.select_columns("question")
-    questions_batched = [item["question"] for item in questions for _ in range(nbatch)]
+    expected_outputs = problem_sets.select_columns(expected_column)
+    inputs = problem_sets.select_columns(input_column)
+    inputs_batched = [item[input_column] for item in inputs for _ in range(nbatch)]
     answers_flattened = [item for sublist in answers for item in sublist]
 
-    expected_answers_batched = [
-        item["expected_answer"] for item in expected_answers for _ in range(nbatch)
+    expected_outputs_batched = [
+        item[expected_column] for item in expected_outputs for _ in range(nbatch)
     ]
     test_cases = [
         LLMTestCase(
-            input=question, actual_output=answer, expected_output=expected_answer
+            input=input_text, actual_output=answer, expected_output=expected_output
         )
-        for question, answer, expected_answer in zip(
-            questions_batched, answers_flattened, expected_answers_batched
+        for input_text, answer, expected_output in zip(
+            inputs_batched, answers_flattened, expected_outputs_batched
         )
     ]
 
     results = evaluate(
-        metrics=[correctness],
+        metrics=[metric],
         test_cases=test_cases,
     )
     metrics = [test_result.success for test_result in results.test_results]
diff --git a/benchmark/metrics.py b/benchmark/metrics.py
index fe588ef5..e2ace7e7 100644
--- a/benchmark/metrics.py
+++ b/benchmark/metrics.py
@@ -45,7 +45,38 @@
 Do NOT include any text outside the JSON object.
 """
 
-correctness = GEval(
+CODE_EVAL_CRITERIA = """
+You are a senior software engineer evaluating a code fix or implementation.
+Your task is to assess the quality and correctness of the GENERATED CODE compared to the EXPECTED SOLUTION (or based on the PROBLEM DESCRIPTION).
+
+Evaluate using the following criteria:
+1. Functional Correctness (50%): Does the code solve the stated problem?
+2. Code Quality & Best Practices (30%): Is the code idiomatic, safe, and maintainable?
+3. Logic Similarity (20%): Does it follow a similar (or better) valid approach to the reference solution?
+
+Output ONLY a JSON object with:
+{
+  "score": <number_between_0_and_1>,
+  "reason": "<brief explanation>"
+}
+
+Where:
+- 0.0 = The generated code fails to address the problem or is fundamentally incorrect.
+- 1.0 = The generated code completely solves the problem with high quality and matches the reference approach or improves upon it.
+
+Scoring Rules:
+- Functional Correctness is paramount. If the code doesn't solve the problem, the score should be low (<0.5).
+- Code Quality matters. Correct but messy code should be penalized slightly.
+- Logic Similarity allows for different valid approaches, but they must solve the same root cause.
+
+Your final output must contain:
+- A numeric "score" between 0 and 1.
+- A short "reason" describing the main factors affecting the score.
+
+Do NOT include any text outside the JSON object.
+"""
+
+qa_correctness = GEval(
     name="Correctness",
     criteria=EVAL_CRITERIA,
     evaluation_params=[
@@ -54,3 +85,14 @@
     ],
     strict_mode=True,
 )
+
+code_correctness = GEval(
+    name="CodeCorrectness",
+    criteria=CODE_EVAL_CRITERIA,
+    evaluation_params=[
+        LLMTestCaseParams.INPUT,
+        LLMTestCaseParams.ACTUAL_OUTPUT,
+        LLMTestCaseParams.EXPECTED_OUTPUT,
+    ],
+    strict_mode=True,
+)
diff --git a/benchmark/potpie.py b/benchmark/potpie.py
index 2142990c..f3830365 100644
--- a/benchmark/potpie.py
+++ b/benchmark/potpie.py
@@ -261,14 +261,16 @@ async def send_message(
     coordinator: ReadinessCoordinator,
     project_id: str,
     content: str,
+    agent_id: str = "codebase_qna_agent",
 ) -> str:
     await coordinator.wait_ready(project_id)
-    return await client.send_message(project_id, content)
+    return await client.send_message(project_id, content, agent_id)
 
 
 async def get_all_st_answers(
     problems: list[dict[str, str]],
     repo_dict: dict[tuple[str, str], dict[tuple[str, int], Path]],
+    task: str = "qa",
 ):
     project_cache = diskcache.Cache("project_cache")
     user_id = os.environ["defaultUsername"]
@@ -282,9 +284,16 @@ async def get_all_st_answers(
         question_tasks: list[Awaitable[str]] = []
         try:
             for problem in problems:
-                repo_url = problem["repo_url"]
-                commit_id = problem["commit_id"]
-                problem_id = problem["problem_id"]
+                if task == "codegen":
+                    # SWE-bench format: "django/django" -> "https://github.com/django/django"
+                    repo_url = f"https://github.com/{problem['repo']}"
+                    commit_id = problem["base_commit"]
+                    problem_id = problem["instance_id"]
+                else:
+                    repo_url = problem["repo_url"]
+                    commit_id = problem["commit_id"]
+                    problem_id = problem["problem_id"]
+
                 worktree_maps = repo_dict[(repo_url, commit_id)]
                 # select a repo for project id caching
                 # All worktrees are the same for now
@@ -330,6 +339,98 @@ async def get_all_st_answers(
                 await worker_task
 
 
+async def get_all_codegen_answers(
+    problems: list[dict[str, str]],
+    repo_dict: dict[tuple[str, str], dict[tuple[str, int], Path]],
+):
+    """Get code generation answers from Potpie using the code_generation_agent.
+
+    Important: Unlike QA, code generation requires SEPARATE project_ids for each batch
+    because the agent modifies files. Each batch must operate on its own isolated worktree
+    to prevent test contamination.
+
+    Args:
+        problems: List of problems with repo_url, commit_id, problem_id, and problem_statement
+        repo_dict: Dictionary mapping (repo_url, commit_id) to {(problem_id, batch_idx): worktree_path}
+
+    Returns:
+        List of generated code/patch strings
+    """
+    project_cache = diskcache.Cache("project_cache_codegen")
+    user_id = os.environ["defaultUsername"]
+    user_token = os.environ["INTERNAL_ADMIN_SECRET"]
+    coordinator = ReadinessCoordinator()
+    queue: asyncio.Queue[str] = asyncio.Queue()
+
+    async with PotpieClient(user_id=user_id, user_token=user_token) as client:
+        worker = PollingWorker(client, coordinator, queue)
+        worker_task = asyncio.create_task(worker.run())
+        codegen_tasks: list[Awaitable[str]] = []
+        try:
+            for problem in problems:
+                repo_url = f"https://github.com/{problem['repo']}"
+                commit_id = problem["base_commit"]
+                problem_id = problem["instance_id"]
+                worktree_maps = repo_dict[(repo_url, commit_id)]
+
+                # For code generation: Parse EACH worktree separately to get independent project_ids
+                i = 0
+                while (problem_id, i) in worktree_maps:
+                    repo_path = worktree_maps[(problem_id, i)]
+                    cache_key = f"{repo_url}_{commit_id}_{problem_id}_{i}"
+                    cached_project_id = project_cache.get(cache_key)
+                    project_id = None
+
+                    if cached_project_id is not None:
+                        existing_projects = await client.get_available_projects()
+                        existing_project_ids = {
+                            project["id"] for project in existing_projects
+                        }
+                        if cached_project_id in existing_project_ids:
+                            logger.info(
+                                "Using cached project_id for worktree",
+                                project_id=cached_project_id,
+                                repo_url=repo_url,
+                                commit_id=commit_id,
+                                problem_id=problem_id,
+                                batch=i,
+                            )
+                            project_id = str(cached_project_id)
+
+                    if project_id is None:
+                        project_id = await client.post_parse(commit_id, repo_path)
+                        project_cache[cache_key] = project_id
+                        logger.info(
+                            "Parsed new worktree for code generation",
+                            project_id=project_id,
+                            repo_url=repo_url,
+                            problem_id=problem_id,
+                            batch=i,
+                        )
+
+                    await _enqueue_project(coordinator, queue, project_id)
+
+                    # Each task uses its own project_id (pointing to its own worktree)
+                    codegen_tasks.append(
+                        send_message(
+                            client,
+                            coordinator,
+                            project_id,
+                            problem["problem_statement"],
+                            agent_id="code_generation_agent",
+                        )
+                    )
+                    i += 1
+
+            answers = await asyncio.gather(*codegen_tasks)
+            return answers
+        finally:
+            await worker.stop()
+            worker_task.cancel()
+            with contextlib.suppress(asyncio.CancelledError):
+                await worker_task
+
+
 if __name__ == "__main__":
     from dotenv import load_dotenv