diff --git a/benchmark/download.py b/benchmark/download.py index cfc8d43e..6938e12d 100644 --- a/benchmark/download.py +++ b/benchmark/download.py @@ -144,18 +144,32 @@ def add_worktree_for_commit( def group_rows_by_repo( rows: Iterable[dict[str, str]], + task: str = "qa", ) -> dict[str, list[Tuple[str, str]]]: """ - Group rows by repo_url. - Input row should have keys: 'repo_url', 'commit_id', 'problem_id' + Group rows by repo URL. + + Args: + rows: Iterable of row dictionaries + task: "qa" or "codegen" + Returns mapping: repo_url -> list of (commit_id, problem_id) """ mapping: dict[str, list[Tuple[str, str]]] = {} for r in rows: - repo = r["repo_url"] - commit = r["commit_id"] - prob = r["problem_id"] - mapping.setdefault(repo, []).append((commit, prob)) + repo_url_value = r.get("repo_url", "") + is_url_format = repo_url_value.startswith(("http://", "https://")) + + if is_url_format: + repo_url = r["repo_url"] + commit = r["commit_id"] + prob = r["problem_id"] + else: + repo_url = f"https://github.com/{r['repo']}" + commit = r["base_commit"] + prob = r["instance_id"] + + mapping.setdefault(repo_url, []).append((commit, prob)) return mapping @@ -273,14 +287,16 @@ def prepare_worktrees( batch_no: int, max_workers: int = 8, skip_if_exists: bool = True, + task: str = "qa", ) -> tuple[dict[tuple[str, str], dict[tuple[str, int], Path]], list[dict[str, Any]]]: """ - rows: iterable of dicts with keys 'repo_url','commit_id','problem_id' + rows: iterable of dicts with keys 'repo_url','commit_id','problem_id' (URL format) or 'repo','base_commit','instance_id' (slug format) base_dir: root directory where we will create: - base_dir/bare/.git - base_dir/worktrees// batch_no: number of batch copies to create per (commit,problem) max_workers: number of threads for parallel repo processing + task: Deprecated, kept for backward compatibility. Auto-detects format. Returns a list of per-repo summaries. """ logger.bind(base_dir=base_dir, batch_no=batch_no, max_worker=max_workers).info( @@ -289,7 +305,7 @@ def prepare_worktrees( base = Path(base_dir).resolve() base.mkdir(parents=True, exist_ok=True) - grouped = group_rows_by_repo(rows) + grouped = group_rows_by_repo(rows, task=task) summaries = [] repo_map: dict[tuple[str, str], dict[tuple[str, int], Path]] = {} diff --git a/benchmark/evaluate.py b/benchmark/evaluate.py index 1fa4673d..8fc930d4 100644 --- a/benchmark/evaluate.py +++ b/benchmark/evaluate.py @@ -10,8 +10,11 @@ from loguru import logger from .download import prepare_worktrees -from .metrics import correctness -from .potpie import get_all_st_answers as get_all_st_answers_potpie +from .metrics import code_correctness, qa_correctness +from .potpie import ( + get_all_codegen_answers as get_all_codegen_answers_potpie, + get_all_st_answers as get_all_st_answers_potpie, +) def get_available_agents() -> list[str]: @@ -39,6 +42,14 @@ async def main(): help="List of agents to evaluate", ) + parser.add_argument( + "--task", + type=str, + choices=["qa", "codegen"], + default="qa", + help="Type of evaluation to perform (default: qa)", + ) + parser.add_argument( "--input", type=str, @@ -53,18 +64,25 @@ async def main(): help="Output CSV file for results (default: evaluation_results.csv)", ) + parser.add_argument( + "--batch-size", + type=int, + default=2, + help="Number of batch copies to create per problem (default: 2)", + ) + args = parser.parse_args() - nbatch = 2 + nbatch = args.batch_size if not Path(args.input).exists(): print(f"Error: Input file '{args.input}' not found.", file=sys.stderr) sys.exit(1) - assert all(agent in get_available_agents() for agent in args.agents), ( - "Invalid Agent(s): {}".format( - ", ".join(set(args.agents) - set(get_available_agents())) - ) + assert all( + agent in get_available_agents() for agent in args.agents + ), "Invalid Agent(s): {}".format( + ", ".join(set(args.agents) - set(get_available_agents())) ) logger.info("Evaluating tools: {}", ", ".join(args.agents)) @@ -78,47 +96,73 @@ async def main(): ) repo_map, summary = prepare_worktrees( - problem_sets, base_dir="/tmp/repos_batch", batch_no=nbatch, max_workers=6 + problem_sets, + base_dir="/tmp/repos_batch", + batch_no=nbatch, + max_workers=6, + task=args.task, ) # repo_dict = worktree_results - # Don't pass expected answers to the tools - problem_sets_without_expected_answers = problem_sets.remove_columns( - "expected_answer" - ) - - result_awaitables = [] - for agent in args.agents: - if agent == "potpie": - result_awaitables.append( - get_all_st_answers_potpie( - problem_sets_without_expected_answers, repo_map + # Select metric and generation function based on task type + if args.task == "codegen": + metric = code_correctness + expected_column = "patch" # SWE-bench uses "patch" for expected output + input_column = "problem_statement" + + # Don't pass expected output to the tools + problem_sets_without_expected = problem_sets.remove_columns(expected_column) + + result_awaitables = [] + for agent in args.agents: + if agent == "potpie": + result_awaitables.append( + get_all_codegen_answers_potpie( + problem_sets_without_expected, repo_map + ) + ) + else: + ... + else: # args.task == "qa" + metric = qa_correctness + expected_column = "expected_answer" + input_column = "question" + + # Don't pass expected answers to the tools + problem_sets_without_expected = problem_sets.remove_columns(expected_column) + + result_awaitables = [] + for agent in args.agents: + if agent == "potpie": + result_awaitables.append( + get_all_st_answers_potpie( + problem_sets_without_expected, repo_map, task=args.task + ) ) - ) - else: - ... + else: + ... answers = await asyncio.gather(*result_awaitables) print(f"len answers : {len(answers[0])}") - expected_answers = problem_sets.select_columns("expected_answer") - questions = problem_sets.select_columns("question") - questions_batched = [item["question"] for item in questions for _ in range(nbatch)] + expected_outputs = problem_sets.select_columns(expected_column) + inputs = problem_sets.select_columns(input_column) + inputs_batched = [item[input_column] for item in inputs for _ in range(nbatch)] answers_flattened = [item for sublist in answers for item in sublist] - expected_answers_batched = [ - item["expected_answer"] for item in expected_answers for _ in range(nbatch) + expected_outputs_batched = [ + item[expected_column] for item in expected_outputs for _ in range(nbatch) ] test_cases = [ LLMTestCase( - input=question, actual_output=answer, expected_output=expected_answer + input=input_text, actual_output=answer, expected_output=expected_output ) - for question, answer, expected_answer in zip( - questions_batched, answers_flattened, expected_answers_batched + for input_text, answer, expected_output in zip( + inputs_batched, answers_flattened, expected_outputs_batched ) ] results = evaluate( - metrics=[correctness], + metrics=[metric], test_cases=test_cases, ) metrics = [test_result.success for test_result in results.test_results] diff --git a/benchmark/metrics.py b/benchmark/metrics.py index fe588ef5..e2ace7e7 100644 --- a/benchmark/metrics.py +++ b/benchmark/metrics.py @@ -45,7 +45,38 @@ Do NOT include any text outside the JSON object. """ -correctness = GEval( +CODE_EVAL_CRITERIA = """ +You are a senior software engineer evaluating a code fix or implementation. +Your task is to assess the quality and correctness of the GENERATED CODE compared to the EXPECTED SOLUTION (or based on the PROBLEM DESCRIPTION). + +Evaluate using the following criteria: +1. Functional Correctness (50%): Does the code solve the stated problem? +2. Code Quality & Best Practices (30%): Is the code idiomatic, safe, and maintainable? +3. Logic Similarity (20%): Does it follow a similar (or better) valid approach to the reference solution? + +Output ONLY a JSON object with: +{ + "score": , + "reason": "" +} + +Where: +- 0.0 = The generated code fails to address the problem or is fundamentally incorrect. +- 1.0 = The generated code completely solves the problem with high quality and matches the reference approach or improves upon it. + +Scoring Rules: +- Functional Correctness is paramount. If the code doesn't solve the problem, the score should be low (<0.5). +- Code Quality matters. Correct but messy code should be penalized slightly. +- Logic Similarity allows for different valid approaches, but they must solve the same root cause. + +Your final output must contain: +- A numeric "score" between 0 and 1. +- A short "reason" describing the main factors affecting the score. + +Do NOT include any text outside the JSON object. +""" + +qa_correctness = GEval( name="Correctness", criteria=EVAL_CRITERIA, evaluation_params=[ @@ -54,3 +85,14 @@ ], strict_mode=True, ) + +code_correctness = GEval( + name="CodeCorrectness", + criteria=CODE_EVAL_CRITERIA, + evaluation_params=[ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.EXPECTED_OUTPUT, + ], + strict_mode=True, +) diff --git a/benchmark/potpie.py b/benchmark/potpie.py index 2142990c..f3830365 100644 --- a/benchmark/potpie.py +++ b/benchmark/potpie.py @@ -261,14 +261,16 @@ async def send_message( coordinator: ReadinessCoordinator, project_id: str, content: str, + agent_id: str = "codebase_qna_agent", ) -> str: await coordinator.wait_ready(project_id) - return await client.send_message(project_id, content) + return await client.send_message(project_id, content, agent_id) async def get_all_st_answers( problems: list[dict[str, str]], repo_dict: dict[tuple[str, str], dict[tuple[str, int], Path]], + task: str = "qa", ): project_cache = diskcache.Cache("project_cache") user_id = os.environ["defaultUsername"] @@ -282,9 +284,16 @@ async def get_all_st_answers( question_tasks: list[Awaitable[str]] = [] try: for problem in problems: - repo_url = problem["repo_url"] - commit_id = problem["commit_id"] - problem_id = problem["problem_id"] + if task == "codegen": + # SWE-bench format: "django/django" -> "https://github.com/django/django" + repo_url = f"https://github.com/{problem['repo']}" + commit_id = problem["base_commit"] + problem_id = problem["instance_id"] + else: + repo_url = problem["repo_url"] + commit_id = problem["commit_id"] + problem_id = problem["problem_id"] + worktree_maps = repo_dict[(repo_url, commit_id)] # select a repo for project id caching # All worktrees are the same for now @@ -330,6 +339,98 @@ async def get_all_st_answers( await worker_task +async def get_all_codegen_answers( + problems: list[dict[str, str]], + repo_dict: dict[tuple[str, str], dict[tuple[str, int], Path]], +): + """Get code generation answers from Potpie using the code_generation_agent. + + Important: Unlike QA, code generation requires SEPARATE project_ids for each batch + because the agent modifies files. Each batch must operate on its own isolated worktree + to prevent test contamination. + + Args: + problems: List of problems with repo_url, commit_id, problem_id, and problem_statement + repo_dict: Dictionary mapping (repo_url, commit_id) to {(problem_id, batch_idx): worktree_path} + + Returns: + List of generated code/patch strings + """ + project_cache = diskcache.Cache("project_cache_codegen") + user_id = os.environ["defaultUsername"] + user_token = os.environ["INTERNAL_ADMIN_SECRET"] + coordinator = ReadinessCoordinator() + queue: asyncio.Queue[str] = asyncio.Queue() + + async with PotpieClient(user_id=user_id, user_token=user_token) as client: + worker = PollingWorker(client, coordinator, queue) + worker_task = asyncio.create_task(worker.run()) + codegen_tasks: list[Awaitable[str]] = [] + try: + for problem in problems: + repo_url = f"https://github.com/{problem['repo']}" + commit_id = problem["base_commit"] + problem_id = problem["instance_id"] + worktree_maps = repo_dict[(repo_url, commit_id)] + + # For code generation: Parse EACH worktree separately to get independent project_ids + i = 0 + while (problem_id, i) in worktree_maps: + repo_path = worktree_maps[(problem_id, i)] + cache_key = f"{repo_url}_{commit_id}_{problem_id}_{i}" + cached_project_id = project_cache.get(cache_key) + project_id = None + + if cached_project_id is not None: + existing_projects = await client.get_available_projects() + existing_project_ids = { + project["id"] for project in existing_projects + } + if cached_project_id in existing_project_ids: + logger.info( + "Using cached project_id for worktree", + project_id=cached_project_id, + repo_url=repo_url, + commit_id=commit_id, + problem_id=problem_id, + batch=i, + ) + project_id = str(cached_project_id) + + if project_id is None: + project_id = await client.post_parse(commit_id, repo_path) + project_cache[cache_key] = project_id + logger.info( + "Parsed new worktree for code generation", + project_id=project_id, + repo_url=repo_url, + problem_id=problem_id, + batch=i, + ) + + await _enqueue_project(coordinator, queue, project_id) + + # Each task uses its own project_id (pointing to its own worktree) + codegen_tasks.append( + send_message( + client, + coordinator, + project_id, + problem["problem_statement"], + agent_id="code_generation_agent", + ) + ) + i += 1 + + answers = await asyncio.gather(*codegen_tasks) + return answers + finally: + await worker.stop() + worker_task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await worker_task + + if __name__ == "__main__": from dotenv import load_dotenv