Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions benchmark/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,18 +144,30 @@ def add_worktree_for_commit(

def group_rows_by_repo(
rows: Iterable[dict[str, str]],
task: str = "qa",
) -> dict[str, list[Tuple[str, str]]]:
"""
Group rows by repo_url.
Input row should have keys: 'repo_url', 'commit_id', 'problem_id'
Group rows by repo URL.

Args:
rows: Iterable of row dictionaries
task: "qa" for legacy format, "codegen" for SWE-bench format

Returns mapping: repo_url -> list of (commit_id, problem_id)
"""
mapping: dict[str, list[Tuple[str, str]]] = {}
for r in rows:
repo = r["repo_url"]
commit = r["commit_id"]
prob = r["problem_id"]
mapping.setdefault(repo, []).append((commit, prob))
if task == "codegen":

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of using task, can you just check if it's URL or repo slug?

# SWE-bench format: "django/django" -> "https://github.com/django/django"
repo_url = f"https://github.com/{r['repo']}"
commit = r["base_commit"]
prob = r["instance_id"]
else:
repo_url = r["repo_url"]
commit = r["commit_id"]
prob = r["problem_id"]

mapping.setdefault(repo_url, []).append((commit, prob))
return mapping


Expand Down Expand Up @@ -273,14 +285,16 @@ def prepare_worktrees(
batch_no: int,
max_workers: int = 8,
skip_if_exists: bool = True,
task: str = "qa",
) -> tuple[dict[tuple[str, str], dict[tuple[str, int], Path]], list[dict[str, Any]]]:
"""
rows: iterable of dicts with keys 'repo_url','commit_id','problem_id'
rows: iterable of dicts with keys 'repo_url','commit_id','problem_id' (QA) or 'repo','base_commit','instance_id' (codegen)
base_dir: root directory where we will create:
- base_dir/bare/<repo_name>.git
- base_dir/worktrees/<repo_name>/<worktree_name>
batch_no: number of batch copies to create per (commit,problem)
max_workers: number of threads for parallel repo processing
task: "qa" for legacy format, "codegen" for SWE-bench format

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"qa" is not legacy format.

Returns a list of per-repo summaries.
"""
logger.bind(base_dir=base_dir, batch_no=batch_no, max_worker=max_workers).info(
Expand All @@ -289,7 +303,7 @@ def prepare_worktrees(
base = Path(base_dir).resolve()
base.mkdir(parents=True, exist_ok=True)

grouped = group_rows_by_repo(rows)
grouped = group_rows_by_repo(rows, task=task)
summaries = []
repo_map: dict[tuple[str, str], dict[tuple[str, int], Path]] = {}

Expand Down
106 changes: 75 additions & 31 deletions benchmark/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@
from loguru import logger

from .download import prepare_worktrees
from .metrics import correctness
from .potpie import get_all_st_answers as get_all_st_answers_potpie
from .metrics import code_correctness, correctness
from .potpie import (
get_all_codegen_answers as get_all_codegen_answers_potpie,
get_all_st_answers as get_all_st_answers_potpie,
)


def get_available_agents() -> list[str]:
Expand Down Expand Up @@ -39,6 +42,14 @@ async def main():
help="List of agents to evaluate",
)

parser.add_argument(
"--task",
type=str,
choices=["qa", "codegen"],
default="qa",
help="Type of evaluation to perform (default: qa)",
)

parser.add_argument(
"--input",
type=str,
Expand All @@ -53,18 +64,25 @@ async def main():
help="Output CSV file for results (default: evaluation_results.csv)",
)

parser.add_argument(
"--batch-size",
type=int,
default=2,
help="Number of batch copies to create per problem (default: 2)",
)

args = parser.parse_args()

nbatch = 2
nbatch = args.batch_size

if not Path(args.input).exists():
print(f"Error: Input file '{args.input}' not found.", file=sys.stderr)
sys.exit(1)

assert all(agent in get_available_agents() for agent in args.agents), (
"Invalid Agent(s): {}".format(
", ".join(set(args.agents) - set(get_available_agents()))
)
assert all(
agent in get_available_agents() for agent in args.agents
), "Invalid Agent(s): {}".format(
", ".join(set(args.agents) - set(get_available_agents()))
)

logger.info("Evaluating tools: {}", ", ".join(args.agents))
Expand All @@ -78,47 +96,73 @@ async def main():
)

repo_map, summary = prepare_worktrees(
problem_sets, base_dir="/tmp/repos_batch", batch_no=nbatch, max_workers=6
problem_sets,
base_dir="/tmp/repos_batch",
batch_no=nbatch,
max_workers=6,
task=args.task,
)
# repo_dict = worktree_results

# Don't pass expected answers to the tools
problem_sets_without_expected_answers = problem_sets.remove_columns(
"expected_answer"
)

result_awaitables = []
for agent in args.agents:
if agent == "potpie":
result_awaitables.append(
get_all_st_answers_potpie(
problem_sets_without_expected_answers, repo_map
# Select metric and generation function based on task type
if args.task == "codegen":
metric = code_correctness
expected_column = "patch" # SWE-bench uses "patch" for expected output
input_column = "problem_statement"

# Don't pass expected output to the tools
problem_sets_without_expected = problem_sets.remove_columns(expected_column)

result_awaitables = []
for agent in args.agents:
if agent == "potpie":
result_awaitables.append(
get_all_codegen_answers_potpie(
problem_sets_without_expected, repo_map
)
)
else:
...
else: # args.task == "qa"
metric = correctness
expected_column = "expected_answer"
input_column = "question"

# Don't pass expected answers to the tools
problem_sets_without_expected = problem_sets.remove_columns(expected_column)

result_awaitables = []
for agent in args.agents:
if agent == "potpie":
result_awaitables.append(
get_all_st_answers_potpie(
problem_sets_without_expected, repo_map, task=args.task
)
)
)
else:
...
else:
...

answers = await asyncio.gather(*result_awaitables)
print(f"len answers : {len(answers[0])}")
expected_answers = problem_sets.select_columns("expected_answer")
questions = problem_sets.select_columns("question")
questions_batched = [item["question"] for item in questions for _ in range(nbatch)]
expected_outputs = problem_sets.select_columns(expected_column)
inputs = problem_sets.select_columns(input_column)
inputs_batched = [item[input_column] for item in inputs for _ in range(nbatch)]
answers_flattened = [item for sublist in answers for item in sublist]

expected_answers_batched = [
item["expected_answer"] for item in expected_answers for _ in range(nbatch)
expected_outputs_batched = [
item[expected_column] for item in expected_outputs for _ in range(nbatch)
]
test_cases = [
LLMTestCase(
input=question, actual_output=answer, expected_output=expected_answer
input=input_text, actual_output=answer, expected_output=expected_output
)
for question, answer, expected_answer in zip(
questions_batched, answers_flattened, expected_answers_batched
for input_text, answer, expected_output in zip(
inputs_batched, answers_flattened, expected_outputs_batched
)
]

results = evaluate(
metrics=[correctness],
metrics=[metric],
test_cases=test_cases,
)
metrics = [test_result.success for test_result in results.test_results]
Expand Down
42 changes: 42 additions & 0 deletions benchmark/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,37 @@
Do NOT include any text outside the JSON object.
"""

CODE_EVAL_CRITERIA = """
You are a senior software engineer evaluating a code fix or implementation.
Your task is to assess the quality and correctness of the GENERATED CODE compared to the EXPECTED SOLUTION (or based on the PROBLEM DESCRIPTION).
Evaluate using the following criteria:
1. Functional Correctness (50%): Does the code solve the stated problem?
2. Code Quality & Best Practices (30%): Is the code idiomatic, safe, and maintainable?
3. Logic Similarity (20%): Does it follow a similar (or better) valid approach to the reference solution?
Output ONLY a JSON object with:
{
"score": <number_between_0_and_1>,
"reason": "<brief explanation>"
}
Where:
- 0.0 = The generated code fails to address the problem or is fundamentally incorrect.
- 1.0 = The generated code completely solves the problem with high quality and matches the reference approach or improves upon it.
Scoring Rules:
- Functional Correctness is paramount. If the code doesn't solve the problem, the score should be low (<0.5).
- Code Quality matters. Correct but messy code should be penalized slightly.
- Logic Similarity allows for different valid approaches, but they must solve the same root cause.
Your final output must contain:
- A numeric "score" between 0 and 1.
- A short "reason" describing the main factors affecting the score.
Do NOT include any text outside the JSON object.
"""

correctness = GEval(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

modify it as qa correcteness?

name="Correctness",
criteria=EVAL_CRITERIA,
Expand All @@ -54,3 +85,14 @@
],
strict_mode=True,
)

code_correctness = GEval(
name="CodeCorrectness",
criteria=CODE_EVAL_CRITERIA,
evaluation_params=[
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT,
],
strict_mode=True,
)
Loading