Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 24 additions & 8 deletions benchmark/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,18 +144,32 @@

def group_rows_by_repo(
rows: Iterable[dict[str, str]],
task: str = "qa",

Check warning on line 147 in benchmark/download.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Remove the unused function parameter "task".

See more on https://sonarcloud.io/project/issues?id=getmomentum_momentum-server&issues=AZqlZu89XmODmi4MbjdB&open=AZqlZu89XmODmi4MbjdB&pullRequest=497
) -> dict[str, list[Tuple[str, str]]]:
"""
Group rows by repo_url.
Input row should have keys: 'repo_url', 'commit_id', 'problem_id'
Group rows by repo URL.

Args:
rows: Iterable of row dictionaries
task: "qa" or "codegen"

Returns mapping: repo_url -> list of (commit_id, problem_id)
"""
mapping: dict[str, list[Tuple[str, str]]] = {}
for r in rows:
repo = r["repo_url"]
commit = r["commit_id"]
prob = r["problem_id"]
mapping.setdefault(repo, []).append((commit, prob))
repo_url_value = r.get("repo_url", "")
is_url_format = repo_url_value.startswith(("http://", "https://"))

if is_url_format:
repo_url = r["repo_url"]
commit = r["commit_id"]
prob = r["problem_id"]
else:
repo_url = f"https://github.com/{r['repo']}"
commit = r["base_commit"]
prob = r["instance_id"]

mapping.setdefault(repo_url, []).append((commit, prob))
return mapping


Expand Down Expand Up @@ -273,14 +287,16 @@
batch_no: int,
max_workers: int = 8,
skip_if_exists: bool = True,
task: str = "qa",
) -> tuple[dict[tuple[str, str], dict[tuple[str, int], Path]], list[dict[str, Any]]]:
"""
rows: iterable of dicts with keys 'repo_url','commit_id','problem_id'
rows: iterable of dicts with keys 'repo_url','commit_id','problem_id' (URL format) or 'repo','base_commit','instance_id' (slug format)
base_dir: root directory where we will create:
- base_dir/bare/<repo_name>.git
- base_dir/worktrees/<repo_name>/<worktree_name>
batch_no: number of batch copies to create per (commit,problem)
max_workers: number of threads for parallel repo processing
task: Deprecated, kept for backward compatibility. Auto-detects format.
Returns a list of per-repo summaries.
"""
logger.bind(base_dir=base_dir, batch_no=batch_no, max_worker=max_workers).info(
Expand All @@ -289,7 +305,7 @@
base = Path(base_dir).resolve()
base.mkdir(parents=True, exist_ok=True)

grouped = group_rows_by_repo(rows)
grouped = group_rows_by_repo(rows, task=task)
summaries = []
repo_map: dict[tuple[str, str], dict[tuple[str, int], Path]] = {}

Expand Down
106 changes: 75 additions & 31 deletions benchmark/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@
from loguru import logger

from .download import prepare_worktrees
from .metrics import correctness
from .potpie import get_all_st_answers as get_all_st_answers_potpie
from .metrics import code_correctness, qa_correctness
from .potpie import (
get_all_codegen_answers as get_all_codegen_answers_potpie,
get_all_st_answers as get_all_st_answers_potpie,
)


def get_available_agents() -> list[str]:
Expand Down Expand Up @@ -39,6 +42,14 @@ async def main():
help="List of agents to evaluate",
)

parser.add_argument(
"--task",
type=str,
choices=["qa", "codegen"],
default="qa",
help="Type of evaluation to perform (default: qa)",
)

parser.add_argument(
"--input",
type=str,
Expand All @@ -53,18 +64,25 @@ async def main():
help="Output CSV file for results (default: evaluation_results.csv)",
)

parser.add_argument(
"--batch-size",
type=int,
default=2,
help="Number of batch copies to create per problem (default: 2)",
)

args = parser.parse_args()

nbatch = 2
nbatch = args.batch_size

if not Path(args.input).exists():
print(f"Error: Input file '{args.input}' not found.", file=sys.stderr)
sys.exit(1)

assert all(agent in get_available_agents() for agent in args.agents), (
"Invalid Agent(s): {}".format(
", ".join(set(args.agents) - set(get_available_agents()))
)
assert all(
agent in get_available_agents() for agent in args.agents
), "Invalid Agent(s): {}".format(
", ".join(set(args.agents) - set(get_available_agents()))
)

logger.info("Evaluating tools: {}", ", ".join(args.agents))
Expand All @@ -78,47 +96,73 @@ async def main():
)

repo_map, summary = prepare_worktrees(
problem_sets, base_dir="/tmp/repos_batch", batch_no=nbatch, max_workers=6
problem_sets,
base_dir="/tmp/repos_batch",
batch_no=nbatch,
max_workers=6,
task=args.task,
)
# repo_dict = worktree_results

# Don't pass expected answers to the tools
problem_sets_without_expected_answers = problem_sets.remove_columns(
"expected_answer"
)

result_awaitables = []
for agent in args.agents:
if agent == "potpie":
result_awaitables.append(
get_all_st_answers_potpie(
problem_sets_without_expected_answers, repo_map
# Select metric and generation function based on task type
if args.task == "codegen":
metric = code_correctness
expected_column = "patch" # SWE-bench uses "patch" for expected output
input_column = "problem_statement"

# Don't pass expected output to the tools
problem_sets_without_expected = problem_sets.remove_columns(expected_column)

result_awaitables = []
for agent in args.agents:
if agent == "potpie":
result_awaitables.append(
get_all_codegen_answers_potpie(
problem_sets_without_expected, repo_map
)
)
else:
...
else: # args.task == "qa"
metric = qa_correctness
expected_column = "expected_answer"
input_column = "question"

# Don't pass expected answers to the tools
problem_sets_without_expected = problem_sets.remove_columns(expected_column)

result_awaitables = []
for agent in args.agents:
if agent == "potpie":
result_awaitables.append(
get_all_st_answers_potpie(
problem_sets_without_expected, repo_map, task=args.task
)
)
)
else:
...
else:
...

answers = await asyncio.gather(*result_awaitables)
print(f"len answers : {len(answers[0])}")
expected_answers = problem_sets.select_columns("expected_answer")
questions = problem_sets.select_columns("question")
questions_batched = [item["question"] for item in questions for _ in range(nbatch)]
expected_outputs = problem_sets.select_columns(expected_column)
inputs = problem_sets.select_columns(input_column)
inputs_batched = [item[input_column] for item in inputs for _ in range(nbatch)]
answers_flattened = [item for sublist in answers for item in sublist]

expected_answers_batched = [
item["expected_answer"] for item in expected_answers for _ in range(nbatch)
expected_outputs_batched = [
item[expected_column] for item in expected_outputs for _ in range(nbatch)
]
test_cases = [
LLMTestCase(
input=question, actual_output=answer, expected_output=expected_answer
input=input_text, actual_output=answer, expected_output=expected_output
)
for question, answer, expected_answer in zip(
questions_batched, answers_flattened, expected_answers_batched
for input_text, answer, expected_output in zip(
inputs_batched, answers_flattened, expected_outputs_batched
)
]

results = evaluate(
metrics=[correctness],
metrics=[metric],
test_cases=test_cases,
)
metrics = [test_result.success for test_result in results.test_results]
Expand Down
44 changes: 43 additions & 1 deletion benchmark/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,38 @@
Do NOT include any text outside the JSON object.
"""

correctness = GEval(
CODE_EVAL_CRITERIA = """
You are a senior software engineer evaluating a code fix or implementation.
Your task is to assess the quality and correctness of the GENERATED CODE compared to the EXPECTED SOLUTION (or based on the PROBLEM DESCRIPTION).

Evaluate using the following criteria:
1. Functional Correctness (50%): Does the code solve the stated problem?
2. Code Quality & Best Practices (30%): Is the code idiomatic, safe, and maintainable?
3. Logic Similarity (20%): Does it follow a similar (or better) valid approach to the reference solution?

Output ONLY a JSON object with:
{
"score": <number_between_0_and_1>,
"reason": "<brief explanation>"
}

Where:
- 0.0 = The generated code fails to address the problem or is fundamentally incorrect.
- 1.0 = The generated code completely solves the problem with high quality and matches the reference approach or improves upon it.

Scoring Rules:
- Functional Correctness is paramount. If the code doesn't solve the problem, the score should be low (<0.5).
- Code Quality matters. Correct but messy code should be penalized slightly.
- Logic Similarity allows for different valid approaches, but they must solve the same root cause.

Your final output must contain:
- A numeric "score" between 0 and 1.
- A short "reason" describing the main factors affecting the score.

Do NOT include any text outside the JSON object.
"""

qa_correctness = GEval(
name="Correctness",
criteria=EVAL_CRITERIA,
evaluation_params=[
Expand All @@ -54,3 +85,14 @@
],
strict_mode=True,
)

code_correctness = GEval(
name="CodeCorrectness",
criteria=CODE_EVAL_CRITERIA,
evaluation_params=[
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT,
],
strict_mode=True,
)
Loading