Skip to content

Commit e7c8132

Browse files
authored
Calibrating reviewer (#13)
1 parent 163f3f1 commit e7c8132

File tree

4 files changed

+86
-108
lines changed

4 files changed

+86
-108
lines changed

academia_mcp/llm.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -40,25 +40,37 @@ async def llm_acall(model_name: str, messages: ChatMessages, **kwargs: Any) -> s
4040

4141

4242
async def llm_acall_structured(
43-
model_name: str, messages: ChatMessages, response_format: type[T], **kwargs: Any
43+
model_name: str,
44+
messages: ChatMessages,
45+
response_format: type[T],
46+
num_parsing_retries: int = 3,
47+
**kwargs: Any,
4448
) -> T:
4549
key = settings.OPENROUTER_API_KEY
4650
assert key, "Please set OPENROUTER_API_KEY in the environment variables"
4751
base_url = settings.BASE_URL
4852

4953
client = AsyncOpenAI(base_url=base_url, api_key=key)
5054
converted_messages = [message.model_dump() for message in messages]
51-
structured_response: T | None = (
52-
(
53-
await client.chat.completions.parse(
54-
model=model_name,
55-
messages=converted_messages,
56-
response_format=response_format,
57-
**kwargs,
55+
for retry_index in range(num_parsing_retries):
56+
try:
57+
structured_response: T | None = (
58+
(
59+
await client.chat.completions.parse(
60+
model=model_name,
61+
messages=converted_messages,
62+
response_format=response_format,
63+
**kwargs,
64+
)
65+
)
66+
.choices[0]
67+
.message.parsed
5868
)
59-
)
60-
.choices[0]
61-
.message.parsed
62-
)
63-
assert structured_response, "Structured response is None"
69+
assert structured_response
70+
break
71+
except Exception:
72+
if retry_index == num_parsing_retries - 1:
73+
raise
74+
continue
75+
assert structured_response
6476
return structured_response

academia_mcp/server.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
generate_research_proposals,
4141
score_research_proposals,
4242
)
43-
from academia_mcp.tools.review import review_pdf_paper, download_pdf_paper
43+
from academia_mcp.tools.review import review_pdf_paper, download_pdf_paper, review_pdf_paper_by_url
4444
from academia_mcp.tools.image_processing import show_image, describe_image
4545
from academia_mcp.tools.speech_to_text import speech_to_text
4646
from academia_mcp.tools.yt_transcript import yt_transcript
@@ -125,6 +125,7 @@ def create_server(
125125
server.add_tool(describe_image)
126126
if settings.WORKSPACE_DIR:
127127
server.add_tool(review_pdf_paper, structured_output=True)
128+
server.add_tool(review_pdf_paper_by_url, structured_output=True)
128129
else:
129130
logger.warning("No OpenRouter API key is set, LLM-related tools will not be available!")
130131

academia_mcp/tools/review.py

Lines changed: 58 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -15,143 +15,107 @@
1515

1616
PROMPT = """
1717
You are an expert peer reviewer for top CS/ML venues (e.g., NeurIPS/ICML/ACL).
18-
Your goal is to produce a fair, rigorous, and reproducible review that is maximally useful to authors and area chairs.
18+
Produce fair, strict, rigorous, reproducible reviews maximally useful to authors and ACs.
19+
Be strict and punishing, but only for good reasons. Don't be afraid to reject a paper.
1920
Be specific: cite paper sections/figures/tables when criticizing or praising.
2021
Use actionable language ("Provide variance across 5 seeds on Dataset X; add leakage control Y").
2122
23+
2224
# Summary
23-
Briefly summarize the paper and its contributions.
24-
This is not the place to critique the paper; the authors should generally agree with a well-written summary.
25-
This is also not the place to paste the abstract—please provide the summary in your own understanding after reading.
25+
Summarize the paper and contributions in your own words (not the abstract).
26+
Authors should agree with a well-written summary. No critique here.
27+
2628
2729
# Strengths and Weaknesses
2830
Please provide a thorough assessment of the strengths and weaknesses of the paper.
2931
A good mental framing for strengths and weaknesses is to think of reasons you might accept or reject the paper.
3032
Please touch on the following dimensions:
3133
32-
## Quality
34+
## Quality (Score 1-4: poor/fair/good/excellent)
3335
Is the submission technically sound?
3436
Are claims well supported (e.g., by theoretical analysis or experimental results)?
3537
Are the methods used appropriate?
3638
Is this a complete piece of work or work in progress?
37-
Are the authors careful and honest about evaluating both the strengths and weaknesses of their work?
39+
Are the authors careful and honest about evaluating their work?
3840
39-
## Clarity
41+
## Clarity (Score 1-4: poor/fair/good/excellent)
4042
Is the submission clearly written?
4143
Is it well organized? (If not, please make constructive suggestions for improving its clarity.)
42-
Does it adequately inform the reader? (Note that a superbly written paper provides enough information for an expert reader to reproduce its results.)
44+
Does it adequately inform the reader?
45+
A superbly written paper provides enough information for an expert reader to reproduce its results.
4346
44-
## Significance
47+
## Significance (Score 1-4: poor/fair/good/excellent)
4548
Are the results impactful for the community?
4649
Are others (researchers or practitioners) likely to use the ideas or build on them?
4750
Does the submission address a difficult task in a better way than previous work?
4851
Does it advance our understanding/knowledge on the topic in a demonstrable way?
4952
Does it provide unique data, unique conclusions about existing data, or a unique theoretical or experimental approach?
5053
51-
## Originality
54+
## Originality (Score 1-4: poor/fair/good/excellent)
5255
Does the work provide new insights, deepen understanding, or highlight important properties of existing methods?
5356
Is it clear how this work differs from previous contributions, with relevant citations provided?
5457
Does the work introduce novel tasks or methods that advance the field?
5558
Does this work offer a novel combination of existing techniques, and is the reasoning behind this combination well-articulated?
5659
As the questions above indicates, originality does not necessarily require introducing an entirely new method.
5760
Rather, a work that provides novel insights by evaluating existing methods, or demonstrates improved efficiency, fairness, etc. is also equally valuable.
5861
62+
5963
# Scores
60-
Try to be specific and detailed in your assessment. Try not to set the same score for all the dimensions.
61-
62-
Quality: Based on what you discussed in the “Quality” section, please assign the paper a numerical rating on the following scale to indicate the quality of the work.
63-
4 = excellent
64-
3 = good
65-
2 = fair
66-
1 = poor
67-
68-
Clarity: Based on what you discussed in the “Clarity” section, please assign the paper a numerical rating on the following scale to indicate the clarity of the paper.
69-
4 = excellent
70-
3 = good
71-
2 = fair
72-
1 = poor
73-
74-
Significance: Based on what you discussed in the “Significance” section, please assign the paper a numerical rating on the following scale to indicate the significance of the paper.
75-
4 = excellent
76-
3 = good
77-
2 = fair
78-
1 = poor
79-
80-
Originality: Based on what you discussed in the “Originality” section, please assign the paper a numerical rating on the following scale to indicate the originality of the paper.
81-
4 = excellent
82-
3 = good
83-
2 = fair
84-
1 = poor
64+
Try to be specific and detailed in your assessment.
65+
Try not to set the same score for all the dimensions.
66+
The scores for all dimensions should be independent of each other.
67+
Scores should rely on strengths and weaknesses of the paper in each dimension.
68+
If there are many substantial weaknesses, the score should be low.
69+
8570
8671
# Questions
87-
Please list up and carefully describe questions and suggestions for the authors, which should focus on key points (ideally around 3–5) that are actionable with clear guidance.
88-
Think of the things where a response from the author can change your opinion, clarify a confusion or address a limitation.
89-
You are strongly encouraged to state the clear criteria under which your evaluation score could increase or decrease.
90-
This can be very important for a productive rebuttal and discussion phase with the authors.
72+
List 3-5 key actionable questions/suggestions.
73+
Focus on points where author response could change your opinion or clarify confusion.
74+
State clear criteria for your score changes.
75+
9176
9277
# Limitations
9378
Have the authors adequately addressed the limitations and potential negative societal impact of their work?
94-
If so, simply leave “yes”; if not, please include constructive suggestions for improvement.
79+
Please include constructive suggestions for improvement.
9580
In general, authors should be rewarded rather than punished for being up front about the limitations of their work and any potential negative societal impact.
9681
You are encouraged to think through whether any critical points are missing and provide these as feedback for the authors.
9782
9883
99-
# Overall
100-
Please provide an "overall score" for this submission. Choices:
101-
6: Strong Accept: Technically flawless paper with groundbreaking impact on one or more areas of AI, with exceptionally strong evaluation, reproducibility, and resources, and no unaddressed ethical considerations.
102-
5: Accept: Technically solid paper, with high impact on at least one sub-area of AI or moderate-to-high impact on more than one area of AI, with good-to-excellent evaluation, resources, reproducibility, and no unaddressed ethical considerations.
103-
4: Borderline accept: Technically solid paper where reasons to accept outweigh reasons to reject, e.g., limited evaluation. Please use sparingly.
104-
3: Borderline reject: Technically solid paper where reasons to reject, e.g., limited evaluation, outweigh reasons to accept, e.g., good evaluation. Please use sparingly.
105-
2: Reject: For instance, a paper with technical flaws, weak evaluation, inadequate reproducibility and incompletely addressed ethical considerations.
106-
1: Strong Reject: For instance, a paper with well-known results or unaddressed ethical considerations
84+
# Overall Score
85+
6: Strong Accept - Flawless, groundbreaking impact, exceptional evaluation/reproducibility, no ethical issues
86+
5: Accept - Solid, high impact (≥1 sub-area) or moderate-high (multiple areas), good-excellent evaluation/resources/reproducibility, no ethical issues
87+
4: Borderline Accept - Solid, accept reasons outweigh reject (e.g., limited evaluation).
88+
3: Borderline Reject - Solid, reject reasons outweigh accept.
89+
2: Reject - Technical flaws, weak evaluation, inadequate reproducibility, incompletely addressed ethics
90+
1: Strong Reject - Known results or unaddressed ethical issues
91+
92+
Don't be afraid to use 1, 2, 5, and 6.
93+
10794
10895
# Confidence
109-
Please provide a "confidence score" for your assessment of this submission to indicate how confident you are in your evaluation. Choices
110-
5: You are absolutely certain about your assessment. You are very familiar with the related work and checked the math/other details carefully.
111-
4: You are confident in your assessment, but not absolutely certain. It is unlikely, but not impossible, that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work.
112-
3: You are fairly confident in your assessment. It is possible that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked.
113-
2: You are willing to defend your assessment, but it is quite likely that you did not understand the central parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked.
114-
1: Your assessment is an educated guess. The submission is not in your area or the submission was difficult to understand. Math/other details were not carefully checked.
96+
5: Absolutely certain, very familiar with related work, checked math/details carefully
97+
4: Confident but not certain, unlikely missed something
98+
3: Fairly confident, possibly missed parts or unfamiliar with some work, details not carefully checked
99+
2: Willing to defend but likely missed central parts, details not checked
100+
1: Educated guess, not your area or hard to understand, details not checked
101+
115102
116103
# Format issues
117104
Find problems with the paper formatting. Report them separately.
118105
119106
# Result
120107
Return the result as a JSON object in the following format:
121108
{
122-
"summary": "Summary of the paper",
123-
"quality": {
124-
"strengths": ["...", "..."],
125-
"weaknesses": ["...", "..."],
126-
"score": ...,
127-
},
128-
"clarity": {
129-
"strengths": ["...", "..."],
130-
"weaknesses": ["...", "..."],
131-
"score": ...,
132-
},
133-
"significance": {
134-
"strengths": ["...", "..."],
135-
"weaknesses": ["...", "..."],
136-
"score": ...,
137-
},
138-
"originality": {
139-
"strengths": ["...", "..."],
140-
"weaknesses": ["...", "..."],
141-
"score": ...,
142-
},
143-
"questions": ["Questions and suggestions for the authors", "..."],
144-
"limitations": ["Limitations of the paper", "..."],
145-
"overall": {
146-
"strengths": ["...", "..."],
147-
"weaknesses": ["...", "..."],
148-
"score": ...,
149-
},
150-
"confidence": {
151-
"description": "Confidence score and its description",
152-
"score": ...,
153-
},
154-
"format_issues": ["Format issues", "..."]
109+
"summary": "...",
110+
"quality": {"strengths": ["..."], "weaknesses": ["..."], "reasoning": "...", "score": ...},
111+
"clarity": {"strengths": ["..."], "weaknesses": ["..."], "reasoning": "...", "score": ...},
112+
"significance": {"strengths": ["..."], "weaknesses": ["..."], "reasoning": "...", "score": ...},
113+
"originality": {"strengths": ["..."], "weaknesses": ["..."], "reasoning": "...", "score": ...},
114+
"questions": ["..."],
115+
"limitations": ["..."],
116+
"overall": {"reasoning": "...", "score": ...},
117+
"confidence": {"reasoning": "...", "score": ...},
118+
"format_issues": ["..."]
155119
}
156120
157121
Always produce a correct JSON object.
@@ -161,12 +125,13 @@
161125
class AspectItem(BaseModel): # type: ignore
162126
strengths: List[str] = Field(description="Strengths of the paper in a specific aspect")
163127
weaknesses: List[str] = Field(description="Weaknesses of the paper in a specific aspect")
164-
score: int = Field(description="Overall score of this aspect")
128+
reasoning: str = Field(description="Reasoning about this aspect")
129+
score: int = Field(description="Score of this aspect")
165130

166131

167-
class ConfidenceItem(BaseModel): # type: ignore
168-
description: str = Field(description="Description of the confidence score")
169-
score: int = Field(description="Confidence score")
132+
class ReasoningItem(BaseModel): # type: ignore
133+
reasoning: str = Field(description="Reasoning about this aspect")
134+
score: int = Field(description="Score of this aspect")
170135

171136

172137
class ReviewResponse(BaseModel): # type: ignore
@@ -177,8 +142,8 @@ class ReviewResponse(BaseModel): # type: ignore
177142
originality: AspectItem = Field(description="Originality of the paper")
178143
questions: List[str] = Field(description="Questions and suggestions for the authors")
179144
limitations: List[str] = Field(description="Limitations of the paper")
180-
overall: AspectItem = Field(description="Overall score and its strengths and weaknesses")
181-
confidence: ConfidenceItem = Field(description="Confidence score and description")
145+
overall: ReasoningItem = Field(description="Overall score and reasoning")
146+
confidence: ReasoningItem = Field(description="Confidence score and reasoning")
182147
format_issues: List[str] = Field(description="Format issues")
183148

184149

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "academia-mcp"
7-
version = "1.11.7"
7+
version = "1.11.8"
88
description = "MCP server that provides different tools to search for scientific publications"
99
readme = "README.md"
1010
authors = [

0 commit comments

Comments
 (0)