Calibrating reviewer (#13)

IlyaGusev · web-flow · commit e7c813293fc5 · 2025-11-01T15:09:49.000+01:00
diff --git a/academia_mcp/llm.py b/academia_mcp/llm.py
@@ -40,25 +40,37 @@ async def llm_acall(model_name: str, messages: ChatMessages, **kwargs: Any) -> s
 
 
 async def llm_acall_structured(
-    model_name: str, messages: ChatMessages, response_format: type[T], **kwargs: Any
+    model_name: str,
+    messages: ChatMessages,
+    response_format: type[T],
+    num_parsing_retries: int = 3,
+    **kwargs: Any,
 ) -> T:
     key = settings.OPENROUTER_API_KEY
     assert key, "Please set OPENROUTER_API_KEY in the environment variables"
     base_url = settings.BASE_URL
 
     client = AsyncOpenAI(base_url=base_url, api_key=key)
     converted_messages = [message.model_dump() for message in messages]
-    structured_response: T | None = (
-        (
-            await client.chat.completions.parse(
-                model=model_name,
-                messages=converted_messages,
-                response_format=response_format,
-                **kwargs,
+    for retry_index in range(num_parsing_retries):
+        try:
+            structured_response: T | None = (
+                (
+                    await client.chat.completions.parse(
+                        model=model_name,
+                        messages=converted_messages,
+                        response_format=response_format,
+                        **kwargs,
+                    )
+                )
+                .choices[0]
+                .message.parsed
             )
-        )
-        .choices[0]
-        .message.parsed
-    )
-    assert structured_response, "Structured response is None"
+            assert structured_response
+            break
+        except Exception:
+            if retry_index == num_parsing_retries - 1:
+                raise
+            continue
+    assert structured_response
     return structured_response
diff --git a/academia_mcp/server.py b/academia_mcp/server.py
@@ -40,7 +40,7 @@
     generate_research_proposals,
     score_research_proposals,
 )
-from academia_mcp.tools.review import review_pdf_paper, download_pdf_paper
+from academia_mcp.tools.review import review_pdf_paper, download_pdf_paper, review_pdf_paper_by_url
 from academia_mcp.tools.image_processing import show_image, describe_image
 from academia_mcp.tools.speech_to_text import speech_to_text
 from academia_mcp.tools.yt_transcript import yt_transcript
@@ -125,6 +125,7 @@ def create_server(
         server.add_tool(describe_image)
         if settings.WORKSPACE_DIR:
             server.add_tool(review_pdf_paper, structured_output=True)
+            server.add_tool(review_pdf_paper_by_url, structured_output=True)
     else:
         logger.warning("No OpenRouter API key is set, LLM-related tools will not be available!")
 
diff --git a/academia_mcp/tools/review.py b/academia_mcp/tools/review.py
@@ -15,143 +15,107 @@
 
 PROMPT = """
 You are an expert peer reviewer for top CS/ML venues (e.g., NeurIPS/ICML/ACL).
-Your goal is to produce a fair, rigorous, and reproducible review that is maximally useful to authors and area chairs.
+Produce fair, strict, rigorous, reproducible reviews maximally useful to authors and ACs.
+Be strict and punishing, but only for good reasons. Don't be afraid to reject a paper.
 Be specific: cite paper sections/figures/tables when criticizing or praising.
 Use actionable language ("Provide variance across 5 seeds on Dataset X; add leakage control Y").
 
+
 # Summary
-Briefly summarize the paper and its contributions.
-This is not the place to critique the paper; the authors should generally agree with a well-written summary.
-This is also not the place to paste the abstract—please provide the summary in your own understanding after reading.
+Summarize the paper and contributions in your own words (not the abstract).
+Authors should agree with a well-written summary. No critique here.
+
 
 # Strengths and Weaknesses
 Please provide a thorough assessment of the strengths and weaknesses of the paper.
 A good mental framing for strengths and weaknesses is to think of reasons you might accept or reject the paper.
 Please touch on the following dimensions:
 
-## Quality
+## Quality (Score 1-4: poor/fair/good/excellent)
 Is the submission technically sound?
 Are claims well supported (e.g., by theoretical analysis or experimental results)?
 Are the methods used appropriate?
 Is this a complete piece of work or work in progress?
-Are the authors careful and honest about evaluating both the strengths and weaknesses of their work?
+Are the authors careful and honest about evaluating their work?
 
-## Clarity
+## Clarity (Score 1-4: poor/fair/good/excellent)
 Is the submission clearly written?
 Is it well organized? (If not, please make constructive suggestions for improving its clarity.)
-Does it adequately inform the reader? (Note that a superbly written paper provides enough information for an expert reader to reproduce its results.)
+Does it adequately inform the reader?
+A superbly written paper provides enough information for an expert reader to reproduce its results.
 
-## Significance
+## Significance (Score 1-4: poor/fair/good/excellent)
 Are the results impactful for the community?
 Are others (researchers or practitioners) likely to use the ideas or build on them?
 Does the submission address a difficult task in a better way than previous work?
 Does it advance our understanding/knowledge on the topic in a demonstrable way?
 Does it provide unique data, unique conclusions about existing data, or a unique theoretical or experimental approach?
 
-## Originality
+## Originality (Score 1-4: poor/fair/good/excellent)
 Does the work provide new insights, deepen understanding, or highlight important properties of existing methods?
 Is it clear how this work differs from previous contributions, with relevant citations provided?
 Does the work introduce novel tasks or methods that advance the field?
 Does this work offer a novel combination of existing techniques, and is the reasoning behind this combination well-articulated?
 As the questions above indicates, originality does not necessarily require introducing an entirely new method.
 Rather, a work that provides novel insights by evaluating existing methods, or demonstrates improved efficiency, fairness, etc. is also equally valuable.
 
+
 # Scores
-Try to be specific and detailed in your assessment. Try not to set the same score for all the dimensions.
-
-Quality: Based on what you discussed in the “Quality” section, please assign the paper a numerical rating on the following scale to indicate the quality of the work.
-4 = excellent
-3 = good
-2 = fair
-1 = poor
-
-Clarity: Based on what you discussed in the “Clarity” section, please assign the paper a numerical rating on the following scale to indicate the clarity of the paper.
-4 = excellent
-3 = good
-2 = fair
-1 = poor
-
-Significance: Based on what you discussed in the “Significance” section, please assign the paper a numerical rating on the following scale to indicate the significance of the paper.
-4 = excellent
-3 = good
-2 = fair
-1 = poor
-
-Originality: Based on what you discussed in the “Originality” section, please assign the paper a numerical rating on the following scale to indicate the originality of the paper.
-4 = excellent
-3 = good
-2 = fair
-1 = poor
+Try to be specific and detailed in your assessment.
+Try not to set the same score for all the dimensions.
+The scores for all dimensions should be independent of each other.
+Scores should rely on strengths and weaknesses of the paper in each dimension.
+If there are many substantial weaknesses, the score should be low.
+
 
 # Questions
-Please list up and carefully describe questions and suggestions for the authors, which should focus on key points (ideally around 3–5) that are actionable with clear guidance.
-Think of the things where a response from the author can change your opinion, clarify a confusion or address a limitation.
-You are strongly encouraged to state the clear criteria under which your evaluation score could increase or decrease.
-This can be very important for a productive rebuttal and discussion phase with the authors.
+List 3-5 key actionable questions/suggestions.
+Focus on points where author response could change your opinion or clarify confusion.
+State clear criteria for your score changes.
+
 
 # Limitations
 Have the authors adequately addressed the limitations and potential negative societal impact of their work?
-If so, simply leave “yes”; if not, please include constructive suggestions for improvement.
+Please include constructive suggestions for improvement.
 In general, authors should be rewarded rather than punished for being up front about the limitations of their work and any potential negative societal impact.
 You are encouraged to think through whether any critical points are missing and provide these as feedback for the authors.
 
 
-# Overall
-Please provide an "overall score" for this submission. Choices:
-6: Strong Accept: Technically flawless paper with groundbreaking impact on one or more areas of AI, with exceptionally strong evaluation, reproducibility, and resources, and no unaddressed ethical considerations.
-5: Accept: Technically solid paper, with high impact on at least one sub-area of AI or moderate-to-high impact on more than one area of AI, with good-to-excellent evaluation, resources, reproducibility, and no unaddressed ethical considerations.
-4: Borderline accept: Technically solid paper where reasons to accept outweigh reasons to reject, e.g., limited evaluation. Please use sparingly.
-3: Borderline reject: Technically solid paper where reasons to reject, e.g., limited evaluation, outweigh reasons to accept, e.g., good evaluation. Please use sparingly.
-2: Reject: For instance, a paper with technical flaws, weak evaluation, inadequate reproducibility and incompletely addressed ethical considerations.
-1: Strong Reject: For instance, a paper with well-known results or unaddressed ethical considerations
+# Overall Score
+6: Strong Accept - Flawless, groundbreaking impact, exceptional evaluation/reproducibility, no ethical issues
+5: Accept - Solid, high impact (≥1 sub-area) or moderate-high (multiple areas), good-excellent evaluation/resources/reproducibility, no ethical issues
+4: Borderline Accept - Solid, accept reasons outweigh reject (e.g., limited evaluation).
+3: Borderline Reject - Solid, reject reasons outweigh accept.
+2: Reject - Technical flaws, weak evaluation, inadequate reproducibility, incompletely addressed ethics
+1: Strong Reject - Known results or unaddressed ethical issues
+
+Don't be afraid to use 1, 2, 5, and 6.
+
 
 # Confidence
-Please provide a "confidence score" for your assessment of this submission to indicate how confident you are in your evaluation.  Choices
-5: You are absolutely certain about your assessment. You are very familiar with the related work and checked the math/other details carefully.
-4: You are confident in your assessment, but not absolutely certain. It is unlikely, but not impossible, that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work.
-3: You are fairly confident in your assessment. It is possible that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked.
-2: You are willing to defend your assessment, but it is quite likely that you did not understand the central parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked.
-1: Your assessment is an educated guess. The submission is not in your area or the submission was difficult to understand. Math/other details were not carefully checked.
+5: Absolutely certain, very familiar with related work, checked math/details carefully
+4: Confident but not certain, unlikely missed something
+3: Fairly confident, possibly missed parts or unfamiliar with some work, details not carefully checked
+2: Willing to defend but likely missed central parts, details not checked
+1: Educated guess, not your area or hard to understand, details not checked
+
 
 # Format issues
 Find problems with the paper formatting. Report them separately.
 
 # Result
 Return the result as a JSON object in the following format:
 {
-    "summary": "Summary of the paper",
-    "quality": {
-        "strengths": ["...", "..."],
-        "weaknesses": ["...", "..."],
-        "score": ...,
-    },
-    "clarity": {
-        "strengths": ["...", "..."],
-        "weaknesses": ["...", "..."],
-        "score": ...,
-    },
-    "significance": {
-        "strengths": ["...", "..."],
-        "weaknesses": ["...", "..."],
-        "score": ...,
-    },
-    "originality": {
-        "strengths": ["...", "..."],
-        "weaknesses": ["...", "..."],
-        "score": ...,
-    },
-    "questions": ["Questions and suggestions for the authors", "..."],
-    "limitations": ["Limitations of the paper", "..."],
-    "overall": {
-        "strengths": ["...", "..."],
-        "weaknesses": ["...", "..."],
-        "score": ...,
-    },
-    "confidence": {
-        "description": "Confidence score and its description",
-        "score": ...,
-    },
-    "format_issues": ["Format issues", "..."]
+    "summary": "...",
+    "quality": {"strengths": ["..."], "weaknesses": ["..."], "reasoning": "...", "score": ...},
+    "clarity": {"strengths": ["..."], "weaknesses": ["..."], "reasoning": "...", "score": ...},
+    "significance": {"strengths": ["..."], "weaknesses": ["..."], "reasoning": "...", "score": ...},
+    "originality": {"strengths": ["..."], "weaknesses": ["..."], "reasoning": "...", "score": ...},
+    "questions": ["..."],
+    "limitations": ["..."],
+    "overall": {"reasoning": "...", "score": ...},
+    "confidence": {"reasoning": "...", "score": ...},
+    "format_issues": ["..."]
 }
 
 Always produce a correct JSON object.
@@ -161,12 +125,13 @@
 class AspectItem(BaseModel):  # type: ignore
     strengths: List[str] = Field(description="Strengths of the paper in a specific aspect")
     weaknesses: List[str] = Field(description="Weaknesses of the paper in a specific aspect")
-    score: int = Field(description="Overall score of this aspect")
+    reasoning: str = Field(description="Reasoning about this aspect")
+    score: int = Field(description="Score of this aspect")
 
 
-class ConfidenceItem(BaseModel):  # type: ignore
-    description: str = Field(description="Description of the confidence score")
-    score: int = Field(description="Confidence score")
+class ReasoningItem(BaseModel):  # type: ignore
+    reasoning: str = Field(description="Reasoning about this aspect")
+    score: int = Field(description="Score of this aspect")
 
 
 class ReviewResponse(BaseModel):  # type: ignore
@@ -177,8 +142,8 @@ class ReviewResponse(BaseModel):  # type: ignore
     originality: AspectItem = Field(description="Originality of the paper")
     questions: List[str] = Field(description="Questions and suggestions for the authors")
     limitations: List[str] = Field(description="Limitations of the paper")
-    overall: AspectItem = Field(description="Overall score and its strengths and weaknesses")
-    confidence: ConfidenceItem = Field(description="Confidence score and description")
+    overall: ReasoningItem = Field(description="Overall score and reasoning")
+    confidence: ReasoningItem = Field(description="Confidence score and reasoning")
     format_issues: List[str] = Field(description="Format issues")
 
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "academia-mcp"
-version = "1.11.7"
+version = "1.11.8"
 description = "MCP server that provides different tools to search for scientific publications"
 readme = "README.md"
 authors = [