diff --git a/src/paperqa/clients/semantic_scholar.py b/src/paperqa/clients/semantic_scholar.py index 375f89167..120fe8977 100644 --- a/src/paperqa/clients/semantic_scholar.py +++ b/src/paperqa/clients/semantic_scholar.py @@ -40,7 +40,8 @@ "volume": {"journal"}, "pages": {"journal"}, "journal": {"journal"}, - "url": {"url", "openAccessPdf"}, + "url": {"url"}, + "pdf_url": {"openAccessPdf"}, "bibtex": {"citationStyles"}, "doi_url": {"url"}, "other": {"isOpenAccess", "influentialCitationCount", "publicationTypes", "venue"}, @@ -189,6 +190,8 @@ async def parse_s2_to_doc_details( journal_data = paper_data.get("journal") or {} + maybe_pdf_url = (paper_data.get("openAccessPdf") or {}).get("url") + doc_details = DocDetails( key=None if not bibtex else bibtex.split("{")[1].split(",")[0], bibtex_type="article", # s2 should be basically all articles @@ -199,7 +202,8 @@ async def parse_s2_to_doc_details( volume=journal_data.get("volume"), pages=journal_data.get("pages"), journal=journal_data.get("name"), - url=(paper_data.get("openAccessPdf") or {}).get("url"), + url=maybe_pdf_url, + pdf_url=maybe_pdf_url, title=paper_data.get("title"), citation_count=paper_data.get("citationCount"), doi=doi, diff --git a/src/paperqa/types.py b/src/paperqa/types.py index a7b80dbe7..36180f2bc 100644 --- a/src/paperqa/types.py +++ b/src/paperqa/types.py @@ -783,7 +783,9 @@ class DocDetails(Doc): " that could be preprint). None means unknown/unset." ), ) - pdf_url: str | None = None + pdf_url: str | None = Field( + default=None, description="URL to the PDF of the paper, if known." + ) other: dict[str, Any] = Field( default_factory=dict, description="Other metadata besides the above standardized fields.",