Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions packages/paper-qa-nemotron/src/paperqa_nemotron/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,8 @@ async def _call_nvidia_api(
if response.choices[0].finish_reason == "length":
raise NemotronLengthError(
f"Model response {response} indicates the input"
f" image of shape {image.shape} is too large or the model started babbling."
f" image of shape {image.shape} is too large or the model started babbling.",
response.choices[0], # Include if callers want
)
if (
response.choices[0].finish_reason != "tool_calls"
Expand Down Expand Up @@ -438,7 +439,8 @@ async def _call_sagemaker_api(
if response.choices[0].finish_reason == "length":
raise NemotronLengthError(
f"Model response {response} indicates the input"
f" image of shape {image.shape} is too large or the model started babbling."
f" image of shape {image.shape} is too large or the model started babbling.",
response.choices[0], # Include if callers want
)
if (
response.choices[0].finish_reason != "stop"
Expand Down
18 changes: 15 additions & 3 deletions packages/paper-qa-nemotron/src/paperqa_nemotron/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from typing import Any, cast

import pypdfium2 as pdfium
from lmi.utils import gather_with_concurrency
from paperqa.types import ParsedMedia, ParsedMetadata, ParsedText
from paperqa.utils import ImpossibleParsingError
from tenacity import RetryError
Expand All @@ -31,6 +32,7 @@ async def parse_pdf_to_pages(
full_page: bool = False,
dpi: int | None = 300,
api_params: Mapping[str, Any] | None = None,
concurrency: int | asyncio.Semaphore | None = 128,
**_: Any,
) -> ParsedText:
"""Parse a PDF using Nvidia's nemotron-parse VLM.
Expand All @@ -48,6 +50,10 @@ async def parse_pdf_to_pages(
dpi: Optional DPI (dots per inch) for image resolution,
if set as None then pypdfium2's default 1 scale will be employed.
api_params: Optional parameters to pass to the nemotron-parse API.
concurrency: Optional concurrency semaphore on concurrent processing of pages,
use to put a ceiling on memory usage. Default is 128 to prioritize reader
speed over memory, but not get obliterated by huge 1000-page PDFs.
Set as None to disable concurrency limits, processing all pages at once.
**_: Thrown away kwargs.

Returns:
Expand Down Expand Up @@ -203,9 +209,15 @@ async def process_page(

content: dict[str, str | tuple[str, list[ParsedMedia]]] = {}
total_length = count_media = 0
for page_num, page_content in await asyncio.gather(
*(process_page(i) for i in range(start_page, end_page))
):

gather = (
asyncio.gather(*(process_page(i) for i in range(start_page, end_page)))
if concurrency is None
else gather_with_concurrency(
concurrency, (process_page(i) for i in range(start_page, end_page))
)
)
for page_num, page_content in await gather:
content[page_num] = page_content
if parse_media:
page_text, page_media = page_content # type: ignore[misc]
Expand Down