Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions packages/paper-qa-docling/src/paperqa_docling/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from docling_core.types.doc import (
DescriptionAnnotation,
DocItem,
FormulaItem,
PictureItem,
TableItem,
TextItem,
Expand Down Expand Up @@ -123,10 +124,14 @@ def parse_pdf_to_pages( # noqa: PLR0912
# NOTE: docling pages are 1-indexed
page_nums = [prov.page_no for prov in item.prov]

if isinstance(item, TextItem):
if isinstance(item, TextItem | FormulaItem): # Handle items with text
item_text = item.text
if not item_text and isinstance(item, FormulaItem) and item.orig:
# Sometimes the sanitization of formula text fails, so use the original
item_text = item.orig
for page_num in page_nums:
new_text = (
item.text if not content[str(page_num)][0] else "\n\n" + item.text
item_text if not content[str(page_num)][0] else "\n\n" + item_text
)
total_length += len(new_text)
if page_size_limit and total_length > page_size_limit:
Expand All @@ -137,7 +142,9 @@ def parse_pdf_to_pages( # noqa: PLR0912
)
content[str(page_num)][0] += new_text

if parse_media and isinstance(item, PictureItem): # Handle images
if parse_media and isinstance( # Handle images and formulae
item, PictureItem | FormulaItem
):
image_data = item.get_image(doc)
if image_data:
try:
Expand All @@ -154,14 +161,16 @@ def parse_pdf_to_pages( # noqa: PLR0912
img_bytes.seek(0) # Reset pointer before read to avoid empty data

media_metadata = {
"type": "picture",
"type": "formula" if isinstance(item, FormulaItem) else "picture",
"width": image_data.width,
"height": image_data.height,
"bbox": item.prov[0].bbox.as_tuple(),
"images_scale": pipeline_options.images_scale,
}
annotations = [
x for x in item.annotations if isinstance(x, DescriptionAnnotation)
x
for x in getattr(item, "annotations", [])
if isinstance(x, DescriptionAnnotation)
]
if len(annotations) == 1:
# We don't set this text in ParsedMedia.text because it's
Expand Down
6 changes: 3 additions & 3 deletions packages/paper-qa-docling/tests/test_paperqa_docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,12 +199,12 @@ def test_media_deduplication() -> None:
all_media = [m for _, media in parsed_text.content.values() for m in media] # type: ignore[misc]

all_images = [m for m in all_media if m.info.get("type") == "picture"]
# We allow for one table to be misinterpreted as an image
# We allow for one table to be misinterpreted as an image, and one logo to be missed
assert (
10 <= len(all_images) <= 11
3 * 5 - 1 <= len(all_images) <= 3 * 5 + 1
), "Expected each image (one/page) and equation (one/page) to be read"
assert (
len({m for m in all_images if cast(int, m.info["page_num"]) > 1}) <= 2
len({m for m in all_images if cast(int, m.info["page_num"]) > 1}) <= 3
), "Expected images/equations on all pages beyond 1 to be deduplicated"

all_tables = [m for m in all_media if m.info.get("type") == "table"]
Expand Down
22 changes: 21 additions & 1 deletion tests/duplicate_media_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
<!-- pyml disable-num-lines 5 line-length -->

[//]: # "To generate `stub_data/duplicate_media.pdf` from this:"
[//]: # "1. `pandoc duplicate_media_template.md --standalone --self-contained --katex -t html -o temp.html`"
[//]: # "1. `pandoc duplicate_media_template.md --standalone --katex -t html -o temp.html`"
[//]: # "2. `Chromium --headless --disable-gpu --print-to-pdf=stub_data/duplicate_media.pdf --no-pdf-header-footer temp.html`"
[//]: # "3. `rm temp.html`"

<div style="text-align: right; margin-bottom: 10px;">
<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/12/Wikimedia_logo_text_RGB.svg/330px-Wikimedia_logo_text_RGB.svg.png" alt="Wikimedia logo" height="100"/>
</div>

<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>

Text under image 1.
Expand All @@ -28,6 +32,10 @@ $$

<div style="page-break-after: always;"></div>

<div style="text-align: right; margin-bottom: 10px;">
<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/12/Wikimedia_logo_text_RGB.svg/330px-Wikimedia_logo_text_RGB.svg.png" alt="Wikimedia logo" height="100"/>
</div>

<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>

Text under image 2.
Expand All @@ -49,6 +57,10 @@ $$

<div style="page-break-after: always;"></div>

<div style="text-align: right; margin-bottom: 10px;">
<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/12/Wikimedia_logo_text_RGB.svg/330px-Wikimedia_logo_text_RGB.svg.png" alt="Wikimedia logo" height="100"/>
</div>

<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>

Text under image 3.
Expand All @@ -70,6 +82,10 @@ $$

<div style="page-break-after: always;"></div>

<div style="text-align: right; margin-bottom: 10px;">
<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/12/Wikimedia_logo_text_RGB.svg/330px-Wikimedia_logo_text_RGB.svg.png" alt="Wikimedia logo" height="100"/>
</div>

<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>

Text under image 4.
Expand All @@ -91,6 +107,10 @@ $$

<div style="page-break-after: always;"></div>

<div style="text-align: right; margin-bottom: 10px;">
<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/12/Wikimedia_logo_text_RGB.svg/330px-Wikimedia_logo_text_RGB.svg.png" alt="Wikimedia logo" height="100"/>
</div>

<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>

Text under image 5.
Expand Down
Binary file modified tests/stub_data/duplicate_media.pdf
Binary file not shown.