Future-House · jamesbraza · Dec 4, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/packages/paper-qa-docling/src/paperqa_docling/reader.py b/packages/paper-qa-docling/src/paperqa_docling/reader.py
@@ -18,6 +18,7 @@
 from docling_core.types.doc import (
     DescriptionAnnotation,
     DocItem,
+    FormulaItem,
     PictureItem,
     TableItem,
     TextItem,
@@ -123,10 +124,14 @@ def parse_pdf_to_pages(  # noqa: PLR0912
         # NOTE: docling pages are 1-indexed
         page_nums = [prov.page_no for prov in item.prov]
 
-        if isinstance(item, TextItem):
+        if isinstance(item, TextItem | FormulaItem):  # Handle items with text
+            item_text = item.text
+            if not item_text and isinstance(item, FormulaItem) and item.orig:
+                # Sometimes the sanitization of formula text fails, so use the original
+                item_text = item.orig
             for page_num in page_nums:
                 new_text = (
-                    item.text if not content[str(page_num)][0] else "\n\n" + item.text
+                    item_text if not content[str(page_num)][0] else "\n\n" + item_text
                 )
                 total_length += len(new_text)
                 if page_size_limit and total_length > page_size_limit:
@@ -137,7 +142,9 @@ def parse_pdf_to_pages(  # noqa: PLR0912
                     )
                 content[str(page_num)][0] += new_text
 
-        if parse_media and isinstance(item, PictureItem):  # Handle images
+        if parse_media and isinstance(  # Handle images and formulae
+            item, PictureItem | FormulaItem
+        ):
             image_data = item.get_image(doc)
             if image_data:
                 try:
@@ -154,14 +161,16 @@ def parse_pdf_to_pages(  # noqa: PLR0912
                 img_bytes.seek(0)  # Reset pointer before read to avoid empty data
 
                 media_metadata = {
-                    "type": "picture",
+                    "type": "formula" if isinstance(item, FormulaItem) else "picture",
                     "width": image_data.width,
                     "height": image_data.height,
                     "bbox": item.prov[0].bbox.as_tuple(),
                     "images_scale": pipeline_options.images_scale,
                 }
                 annotations = [
-                    x for x in item.annotations if isinstance(x, DescriptionAnnotation)
+                    x
+                    for x in getattr(item, "annotations", [])
+                    if isinstance(x, DescriptionAnnotation)
                 ]
                 if len(annotations) == 1:
                     # We don't set this text in ParsedMedia.text because it's

diff --git a/packages/paper-qa-docling/tests/test_paperqa_docling.py b/packages/paper-qa-docling/tests/test_paperqa_docling.py
@@ -199,12 +199,12 @@ def test_media_deduplication() -> None:
     all_media = [m for _, media in parsed_text.content.values() for m in media]  # type: ignore[misc]
 
     all_images = [m for m in all_media if m.info.get("type") == "picture"]
-    # We allow for one table to be misinterpreted as an image
+    # We allow for one table to be misinterpreted as an image, and one logo to be missed
     assert (
-        10 <= len(all_images) <= 11
+        3 * 5 - 1 <= len(all_images) <= 3 * 5 + 1
     ), "Expected each image (one/page) and equation (one/page) to be read"
     assert (
-        len({m for m in all_images if cast(int, m.info["page_num"]) > 1}) <= 2
+        len({m for m in all_images if cast(int, m.info["page_num"]) > 1}) <= 3
     ), "Expected images/equations on all pages beyond 1 to be deduplicated"
 
     all_tables = [m for m in all_media if m.info.get("type") == "table"]

diff --git a/tests/duplicate_media_template.md b/tests/duplicate_media_template.md
@@ -3,10 +3,14 @@
 <!-- pyml disable-num-lines 5 line-length -->
 
 [//]: # "To generate `stub_data/duplicate_media.pdf` from this:"
-[//]: # "1. `pandoc duplicate_media_template.md --standalone --self-contained --katex -t html -o temp.html`"
+[//]: # "1. `pandoc duplicate_media_template.md --standalone --katex -t html -o temp.html`"
 [//]: # "2. `Chromium --headless --disable-gpu --print-to-pdf=stub_data/duplicate_media.pdf --no-pdf-header-footer temp.html`"
 [//]: # "3. `rm temp.html`"
 
+<div style="text-align: right; margin-bottom: 10px;">
+<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/12/Wikimedia_logo_text_RGB.svg/330px-Wikimedia_logo_text_RGB.svg.png" alt="Wikimedia logo" height="100"/>
+</div>
+
 <img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>
 
 Text under image 1.
@@ -28,6 +32,10 @@ $$
 
 <div style="page-break-after: always;"></div>
 
+<div style="text-align: right; margin-bottom: 10px;">
+<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/12/Wikimedia_logo_text_RGB.svg/330px-Wikimedia_logo_text_RGB.svg.png" alt="Wikimedia logo" height="100"/>
+</div>
+
 <img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>
 
 Text under image 2.
@@ -49,6 +57,10 @@ $$
 
 <div style="page-break-after: always;"></div>
 
+<div style="text-align: right; margin-bottom: 10px;">
+<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/12/Wikimedia_logo_text_RGB.svg/330px-Wikimedia_logo_text_RGB.svg.png" alt="Wikimedia logo" height="100"/>
+</div>
+
 <img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>
 
 Text under image 3.
@@ -70,6 +82,10 @@ $$
 
 <div style="page-break-after: always;"></div>
 
+<div style="text-align: right; margin-bottom: 10px;">
+<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/12/Wikimedia_logo_text_RGB.svg/330px-Wikimedia_logo_text_RGB.svg.png" alt="Wikimedia logo" height="100"/>
+</div>
+
 <img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>
 
 Text under image 4.
@@ -91,6 +107,10 @@ $$
 
 <div style="page-break-after: always;"></div>
 
+<div style="text-align: right; margin-bottom: 10px;">
+<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/12/Wikimedia_logo_text_RGB.svg/330px-Wikimedia_logo_text_RGB.svg.png" alt="Wikimedia logo" height="100"/>
+</div>
+
 <img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>
 
 Text under image 5.

diff --git a/tests/stub_data/duplicate_media.pdf b/tests/stub_data/duplicate_media.pdf