diff --git a/packages/paper-qa-docling/src/paperqa_docling/reader.py b/packages/paper-qa-docling/src/paperqa_docling/reader.py index fa3846c61..311d2d2aa 100644 --- a/packages/paper-qa-docling/src/paperqa_docling/reader.py +++ b/packages/paper-qa-docling/src/paperqa_docling/reader.py @@ -18,6 +18,7 @@ from docling_core.types.doc import ( DescriptionAnnotation, DocItem, + FormulaItem, PictureItem, TableItem, TextItem, @@ -123,10 +124,14 @@ def parse_pdf_to_pages( # noqa: PLR0912 # NOTE: docling pages are 1-indexed page_nums = [prov.page_no for prov in item.prov] - if isinstance(item, TextItem): + if isinstance(item, TextItem | FormulaItem): # Handle items with text + item_text = item.text + if not item_text and isinstance(item, FormulaItem) and item.orig: + # Sometimes the sanitization of formula text fails, so use the original + item_text = item.orig for page_num in page_nums: new_text = ( - item.text if not content[str(page_num)][0] else "\n\n" + item.text + item_text if not content[str(page_num)][0] else "\n\n" + item_text ) total_length += len(new_text) if page_size_limit and total_length > page_size_limit: @@ -137,7 +142,9 @@ def parse_pdf_to_pages( # noqa: PLR0912 ) content[str(page_num)][0] += new_text - if parse_media and isinstance(item, PictureItem): # Handle images + if parse_media and isinstance( # Handle images and formulae + item, PictureItem | FormulaItem + ): image_data = item.get_image(doc) if image_data: try: @@ -154,14 +161,16 @@ def parse_pdf_to_pages( # noqa: PLR0912 img_bytes.seek(0) # Reset pointer before read to avoid empty data media_metadata = { - "type": "picture", + "type": "formula" if isinstance(item, FormulaItem) else "picture", "width": image_data.width, "height": image_data.height, "bbox": item.prov[0].bbox.as_tuple(), "images_scale": pipeline_options.images_scale, } annotations = [ - x for x in item.annotations if isinstance(x, DescriptionAnnotation) + x + for x in getattr(item, "annotations", []) + if isinstance(x, DescriptionAnnotation) ] if len(annotations) == 1: # We don't set this text in ParsedMedia.text because it's diff --git a/packages/paper-qa-docling/tests/test_paperqa_docling.py b/packages/paper-qa-docling/tests/test_paperqa_docling.py index 75e286e08..c0d327096 100644 --- a/packages/paper-qa-docling/tests/test_paperqa_docling.py +++ b/packages/paper-qa-docling/tests/test_paperqa_docling.py @@ -199,12 +199,12 @@ def test_media_deduplication() -> None: all_media = [m for _, media in parsed_text.content.values() for m in media] # type: ignore[misc] all_images = [m for m in all_media if m.info.get("type") == "picture"] - # We allow for one table to be misinterpreted as an image + # We allow for one table to be misinterpreted as an image, and one logo to be missed assert ( - 10 <= len(all_images) <= 11 + 3 * 5 - 1 <= len(all_images) <= 3 * 5 + 1 ), "Expected each image (one/page) and equation (one/page) to be read" assert ( - len({m for m in all_images if cast(int, m.info["page_num"]) > 1}) <= 2 + len({m for m in all_images if cast(int, m.info["page_num"]) > 1}) <= 3 ), "Expected images/equations on all pages beyond 1 to be deduplicated" all_tables = [m for m in all_media if m.info.get("type") == "table"] diff --git a/tests/duplicate_media_template.md b/tests/duplicate_media_template.md index 0c9f5fc87..4ed13834a 100644 --- a/tests/duplicate_media_template.md +++ b/tests/duplicate_media_template.md @@ -3,10 +3,14 @@ [//]: # "To generate `stub_data/duplicate_media.pdf` from this:" -[//]: # "1. `pandoc duplicate_media_template.md --standalone --self-contained --katex -t html -o temp.html`" +[//]: # "1. `pandoc duplicate_media_template.md --standalone --katex -t html -o temp.html`" [//]: # "2. `Chromium --headless --disable-gpu --print-to-pdf=stub_data/duplicate_media.pdf --no-pdf-header-footer temp.html`" [//]: # "3. `rm temp.html`" +
+Wikimedia logo +
+ Map of SF districts Text under image 1. @@ -28,6 +32,10 @@ $$
+
+Wikimedia logo +
+ Map of SF districts Text under image 2. @@ -49,6 +57,10 @@ $$
+
+Wikimedia logo +
+ Map of SF districts Text under image 3. @@ -70,6 +82,10 @@ $$
+
+Wikimedia logo +
+ Map of SF districts Text under image 4. @@ -91,6 +107,10 @@ $$
+
+Wikimedia logo +
+ Map of SF districts Text under image 5. diff --git a/tests/stub_data/duplicate_media.pdf b/tests/stub_data/duplicate_media.pdf index 22f662d77..4f3361761 100644 Binary files a/tests/stub_data/duplicate_media.pdf and b/tests/stub_data/duplicate_media.pdf differ