From 0bf5463a1e0b52378274041b31f15f72f8a07047 Mon Sep 17 00:00:00 2001 From: Jacques Dumora Date: Thu, 12 Mar 2026 17:38:53 +0100 Subject: [PATCH 1/3] feat: add Python bindings via PyO3 Expose the pdf-inspector Rust library as a Python package using PyO3 + maturin. Python users can now `pip install` and use `import pdf_inspector` for PDF classification, text extraction, and markdown conversion with native Rust speed. Adds: - src/python.rs: PyO3 bindings (process_pdf, detect_pdf, extract_text, etc.) - pyproject.toml: maturin build configuration - pdf_inspector.pyi: type stubs for IDE support - tests/test_python.py: 21 pytest tests covering all Python API functions - examples/basic_usage.py: example script demonstrating all features - Updated README with Python quick start and API reference Co-Authored-By: Claude Opus 4.6 --- Cargo.toml | 8 ++ README.md | 65 +++++++++- examples/basic_usage.py | 76 ++++++++++++ pdf_inspector.pyi | 56 +++++++++ pyproject.toml | 21 ++++ src/lib.rs | 3 + src/python.rs | 269 ++++++++++++++++++++++++++++++++++++++++ tests/test_python.py | 188 ++++++++++++++++++++++++++++ 8 files changed, 684 insertions(+), 2 deletions(-) create mode 100644 examples/basic_usage.py create mode 100644 pdf_inspector.pyi create mode 100644 pyproject.toml create mode 100644 src/python.rs create mode 100644 tests/test_python.py diff --git a/Cargo.toml b/Cargo.toml index d625bde..f534f95 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,14 @@ description = "Fast PDF inspection, classification, and text extraction with sma license = "MIT" repository = "https://github.com/firecrawl/pdf-inspector" +[lib] +name = "pdf_inspector" +crate-type = ["lib", "cdylib"] + [dependencies] +# Python bindings +pyo3 = { version = "0.22", features = ["extension-module"], optional = true } + # PDF parsing lopdf = { git = "https://github.com/firecrawl/lopdf", branch = "firecrawl/zlib-checksum-encrypted", features = ["rayon"] } @@ -34,6 +41,7 @@ tempfile = "3.3" [features] default = [] +python = ["pyo3"] [[bin]] name = "pdf2md" diff --git a/README.md b/README.md index 762927b..cf1026f 100644 --- a/README.md +++ b/README.md @@ -15,10 +15,70 @@ Built by [Firecrawl](https://firecrawl.dev) to handle text-based PDFs locally in - **Encoding issue detection** — Automatically flags broken font encodings (garbled text, replacement characters) so callers can fall back to OCR. - **Single document load** — The document is parsed once and shared between detection and extraction, avoiding redundant I/O. - **Lightweight** — Pure Rust, no ML models, no external services. Single dependency on `lopdf` for PDF parsing. +- **Python bindings** — Use from Python via PyO3. Install with `pip install pdf-inspector` or build from source with `maturin`. ## Quick start -### As a library +### Python + +Install from source (requires Rust toolchain): + +```bash +pip install maturin +maturin develop --release +``` + +Use it: + +```python +import pdf_inspector + +# Full processing: detect + extract + convert to Markdown +result = pdf_inspector.process_pdf("document.pdf") +print(result.pdf_type) # "text_based", "scanned", "image_based", "mixed" +print(result.confidence) # 0.0 - 1.0 +print(result.page_count) # number of pages +print(result.markdown) # Markdown string or None + +# Process specific pages only +result = pdf_inspector.process_pdf("document.pdf", pages=[1, 3, 5]) + +# Process from bytes (no filesystem needed) +with open("document.pdf", "rb") as f: + result = pdf_inspector.process_pdf_bytes(f.read()) + +# Fast detection only (no text extraction) +result = pdf_inspector.detect_pdf("document.pdf") +if result.pdf_type == "text_based": + print("Can extract locally!") +else: + print(f"Pages needing OCR: {result.pages_needing_ocr}") + +# Plain text extraction +text = pdf_inspector.extract_text("document.pdf") + +# Positioned text items with font info +items = pdf_inspector.extract_text_with_positions("document.pdf") +for item in items[:5]: + print(f"'{item.text}' at ({item.x:.0f}, {item.y:.0f}) size={item.font_size}") +``` + +#### Python API reference + +| Function | Description | +|---|---| +| `process_pdf(path, pages=None)` | Full processing (detect + extract + markdown) | +| `process_pdf_bytes(data, pages=None)` | Full processing from bytes | +| `detect_pdf(path)` | Fast detection only | +| `detect_pdf_bytes(data)` | Fast detection from bytes | +| `extract_text(path)` | Plain text extraction | +| `extract_text_with_positions(path, pages=None)` | Text with X/Y coords and font info | + +**`PdfResult` fields:** `pdf_type`, `markdown`, `page_count`, `processing_time_ms`, `pages_needing_ocr`, `title`, `confidence`, `is_complex_layout`, `pages_with_tables`, `pages_with_columns`, `has_encoding_issues` + +**`TextItem` fields:** `text`, `x`, `y`, `width`, `height`, `font`, `font_size`, `page`, `is_bold`, `is_italic`, `item_type` + +### Rust Add to your `Cargo.toml`: @@ -159,6 +219,7 @@ The document is loaded **once** via `load_document_from_path` / `load_document_f ``` src/ lib.rs — Public API, PdfOptions builder, convenience functions + python.rs — PyO3 Python bindings types.rs — Shared types: TextItem, TextLine, PdfRect, ItemType text_utils.rs — Character/text helpers (CJK, RTL, ligatures, bold/italic) process_mode.rs — ProcessMode enum (DetectOnly, Analyze, Full) @@ -189,7 +250,7 @@ This detects 300+ page PDFs in milliseconds. The result includes `pages_needing_ | `Sample(n)` | Sample `n` evenly distributed pages (first, last, middle) | Very large PDFs where speed matters more than precision | | `Pages(vec)` | Only scan specific 1-indexed page numbers | When the caller knows which pages to check | -## API +## Rust API ### Processing modes diff --git a/examples/basic_usage.py b/examples/basic_usage.py new file mode 100644 index 0000000..3ab14a1 --- /dev/null +++ b/examples/basic_usage.py @@ -0,0 +1,76 @@ +"""Basic usage examples for pdf-inspector Python library.""" + +import sys +import pdf_inspector + + +def main(): + if len(sys.argv) < 2: + print("Usage: python basic_usage.py ") + sys.exit(1) + + path = sys.argv[1] + + # 1. Full processing: detect + extract + markdown + print("=" * 60) + print("Full processing") + print("=" * 60) + result = pdf_inspector.process_pdf(path) + print(f"Type: {result.pdf_type}") + print(f"Pages: {result.page_count}") + print(f"Confidence: {result.confidence:.0%}") + print(f"Time: {result.processing_time_ms}ms") + print(f"Title: {result.title}") + print(f"Complex: {result.is_complex_layout}") + print(f"Tables on: {result.pages_with_tables}") + print(f"Columns on: {result.pages_with_columns}") + print(f"Encoding: {'issues detected' if result.has_encoding_issues else 'ok'}") + print(f"OCR needed: {result.pages_needing_ocr or 'none'}") + if result.markdown: + print(f"\n--- Markdown ({len(result.markdown)} chars) ---") + print(result.markdown[:500]) + if len(result.markdown) > 500: + print(f"\n... ({len(result.markdown) - 500} more chars)") + + # 2. Fast detection only + print("\n" + "=" * 60) + print("Detection only") + print("=" * 60) + info = pdf_inspector.detect_pdf(path) + print(f"Type: {info.pdf_type}") + print(f"Confidence: {info.confidence:.0%}") + print(f"Time: {info.processing_time_ms}ms") + + # 3. From bytes + print("\n" + "=" * 60) + print("From bytes") + print("=" * 60) + with open(path, "rb") as f: + data = f.read() + result = pdf_inspector.process_pdf_bytes(data) + print(f"Type: {result.pdf_type}, Pages: {result.page_count}") + + # 4. Plain text + print("\n" + "=" * 60) + print("Plain text extraction") + print("=" * 60) + text = pdf_inspector.extract_text(path) + print(text[:300]) + + # 5. Positioned items + print("\n" + "=" * 60) + print("Positioned text items (first 10)") + print("=" * 60) + items = pdf_inspector.extract_text_with_positions(path, pages=[1]) + for item in items[:10]: + bold = " [B]" if item.is_bold else "" + italic = " [I]" if item.is_italic else "" + print( + f" p{item.page} ({item.x:6.1f}, {item.y:6.1f}) " + f"size={item.font_size:5.1f}{bold}{italic} " + f"'{item.text}'" + ) + + +if __name__ == "__main__": + main() diff --git a/pdf_inspector.pyi b/pdf_inspector.pyi new file mode 100644 index 0000000..993a59e --- /dev/null +++ b/pdf_inspector.pyi @@ -0,0 +1,56 @@ +"""Type stubs for pdf_inspector.""" + +from typing import Optional + +class PdfResult: + """Result of processing a PDF file.""" + pdf_type: str + """'text_based', 'scanned', 'image_based', or 'mixed'.""" + markdown: Optional[str] + page_count: int + processing_time_ms: int + pages_needing_ocr: list[int] + title: Optional[str] + confidence: float + is_complex_layout: bool + pages_with_tables: list[int] + pages_with_columns: list[int] + has_encoding_issues: bool + +class TextItem: + """A positioned text item extracted from a PDF.""" + text: str + x: float + y: float + width: float + height: float + font: str + font_size: float + page: int + is_bold: bool + is_italic: bool + item_type: str + +def process_pdf(path: str, pages: Optional[list[int]] = None) -> PdfResult: + """Process a PDF: detect type, extract text, convert to Markdown.""" + ... + +def process_pdf_bytes(data: bytes, pages: Optional[list[int]] = None) -> PdfResult: + """Process a PDF from bytes in memory.""" + ... + +def detect_pdf(path: str) -> PdfResult: + """Fast detection only — no text extraction.""" + ... + +def detect_pdf_bytes(data: bytes) -> PdfResult: + """Fast detection from bytes.""" + ... + +def extract_text(path: str) -> str: + """Extract plain text from a PDF.""" + ... + +def extract_text_with_positions(path: str, pages: Optional[list[int]] = None) -> list[TextItem]: + """Extract text with position information.""" + ... diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f925a0e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,21 @@ +[build-system] +requires = ["maturin>=1.0,<2.0"] +build-backend = "maturin" + +[project] +name = "pdf-inspector" +version = "0.1.0" +description = "Fast PDF inspection, classification, and text extraction with smart scanned vs text-based detection" +license = { text = "MIT" } +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Topic :: Text Processing", +] + +[tool.maturin] +features = ["python"] diff --git a/src/lib.rs b/src/lib.rs index aebab7a..f80718c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,6 +22,9 @@ //! ).unwrap(); //! ``` +#[cfg(feature = "python")] +pub mod python; + pub mod adobe_korea1; pub mod detector; pub mod extractor; diff --git a/src/python.rs b/src/python.rs new file mode 100644 index 0000000..3fb645d --- /dev/null +++ b/src/python.rs @@ -0,0 +1,269 @@ +//! PyO3 Python bindings for pdf-inspector. + +use pyo3::prelude::*; +use pyo3::exceptions::PyValueError; +use std::collections::HashSet; + +use crate::detector::PdfType; +use crate::types::ItemType; + +// --------------------------------------------------------------------------- +// Result wrapper +// --------------------------------------------------------------------------- + +/// Result of processing a PDF file. +#[pyclass(name = "PdfResult")] +#[derive(Clone)] +pub struct PyPdfResult { + /// The detected PDF type: "text_based", "scanned", "image_based", or "mixed". + #[pyo3(get)] + pub pdf_type: String, + /// Markdown output (None if detect-only or scanned PDF). + #[pyo3(get)] + pub markdown: Option, + /// Total number of pages. + #[pyo3(get)] + pub page_count: u32, + /// Processing time in milliseconds. + #[pyo3(get)] + pub processing_time_ms: u64, + /// 1-indexed page numbers that need OCR. + #[pyo3(get)] + pub pages_needing_ocr: Vec, + /// Title from PDF metadata. + #[pyo3(get)] + pub title: Option, + /// Detection confidence (0.0-1.0). + #[pyo3(get)] + pub confidence: f32, + /// Whether the layout is complex (tables/columns detected). + #[pyo3(get)] + pub is_complex_layout: bool, + /// Pages with tables detected. + #[pyo3(get)] + pub pages_with_tables: Vec, + /// Pages with multi-column layout. + #[pyo3(get)] + pub pages_with_columns: Vec, + /// Whether encoding issues were detected. + #[pyo3(get)] + pub has_encoding_issues: bool, +} + +#[pymethods] +impl PyPdfResult { + fn __repr__(&self) -> String { + format!( + "PdfResult(pdf_type='{}', pages={}, confidence={:.2})", + self.pdf_type, self.page_count, self.confidence + ) + } +} + +fn pdf_type_str(t: PdfType) -> String { + match t { + PdfType::TextBased => "text_based".into(), + PdfType::Scanned => "scanned".into(), + PdfType::ImageBased => "image_based".into(), + PdfType::Mixed => "mixed".into(), + } +} + +fn to_py_result(r: crate::PdfProcessResult) -> PyPdfResult { + PyPdfResult { + pdf_type: pdf_type_str(r.pdf_type), + markdown: r.markdown, + page_count: r.page_count, + processing_time_ms: r.processing_time_ms, + pages_needing_ocr: r.pages_needing_ocr, + title: r.title, + confidence: r.confidence, + is_complex_layout: r.layout.is_complex, + pages_with_tables: r.layout.pages_with_tables, + pages_with_columns: r.layout.pages_with_columns, + has_encoding_issues: r.has_encoding_issues, + } +} + +fn to_py_err(e: crate::PdfError) -> PyErr { + PyValueError::new_err(e.to_string()) +} + +// --------------------------------------------------------------------------- +// Text item wrapper +// --------------------------------------------------------------------------- + +/// A positioned text item extracted from a PDF. +#[pyclass(name = "TextItem")] +#[derive(Clone)] +pub struct PyTextItem { + #[pyo3(get)] + pub text: String, + #[pyo3(get)] + pub x: f32, + #[pyo3(get)] + pub y: f32, + #[pyo3(get)] + pub width: f32, + #[pyo3(get)] + pub height: f32, + #[pyo3(get)] + pub font: String, + #[pyo3(get)] + pub font_size: f32, + #[pyo3(get)] + pub page: u32, + #[pyo3(get)] + pub is_bold: bool, + #[pyo3(get)] + pub is_italic: bool, + #[pyo3(get)] + pub item_type: String, +} + +#[pymethods] +impl PyTextItem { + fn __repr__(&self) -> String { + format!( + "TextItem(text='{}', page={}, x={:.1}, y={:.1})", + self.text.chars().take(40).collect::(), + self.page, + self.x, + self.y, + ) + } +} + +fn item_type_str(t: &ItemType) -> String { + match t { + ItemType::Text => "text".into(), + ItemType::Image => "image".into(), + ItemType::Link(url) => format!("link:{url}"), + ItemType::FormField => "form_field".into(), + } +} + +// --------------------------------------------------------------------------- +// Public Python API +// --------------------------------------------------------------------------- + +/// Process a PDF file: detect type, extract text, and convert to Markdown. +/// +/// Args: +/// path: Path to the PDF file. +/// pages: Optional list of 1-indexed page numbers to process. +/// +/// Returns: +/// PdfResult with markdown, pdf_type, and metadata. +#[pyfunction] +#[pyo3(signature = (path, pages=None))] +fn process_pdf(path: &str, pages: Option>) -> PyResult { + let mut opts = crate::PdfOptions::new(); + if let Some(p) = pages { + opts = opts.pages(p); + } + let result = crate::process_pdf_with_options(path, opts).map_err(to_py_err)?; + Ok(to_py_result(result)) +} + +/// Process a PDF from bytes in memory. +/// +/// Args: +/// data: PDF file contents as bytes. +/// pages: Optional list of 1-indexed page numbers to process. +/// +/// Returns: +/// PdfResult with markdown, pdf_type, and metadata. +#[pyfunction] +#[pyo3(signature = (data, pages=None))] +fn process_pdf_bytes(data: &[u8], pages: Option>) -> PyResult { + let mut opts = crate::PdfOptions::new(); + if let Some(p) = pages { + opts = opts.pages(p); + } + let result = crate::process_pdf_mem_with_options(data, opts).map_err(to_py_err)?; + Ok(to_py_result(result)) +} + +/// Fast detection only — no text extraction or markdown. +/// +/// Args: +/// path: Path to the PDF file. +/// +/// Returns: +/// PdfResult with pdf_type and metadata (markdown will be None). +#[pyfunction] +fn detect_pdf(path: &str) -> PyResult { + let result = crate::detect_pdf(path).map_err(to_py_err)?; + Ok(to_py_result(result)) +} + +/// Fast detection from bytes — no text extraction or markdown. +#[pyfunction] +fn detect_pdf_bytes(data: &[u8]) -> PyResult { + let result = crate::detect_pdf_mem(data).map_err(to_py_err)?; + Ok(to_py_result(result)) +} + +/// Extract plain text from a PDF file. +/// +/// Args: +/// path: Path to the PDF file. +/// +/// Returns: +/// Extracted text as a string. +#[pyfunction] +fn extract_text(path: &str) -> PyResult { + crate::extract_text(path).map_err(to_py_err) +} + +/// Extract text with position information. +/// +/// Args: +/// path: Path to the PDF file. +/// pages: Optional list of 1-indexed page numbers. +/// +/// Returns: +/// List of TextItem objects with text, position, font info. +#[pyfunction] +#[pyo3(signature = (path, pages=None))] +fn extract_text_with_positions(path: &str, pages: Option>) -> PyResult> { + let items = match pages { + Some(p) => { + let page_set: HashSet = p.into_iter().collect(); + crate::extract_text_with_positions_pages(path, Some(&page_set)).map_err(to_py_err)? + } + None => crate::extract_text_with_positions(path).map_err(to_py_err)?, + }; + + Ok(items + .into_iter() + .map(|item| PyTextItem { + text: item.text, + x: item.x, + y: item.y, + width: item.width, + height: item.height, + font: item.font, + font_size: item.font_size, + page: item.page, + is_bold: item.is_bold, + is_italic: item.is_italic, + item_type: item_type_str(&item.item_type), + }) + .collect()) +} + +/// Python module definition. +#[pymodule] +fn pdf_inspector(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_function(wrap_pyfunction!(process_pdf, m)?)?; + m.add_function(wrap_pyfunction!(process_pdf_bytes, m)?)?; + m.add_function(wrap_pyfunction!(detect_pdf, m)?)?; + m.add_function(wrap_pyfunction!(detect_pdf_bytes, m)?)?; + m.add_function(wrap_pyfunction!(extract_text, m)?)?; + m.add_function(wrap_pyfunction!(extract_text_with_positions, m)?)?; + Ok(()) +} diff --git a/tests/test_python.py b/tests/test_python.py new file mode 100644 index 0000000..379f8b4 --- /dev/null +++ b/tests/test_python.py @@ -0,0 +1,188 @@ +"""Tests for the pdf_inspector Python bindings.""" + +import os +import pytest +import pdf_inspector + +FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures") + + +def fixture_path(name: str) -> str: + return os.path.join(FIXTURES_DIR, name) + + +# --------------------------------------------------------------------------- +# process_pdf +# --------------------------------------------------------------------------- + + +class TestProcessPdf: + def test_basic(self): + result = pdf_inspector.process_pdf(fixture_path("thermo-freon12.pdf")) + assert result.pdf_type == "text_based" + assert result.page_count == 3 + assert result.confidence > 0.0 + assert result.markdown is not None + assert len(result.markdown) > 0 + + def test_result_repr(self): + result = pdf_inspector.process_pdf(fixture_path("thermo-freon12.pdf")) + r = repr(result) + assert "PdfResult" in r + assert "text_based" in r + + def test_with_pages(self): + result = pdf_inspector.process_pdf( + fixture_path("thermo-freon12.pdf"), pages=[1] + ) + assert result.page_count == 3 # total pages in doc + assert result.markdown is not None + + def test_result_fields(self): + result = pdf_inspector.process_pdf(fixture_path("thermo-freon12.pdf")) + # All fields should be accessible + assert isinstance(result.pdf_type, str) + assert isinstance(result.page_count, int) + assert isinstance(result.processing_time_ms, int) + assert isinstance(result.pages_needing_ocr, list) + assert isinstance(result.confidence, float) + assert isinstance(result.is_complex_layout, bool) + assert isinstance(result.pages_with_tables, list) + assert isinstance(result.pages_with_columns, list) + assert isinstance(result.has_encoding_issues, bool) + # title can be None or str + assert result.title is None or isinstance(result.title, str) + + +# --------------------------------------------------------------------------- +# process_pdf_bytes +# --------------------------------------------------------------------------- + + +class TestProcessPdfBytes: + def test_basic(self): + with open(fixture_path("thermo-freon12.pdf"), "rb") as f: + data = f.read() + result = pdf_inspector.process_pdf_bytes(data) + assert result.pdf_type == "text_based" + assert result.markdown is not None + + def test_with_pages(self): + with open(fixture_path("thermo-freon12.pdf"), "rb") as f: + data = f.read() + result = pdf_inspector.process_pdf_bytes(data, pages=[1, 2]) + assert result.markdown is not None + + +# --------------------------------------------------------------------------- +# detect_pdf / detect_pdf_bytes +# --------------------------------------------------------------------------- + + +class TestDetectPdf: + def test_detect_file(self): + result = pdf_inspector.detect_pdf(fixture_path("thermo-freon12.pdf")) + assert result.pdf_type == "text_based" + assert result.markdown is None # detect only — no markdown + assert result.page_count == 3 + + def test_detect_bytes(self): + with open(fixture_path("thermo-freon12.pdf"), "rb") as f: + data = f.read() + result = pdf_inspector.detect_pdf_bytes(data) + assert result.pdf_type == "text_based" + assert result.markdown is None + + +# --------------------------------------------------------------------------- +# extract_text +# --------------------------------------------------------------------------- + + +class TestExtractText: + def test_basic(self): + text = pdf_inspector.extract_text(fixture_path("thermo-freon12.pdf")) + assert isinstance(text, str) + assert len(text) > 0 + + +# --------------------------------------------------------------------------- +# extract_text_with_positions +# --------------------------------------------------------------------------- + + +class TestExtractTextWithPositions: + def test_basic(self): + items = pdf_inspector.extract_text_with_positions( + fixture_path("thermo-freon12.pdf") + ) + assert len(items) > 0 + item = items[0] + assert isinstance(item.text, str) + assert isinstance(item.x, float) + assert isinstance(item.y, float) + assert isinstance(item.width, float) + assert isinstance(item.height, float) + assert isinstance(item.font, str) + assert isinstance(item.font_size, float) + assert isinstance(item.page, int) + assert isinstance(item.is_bold, bool) + assert isinstance(item.is_italic, bool) + assert isinstance(item.item_type, str) + + def test_with_pages(self): + items = pdf_inspector.extract_text_with_positions( + fixture_path("thermo-freon12.pdf"), pages=[1] + ) + assert len(items) > 0 + assert all(item.page == 1 for item in items) + + def test_repr(self): + items = pdf_inspector.extract_text_with_positions( + fixture_path("thermo-freon12.pdf") + ) + r = repr(items[0]) + assert "TextItem" in r + + +# --------------------------------------------------------------------------- +# Error handling +# --------------------------------------------------------------------------- + + +class TestErrors: + def test_nonexistent_file(self): + with pytest.raises(ValueError): + pdf_inspector.process_pdf("/nonexistent/file.pdf") + + def test_not_a_pdf(self): + with pytest.raises(ValueError): + pdf_inspector.process_pdf_bytes(b"this is not a pdf") + + def test_empty_bytes(self): + with pytest.raises(ValueError): + pdf_inspector.process_pdf_bytes(b"") + + +# --------------------------------------------------------------------------- +# Multiple fixtures +# --------------------------------------------------------------------------- + + +class TestMultipleFixtures: + """Run basic processing on all available test fixtures.""" + + @pytest.mark.parametrize( + "filename", + [f for f in os.listdir(FIXTURES_DIR) if f.endswith(".pdf")], + ) + def test_process_all_fixtures(self, filename): + result = pdf_inspector.process_pdf(fixture_path(filename)) + assert result.pdf_type in ( + "text_based", + "scanned", + "image_based", + "mixed", + ) + assert result.page_count > 0 + assert result.confidence >= 0.0 From 506b2a0c7097223e9ba82dd18f3c48a9d15e7e7c Mon Sep 17 00:00:00 2001 From: Abimael Martell Date: Thu, 2 Apr 2026 11:48:18 -0700 Subject: [PATCH 2/3] unify NAPI and Python binding APIs for consistent surface Both bindings now expose the same 6 function families: process, detect, classify, extractText, extractTextWithPositions, and extractTextInRegions. Bumps PyO3 from 0.22 to 0.25 for Python 3.14 support. Co-Authored-By: Claude Opus 4.6 --- Cargo.toml | 2 +- README.md | 52 ++++++- examples/basic_usage.py | 22 +++ napi/build.rs | 2 +- napi/index.d.ts | 51 ++++++- napi/index.js | 4 + napi/src/lib.rs | 278 +++++++++++++++++++++++++--------- napi/test.mjs | 98 ++++++++++++ pdf_inspector.pyi | 61 ++++++++ src/python.rs | 328 +++++++++++++++++++++++++++++++--------- tests/test_python.py | 152 ++++++++++++++++++- 11 files changed, 893 insertions(+), 157 deletions(-) create mode 100644 napi/test.mjs diff --git a/Cargo.toml b/Cargo.toml index f1d7a8b..69c0af6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ crate-type = ["lib", "cdylib"] [dependencies] # Python bindings -pyo3 = { version = "0.22", features = ["extension-module"], optional = true } +pyo3 = { version = "0.25", features = ["extension-module"], optional = true } # PDF parsing lopdf = { git = "https://github.com/J-F-Liu/lopdf", rev = "052674053814a9f4897af94f0b8e46a545c9b329", features = ["rayon"] } diff --git a/README.md b/README.md index cf1026f..d4a6fbd 100644 --- a/README.md +++ b/README.md @@ -69,15 +69,65 @@ for item in items[:5]: |---|---| | `process_pdf(path, pages=None)` | Full processing (detect + extract + markdown) | | `process_pdf_bytes(data, pages=None)` | Full processing from bytes | -| `detect_pdf(path)` | Fast detection only | +| `detect_pdf(path)` | Fast detection only (returns PdfResult) | | `detect_pdf_bytes(data)` | Fast detection from bytes | +| `classify_pdf(path)` | Lightweight classification (returns PdfClassification) | +| `classify_pdf_bytes(data)` | Lightweight classification from bytes | | `extract_text(path)` | Plain text extraction | +| `extract_text_bytes(data)` | Plain text extraction from bytes | | `extract_text_with_positions(path, pages=None)` | Text with X/Y coords and font info | +| `extract_text_with_positions_bytes(data, pages=None)` | Text with positions from bytes | +| `extract_text_in_regions(path, page_regions)` | Extract text in bounding-box regions | +| `extract_text_in_regions_bytes(data, page_regions)` | Region extraction from bytes | **`PdfResult` fields:** `pdf_type`, `markdown`, `page_count`, `processing_time_ms`, `pages_needing_ocr`, `title`, `confidence`, `is_complex_layout`, `pages_with_tables`, `pages_with_columns`, `has_encoding_issues` +**`PdfClassification` fields:** `pdf_type`, `page_count`, `pages_needing_ocr` (0-indexed), `confidence` + **`TextItem` fields:** `text`, `x`, `y`, `width`, `height`, `font`, `font_size`, `page`, `is_bold`, `is_italic`, `item_type` +**`RegionText` fields:** `text`, `needs_ocr` + +**`PageRegionTexts` fields:** `page` (0-indexed), `regions` (list of RegionText) + +### Node.js (NAPI) + +```bash +npm install @firecrawl/pdf-inspector-js +``` + +```javascript +import { readFileSync } from 'fs'; +import { processPdf, classifyPdf, extractTextInRegions } from '@firecrawl/pdf-inspector-js'; + +const buffer = readFileSync('document.pdf'); + +// Full processing +const result = processPdf(buffer); +console.log(result.pdfType); // "TextBased", "Scanned", "ImageBased", "Mixed" +console.log(result.markdown); // Markdown string or null + +// Lightweight classification +const cls = classifyPdf(buffer); +console.log(cls.pdfType, cls.pagesNeedingOcr); + +// Region-based extraction (for hybrid OCR pipelines) +const regions = extractTextInRegions(buffer, [ + { page: 0, regions: [[0, 0, 600, 100]] } +]); +``` + +#### Node.js API reference + +| Function | Description | +|---|---| +| `processPdf(buffer, pages?)` | Full processing (detect + extract + markdown) | +| `detectPdf(buffer)` | Fast detection only (returns PdfResult) | +| `classifyPdf(buffer)` | Lightweight classification (returns PdfClassification) | +| `extractText(buffer)` | Plain text extraction | +| `extractTextWithPositions(buffer, pages?)` | Text with X/Y coords and font info | +| `extractTextInRegions(buffer, pageRegions)` | Extract text in bounding-box regions | + ### Rust Add to your `Cargo.toml`: diff --git a/examples/basic_usage.py b/examples/basic_usage.py index 3ab14a1..96401dc 100644 --- a/examples/basic_usage.py +++ b/examples/basic_usage.py @@ -71,6 +71,28 @@ def main(): f"'{item.text}'" ) + # 6. Lightweight classification + print("\n" + "=" * 60) + print("Lightweight classification") + print("=" * 60) + cls = pdf_inspector.classify_pdf(path) + print(f"Type: {cls.pdf_type}") + print(f"Pages: {cls.page_count}") + print(f"Confidence: {cls.confidence:.0%}") + print(f"OCR pages: {cls.pages_needing_ocr or 'none'} (0-indexed)") + + # 7. Region-based text extraction + print("\n" + "=" * 60) + print("Region-based text extraction (page 0, top region)") + print("=" * 60) + regions = pdf_inspector.extract_text_in_regions( + path, [(0, [[0.0, 0.0, 600.0, 200.0]])] + ) + for page_result in regions: + for i, region in enumerate(page_result.regions): + print(f" Region {i}: needs_ocr={region.needs_ocr}") + print(f" Text: {region.text[:200]}") + if __name__ == "__main__": main() diff --git a/napi/build.rs b/napi/build.rs index bbfc9e4..0f1b010 100644 --- a/napi/build.rs +++ b/napi/build.rs @@ -1,3 +1,3 @@ fn main() { - napi_build::setup(); + napi_build::setup(); } diff --git a/napi/index.d.ts b/napi/index.d.ts index 8254452..904cb76 100644 --- a/napi/index.d.ts +++ b/napi/index.d.ts @@ -1,11 +1,18 @@ /* auto-generated by NAPI-RS */ /* eslint-disable */ /** - * Classify a PDF: detect type (TextBased/Scanned/Mixed/ImageBased), - * page count, and which pages need OCR. Takes PDF bytes as Buffer. + * Lightweight PDF classification — returns type, page count, and OCR pages. + * Faster than detectPdf as it skips building the full PdfResult. + * Pages in pagesNeedingOcr are 0-indexed. */ export declare function classifyPdf(buffer: Buffer): PdfClassification +/** Fast detection only — no text extraction or markdown. */ +export declare function detectPdf(buffer: Buffer): PdfResult + +/** Extract plain text from a PDF Buffer. */ +export declare function extractText(buffer: Buffer): string + /** * Extract text within bounding-box regions from a PDF. * @@ -13,13 +20,16 @@ export declare function classifyPdf(buffer: Buffer): PdfClassification * this extracts PDF text within those regions — skipping GPU OCR * for text-based pages. * - * Each region result includes `needs_ocr` — set when the extracted text + * Each region result includes `needsOcr` — set when the extracted text * is unreliable (empty, GID-encoded fonts, garbage, encoding issues). * * Coordinates are PDF points with top-left origin. */ export declare function extractTextInRegions(buffer: Buffer, pageRegions: Array): Array +/** Extract text with position information from a PDF Buffer. */ +export declare function extractTextWithPositions(buffer: Buffer, pages?: Array | undefined | null): Array + /** A page's regions for text extraction: (page_index_0based, bboxes). */ export interface PageRegions { page: number @@ -37,13 +47,48 @@ export interface PageRegionTexts { export interface PdfClassification { pdfType: string pageCount: number + /** 0-indexed page numbers that need OCR. */ + pagesNeedingOcr: Array + confidence: number +} + +/** Full PDF processing result with markdown and metadata. */ +export interface PdfResult { + pdfType: string + markdown?: string + pageCount: number + processingTimeMs: number + /** 1-indexed page numbers that need OCR. */ pagesNeedingOcr: Array + title?: string confidence: number + isComplexLayout: boolean + pagesWithTables: Array + pagesWithColumns: Array + hasEncodingIssues: boolean } +/** Process a PDF from a Buffer: detect type, extract text, and convert to Markdown. */ +export declare function processPdf(buffer: Buffer, pages?: Array | undefined | null): PdfResult + /** Extracted text for a single region. */ export interface RegionText { text: string /** `true` when the text should not be trusted (empty, GID fonts, garbage, encoding issues). */ needsOcr: boolean } + +/** A positioned text item extracted from a PDF. */ +export interface TextItem { + text: string + x: number + y: number + width: number + height: number + font: string + fontSize: number + page: number + isBold: boolean + isItalic: boolean + itemType: string +} diff --git a/napi/index.js b/napi/index.js index 23f7a6e..be1f71d 100644 --- a/napi/index.js +++ b/napi/index.js @@ -577,4 +577,8 @@ if (!nativeBinding) { module.exports = nativeBinding module.exports.classifyPdf = nativeBinding.classifyPdf +module.exports.detectPdf = nativeBinding.detectPdf +module.exports.extractText = nativeBinding.extractText module.exports.extractTextInRegions = nativeBinding.extractTextInRegions +module.exports.extractTextWithPositions = nativeBinding.extractTextWithPositions +module.exports.processPdf = nativeBinding.processPdf diff --git a/napi/src/lib.rs b/napi/src/lib.rs index c13ffdf..ceff67c 100644 --- a/napi/src/lib.rs +++ b/napi/src/lib.rs @@ -2,58 +2,201 @@ use napi::bindgen_prelude::*; use napi_derive::napi; +use std::collections::HashSet; + +// --------------------------------------------------------------------------- +// Result types +// --------------------------------------------------------------------------- + +/// Full PDF processing result with markdown and metadata. +#[napi(object)] +pub struct PdfResult { + pub pdf_type: String, + pub markdown: Option, + pub page_count: u32, + pub processing_time_ms: u32, + /// 1-indexed page numbers that need OCR. + pub pages_needing_ocr: Vec, + pub title: Option, + pub confidence: f64, + pub is_complex_layout: bool, + pub pages_with_tables: Vec, + pub pages_with_columns: Vec, + pub has_encoding_issues: bool, +} /// Lightweight PDF classification result. #[napi(object)] pub struct PdfClassification { - pub pdf_type: String, - pub page_count: u32, - pub pages_needing_ocr: Vec, - pub confidence: f64, + pub pdf_type: String, + pub page_count: u32, + /// 0-indexed page numbers that need OCR. + pub pages_needing_ocr: Vec, + pub confidence: f64, +} + +/// A positioned text item extracted from a PDF. +#[napi(object)] +pub struct TextItem { + pub text: String, + pub x: f64, + pub y: f64, + pub width: f64, + pub height: f64, + pub font: String, + pub font_size: f64, + pub page: u32, + pub is_bold: bool, + pub is_italic: bool, + pub item_type: String, } /// A page's regions for text extraction: (page_index_0based, bboxes). #[napi(object)] pub struct PageRegions { - pub page: u32, - /// Each bbox is [x1, y1, x2, y2] in PDF points, top-left origin. - pub regions: Vec>, + pub page: u32, + /// Each bbox is [x1, y1, x2, y2] in PDF points, top-left origin. + pub regions: Vec>, } /// Extracted text for a single region. #[napi(object)] pub struct RegionText { - pub text: String, - /// `true` when the text should not be trusted (empty, GID fonts, garbage, encoding issues). - pub needs_ocr: bool, + pub text: String, + /// `true` when the text should not be trusted (empty, GID fonts, garbage, encoding issues). + pub needs_ocr: bool, } /// Extracted text for one page's regions. #[napi(object)] pub struct PageRegionTexts { - pub page: u32, - pub regions: Vec, + pub page: u32, + pub regions: Vec, +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +fn pdf_type_string(t: pdf_inspector::PdfType) -> String { + match t { + pdf_inspector::PdfType::TextBased => "TextBased".to_string(), + pdf_inspector::PdfType::Scanned => "Scanned".to_string(), + pdf_inspector::PdfType::ImageBased => "ImageBased".to_string(), + pdf_inspector::PdfType::Mixed => "Mixed".to_string(), + } } -/// Classify a PDF: detect type (TextBased/Scanned/Mixed/ImageBased), -/// page count, and which pages need OCR. Takes PDF bytes as Buffer. +fn to_napi_result(r: pdf_inspector::PdfProcessResult) -> PdfResult { + PdfResult { + pdf_type: pdf_type_string(r.pdf_type), + markdown: r.markdown, + page_count: r.page_count, + processing_time_ms: r.processing_time_ms as u32, + pages_needing_ocr: r.pages_needing_ocr, + title: r.title, + confidence: r.confidence as f64, + is_complex_layout: r.layout.is_complex, + pages_with_tables: r.layout.pages_with_tables, + pages_with_columns: r.layout.pages_with_columns, + has_encoding_issues: r.has_encoding_issues, + } +} + +fn item_type_string(t: &pdf_inspector::types::ItemType) -> String { + match t { + pdf_inspector::types::ItemType::Text => "text".into(), + pdf_inspector::types::ItemType::Image => "image".into(), + pdf_inspector::types::ItemType::Link(url) => format!("link:{url}"), + pdf_inspector::types::ItemType::FormField => "form_field".into(), + } +} + +fn to_napi_err(e: impl std::fmt::Display, ctx: &str) -> Error { + Error::new(Status::GenericFailure, format!("{ctx}: {e}")) +} + +// --------------------------------------------------------------------------- +// Public NAPI API +// --------------------------------------------------------------------------- + +/// Process a PDF from a Buffer: detect type, extract text, and convert to Markdown. +#[napi] +pub fn process_pdf(buffer: Buffer, pages: Option>) -> Result { + let mut opts = pdf_inspector::PdfOptions::new(); + if let Some(p) = pages { + opts = opts.pages(p); + } + let result = pdf_inspector::process_pdf_mem_with_options(&buffer, opts) + .map_err(|e| to_napi_err(e, "process_pdf"))?; + Ok(to_napi_result(result)) +} + +/// Fast detection only — no text extraction or markdown. +#[napi] +pub fn detect_pdf(buffer: Buffer) -> Result { + let result = + pdf_inspector::detect_pdf_mem(&buffer).map_err(|e| to_napi_err(e, "detect_pdf"))?; + Ok(to_napi_result(result)) +} + +/// Lightweight PDF classification — returns type, page count, and OCR pages. +/// Faster than detectPdf as it skips building the full PdfResult. +/// Pages in pagesNeedingOcr are 0-indexed. #[napi] pub fn classify_pdf(buffer: Buffer) -> Result { - let result = pdf_inspector::classify_pdf_mem(&buffer).map_err(|e| { - Error::new(Status::GenericFailure, format!("classify_pdf failed: {e}")) - })?; - - Ok(PdfClassification { - pdf_type: match result.pdf_type { - pdf_inspector::PdfType::TextBased => "TextBased".to_string(), - pdf_inspector::PdfType::Scanned => "Scanned".to_string(), - pdf_inspector::PdfType::ImageBased => "ImageBased".to_string(), - pdf_inspector::PdfType::Mixed => "Mixed".to_string(), - }, - page_count: result.page_count, - pages_needing_ocr: result.pages_needing_ocr, - confidence: result.confidence as f64, - }) + let result = + pdf_inspector::classify_pdf_mem(&buffer).map_err(|e| to_napi_err(e, "classify_pdf"))?; + + Ok(PdfClassification { + pdf_type: pdf_type_string(result.pdf_type), + page_count: result.page_count, + pages_needing_ocr: result.pages_needing_ocr, + confidence: result.confidence as f64, + }) +} + +/// Extract plain text from a PDF Buffer. +#[napi] +pub fn extract_text(buffer: Buffer) -> Result { + pdf_inspector::extractor::extract_text_mem(&buffer).map_err(|e| to_napi_err(e, "extract_text")) +} + +/// Extract text with position information from a PDF Buffer. +#[napi] +pub fn extract_text_with_positions( + buffer: Buffer, + pages: Option>, +) -> Result> { + let items = match pages { + Some(p) => { + let page_set: HashSet = p.into_iter().collect(); + pdf_inspector::extractor::extract_text_with_positions_mem_pages( + &buffer, + Some(&page_set), + ) + .map_err(|e| to_napi_err(e, "extract_text_with_positions"))? + } + None => pdf_inspector::extractor::extract_text_with_positions_mem(&buffer) + .map_err(|e| to_napi_err(e, "extract_text_with_positions"))?, + }; + + Ok(items + .into_iter() + .map(|item| TextItem { + text: item.text, + x: item.x as f64, + y: item.y as f64, + width: item.width as f64, + height: item.height as f64, + font: item.font, + font_size: item.font_size as f64, + page: item.page, + is_bold: item.is_bold, + is_italic: item.is_italic, + item_type: item_type_string(&item.item_type), + }) + .collect()) } /// Extract text within bounding-box regions from a PDF. @@ -62,55 +205,48 @@ pub fn classify_pdf(buffer: Buffer) -> Result { /// this extracts PDF text within those regions — skipping GPU OCR /// for text-based pages. /// -/// Each region result includes `needs_ocr` — set when the extracted text +/// Each region result includes `needsOcr` — set when the extracted text /// is unreliable (empty, GID-encoded fonts, garbage, encoding issues). /// /// Coordinates are PDF points with top-left origin. #[napi] pub fn extract_text_in_regions( - buffer: Buffer, - page_regions: Vec, + buffer: Buffer, + page_regions: Vec, ) -> Result> { - // Convert from napi types to the Rust API's expected format - let regions: Vec<(u32, Vec<[f32; 4]>)> = page_regions - .iter() - .map(|pr| { - let bboxes: Vec<[f32; 4]> = pr - .regions + let regions: Vec<(u32, Vec<[f32; 4]>)> = page_regions .iter() - .map(|r| { - if r.len() != 4 { - [0.0, 0.0, 0.0, 0.0] - } else { - [r[0] as f32, r[1] as f32, r[2] as f32, r[3] as f32] - } + .map(|pr| { + let bboxes: Vec<[f32; 4]> = pr + .regions + .iter() + .map(|r| { + if r.len() != 4 { + [0.0, 0.0, 0.0, 0.0] + } else { + [r[0] as f32, r[1] as f32, r[2] as f32, r[3] as f32] + } + }) + .collect(); + (pr.page, bboxes) }) .collect(); - (pr.page, bboxes) - }) - .collect(); - - let results = pdf_inspector::extract_text_in_regions_mem(&buffer, ®ions).map_err(|e| { - Error::new( - Status::GenericFailure, - format!("extract_text_in_regions failed: {e}"), - ) - })?; - - Ok( - results - .into_iter() - .map(|page_result| PageRegionTexts { - page: page_result.page, - regions: page_result - .regions - .into_iter() - .map(|r| RegionText { - text: r.text, - needs_ocr: r.needs_ocr, - }) - .collect(), - }) - .collect(), - ) + + let results = pdf_inspector::extract_text_in_regions_mem(&buffer, ®ions) + .map_err(|e| to_napi_err(e, "extract_text_in_regions"))?; + + Ok(results + .into_iter() + .map(|page_result| PageRegionTexts { + page: page_result.page, + regions: page_result + .regions + .into_iter() + .map(|r| RegionText { + text: r.text, + needs_ocr: r.needs_ocr, + }) + .collect(), + }) + .collect()) } diff --git a/napi/test.mjs b/napi/test.mjs new file mode 100644 index 0000000..64f1490 --- /dev/null +++ b/napi/test.mjs @@ -0,0 +1,98 @@ +import { readFileSync } from 'fs'; +import { strict as assert } from 'assert'; +import { + processPdf, + detectPdf, + classifyPdf, + extractText, + extractTextWithPositions, + extractTextInRegions, +} from './index.js'; + +const fixture = readFileSync('../tests/fixtures/thermo-freon12.pdf'); + +// --- processPdf --- +console.log('Testing processPdf...'); +const result = processPdf(fixture); +assert.equal(result.pdfType, 'TextBased'); +assert.equal(result.pageCount, 3); +assert.ok(result.confidence > 0); +assert.ok(result.markdown && result.markdown.length > 0); +assert.equal(typeof result.isComplexLayout, 'boolean'); +assert.ok(Array.isArray(result.pagesWithTables)); +assert.ok(Array.isArray(result.pagesWithColumns)); +assert.equal(typeof result.hasEncodingIssues, 'boolean'); +console.log(' processPdf: OK'); + +// processPdf with pages +const result2 = processPdf(fixture, [1]); +assert.ok(result2.markdown && result2.markdown.length > 0); +console.log(' processPdf with pages: OK'); + +// --- detectPdf --- +console.log('Testing detectPdf...'); +const detected = detectPdf(fixture); +assert.equal(detected.pdfType, 'TextBased'); +assert.equal(detected.pageCount, 3); +assert.equal(detected.markdown, undefined); +console.log(' detectPdf: OK'); + +// --- classifyPdf --- +console.log('Testing classifyPdf...'); +const classified = classifyPdf(fixture); +assert.equal(classified.pdfType, 'TextBased'); +assert.equal(classified.pageCount, 3); +assert.ok(classified.confidence > 0); +assert.ok(Array.isArray(classified.pagesNeedingOcr)); +console.log(' classifyPdf: OK'); + +// --- extractText --- +console.log('Testing extractText...'); +const text = extractText(fixture); +assert.equal(typeof text, 'string'); +assert.ok(text.length > 0); +console.log(' extractText: OK'); + +// --- extractTextWithPositions --- +console.log('Testing extractTextWithPositions...'); +const items = extractTextWithPositions(fixture); +assert.ok(items.length > 0); +const item = items[0]; +assert.equal(typeof item.text, 'string'); +assert.equal(typeof item.x, 'number'); +assert.equal(typeof item.y, 'number'); +assert.equal(typeof item.width, 'number'); +assert.equal(typeof item.height, 'number'); +assert.equal(typeof item.font, 'string'); +assert.equal(typeof item.fontSize, 'number'); +assert.equal(typeof item.page, 'number'); +assert.equal(typeof item.isBold, 'boolean'); +assert.equal(typeof item.isItalic, 'boolean'); +assert.equal(typeof item.itemType, 'string'); +console.log(' extractTextWithPositions: OK'); + +// with pages filter +const page1Items = extractTextWithPositions(fixture, [1]); +assert.ok(page1Items.length > 0); +assert.ok(page1Items.every(i => i.page === 1)); +console.log(' extractTextWithPositions with pages: OK'); + +// --- extractTextInRegions --- +console.log('Testing extractTextInRegions...'); +const regionResults = extractTextInRegions(fixture, [ + { page: 0, regions: [[0, 0, 600, 100]] }, +]); +assert.equal(regionResults.length, 1); +assert.equal(regionResults[0].page, 0); +assert.equal(regionResults[0].regions.length, 1); +assert.equal(typeof regionResults[0].regions[0].text, 'string'); +assert.equal(typeof regionResults[0].regions[0].needsOcr, 'boolean'); +console.log(' extractTextInRegions: OK'); + +// --- Error handling --- +console.log('Testing error handling...'); +assert.throws(() => processPdf(Buffer.from('not a pdf')), /process_pdf/); +assert.throws(() => classifyPdf(Buffer.from('')), /classify_pdf/); +console.log(' error handling: OK'); + +console.log('\nAll NAPI tests passed!'); diff --git a/pdf_inspector.pyi b/pdf_inspector.pyi index 993a59e..041e321 100644 --- a/pdf_inspector.pyi +++ b/pdf_inspector.pyi @@ -17,6 +17,15 @@ class PdfResult: pages_with_columns: list[int] has_encoding_issues: bool +class PdfClassification: + """Lightweight PDF classification result.""" + pdf_type: str + """'text_based', 'scanned', 'image_based', or 'mixed'.""" + page_count: int + pages_needing_ocr: list[int] + """0-indexed page numbers that need OCR.""" + confidence: float + class TextItem: """A positioned text item extracted from a PDF.""" text: str @@ -31,6 +40,18 @@ class TextItem: is_italic: bool item_type: str +class RegionText: + """Extracted text for a single region.""" + text: str + needs_ocr: bool + """True when the text should not be trusted.""" + +class PageRegionTexts: + """Extracted text for one page's regions.""" + page: int + """0-indexed page number.""" + regions: list[RegionText] + def process_pdf(path: str, pages: Optional[list[int]] = None) -> PdfResult: """Process a PDF: detect type, extract text, convert to Markdown.""" ... @@ -47,10 +68,50 @@ def detect_pdf_bytes(data: bytes) -> PdfResult: """Fast detection from bytes.""" ... +def classify_pdf(path: str) -> PdfClassification: + """Lightweight classification — type, page count, and OCR pages (0-indexed).""" + ... + +def classify_pdf_bytes(data: bytes) -> PdfClassification: + """Lightweight classification from bytes.""" + ... + def extract_text(path: str) -> str: """Extract plain text from a PDF.""" ... +def extract_text_bytes(data: bytes) -> str: + """Extract plain text from PDF bytes.""" + ... + def extract_text_with_positions(path: str, pages: Optional[list[int]] = None) -> list[TextItem]: """Extract text with position information.""" ... + +def extract_text_with_positions_bytes(data: bytes, pages: Optional[list[int]] = None) -> list[TextItem]: + """Extract text with position information from bytes.""" + ... + +def extract_text_in_regions( + path: str, + page_regions: list[tuple[int, list[list[float]]]], +) -> list[PageRegionTexts]: + """Extract text within bounding-box regions from a PDF file. + + Args: + path: Path to the PDF file. + page_regions: List of (page_0indexed, [[x1, y1, x2, y2], ...]) tuples. + """ + ... + +def extract_text_in_regions_bytes( + data: bytes, + page_regions: list[tuple[int, list[list[float]]]], +) -> list[PageRegionTexts]: + """Extract text within bounding-box regions from PDF bytes. + + Args: + data: PDF file contents as bytes. + page_regions: List of (page_0indexed, [[x1, y1, x2, y2], ...]) tuples. + """ + ... diff --git a/src/python.rs b/src/python.rs index 3fb645d..a9bef94 100644 --- a/src/python.rs +++ b/src/python.rs @@ -1,7 +1,7 @@ //! PyO3 Python bindings for pdf-inspector. -use pyo3::prelude::*; use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; use std::collections::HashSet; use crate::detector::PdfType; @@ -60,33 +60,86 @@ impl PyPdfResult { } } -fn pdf_type_str(t: PdfType) -> String { - match t { - PdfType::TextBased => "text_based".into(), - PdfType::Scanned => "scanned".into(), - PdfType::ImageBased => "image_based".into(), - PdfType::Mixed => "mixed".into(), +// --------------------------------------------------------------------------- +// Classification wrapper (lightweight) +// --------------------------------------------------------------------------- + +/// Lightweight PDF classification result. +#[pyclass(name = "PdfClassification")] +#[derive(Clone)] +pub struct PyPdfClassification { + /// The detected PDF type: "text_based", "scanned", "image_based", or "mixed". + #[pyo3(get)] + pub pdf_type: String, + /// Total number of pages. + #[pyo3(get)] + pub page_count: u32, + /// 0-indexed page numbers that need OCR. + #[pyo3(get)] + pub pages_needing_ocr: Vec, + /// Detection confidence (0.0-1.0). + #[pyo3(get)] + pub confidence: f32, +} + +#[pymethods] +impl PyPdfClassification { + fn __repr__(&self) -> String { + format!( + "PdfClassification(pdf_type='{}', pages={}, confidence={:.2})", + self.pdf_type, self.page_count, self.confidence + ) } } -fn to_py_result(r: crate::PdfProcessResult) -> PyPdfResult { - PyPdfResult { - pdf_type: pdf_type_str(r.pdf_type), - markdown: r.markdown, - page_count: r.page_count, - processing_time_ms: r.processing_time_ms, - pages_needing_ocr: r.pages_needing_ocr, - title: r.title, - confidence: r.confidence, - is_complex_layout: r.layout.is_complex, - pages_with_tables: r.layout.pages_with_tables, - pages_with_columns: r.layout.pages_with_columns, - has_encoding_issues: r.has_encoding_issues, +// --------------------------------------------------------------------------- +// Region extraction wrappers +// --------------------------------------------------------------------------- + +/// Extracted text for a single region. +#[pyclass(name = "RegionText")] +#[derive(Clone)] +pub struct PyRegionText { + /// Extracted text content. + #[pyo3(get)] + pub text: String, + /// True when the text should not be trusted (empty, GID fonts, garbage, encoding issues). + #[pyo3(get)] + pub needs_ocr: bool, +} + +#[pymethods] +impl PyRegionText { + fn __repr__(&self) -> String { + format!( + "RegionText(text='{}', needs_ocr={})", + self.text.chars().take(40).collect::(), + self.needs_ocr + ) } } -fn to_py_err(e: crate::PdfError) -> PyErr { - PyValueError::new_err(e.to_string()) +/// Extracted text for one page's regions. +#[pyclass(name = "PageRegionTexts")] +#[derive(Clone)] +pub struct PyPageRegionTexts { + /// 0-indexed page number. + #[pyo3(get)] + pub page: u32, + /// Per-region results, parallel to the input regions. + #[pyo3(get)] + pub regions: Vec, +} + +#[pymethods] +impl PyPageRegionTexts { + fn __repr__(&self) -> String { + format!( + "PageRegionTexts(page={}, regions={})", + self.page, + self.regions.len() + ) + } } // --------------------------------------------------------------------------- @@ -134,6 +187,39 @@ impl PyTextItem { } } +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +fn pdf_type_str(t: PdfType) -> String { + match t { + PdfType::TextBased => "text_based".into(), + PdfType::Scanned => "scanned".into(), + PdfType::ImageBased => "image_based".into(), + PdfType::Mixed => "mixed".into(), + } +} + +fn to_py_result(r: crate::PdfProcessResult) -> PyPdfResult { + PyPdfResult { + pdf_type: pdf_type_str(r.pdf_type), + markdown: r.markdown, + page_count: r.page_count, + processing_time_ms: r.processing_time_ms, + pages_needing_ocr: r.pages_needing_ocr, + title: r.title, + confidence: r.confidence, + is_complex_layout: r.layout.is_complex, + pages_with_tables: r.layout.pages_with_tables, + pages_with_columns: r.layout.pages_with_columns, + has_encoding_issues: r.has_encoding_issues, + } +} + +fn to_py_err(e: crate::PdfError) -> PyErr { + PyValueError::new_err(e.to_string()) +} + fn item_type_str(t: &ItemType) -> String { match t { ItemType::Text => "text".into(), @@ -143,18 +229,66 @@ fn item_type_str(t: &ItemType) -> String { } } +fn convert_text_items(items: Vec) -> Vec { + items + .into_iter() + .map(|item| PyTextItem { + text: item.text, + x: item.x, + y: item.y, + width: item.width, + height: item.height, + font: item.font, + font_size: item.font_size, + page: item.page, + is_bold: item.is_bold, + is_italic: item.is_italic, + item_type: item_type_str(&item.item_type), + }) + .collect() +} + +fn parse_page_regions(page_regions: Vec<(u32, Vec>)>) -> Vec<(u32, Vec<[f32; 4]>)> { + page_regions + .into_iter() + .map(|(page, regions)| { + let bboxes: Vec<[f32; 4]> = regions + .iter() + .map(|r| { + if r.len() != 4 { + [0.0, 0.0, 0.0, 0.0] + } else { + [r[0] as f32, r[1] as f32, r[2] as f32, r[3] as f32] + } + }) + .collect(); + (page, bboxes) + }) + .collect() +} + +fn convert_region_results(results: Vec) -> Vec { + results + .into_iter() + .map(|page_result| PyPageRegionTexts { + page: page_result.page, + regions: page_result + .regions + .into_iter() + .map(|r| PyRegionText { + text: r.text, + needs_ocr: r.needs_ocr, + }) + .collect(), + }) + .collect() +} + // --------------------------------------------------------------------------- // Public Python API // --------------------------------------------------------------------------- /// Process a PDF file: detect type, extract text, and convert to Markdown. -/// -/// Args: -/// path: Path to the PDF file. -/// pages: Optional list of 1-indexed page numbers to process. -/// -/// Returns: -/// PdfResult with markdown, pdf_type, and metadata. #[pyfunction] #[pyo3(signature = (path, pages=None))] fn process_pdf(path: &str, pages: Option>) -> PyResult { @@ -167,13 +301,6 @@ fn process_pdf(path: &str, pages: Option>) -> PyResult { } /// Process a PDF from bytes in memory. -/// -/// Args: -/// data: PDF file contents as bytes. -/// pages: Optional list of 1-indexed page numbers to process. -/// -/// Returns: -/// PdfResult with markdown, pdf_type, and metadata. #[pyfunction] #[pyo3(signature = (data, pages=None))] fn process_pdf_bytes(data: &[u8], pages: Option>) -> PyResult { @@ -186,12 +313,6 @@ fn process_pdf_bytes(data: &[u8], pages: Option>) -> PyResult PyResult { let result = crate::detect_pdf(path).map_err(to_py_err)?; @@ -205,26 +326,41 @@ fn detect_pdf_bytes(data: &[u8]) -> PyResult { Ok(to_py_result(result)) } +/// Lightweight PDF classification — returns type, page count, and OCR pages. +/// Faster than detect_pdf as it skips building the full PdfProcessResult. +/// Pages in pages_needing_ocr are 0-indexed. +#[pyfunction] +fn classify_pdf(path: &str) -> PyResult { + let data = std::fs::read(path).map_err(|e| PyValueError::new_err(e.to_string()))?; + classify_pdf_bytes(&data) +} + +/// Lightweight PDF classification from bytes. +/// Pages in pages_needing_ocr are 0-indexed. +#[pyfunction] +fn classify_pdf_bytes(data: &[u8]) -> PyResult { + let result = crate::classify_pdf_mem(data).map_err(to_py_err)?; + Ok(PyPdfClassification { + pdf_type: pdf_type_str(result.pdf_type), + page_count: result.page_count, + pages_needing_ocr: result.pages_needing_ocr, + confidence: result.confidence, + }) +} + /// Extract plain text from a PDF file. -/// -/// Args: -/// path: Path to the PDF file. -/// -/// Returns: -/// Extracted text as a string. #[pyfunction] fn extract_text(path: &str) -> PyResult { crate::extract_text(path).map_err(to_py_err) } -/// Extract text with position information. -/// -/// Args: -/// path: Path to the PDF file. -/// pages: Optional list of 1-indexed page numbers. -/// -/// Returns: -/// List of TextItem objects with text, position, font info. +/// Extract plain text from PDF bytes. +#[pyfunction] +fn extract_text_bytes(data: &[u8]) -> PyResult { + crate::extractor::extract_text_mem(data).map_err(to_py_err) +} + +/// Extract text with position information from a file. #[pyfunction] #[pyo3(signature = (path, pages=None))] fn extract_text_with_positions(path: &str, pages: Option>) -> PyResult> { @@ -235,35 +371,83 @@ fn extract_text_with_positions(path: &str, pages: Option>) -> PyResult< } None => crate::extract_text_with_positions(path).map_err(to_py_err)?, }; + Ok(convert_text_items(items)) +} - Ok(items - .into_iter() - .map(|item| PyTextItem { - text: item.text, - x: item.x, - y: item.y, - width: item.width, - height: item.height, - font: item.font, - font_size: item.font_size, - page: item.page, - is_bold: item.is_bold, - is_italic: item.is_italic, - item_type: item_type_str(&item.item_type), - }) - .collect()) +/// Extract text with position information from bytes. +#[pyfunction] +#[pyo3(signature = (data, pages=None))] +fn extract_text_with_positions_bytes( + data: &[u8], + pages: Option>, +) -> PyResult> { + let items = match pages { + Some(p) => { + let page_set: HashSet = p.into_iter().collect(); + crate::extractor::extract_text_with_positions_mem_pages(data, Some(&page_set)) + .map_err(to_py_err)? + } + None => crate::extractor::extract_text_with_positions_mem(data).map_err(to_py_err)?, + }; + Ok(convert_text_items(items)) +} + +/// Extract text within bounding-box regions from a PDF file. +/// +/// Args: +/// path: Path to the PDF file. +/// page_regions: List of (page_0indexed, [[x1, y1, x2, y2], ...]) tuples. +/// Coordinates are PDF points with top-left origin. +/// +/// Returns: +/// List of PageRegionTexts with per-region text and needs_ocr flag. +#[pyfunction] +fn extract_text_in_regions( + path: &str, + page_regions: Vec<(u32, Vec>)>, +) -> PyResult> { + let data = std::fs::read(path).map_err(|e| PyValueError::new_err(e.to_string()))?; + extract_text_in_regions_bytes(&data, page_regions) +} + +/// Extract text within bounding-box regions from PDF bytes. +/// +/// Args: +/// data: PDF file contents as bytes. +/// page_regions: List of (page_0indexed, [[x1, y1, x2, y2], ...]) tuples. +/// Coordinates are PDF points with top-left origin. +/// +/// Returns: +/// List of PageRegionTexts with per-region text and needs_ocr flag. +#[pyfunction] +fn extract_text_in_regions_bytes( + data: &[u8], + page_regions: Vec<(u32, Vec>)>, +) -> PyResult> { + let regions = parse_page_regions(page_regions); + let results = crate::extract_text_in_regions_mem(data, ®ions).map_err(to_py_err)?; + Ok(convert_region_results(results)) } /// Python module definition. #[pymodule] fn pdf_inspector(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; + m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_function(wrap_pyfunction!(process_pdf, m)?)?; m.add_function(wrap_pyfunction!(process_pdf_bytes, m)?)?; m.add_function(wrap_pyfunction!(detect_pdf, m)?)?; m.add_function(wrap_pyfunction!(detect_pdf_bytes, m)?)?; + m.add_function(wrap_pyfunction!(classify_pdf, m)?)?; + m.add_function(wrap_pyfunction!(classify_pdf_bytes, m)?)?; m.add_function(wrap_pyfunction!(extract_text, m)?)?; + m.add_function(wrap_pyfunction!(extract_text_bytes, m)?)?; m.add_function(wrap_pyfunction!(extract_text_with_positions, m)?)?; + m.add_function(wrap_pyfunction!(extract_text_with_positions_bytes, m)?)?; + m.add_function(wrap_pyfunction!(extract_text_in_regions, m)?)?; + m.add_function(wrap_pyfunction!(extract_text_in_regions_bytes, m)?)?; Ok(()) } diff --git a/tests/test_python.py b/tests/test_python.py index 379f8b4..f8daffa 100644 --- a/tests/test_python.py +++ b/tests/test_python.py @@ -11,6 +11,11 @@ def fixture_path(name: str) -> str: return os.path.join(FIXTURES_DIR, name) +def fixture_bytes(name: str) -> bytes: + with open(fixture_path(name), "rb") as f: + return f.read() + + # --------------------------------------------------------------------------- # process_pdf # --------------------------------------------------------------------------- @@ -61,15 +66,13 @@ def test_result_fields(self): class TestProcessPdfBytes: def test_basic(self): - with open(fixture_path("thermo-freon12.pdf"), "rb") as f: - data = f.read() + data = fixture_bytes("thermo-freon12.pdf") result = pdf_inspector.process_pdf_bytes(data) assert result.pdf_type == "text_based" assert result.markdown is not None def test_with_pages(self): - with open(fixture_path("thermo-freon12.pdf"), "rb") as f: - data = f.read() + data = fixture_bytes("thermo-freon12.pdf") result = pdf_inspector.process_pdf_bytes(data, pages=[1, 2]) assert result.markdown is not None @@ -87,15 +90,48 @@ def test_detect_file(self): assert result.page_count == 3 def test_detect_bytes(self): - with open(fixture_path("thermo-freon12.pdf"), "rb") as f: - data = f.read() + data = fixture_bytes("thermo-freon12.pdf") result = pdf_inspector.detect_pdf_bytes(data) assert result.pdf_type == "text_based" assert result.markdown is None # --------------------------------------------------------------------------- -# extract_text +# classify_pdf / classify_pdf_bytes +# --------------------------------------------------------------------------- + + +class TestClassifyPdf: + def test_classify_file(self): + result = pdf_inspector.classify_pdf(fixture_path("thermo-freon12.pdf")) + assert result.pdf_type == "text_based" + assert result.page_count == 3 + assert result.confidence > 0.0 + assert isinstance(result.pages_needing_ocr, list) + + def test_classify_bytes(self): + data = fixture_bytes("thermo-freon12.pdf") + result = pdf_inspector.classify_pdf_bytes(data) + assert result.pdf_type == "text_based" + assert result.page_count == 3 + assert result.confidence > 0.0 + + def test_classify_repr(self): + result = pdf_inspector.classify_pdf(fixture_path("thermo-freon12.pdf")) + r = repr(result) + assert "PdfClassification" in r + assert "text_based" in r + + def test_classify_fields(self): + result = pdf_inspector.classify_pdf(fixture_path("thermo-freon12.pdf")) + assert isinstance(result.pdf_type, str) + assert isinstance(result.page_count, int) + assert isinstance(result.pages_needing_ocr, list) + assert isinstance(result.confidence, float) + + +# --------------------------------------------------------------------------- +# extract_text / extract_text_bytes # --------------------------------------------------------------------------- @@ -105,9 +141,20 @@ def test_basic(self): assert isinstance(text, str) assert len(text) > 0 + def test_bytes(self): + data = fixture_bytes("thermo-freon12.pdf") + text = pdf_inspector.extract_text_bytes(data) + assert isinstance(text, str) + assert len(text) > 0 + + def test_bytes_matches_file(self): + text_file = pdf_inspector.extract_text(fixture_path("thermo-freon12.pdf")) + text_bytes = pdf_inspector.extract_text_bytes(fixture_bytes("thermo-freon12.pdf")) + assert text_file == text_bytes + # --------------------------------------------------------------------------- -# extract_text_with_positions +# extract_text_with_positions / extract_text_with_positions_bytes # --------------------------------------------------------------------------- @@ -144,6 +191,77 @@ def test_repr(self): r = repr(items[0]) assert "TextItem" in r + def test_bytes(self): + data = fixture_bytes("thermo-freon12.pdf") + items = pdf_inspector.extract_text_with_positions_bytes(data) + assert len(items) > 0 + assert isinstance(items[0].text, str) + + def test_bytes_with_pages(self): + data = fixture_bytes("thermo-freon12.pdf") + items = pdf_inspector.extract_text_with_positions_bytes(data, pages=[1]) + assert len(items) > 0 + assert all(item.page == 1 for item in items) + + +# --------------------------------------------------------------------------- +# extract_text_in_regions / extract_text_in_regions_bytes +# --------------------------------------------------------------------------- + + +class TestExtractTextInRegions: + def test_file(self): + results = pdf_inspector.extract_text_in_regions( + fixture_path("thermo-freon12.pdf"), + [(0, [[0.0, 0.0, 600.0, 100.0]])], + ) + assert len(results) == 1 + assert results[0].page == 0 + assert len(results[0].regions) == 1 + assert isinstance(results[0].regions[0].text, str) + assert isinstance(results[0].regions[0].needs_ocr, bool) + + def test_bytes(self): + data = fixture_bytes("thermo-freon12.pdf") + results = pdf_inspector.extract_text_in_regions_bytes( + data, + [(0, [[0.0, 0.0, 600.0, 100.0]])], + ) + assert len(results) == 1 + assert results[0].page == 0 + assert len(results[0].regions) == 1 + assert isinstance(results[0].regions[0].text, str) + + def test_repr(self): + results = pdf_inspector.extract_text_in_regions( + fixture_path("thermo-freon12.pdf"), + [(0, [[0.0, 0.0, 600.0, 100.0]])], + ) + r = repr(results[0]) + assert "PageRegionTexts" in r + r2 = repr(results[0].regions[0]) + assert "RegionText" in r2 + + def test_multiple_regions(self): + results = pdf_inspector.extract_text_in_regions( + fixture_path("thermo-freon12.pdf"), + [(0, [[0.0, 0.0, 300.0, 100.0], [300.0, 0.0, 600.0, 100.0]])], + ) + assert len(results) == 1 + assert len(results[0].regions) == 2 + + def test_multiple_pages(self): + results = pdf_inspector.extract_text_in_regions( + fixture_path("thermo-freon12.pdf"), + [ + (0, [[0.0, 0.0, 600.0, 100.0]]), + (1, [[0.0, 0.0, 600.0, 100.0]]), + ], + ) + assert len(results) == 2 + assert results[0].page == 0 + assert results[1].page == 1 + # --------------------------------------------------------------------------- # Error handling @@ -163,6 +281,24 @@ def test_empty_bytes(self): with pytest.raises(ValueError): pdf_inspector.process_pdf_bytes(b"") + def test_classify_not_a_pdf(self): + with pytest.raises(ValueError): + pdf_inspector.classify_pdf_bytes(b"not a pdf") + + def test_classify_nonexistent(self): + with pytest.raises((ValueError, OSError)): + pdf_inspector.classify_pdf("/nonexistent/file.pdf") + + def test_extract_text_bytes_not_a_pdf(self): + with pytest.raises(ValueError): + pdf_inspector.extract_text_bytes(b"not a pdf") + + def test_regions_not_a_pdf(self): + with pytest.raises(ValueError): + pdf_inspector.extract_text_in_regions_bytes( + b"not a pdf", [(0, [[0.0, 0.0, 100.0, 100.0]])] + ) + # --------------------------------------------------------------------------- # Multiple fixtures From 2e874f6cfd6991b1c23740b20cc0d0414dacf523 Mon Sep 17 00:00:00 2001 From: Abimael Martell Date: Thu, 2 Apr 2026 12:18:06 -0700 Subject: [PATCH 3/3] update napi index.js version strings to match 0.2.2 Co-Authored-By: Claude Opus 4.6 --- napi/index.js | 104 +++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/napi/index.js b/napi/index.js index 41d142e..df6cd7e 100644 --- a/napi/index.js +++ b/napi/index.js @@ -77,8 +77,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-android-arm64') const bindingPackageVersion = require('firecrawl-pdf-inspector-android-arm64/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -93,8 +93,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-android-arm-eabi') const bindingPackageVersion = require('firecrawl-pdf-inspector-android-arm-eabi/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -114,8 +114,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-win32-x64-gnu') const bindingPackageVersion = require('firecrawl-pdf-inspector-win32-x64-gnu/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -130,8 +130,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-win32-x64-msvc') const bindingPackageVersion = require('firecrawl-pdf-inspector-win32-x64-msvc/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -147,8 +147,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-win32-ia32-msvc') const bindingPackageVersion = require('firecrawl-pdf-inspector-win32-ia32-msvc/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -163,8 +163,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-win32-arm64-msvc') const bindingPackageVersion = require('firecrawl-pdf-inspector-win32-arm64-msvc/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -182,8 +182,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-darwin-universal') const bindingPackageVersion = require('firecrawl-pdf-inspector-darwin-universal/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -198,8 +198,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-darwin-x64') const bindingPackageVersion = require('firecrawl-pdf-inspector-darwin-x64/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -214,8 +214,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-darwin-arm64') const bindingPackageVersion = require('firecrawl-pdf-inspector-darwin-arm64/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -234,8 +234,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-freebsd-x64') const bindingPackageVersion = require('firecrawl-pdf-inspector-freebsd-x64/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -250,8 +250,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-freebsd-arm64') const bindingPackageVersion = require('firecrawl-pdf-inspector-freebsd-arm64/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -271,8 +271,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-linux-x64-musl') const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-x64-musl/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -287,8 +287,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-linux-x64-gnu') const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-x64-gnu/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -305,8 +305,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-linux-arm64-musl') const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-arm64-musl/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -321,8 +321,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-linux-arm64-gnu') const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-arm64-gnu/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -339,8 +339,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-linux-arm-musleabihf') const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-arm-musleabihf/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -355,8 +355,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-linux-arm-gnueabihf') const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-arm-gnueabihf/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -373,8 +373,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-linux-loong64-musl') const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-loong64-musl/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -389,8 +389,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-linux-loong64-gnu') const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-loong64-gnu/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -407,8 +407,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-linux-riscv64-musl') const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-riscv64-musl/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -423,8 +423,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-linux-riscv64-gnu') const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-riscv64-gnu/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -440,8 +440,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-linux-ppc64-gnu') const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-ppc64-gnu/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -456,8 +456,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-linux-s390x-gnu') const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-s390x-gnu/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -476,8 +476,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-openharmony-arm64') const bindingPackageVersion = require('firecrawl-pdf-inspector-openharmony-arm64/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -492,8 +492,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-openharmony-x64') const bindingPackageVersion = require('firecrawl-pdf-inspector-openharmony-x64/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) { @@ -508,8 +508,8 @@ function requireNative() { try { const binding = require('firecrawl-pdf-inspector-openharmony-arm') const bindingPackageVersion = require('firecrawl-pdf-inspector-openharmony-arm/package.json').version - if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { - throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) + if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') { + throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`) } return binding } catch (e) {