From 0bf5463a1e0b52378274041b31f15f72f8a07047 Mon Sep 17 00:00:00 2001
From: Jacques Dumora <jacq.dumora@gmail.com>
Date: Thu, 12 Mar 2026 17:38:53 +0100
Subject: [PATCH 1/3] feat: add Python bindings via PyO3

Expose the pdf-inspector Rust library as a Python package using PyO3 + maturin.
Python users can now `pip install` and use `import pdf_inspector` for PDF
classification, text extraction, and markdown conversion with native Rust speed.

Adds:
- src/python.rs: PyO3 bindings (process_pdf, detect_pdf, extract_text, etc.)
- pyproject.toml: maturin build configuration
- pdf_inspector.pyi: type stubs for IDE support
- tests/test_python.py: 21 pytest tests covering all Python API functions
- examples/basic_usage.py: example script demonstrating all features
- Updated README with Python quick start and API reference

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 Cargo.toml              |   8 ++
 README.md               |  65 +++++++++-
 examples/basic_usage.py |  76 ++++++++++++
 pdf_inspector.pyi       |  56 +++++++++
 pyproject.toml          |  21 ++++
 src/lib.rs              |   3 +
 src/python.rs           | 269 ++++++++++++++++++++++++++++++++++++++++
 tests/test_python.py    | 188 ++++++++++++++++++++++++++++
 8 files changed, 684 insertions(+), 2 deletions(-)
 create mode 100644 examples/basic_usage.py
 create mode 100644 pdf_inspector.pyi
 create mode 100644 pyproject.toml
 create mode 100644 src/python.rs
 create mode 100644 tests/test_python.py

diff --git a/Cargo.toml b/Cargo.toml
index d625bde..f534f95 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,7 +8,14 @@ description = "Fast PDF inspection, classification, and text extraction with sma
 license = "MIT"
 repository = "https://github.com/firecrawl/pdf-inspector"
 
+[lib]
+name = "pdf_inspector"
+crate-type = ["lib", "cdylib"]
+
 [dependencies]
+# Python bindings
+pyo3 = { version = "0.22", features = ["extension-module"], optional = true }
+
 # PDF parsing
 lopdf = { git = "https://github.com/firecrawl/lopdf", branch = "firecrawl/zlib-checksum-encrypted", features = ["rayon"] }
 
@@ -34,6 +41,7 @@ tempfile = "3.3"
 
 [features]
 default = []
+python = ["pyo3"]
 
 [[bin]]
 name = "pdf2md"
diff --git a/README.md b/README.md
index 762927b..cf1026f 100644
--- a/README.md
+++ b/README.md
@@ -15,10 +15,70 @@ Built by [Firecrawl](https://firecrawl.dev) to handle text-based PDFs locally in
 - **Encoding issue detection** — Automatically flags broken font encodings (garbled text, replacement characters) so callers can fall back to OCR.
 - **Single document load** — The document is parsed once and shared between detection and extraction, avoiding redundant I/O.
 - **Lightweight** — Pure Rust, no ML models, no external services. Single dependency on `lopdf` for PDF parsing.
+- **Python bindings** — Use from Python via PyO3. Install with `pip install pdf-inspector` or build from source with `maturin`.
 
 ## Quick start
 
-### As a library
+### Python
+
+Install from source (requires Rust toolchain):
+
+```bash
+pip install maturin
+maturin develop --release
+```
+
+Use it:
+
+```python
+import pdf_inspector
+
+# Full processing: detect + extract + convert to Markdown
+result = pdf_inspector.process_pdf("document.pdf")
+print(result.pdf_type)      # "text_based", "scanned", "image_based", "mixed"
+print(result.confidence)     # 0.0 - 1.0
+print(result.page_count)     # number of pages
+print(result.markdown)       # Markdown string or None
+
+# Process specific pages only
+result = pdf_inspector.process_pdf("document.pdf", pages=[1, 3, 5])
+
+# Process from bytes (no filesystem needed)
+with open("document.pdf", "rb") as f:
+    result = pdf_inspector.process_pdf_bytes(f.read())
+
+# Fast detection only (no text extraction)
+result = pdf_inspector.detect_pdf("document.pdf")
+if result.pdf_type == "text_based":
+    print("Can extract locally!")
+else:
+    print(f"Pages needing OCR: {result.pages_needing_ocr}")
+
+# Plain text extraction
+text = pdf_inspector.extract_text("document.pdf")
+
+# Positioned text items with font info
+items = pdf_inspector.extract_text_with_positions("document.pdf")
+for item in items[:5]:
+    print(f"'{item.text}' at ({item.x:.0f}, {item.y:.0f}) size={item.font_size}")
+```
+
+#### Python API reference
+
+| Function | Description |
+|---|---|
+| `process_pdf(path, pages=None)` | Full processing (detect + extract + markdown) |
+| `process_pdf_bytes(data, pages=None)` | Full processing from bytes |
+| `detect_pdf(path)` | Fast detection only |
+| `detect_pdf_bytes(data)` | Fast detection from bytes |
+| `extract_text(path)` | Plain text extraction |
+| `extract_text_with_positions(path, pages=None)` | Text with X/Y coords and font info |
+
+**`PdfResult` fields:** `pdf_type`, `markdown`, `page_count`, `processing_time_ms`, `pages_needing_ocr`, `title`, `confidence`, `is_complex_layout`, `pages_with_tables`, `pages_with_columns`, `has_encoding_issues`
+
+**`TextItem` fields:** `text`, `x`, `y`, `width`, `height`, `font`, `font_size`, `page`, `is_bold`, `is_italic`, `item_type`
+
+### Rust
 
 Add to your `Cargo.toml`:
 
@@ -159,6 +219,7 @@ The document is loaded **once** via `load_document_from_path` / `load_document_f
 ```
 src/
   lib.rs                — Public API, PdfOptions builder, convenience functions
+  python.rs             — PyO3 Python bindings
   types.rs              — Shared types: TextItem, TextLine, PdfRect, ItemType
   text_utils.rs         — Character/text helpers (CJK, RTL, ligatures, bold/italic)
   process_mode.rs       — ProcessMode enum (DetectOnly, Analyze, Full)
@@ -189,7 +250,7 @@ This detects 300+ page PDFs in milliseconds. The result includes `pages_needing_
 | `Sample(n)` | Sample `n` evenly distributed pages (first, last, middle) | Very large PDFs where speed matters more than precision |
 | `Pages(vec)` | Only scan specific 1-indexed page numbers | When the caller knows which pages to check |
 
-## API
+## Rust API
 
 ### Processing modes
 
diff --git a/examples/basic_usage.py b/examples/basic_usage.py
new file mode 100644
index 0000000..3ab14a1
--- /dev/null
+++ b/examples/basic_usage.py
@@ -0,0 +1,76 @@
+"""Basic usage examples for pdf-inspector Python library."""
+
+import sys
+import pdf_inspector
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python basic_usage.py <path-to-pdf>")
+        sys.exit(1)
+
+    path = sys.argv[1]
+
+    # 1. Full processing: detect + extract + markdown
+    print("=" * 60)
+    print("Full processing")
+    print("=" * 60)
+    result = pdf_inspector.process_pdf(path)
+    print(f"Type:       {result.pdf_type}")
+    print(f"Pages:      {result.page_count}")
+    print(f"Confidence: {result.confidence:.0%}")
+    print(f"Time:       {result.processing_time_ms}ms")
+    print(f"Title:      {result.title}")
+    print(f"Complex:    {result.is_complex_layout}")
+    print(f"Tables on:  {result.pages_with_tables}")
+    print(f"Columns on: {result.pages_with_columns}")
+    print(f"Encoding:   {'issues detected' if result.has_encoding_issues else 'ok'}")
+    print(f"OCR needed: {result.pages_needing_ocr or 'none'}")
+    if result.markdown:
+        print(f"\n--- Markdown ({len(result.markdown)} chars) ---")
+        print(result.markdown[:500])
+        if len(result.markdown) > 500:
+            print(f"\n... ({len(result.markdown) - 500} more chars)")
+
+    # 2. Fast detection only
+    print("\n" + "=" * 60)
+    print("Detection only")
+    print("=" * 60)
+    info = pdf_inspector.detect_pdf(path)
+    print(f"Type:       {info.pdf_type}")
+    print(f"Confidence: {info.confidence:.0%}")
+    print(f"Time:       {info.processing_time_ms}ms")
+
+    # 3. From bytes
+    print("\n" + "=" * 60)
+    print("From bytes")
+    print("=" * 60)
+    with open(path, "rb") as f:
+        data = f.read()
+    result = pdf_inspector.process_pdf_bytes(data)
+    print(f"Type: {result.pdf_type}, Pages: {result.page_count}")
+
+    # 4. Plain text
+    print("\n" + "=" * 60)
+    print("Plain text extraction")
+    print("=" * 60)
+    text = pdf_inspector.extract_text(path)
+    print(text[:300])
+
+    # 5. Positioned items
+    print("\n" + "=" * 60)
+    print("Positioned text items (first 10)")
+    print("=" * 60)
+    items = pdf_inspector.extract_text_with_positions(path, pages=[1])
+    for item in items[:10]:
+        bold = " [B]" if item.is_bold else ""
+        italic = " [I]" if item.is_italic else ""
+        print(
+            f"  p{item.page} ({item.x:6.1f}, {item.y:6.1f}) "
+            f"size={item.font_size:5.1f}{bold}{italic} "
+            f"'{item.text}'"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pdf_inspector.pyi b/pdf_inspector.pyi
new file mode 100644
index 0000000..993a59e
--- /dev/null
+++ b/pdf_inspector.pyi
@@ -0,0 +1,56 @@
+"""Type stubs for pdf_inspector."""
+
+from typing import Optional
+
+class PdfResult:
+    """Result of processing a PDF file."""
+    pdf_type: str
+    """'text_based', 'scanned', 'image_based', or 'mixed'."""
+    markdown: Optional[str]
+    page_count: int
+    processing_time_ms: int
+    pages_needing_ocr: list[int]
+    title: Optional[str]
+    confidence: float
+    is_complex_layout: bool
+    pages_with_tables: list[int]
+    pages_with_columns: list[int]
+    has_encoding_issues: bool
+
+class TextItem:
+    """A positioned text item extracted from a PDF."""
+    text: str
+    x: float
+    y: float
+    width: float
+    height: float
+    font: str
+    font_size: float
+    page: int
+    is_bold: bool
+    is_italic: bool
+    item_type: str
+
+def process_pdf(path: str, pages: Optional[list[int]] = None) -> PdfResult:
+    """Process a PDF: detect type, extract text, convert to Markdown."""
+    ...
+
+def process_pdf_bytes(data: bytes, pages: Optional[list[int]] = None) -> PdfResult:
+    """Process a PDF from bytes in memory."""
+    ...
+
+def detect_pdf(path: str) -> PdfResult:
+    """Fast detection only — no text extraction."""
+    ...
+
+def detect_pdf_bytes(data: bytes) -> PdfResult:
+    """Fast detection from bytes."""
+    ...
+
+def extract_text(path: str) -> str:
+    """Extract plain text from a PDF."""
+    ...
+
+def extract_text_with_positions(path: str, pages: Optional[list[int]] = None) -> list[TextItem]:
+    """Extract text with position information."""
+    ...
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..f925a0e
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,21 @@
+[build-system]
+requires = ["maturin>=1.0,<2.0"]
+build-backend = "maturin"
+
+[project]
+name = "pdf-inspector"
+version = "0.1.0"
+description = "Fast PDF inspection, classification, and text extraction with smart scanned vs text-based detection"
+license = { text = "MIT" }
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Rust",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Topic :: Text Processing",
+]
+
+[tool.maturin]
+features = ["python"]
diff --git a/src/lib.rs b/src/lib.rs
index aebab7a..f80718c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -22,6 +22,9 @@
 //! ).unwrap();
 //! ```
 
+#[cfg(feature = "python")]
+pub mod python;
+
 pub mod adobe_korea1;
 pub mod detector;
 pub mod extractor;
diff --git a/src/python.rs b/src/python.rs
new file mode 100644
index 0000000..3fb645d
--- /dev/null
+++ b/src/python.rs
@@ -0,0 +1,269 @@
+//! PyO3 Python bindings for pdf-inspector.
+
+use pyo3::prelude::*;
+use pyo3::exceptions::PyValueError;
+use std::collections::HashSet;
+
+use crate::detector::PdfType;
+use crate::types::ItemType;
+
+// ---------------------------------------------------------------------------
+// Result wrapper
+// ---------------------------------------------------------------------------
+
+/// Result of processing a PDF file.
+#[pyclass(name = "PdfResult")]
+#[derive(Clone)]
+pub struct PyPdfResult {
+    /// The detected PDF type: "text_based", "scanned", "image_based", or "mixed".
+    #[pyo3(get)]
+    pub pdf_type: String,
+    /// Markdown output (None if detect-only or scanned PDF).
+    #[pyo3(get)]
+    pub markdown: Option<String>,
+    /// Total number of pages.
+    #[pyo3(get)]
+    pub page_count: u32,
+    /// Processing time in milliseconds.
+    #[pyo3(get)]
+    pub processing_time_ms: u64,
+    /// 1-indexed page numbers that need OCR.
+    #[pyo3(get)]
+    pub pages_needing_ocr: Vec<u32>,
+    /// Title from PDF metadata.
+    #[pyo3(get)]
+    pub title: Option<String>,
+    /// Detection confidence (0.0-1.0).
+    #[pyo3(get)]
+    pub confidence: f32,
+    /// Whether the layout is complex (tables/columns detected).
+    #[pyo3(get)]
+    pub is_complex_layout: bool,
+    /// Pages with tables detected.
+    #[pyo3(get)]
+    pub pages_with_tables: Vec<u32>,
+    /// Pages with multi-column layout.
+    #[pyo3(get)]
+    pub pages_with_columns: Vec<u32>,
+    /// Whether encoding issues were detected.
+    #[pyo3(get)]
+    pub has_encoding_issues: bool,
+}
+
+#[pymethods]
+impl PyPdfResult {
+    fn __repr__(&self) -> String {
+        format!(
+            "PdfResult(pdf_type='{}', pages={}, confidence={:.2})",
+            self.pdf_type, self.page_count, self.confidence
+        )
+    }
+}
+
+fn pdf_type_str(t: PdfType) -> String {
+    match t {
+        PdfType::TextBased => "text_based".into(),
+        PdfType::Scanned => "scanned".into(),
+        PdfType::ImageBased => "image_based".into(),
+        PdfType::Mixed => "mixed".into(),
+    }
+}
+
+fn to_py_result(r: crate::PdfProcessResult) -> PyPdfResult {
+    PyPdfResult {
+        pdf_type: pdf_type_str(r.pdf_type),
+        markdown: r.markdown,
+        page_count: r.page_count,
+        processing_time_ms: r.processing_time_ms,
+        pages_needing_ocr: r.pages_needing_ocr,
+        title: r.title,
+        confidence: r.confidence,
+        is_complex_layout: r.layout.is_complex,
+        pages_with_tables: r.layout.pages_with_tables,
+        pages_with_columns: r.layout.pages_with_columns,
+        has_encoding_issues: r.has_encoding_issues,
+    }
+}
+
+fn to_py_err(e: crate::PdfError) -> PyErr {
+    PyValueError::new_err(e.to_string())
+}
+
+// ---------------------------------------------------------------------------
+// Text item wrapper
+// ---------------------------------------------------------------------------
+
+/// A positioned text item extracted from a PDF.
+#[pyclass(name = "TextItem")]
+#[derive(Clone)]
+pub struct PyTextItem {
+    #[pyo3(get)]
+    pub text: String,
+    #[pyo3(get)]
+    pub x: f32,
+    #[pyo3(get)]
+    pub y: f32,
+    #[pyo3(get)]
+    pub width: f32,
+    #[pyo3(get)]
+    pub height: f32,
+    #[pyo3(get)]
+    pub font: String,
+    #[pyo3(get)]
+    pub font_size: f32,
+    #[pyo3(get)]
+    pub page: u32,
+    #[pyo3(get)]
+    pub is_bold: bool,
+    #[pyo3(get)]
+    pub is_italic: bool,
+    #[pyo3(get)]
+    pub item_type: String,
+}
+
+#[pymethods]
+impl PyTextItem {
+    fn __repr__(&self) -> String {
+        format!(
+            "TextItem(text='{}', page={}, x={:.1}, y={:.1})",
+            self.text.chars().take(40).collect::<String>(),
+            self.page,
+            self.x,
+            self.y,
+        )
+    }
+}
+
+fn item_type_str(t: &ItemType) -> String {
+    match t {
+        ItemType::Text => "text".into(),
+        ItemType::Image => "image".into(),
+        ItemType::Link(url) => format!("link:{url}"),
+        ItemType::FormField => "form_field".into(),
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Public Python API
+// ---------------------------------------------------------------------------
+
+/// Process a PDF file: detect type, extract text, and convert to Markdown.
+///
+/// Args:
+///     path: Path to the PDF file.
+///     pages: Optional list of 1-indexed page numbers to process.
+///
+/// Returns:
+///     PdfResult with markdown, pdf_type, and metadata.
+#[pyfunction]
+#[pyo3(signature = (path, pages=None))]
+fn process_pdf(path: &str, pages: Option<Vec<u32>>) -> PyResult<PyPdfResult> {
+    let mut opts = crate::PdfOptions::new();
+    if let Some(p) = pages {
+        opts = opts.pages(p);
+    }
+    let result = crate::process_pdf_with_options(path, opts).map_err(to_py_err)?;
+    Ok(to_py_result(result))
+}
+
+/// Process a PDF from bytes in memory.
+///
+/// Args:
+///     data: PDF file contents as bytes.
+///     pages: Optional list of 1-indexed page numbers to process.
+///
+/// Returns:
+///     PdfResult with markdown, pdf_type, and metadata.
+#[pyfunction]
+#[pyo3(signature = (data, pages=None))]
+fn process_pdf_bytes(data: &[u8], pages: Option<Vec<u32>>) -> PyResult<PyPdfResult> {
+    let mut opts = crate::PdfOptions::new();
+    if let Some(p) = pages {
+        opts = opts.pages(p);
+    }
+    let result = crate::process_pdf_mem_with_options(data, opts).map_err(to_py_err)?;
+    Ok(to_py_result(result))
+}
+
+/// Fast detection only — no text extraction or markdown.
+///
+/// Args:
+///     path: Path to the PDF file.
+///
+/// Returns:
+///     PdfResult with pdf_type and metadata (markdown will be None).
+#[pyfunction]
+fn detect_pdf(path: &str) -> PyResult<PyPdfResult> {
+    let result = crate::detect_pdf(path).map_err(to_py_err)?;
+    Ok(to_py_result(result))
+}
+
+/// Fast detection from bytes — no text extraction or markdown.
+#[pyfunction]
+fn detect_pdf_bytes(data: &[u8]) -> PyResult<PyPdfResult> {
+    let result = crate::detect_pdf_mem(data).map_err(to_py_err)?;
+    Ok(to_py_result(result))
+}
+
+/// Extract plain text from a PDF file.
+///
+/// Args:
+///     path: Path to the PDF file.
+///
+/// Returns:
+///     Extracted text as a string.
+#[pyfunction]
+fn extract_text(path: &str) -> PyResult<String> {
+    crate::extract_text(path).map_err(to_py_err)
+}
+
+/// Extract text with position information.
+///
+/// Args:
+///     path: Path to the PDF file.
+///     pages: Optional list of 1-indexed page numbers.
+///
+/// Returns:
+///     List of TextItem objects with text, position, font info.
+#[pyfunction]
+#[pyo3(signature = (path, pages=None))]
+fn extract_text_with_positions(path: &str, pages: Option<Vec<u32>>) -> PyResult<Vec<PyTextItem>> {
+    let items = match pages {
+        Some(p) => {
+            let page_set: HashSet<u32> = p.into_iter().collect();
+            crate::extract_text_with_positions_pages(path, Some(&page_set)).map_err(to_py_err)?
+        }
+        None => crate::extract_text_with_positions(path).map_err(to_py_err)?,
+    };
+
+    Ok(items
+        .into_iter()
+        .map(|item| PyTextItem {
+            text: item.text,
+            x: item.x,
+            y: item.y,
+            width: item.width,
+            height: item.height,
+            font: item.font,
+            font_size: item.font_size,
+            page: item.page,
+            is_bold: item.is_bold,
+            is_italic: item.is_italic,
+            item_type: item_type_str(&item.item_type),
+        })
+        .collect())
+}
+
+/// Python module definition.
+#[pymodule]
+fn pdf_inspector(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_class::<PyPdfResult>()?;
+    m.add_class::<PyTextItem>()?;
+    m.add_function(wrap_pyfunction!(process_pdf, m)?)?;
+    m.add_function(wrap_pyfunction!(process_pdf_bytes, m)?)?;
+    m.add_function(wrap_pyfunction!(detect_pdf, m)?)?;
+    m.add_function(wrap_pyfunction!(detect_pdf_bytes, m)?)?;
+    m.add_function(wrap_pyfunction!(extract_text, m)?)?;
+    m.add_function(wrap_pyfunction!(extract_text_with_positions, m)?)?;
+    Ok(())
+}
diff --git a/tests/test_python.py b/tests/test_python.py
new file mode 100644
index 0000000..379f8b4
--- /dev/null
+++ b/tests/test_python.py
@@ -0,0 +1,188 @@
+"""Tests for the pdf_inspector Python bindings."""
+
+import os
+import pytest
+import pdf_inspector
+
+FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
+
+
+def fixture_path(name: str) -> str:
+    return os.path.join(FIXTURES_DIR, name)
+
+
+# ---------------------------------------------------------------------------
+# process_pdf
+# ---------------------------------------------------------------------------
+
+
+class TestProcessPdf:
+    def test_basic(self):
+        result = pdf_inspector.process_pdf(fixture_path("thermo-freon12.pdf"))
+        assert result.pdf_type == "text_based"
+        assert result.page_count == 3
+        assert result.confidence > 0.0
+        assert result.markdown is not None
+        assert len(result.markdown) > 0
+
+    def test_result_repr(self):
+        result = pdf_inspector.process_pdf(fixture_path("thermo-freon12.pdf"))
+        r = repr(result)
+        assert "PdfResult" in r
+        assert "text_based" in r
+
+    def test_with_pages(self):
+        result = pdf_inspector.process_pdf(
+            fixture_path("thermo-freon12.pdf"), pages=[1]
+        )
+        assert result.page_count == 3  # total pages in doc
+        assert result.markdown is not None
+
+    def test_result_fields(self):
+        result = pdf_inspector.process_pdf(fixture_path("thermo-freon12.pdf"))
+        # All fields should be accessible
+        assert isinstance(result.pdf_type, str)
+        assert isinstance(result.page_count, int)
+        assert isinstance(result.processing_time_ms, int)
+        assert isinstance(result.pages_needing_ocr, list)
+        assert isinstance(result.confidence, float)
+        assert isinstance(result.is_complex_layout, bool)
+        assert isinstance(result.pages_with_tables, list)
+        assert isinstance(result.pages_with_columns, list)
+        assert isinstance(result.has_encoding_issues, bool)
+        # title can be None or str
+        assert result.title is None or isinstance(result.title, str)
+
+
+# ---------------------------------------------------------------------------
+# process_pdf_bytes
+# ---------------------------------------------------------------------------
+
+
+class TestProcessPdfBytes:
+    def test_basic(self):
+        with open(fixture_path("thermo-freon12.pdf"), "rb") as f:
+            data = f.read()
+        result = pdf_inspector.process_pdf_bytes(data)
+        assert result.pdf_type == "text_based"
+        assert result.markdown is not None
+
+    def test_with_pages(self):
+        with open(fixture_path("thermo-freon12.pdf"), "rb") as f:
+            data = f.read()
+        result = pdf_inspector.process_pdf_bytes(data, pages=[1, 2])
+        assert result.markdown is not None
+
+
+# ---------------------------------------------------------------------------
+# detect_pdf / detect_pdf_bytes
+# ---------------------------------------------------------------------------
+
+
+class TestDetectPdf:
+    def test_detect_file(self):
+        result = pdf_inspector.detect_pdf(fixture_path("thermo-freon12.pdf"))
+        assert result.pdf_type == "text_based"
+        assert result.markdown is None  # detect only — no markdown
+        assert result.page_count == 3
+
+    def test_detect_bytes(self):
+        with open(fixture_path("thermo-freon12.pdf"), "rb") as f:
+            data = f.read()
+        result = pdf_inspector.detect_pdf_bytes(data)
+        assert result.pdf_type == "text_based"
+        assert result.markdown is None
+
+
+# ---------------------------------------------------------------------------
+# extract_text
+# ---------------------------------------------------------------------------
+
+
+class TestExtractText:
+    def test_basic(self):
+        text = pdf_inspector.extract_text(fixture_path("thermo-freon12.pdf"))
+        assert isinstance(text, str)
+        assert len(text) > 0
+
+
+# ---------------------------------------------------------------------------
+# extract_text_with_positions
+# ---------------------------------------------------------------------------
+
+
+class TestExtractTextWithPositions:
+    def test_basic(self):
+        items = pdf_inspector.extract_text_with_positions(
+            fixture_path("thermo-freon12.pdf")
+        )
+        assert len(items) > 0
+        item = items[0]
+        assert isinstance(item.text, str)
+        assert isinstance(item.x, float)
+        assert isinstance(item.y, float)
+        assert isinstance(item.width, float)
+        assert isinstance(item.height, float)
+        assert isinstance(item.font, str)
+        assert isinstance(item.font_size, float)
+        assert isinstance(item.page, int)
+        assert isinstance(item.is_bold, bool)
+        assert isinstance(item.is_italic, bool)
+        assert isinstance(item.item_type, str)
+
+    def test_with_pages(self):
+        items = pdf_inspector.extract_text_with_positions(
+            fixture_path("thermo-freon12.pdf"), pages=[1]
+        )
+        assert len(items) > 0
+        assert all(item.page == 1 for item in items)
+
+    def test_repr(self):
+        items = pdf_inspector.extract_text_with_positions(
+            fixture_path("thermo-freon12.pdf")
+        )
+        r = repr(items[0])
+        assert "TextItem" in r
+
+
+# ---------------------------------------------------------------------------
+# Error handling
+# ---------------------------------------------------------------------------
+
+
+class TestErrors:
+    def test_nonexistent_file(self):
+        with pytest.raises(ValueError):
+            pdf_inspector.process_pdf("/nonexistent/file.pdf")
+
+    def test_not_a_pdf(self):
+        with pytest.raises(ValueError):
+            pdf_inspector.process_pdf_bytes(b"this is not a pdf")
+
+    def test_empty_bytes(self):
+        with pytest.raises(ValueError):
+            pdf_inspector.process_pdf_bytes(b"")
+
+
+# ---------------------------------------------------------------------------
+# Multiple fixtures
+# ---------------------------------------------------------------------------
+
+
+class TestMultipleFixtures:
+    """Run basic processing on all available test fixtures."""
+
+    @pytest.mark.parametrize(
+        "filename",
+        [f for f in os.listdir(FIXTURES_DIR) if f.endswith(".pdf")],
+    )
+    def test_process_all_fixtures(self, filename):
+        result = pdf_inspector.process_pdf(fixture_path(filename))
+        assert result.pdf_type in (
+            "text_based",
+            "scanned",
+            "image_based",
+            "mixed",
+        )
+        assert result.page_count > 0
+        assert result.confidence >= 0.0

From 506b2a0c7097223e9ba82dd18f3c48a9d15e7e7c Mon Sep 17 00:00:00 2001
From: Abimael Martell <abimex@gmail.com>
Date: Thu, 2 Apr 2026 11:48:18 -0700
Subject: [PATCH 2/3] unify NAPI and Python binding APIs for consistent surface

Both bindings now expose the same 6 function families: process, detect,
classify, extractText, extractTextWithPositions, and extractTextInRegions.
Bumps PyO3 from 0.22 to 0.25 for Python 3.14 support.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 Cargo.toml              |   2 +-
 README.md               |  52 ++++++-
 examples/basic_usage.py |  22 +++
 napi/build.rs           |   2 +-
 napi/index.d.ts         |  51 ++++++-
 napi/index.js           |   4 +
 napi/src/lib.rs         | 278 +++++++++++++++++++++++++---------
 napi/test.mjs           |  98 ++++++++++++
 pdf_inspector.pyi       |  61 ++++++++
 src/python.rs           | 328 +++++++++++++++++++++++++++++++---------
 tests/test_python.py    | 152 ++++++++++++++++++-
 11 files changed, 893 insertions(+), 157 deletions(-)
 create mode 100644 napi/test.mjs

diff --git a/Cargo.toml b/Cargo.toml
index f1d7a8b..69c0af6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,7 +14,7 @@ crate-type = ["lib", "cdylib"]
 
 [dependencies]
 # Python bindings
-pyo3 = { version = "0.22", features = ["extension-module"], optional = true }
+pyo3 = { version = "0.25", features = ["extension-module"], optional = true }
 
 # PDF parsing
 lopdf = { git = "https://github.com/J-F-Liu/lopdf", rev = "052674053814a9f4897af94f0b8e46a545c9b329", features = ["rayon"] }
diff --git a/README.md b/README.md
index cf1026f..d4a6fbd 100644
--- a/README.md
+++ b/README.md
@@ -69,15 +69,65 @@ for item in items[:5]:
 |---|---|
 | `process_pdf(path, pages=None)` | Full processing (detect + extract + markdown) |
 | `process_pdf_bytes(data, pages=None)` | Full processing from bytes |
-| `detect_pdf(path)` | Fast detection only |
+| `detect_pdf(path)` | Fast detection only (returns PdfResult) |
 | `detect_pdf_bytes(data)` | Fast detection from bytes |
+| `classify_pdf(path)` | Lightweight classification (returns PdfClassification) |
+| `classify_pdf_bytes(data)` | Lightweight classification from bytes |
 | `extract_text(path)` | Plain text extraction |
+| `extract_text_bytes(data)` | Plain text extraction from bytes |
 | `extract_text_with_positions(path, pages=None)` | Text with X/Y coords and font info |
+| `extract_text_with_positions_bytes(data, pages=None)` | Text with positions from bytes |
+| `extract_text_in_regions(path, page_regions)` | Extract text in bounding-box regions |
+| `extract_text_in_regions_bytes(data, page_regions)` | Region extraction from bytes |
 
 **`PdfResult` fields:** `pdf_type`, `markdown`, `page_count`, `processing_time_ms`, `pages_needing_ocr`, `title`, `confidence`, `is_complex_layout`, `pages_with_tables`, `pages_with_columns`, `has_encoding_issues`
 
+**`PdfClassification` fields:** `pdf_type`, `page_count`, `pages_needing_ocr` (0-indexed), `confidence`
+
 **`TextItem` fields:** `text`, `x`, `y`, `width`, `height`, `font`, `font_size`, `page`, `is_bold`, `is_italic`, `item_type`
 
+**`RegionText` fields:** `text`, `needs_ocr`
+
+**`PageRegionTexts` fields:** `page` (0-indexed), `regions` (list of RegionText)
+
+### Node.js (NAPI)
+
+```bash
+npm install @firecrawl/pdf-inspector-js
+```
+
+```javascript
+import { readFileSync } from 'fs';
+import { processPdf, classifyPdf, extractTextInRegions } from '@firecrawl/pdf-inspector-js';
+
+const buffer = readFileSync('document.pdf');
+
+// Full processing
+const result = processPdf(buffer);
+console.log(result.pdfType);   // "TextBased", "Scanned", "ImageBased", "Mixed"
+console.log(result.markdown);  // Markdown string or null
+
+// Lightweight classification
+const cls = classifyPdf(buffer);
+console.log(cls.pdfType, cls.pagesNeedingOcr);
+
+// Region-based extraction (for hybrid OCR pipelines)
+const regions = extractTextInRegions(buffer, [
+  { page: 0, regions: [[0, 0, 600, 100]] }
+]);
+```
+
+#### Node.js API reference
+
+| Function | Description |
+|---|---|
+| `processPdf(buffer, pages?)` | Full processing (detect + extract + markdown) |
+| `detectPdf(buffer)` | Fast detection only (returns PdfResult) |
+| `classifyPdf(buffer)` | Lightweight classification (returns PdfClassification) |
+| `extractText(buffer)` | Plain text extraction |
+| `extractTextWithPositions(buffer, pages?)` | Text with X/Y coords and font info |
+| `extractTextInRegions(buffer, pageRegions)` | Extract text in bounding-box regions |
+
 ### Rust
 
 Add to your `Cargo.toml`:
diff --git a/examples/basic_usage.py b/examples/basic_usage.py
index 3ab14a1..96401dc 100644
--- a/examples/basic_usage.py
+++ b/examples/basic_usage.py
@@ -71,6 +71,28 @@ def main():
             f"'{item.text}'"
         )
 
+    # 6. Lightweight classification
+    print("\n" + "=" * 60)
+    print("Lightweight classification")
+    print("=" * 60)
+    cls = pdf_inspector.classify_pdf(path)
+    print(f"Type:       {cls.pdf_type}")
+    print(f"Pages:      {cls.page_count}")
+    print(f"Confidence: {cls.confidence:.0%}")
+    print(f"OCR pages:  {cls.pages_needing_ocr or 'none'} (0-indexed)")
+
+    # 7. Region-based text extraction
+    print("\n" + "=" * 60)
+    print("Region-based text extraction (page 0, top region)")
+    print("=" * 60)
+    regions = pdf_inspector.extract_text_in_regions(
+        path, [(0, [[0.0, 0.0, 600.0, 200.0]])]
+    )
+    for page_result in regions:
+        for i, region in enumerate(page_result.regions):
+            print(f"  Region {i}: needs_ocr={region.needs_ocr}")
+            print(f"  Text: {region.text[:200]}")
+
 
 if __name__ == "__main__":
     main()
diff --git a/napi/build.rs b/napi/build.rs
index bbfc9e4..0f1b010 100644
--- a/napi/build.rs
+++ b/napi/build.rs
@@ -1,3 +1,3 @@
 fn main() {
-  napi_build::setup();
+    napi_build::setup();
 }
diff --git a/napi/index.d.ts b/napi/index.d.ts
index 8254452..904cb76 100644
--- a/napi/index.d.ts
+++ b/napi/index.d.ts
@@ -1,11 +1,18 @@
 /* auto-generated by NAPI-RS */
 /* eslint-disable */
 /**
- * Classify a PDF: detect type (TextBased/Scanned/Mixed/ImageBased),
- * page count, and which pages need OCR. Takes PDF bytes as Buffer.
+ * Lightweight PDF classification — returns type, page count, and OCR pages.
+ * Faster than detectPdf as it skips building the full PdfResult.
+ * Pages in pagesNeedingOcr are 0-indexed.
  */
 export declare function classifyPdf(buffer: Buffer): PdfClassification
 
+/** Fast detection only — no text extraction or markdown. */
+export declare function detectPdf(buffer: Buffer): PdfResult
+
+/** Extract plain text from a PDF Buffer. */
+export declare function extractText(buffer: Buffer): string
+
 /**
  * Extract text within bounding-box regions from a PDF.
  *
@@ -13,13 +20,16 @@ export declare function classifyPdf(buffer: Buffer): PdfClassification
  * this extracts PDF text within those regions — skipping GPU OCR
  * for text-based pages.
  *
- * Each region result includes `needs_ocr` — set when the extracted text
+ * Each region result includes `needsOcr` — set when the extracted text
  * is unreliable (empty, GID-encoded fonts, garbage, encoding issues).
  *
  * Coordinates are PDF points with top-left origin.
  */
 export declare function extractTextInRegions(buffer: Buffer, pageRegions: Array<PageRegions>): Array<PageRegionTexts>
 
+/** Extract text with position information from a PDF Buffer. */
+export declare function extractTextWithPositions(buffer: Buffer, pages?: Array<number> | undefined | null): Array<TextItem>
+
 /** A page's regions for text extraction: (page_index_0based, bboxes). */
 export interface PageRegions {
   page: number
@@ -37,13 +47,48 @@ export interface PageRegionTexts {
 export interface PdfClassification {
   pdfType: string
   pageCount: number
+  /** 0-indexed page numbers that need OCR. */
+  pagesNeedingOcr: Array<number>
+  confidence: number
+}
+
+/** Full PDF processing result with markdown and metadata. */
+export interface PdfResult {
+  pdfType: string
+  markdown?: string
+  pageCount: number
+  processingTimeMs: number
+  /** 1-indexed page numbers that need OCR. */
   pagesNeedingOcr: Array<number>
+  title?: string
   confidence: number
+  isComplexLayout: boolean
+  pagesWithTables: Array<number>
+  pagesWithColumns: Array<number>
+  hasEncodingIssues: boolean
 }
 
+/** Process a PDF from a Buffer: detect type, extract text, and convert to Markdown. */
+export declare function processPdf(buffer: Buffer, pages?: Array<number> | undefined | null): PdfResult
+
 /** Extracted text for a single region. */
 export interface RegionText {
   text: string
   /** `true` when the text should not be trusted (empty, GID fonts, garbage, encoding issues). */
   needsOcr: boolean
 }
+
+/** A positioned text item extracted from a PDF. */
+export interface TextItem {
+  text: string
+  x: number
+  y: number
+  width: number
+  height: number
+  font: string
+  fontSize: number
+  page: number
+  isBold: boolean
+  isItalic: boolean
+  itemType: string
+}
diff --git a/napi/index.js b/napi/index.js
index 23f7a6e..be1f71d 100644
--- a/napi/index.js
+++ b/napi/index.js
@@ -577,4 +577,8 @@ if (!nativeBinding) {
 
 module.exports = nativeBinding
 module.exports.classifyPdf = nativeBinding.classifyPdf
+module.exports.detectPdf = nativeBinding.detectPdf
+module.exports.extractText = nativeBinding.extractText
 module.exports.extractTextInRegions = nativeBinding.extractTextInRegions
+module.exports.extractTextWithPositions = nativeBinding.extractTextWithPositions
+module.exports.processPdf = nativeBinding.processPdf
diff --git a/napi/src/lib.rs b/napi/src/lib.rs
index c13ffdf..ceff67c 100644
--- a/napi/src/lib.rs
+++ b/napi/src/lib.rs
@@ -2,58 +2,201 @@
 
 use napi::bindgen_prelude::*;
 use napi_derive::napi;
+use std::collections::HashSet;
+
+// ---------------------------------------------------------------------------
+// Result types
+// ---------------------------------------------------------------------------
+
+/// Full PDF processing result with markdown and metadata.
+#[napi(object)]
+pub struct PdfResult {
+    pub pdf_type: String,
+    pub markdown: Option<String>,
+    pub page_count: u32,
+    pub processing_time_ms: u32,
+    /// 1-indexed page numbers that need OCR.
+    pub pages_needing_ocr: Vec<u32>,
+    pub title: Option<String>,
+    pub confidence: f64,
+    pub is_complex_layout: bool,
+    pub pages_with_tables: Vec<u32>,
+    pub pages_with_columns: Vec<u32>,
+    pub has_encoding_issues: bool,
+}
 
 /// Lightweight PDF classification result.
 #[napi(object)]
 pub struct PdfClassification {
-  pub pdf_type: String,
-  pub page_count: u32,
-  pub pages_needing_ocr: Vec<u32>,
-  pub confidence: f64,
+    pub pdf_type: String,
+    pub page_count: u32,
+    /// 0-indexed page numbers that need OCR.
+    pub pages_needing_ocr: Vec<u32>,
+    pub confidence: f64,
+}
+
+/// A positioned text item extracted from a PDF.
+#[napi(object)]
+pub struct TextItem {
+    pub text: String,
+    pub x: f64,
+    pub y: f64,
+    pub width: f64,
+    pub height: f64,
+    pub font: String,
+    pub font_size: f64,
+    pub page: u32,
+    pub is_bold: bool,
+    pub is_italic: bool,
+    pub item_type: String,
 }
 
 /// A page's regions for text extraction: (page_index_0based, bboxes).
 #[napi(object)]
 pub struct PageRegions {
-  pub page: u32,
-  /// Each bbox is [x1, y1, x2, y2] in PDF points, top-left origin.
-  pub regions: Vec<Vec<f64>>,
+    pub page: u32,
+    /// Each bbox is [x1, y1, x2, y2] in PDF points, top-left origin.
+    pub regions: Vec<Vec<f64>>,
 }
 
 /// Extracted text for a single region.
 #[napi(object)]
 pub struct RegionText {
-  pub text: String,
-  /// `true` when the text should not be trusted (empty, GID fonts, garbage, encoding issues).
-  pub needs_ocr: bool,
+    pub text: String,
+    /// `true` when the text should not be trusted (empty, GID fonts, garbage, encoding issues).
+    pub needs_ocr: bool,
 }
 
 /// Extracted text for one page's regions.
 #[napi(object)]
 pub struct PageRegionTexts {
-  pub page: u32,
-  pub regions: Vec<RegionText>,
+    pub page: u32,
+    pub regions: Vec<RegionText>,
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+fn pdf_type_string(t: pdf_inspector::PdfType) -> String {
+    match t {
+        pdf_inspector::PdfType::TextBased => "TextBased".to_string(),
+        pdf_inspector::PdfType::Scanned => "Scanned".to_string(),
+        pdf_inspector::PdfType::ImageBased => "ImageBased".to_string(),
+        pdf_inspector::PdfType::Mixed => "Mixed".to_string(),
+    }
 }
 
-/// Classify a PDF: detect type (TextBased/Scanned/Mixed/ImageBased),
-/// page count, and which pages need OCR. Takes PDF bytes as Buffer.
+fn to_napi_result(r: pdf_inspector::PdfProcessResult) -> PdfResult {
+    PdfResult {
+        pdf_type: pdf_type_string(r.pdf_type),
+        markdown: r.markdown,
+        page_count: r.page_count,
+        processing_time_ms: r.processing_time_ms as u32,
+        pages_needing_ocr: r.pages_needing_ocr,
+        title: r.title,
+        confidence: r.confidence as f64,
+        is_complex_layout: r.layout.is_complex,
+        pages_with_tables: r.layout.pages_with_tables,
+        pages_with_columns: r.layout.pages_with_columns,
+        has_encoding_issues: r.has_encoding_issues,
+    }
+}
+
+fn item_type_string(t: &pdf_inspector::types::ItemType) -> String {
+    match t {
+        pdf_inspector::types::ItemType::Text => "text".into(),
+        pdf_inspector::types::ItemType::Image => "image".into(),
+        pdf_inspector::types::ItemType::Link(url) => format!("link:{url}"),
+        pdf_inspector::types::ItemType::FormField => "form_field".into(),
+    }
+}
+
+fn to_napi_err(e: impl std::fmt::Display, ctx: &str) -> Error {
+    Error::new(Status::GenericFailure, format!("{ctx}: {e}"))
+}
+
+// ---------------------------------------------------------------------------
+// Public NAPI API
+// ---------------------------------------------------------------------------
+
+/// Process a PDF from a Buffer: detect type, extract text, and convert to Markdown.
+#[napi]
+pub fn process_pdf(buffer: Buffer, pages: Option<Vec<u32>>) -> Result<PdfResult> {
+    let mut opts = pdf_inspector::PdfOptions::new();
+    if let Some(p) = pages {
+        opts = opts.pages(p);
+    }
+    let result = pdf_inspector::process_pdf_mem_with_options(&buffer, opts)
+        .map_err(|e| to_napi_err(e, "process_pdf"))?;
+    Ok(to_napi_result(result))
+}
+
+/// Fast detection only — no text extraction or markdown.
+#[napi]
+pub fn detect_pdf(buffer: Buffer) -> Result<PdfResult> {
+    let result =
+        pdf_inspector::detect_pdf_mem(&buffer).map_err(|e| to_napi_err(e, "detect_pdf"))?;
+    Ok(to_napi_result(result))
+}
+
+/// Lightweight PDF classification — returns type, page count, and OCR pages.
+/// Faster than detectPdf as it skips building the full PdfResult.
+/// Pages in pagesNeedingOcr are 0-indexed.
 #[napi]
 pub fn classify_pdf(buffer: Buffer) -> Result<PdfClassification> {
-  let result = pdf_inspector::classify_pdf_mem(&buffer).map_err(|e| {
-    Error::new(Status::GenericFailure, format!("classify_pdf failed: {e}"))
-  })?;
-
-  Ok(PdfClassification {
-    pdf_type: match result.pdf_type {
-      pdf_inspector::PdfType::TextBased => "TextBased".to_string(),
-      pdf_inspector::PdfType::Scanned => "Scanned".to_string(),
-      pdf_inspector::PdfType::ImageBased => "ImageBased".to_string(),
-      pdf_inspector::PdfType::Mixed => "Mixed".to_string(),
-    },
-    page_count: result.page_count,
-    pages_needing_ocr: result.pages_needing_ocr,
-    confidence: result.confidence as f64,
-  })
+    let result =
+        pdf_inspector::classify_pdf_mem(&buffer).map_err(|e| to_napi_err(e, "classify_pdf"))?;
+
+    Ok(PdfClassification {
+        pdf_type: pdf_type_string(result.pdf_type),
+        page_count: result.page_count,
+        pages_needing_ocr: result.pages_needing_ocr,
+        confidence: result.confidence as f64,
+    })
+}
+
+/// Extract plain text from a PDF Buffer.
+#[napi]
+pub fn extract_text(buffer: Buffer) -> Result<String> {
+    pdf_inspector::extractor::extract_text_mem(&buffer).map_err(|e| to_napi_err(e, "extract_text"))
+}
+
+/// Extract text with position information from a PDF Buffer.
+#[napi]
+pub fn extract_text_with_positions(
+    buffer: Buffer,
+    pages: Option<Vec<u32>>,
+) -> Result<Vec<TextItem>> {
+    let items = match pages {
+        Some(p) => {
+            let page_set: HashSet<u32> = p.into_iter().collect();
+            pdf_inspector::extractor::extract_text_with_positions_mem_pages(
+                &buffer,
+                Some(&page_set),
+            )
+            .map_err(|e| to_napi_err(e, "extract_text_with_positions"))?
+        }
+        None => pdf_inspector::extractor::extract_text_with_positions_mem(&buffer)
+            .map_err(|e| to_napi_err(e, "extract_text_with_positions"))?,
+    };
+
+    Ok(items
+        .into_iter()
+        .map(|item| TextItem {
+            text: item.text,
+            x: item.x as f64,
+            y: item.y as f64,
+            width: item.width as f64,
+            height: item.height as f64,
+            font: item.font,
+            font_size: item.font_size as f64,
+            page: item.page,
+            is_bold: item.is_bold,
+            is_italic: item.is_italic,
+            item_type: item_type_string(&item.item_type),
+        })
+        .collect())
 }
 
 /// Extract text within bounding-box regions from a PDF.
@@ -62,55 +205,48 @@ pub fn classify_pdf(buffer: Buffer) -> Result<PdfClassification> {
 /// this extracts PDF text within those regions — skipping GPU OCR
 /// for text-based pages.
 ///
-/// Each region result includes `needs_ocr` — set when the extracted text
+/// Each region result includes `needsOcr` — set when the extracted text
 /// is unreliable (empty, GID-encoded fonts, garbage, encoding issues).
 ///
 /// Coordinates are PDF points with top-left origin.
 #[napi]
 pub fn extract_text_in_regions(
-  buffer: Buffer,
-  page_regions: Vec<PageRegions>,
+    buffer: Buffer,
+    page_regions: Vec<PageRegions>,
 ) -> Result<Vec<PageRegionTexts>> {
-  // Convert from napi types to the Rust API's expected format
-  let regions: Vec<(u32, Vec<[f32; 4]>)> = page_regions
-    .iter()
-    .map(|pr| {
-      let bboxes: Vec<[f32; 4]> = pr
-        .regions
+    let regions: Vec<(u32, Vec<[f32; 4]>)> = page_regions
         .iter()
-        .map(|r| {
-          if r.len() != 4 {
-            [0.0, 0.0, 0.0, 0.0]
-          } else {
-            [r[0] as f32, r[1] as f32, r[2] as f32, r[3] as f32]
-          }
+        .map(|pr| {
+            let bboxes: Vec<[f32; 4]> = pr
+                .regions
+                .iter()
+                .map(|r| {
+                    if r.len() != 4 {
+                        [0.0, 0.0, 0.0, 0.0]
+                    } else {
+                        [r[0] as f32, r[1] as f32, r[2] as f32, r[3] as f32]
+                    }
+                })
+                .collect();
+            (pr.page, bboxes)
         })
         .collect();
-      (pr.page, bboxes)
-    })
-    .collect();
-
-  let results = pdf_inspector::extract_text_in_regions_mem(&buffer, &regions).map_err(|e| {
-    Error::new(
-      Status::GenericFailure,
-      format!("extract_text_in_regions failed: {e}"),
-    )
-  })?;
-
-  Ok(
-    results
-      .into_iter()
-      .map(|page_result| PageRegionTexts {
-        page: page_result.page,
-        regions: page_result
-          .regions
-          .into_iter()
-          .map(|r| RegionText {
-            text: r.text,
-            needs_ocr: r.needs_ocr,
-          })
-          .collect(),
-      })
-      .collect(),
-  )
+
+    let results = pdf_inspector::extract_text_in_regions_mem(&buffer, &regions)
+        .map_err(|e| to_napi_err(e, "extract_text_in_regions"))?;
+
+    Ok(results
+        .into_iter()
+        .map(|page_result| PageRegionTexts {
+            page: page_result.page,
+            regions: page_result
+                .regions
+                .into_iter()
+                .map(|r| RegionText {
+                    text: r.text,
+                    needs_ocr: r.needs_ocr,
+                })
+                .collect(),
+        })
+        .collect())
 }
diff --git a/napi/test.mjs b/napi/test.mjs
new file mode 100644
index 0000000..64f1490
--- /dev/null
+++ b/napi/test.mjs
@@ -0,0 +1,98 @@
+import { readFileSync } from 'fs';
+import { strict as assert } from 'assert';
+import {
+  processPdf,
+  detectPdf,
+  classifyPdf,
+  extractText,
+  extractTextWithPositions,
+  extractTextInRegions,
+} from './index.js';
+
+const fixture = readFileSync('../tests/fixtures/thermo-freon12.pdf');
+
+// --- processPdf ---
+console.log('Testing processPdf...');
+const result = processPdf(fixture);
+assert.equal(result.pdfType, 'TextBased');
+assert.equal(result.pageCount, 3);
+assert.ok(result.confidence > 0);
+assert.ok(result.markdown && result.markdown.length > 0);
+assert.equal(typeof result.isComplexLayout, 'boolean');
+assert.ok(Array.isArray(result.pagesWithTables));
+assert.ok(Array.isArray(result.pagesWithColumns));
+assert.equal(typeof result.hasEncodingIssues, 'boolean');
+console.log('  processPdf: OK');
+
+// processPdf with pages
+const result2 = processPdf(fixture, [1]);
+assert.ok(result2.markdown && result2.markdown.length > 0);
+console.log('  processPdf with pages: OK');
+
+// --- detectPdf ---
+console.log('Testing detectPdf...');
+const detected = detectPdf(fixture);
+assert.equal(detected.pdfType, 'TextBased');
+assert.equal(detected.pageCount, 3);
+assert.equal(detected.markdown, undefined);
+console.log('  detectPdf: OK');
+
+// --- classifyPdf ---
+console.log('Testing classifyPdf...');
+const classified = classifyPdf(fixture);
+assert.equal(classified.pdfType, 'TextBased');
+assert.equal(classified.pageCount, 3);
+assert.ok(classified.confidence > 0);
+assert.ok(Array.isArray(classified.pagesNeedingOcr));
+console.log('  classifyPdf: OK');
+
+// --- extractText ---
+console.log('Testing extractText...');
+const text = extractText(fixture);
+assert.equal(typeof text, 'string');
+assert.ok(text.length > 0);
+console.log('  extractText: OK');
+
+// --- extractTextWithPositions ---
+console.log('Testing extractTextWithPositions...');
+const items = extractTextWithPositions(fixture);
+assert.ok(items.length > 0);
+const item = items[0];
+assert.equal(typeof item.text, 'string');
+assert.equal(typeof item.x, 'number');
+assert.equal(typeof item.y, 'number');
+assert.equal(typeof item.width, 'number');
+assert.equal(typeof item.height, 'number');
+assert.equal(typeof item.font, 'string');
+assert.equal(typeof item.fontSize, 'number');
+assert.equal(typeof item.page, 'number');
+assert.equal(typeof item.isBold, 'boolean');
+assert.equal(typeof item.isItalic, 'boolean');
+assert.equal(typeof item.itemType, 'string');
+console.log('  extractTextWithPositions: OK');
+
+// with pages filter
+const page1Items = extractTextWithPositions(fixture, [1]);
+assert.ok(page1Items.length > 0);
+assert.ok(page1Items.every(i => i.page === 1));
+console.log('  extractTextWithPositions with pages: OK');
+
+// --- extractTextInRegions ---
+console.log('Testing extractTextInRegions...');
+const regionResults = extractTextInRegions(fixture, [
+  { page: 0, regions: [[0, 0, 600, 100]] },
+]);
+assert.equal(regionResults.length, 1);
+assert.equal(regionResults[0].page, 0);
+assert.equal(regionResults[0].regions.length, 1);
+assert.equal(typeof regionResults[0].regions[0].text, 'string');
+assert.equal(typeof regionResults[0].regions[0].needsOcr, 'boolean');
+console.log('  extractTextInRegions: OK');
+
+// --- Error handling ---
+console.log('Testing error handling...');
+assert.throws(() => processPdf(Buffer.from('not a pdf')), /process_pdf/);
+assert.throws(() => classifyPdf(Buffer.from('')), /classify_pdf/);
+console.log('  error handling: OK');
+
+console.log('\nAll NAPI tests passed!');
diff --git a/pdf_inspector.pyi b/pdf_inspector.pyi
index 993a59e..041e321 100644
--- a/pdf_inspector.pyi
+++ b/pdf_inspector.pyi
@@ -17,6 +17,15 @@ class PdfResult:
     pages_with_columns: list[int]
     has_encoding_issues: bool
 
+class PdfClassification:
+    """Lightweight PDF classification result."""
+    pdf_type: str
+    """'text_based', 'scanned', 'image_based', or 'mixed'."""
+    page_count: int
+    pages_needing_ocr: list[int]
+    """0-indexed page numbers that need OCR."""
+    confidence: float
+
 class TextItem:
     """A positioned text item extracted from a PDF."""
     text: str
@@ -31,6 +40,18 @@ class TextItem:
     is_italic: bool
     item_type: str
 
+class RegionText:
+    """Extracted text for a single region."""
+    text: str
+    needs_ocr: bool
+    """True when the text should not be trusted."""
+
+class PageRegionTexts:
+    """Extracted text for one page's regions."""
+    page: int
+    """0-indexed page number."""
+    regions: list[RegionText]
+
 def process_pdf(path: str, pages: Optional[list[int]] = None) -> PdfResult:
     """Process a PDF: detect type, extract text, convert to Markdown."""
     ...
@@ -47,10 +68,50 @@ def detect_pdf_bytes(data: bytes) -> PdfResult:
     """Fast detection from bytes."""
     ...
 
+def classify_pdf(path: str) -> PdfClassification:
+    """Lightweight classification — type, page count, and OCR pages (0-indexed)."""
+    ...
+
+def classify_pdf_bytes(data: bytes) -> PdfClassification:
+    """Lightweight classification from bytes."""
+    ...
+
 def extract_text(path: str) -> str:
     """Extract plain text from a PDF."""
     ...
 
+def extract_text_bytes(data: bytes) -> str:
+    """Extract plain text from PDF bytes."""
+    ...
+
 def extract_text_with_positions(path: str, pages: Optional[list[int]] = None) -> list[TextItem]:
     """Extract text with position information."""
     ...
+
+def extract_text_with_positions_bytes(data: bytes, pages: Optional[list[int]] = None) -> list[TextItem]:
+    """Extract text with position information from bytes."""
+    ...
+
+def extract_text_in_regions(
+    path: str,
+    page_regions: list[tuple[int, list[list[float]]]],
+) -> list[PageRegionTexts]:
+    """Extract text within bounding-box regions from a PDF file.
+
+    Args:
+        path: Path to the PDF file.
+        page_regions: List of (page_0indexed, [[x1, y1, x2, y2], ...]) tuples.
+    """
+    ...
+
+def extract_text_in_regions_bytes(
+    data: bytes,
+    page_regions: list[tuple[int, list[list[float]]]],
+) -> list[PageRegionTexts]:
+    """Extract text within bounding-box regions from PDF bytes.
+
+    Args:
+        data: PDF file contents as bytes.
+        page_regions: List of (page_0indexed, [[x1, y1, x2, y2], ...]) tuples.
+    """
+    ...
diff --git a/src/python.rs b/src/python.rs
index 3fb645d..a9bef94 100644
--- a/src/python.rs
+++ b/src/python.rs
@@ -1,7 +1,7 @@
 //! PyO3 Python bindings for pdf-inspector.
 
-use pyo3::prelude::*;
 use pyo3::exceptions::PyValueError;
+use pyo3::prelude::*;
 use std::collections::HashSet;
 
 use crate::detector::PdfType;
@@ -60,33 +60,86 @@ impl PyPdfResult {
     }
 }
 
-fn pdf_type_str(t: PdfType) -> String {
-    match t {
-        PdfType::TextBased => "text_based".into(),
-        PdfType::Scanned => "scanned".into(),
-        PdfType::ImageBased => "image_based".into(),
-        PdfType::Mixed => "mixed".into(),
+// ---------------------------------------------------------------------------
+// Classification wrapper (lightweight)
+// ---------------------------------------------------------------------------
+
+/// Lightweight PDF classification result.
+#[pyclass(name = "PdfClassification")]
+#[derive(Clone)]
+pub struct PyPdfClassification {
+    /// The detected PDF type: "text_based", "scanned", "image_based", or "mixed".
+    #[pyo3(get)]
+    pub pdf_type: String,
+    /// Total number of pages.
+    #[pyo3(get)]
+    pub page_count: u32,
+    /// 0-indexed page numbers that need OCR.
+    #[pyo3(get)]
+    pub pages_needing_ocr: Vec<u32>,
+    /// Detection confidence (0.0-1.0).
+    #[pyo3(get)]
+    pub confidence: f32,
+}
+
+#[pymethods]
+impl PyPdfClassification {
+    fn __repr__(&self) -> String {
+        format!(
+            "PdfClassification(pdf_type='{}', pages={}, confidence={:.2})",
+            self.pdf_type, self.page_count, self.confidence
+        )
     }
 }
 
-fn to_py_result(r: crate::PdfProcessResult) -> PyPdfResult {
-    PyPdfResult {
-        pdf_type: pdf_type_str(r.pdf_type),
-        markdown: r.markdown,
-        page_count: r.page_count,
-        processing_time_ms: r.processing_time_ms,
-        pages_needing_ocr: r.pages_needing_ocr,
-        title: r.title,
-        confidence: r.confidence,
-        is_complex_layout: r.layout.is_complex,
-        pages_with_tables: r.layout.pages_with_tables,
-        pages_with_columns: r.layout.pages_with_columns,
-        has_encoding_issues: r.has_encoding_issues,
+// ---------------------------------------------------------------------------
+// Region extraction wrappers
+// ---------------------------------------------------------------------------
+
+/// Extracted text for a single region.
+#[pyclass(name = "RegionText")]
+#[derive(Clone)]
+pub struct PyRegionText {
+    /// Extracted text content.
+    #[pyo3(get)]
+    pub text: String,
+    /// True when the text should not be trusted (empty, GID fonts, garbage, encoding issues).
+    #[pyo3(get)]
+    pub needs_ocr: bool,
+}
+
+#[pymethods]
+impl PyRegionText {
+    fn __repr__(&self) -> String {
+        format!(
+            "RegionText(text='{}', needs_ocr={})",
+            self.text.chars().take(40).collect::<String>(),
+            self.needs_ocr
+        )
     }
 }
 
-fn to_py_err(e: crate::PdfError) -> PyErr {
-    PyValueError::new_err(e.to_string())
+/// Extracted text for one page's regions.
+#[pyclass(name = "PageRegionTexts")]
+#[derive(Clone)]
+pub struct PyPageRegionTexts {
+    /// 0-indexed page number.
+    #[pyo3(get)]
+    pub page: u32,
+    /// Per-region results, parallel to the input regions.
+    #[pyo3(get)]
+    pub regions: Vec<PyRegionText>,
+}
+
+#[pymethods]
+impl PyPageRegionTexts {
+    fn __repr__(&self) -> String {
+        format!(
+            "PageRegionTexts(page={}, regions={})",
+            self.page,
+            self.regions.len()
+        )
+    }
 }
 
 // ---------------------------------------------------------------------------
@@ -134,6 +187,39 @@ impl PyTextItem {
     }
 }
 
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+fn pdf_type_str(t: PdfType) -> String {
+    match t {
+        PdfType::TextBased => "text_based".into(),
+        PdfType::Scanned => "scanned".into(),
+        PdfType::ImageBased => "image_based".into(),
+        PdfType::Mixed => "mixed".into(),
+    }
+}
+
+fn to_py_result(r: crate::PdfProcessResult) -> PyPdfResult {
+    PyPdfResult {
+        pdf_type: pdf_type_str(r.pdf_type),
+        markdown: r.markdown,
+        page_count: r.page_count,
+        processing_time_ms: r.processing_time_ms,
+        pages_needing_ocr: r.pages_needing_ocr,
+        title: r.title,
+        confidence: r.confidence,
+        is_complex_layout: r.layout.is_complex,
+        pages_with_tables: r.layout.pages_with_tables,
+        pages_with_columns: r.layout.pages_with_columns,
+        has_encoding_issues: r.has_encoding_issues,
+    }
+}
+
+fn to_py_err(e: crate::PdfError) -> PyErr {
+    PyValueError::new_err(e.to_string())
+}
+
 fn item_type_str(t: &ItemType) -> String {
     match t {
         ItemType::Text => "text".into(),
@@ -143,18 +229,66 @@ fn item_type_str(t: &ItemType) -> String {
     }
 }
 
+fn convert_text_items(items: Vec<crate::TextItem>) -> Vec<PyTextItem> {
+    items
+        .into_iter()
+        .map(|item| PyTextItem {
+            text: item.text,
+            x: item.x,
+            y: item.y,
+            width: item.width,
+            height: item.height,
+            font: item.font,
+            font_size: item.font_size,
+            page: item.page,
+            is_bold: item.is_bold,
+            is_italic: item.is_italic,
+            item_type: item_type_str(&item.item_type),
+        })
+        .collect()
+}
+
+fn parse_page_regions(page_regions: Vec<(u32, Vec<Vec<f64>>)>) -> Vec<(u32, Vec<[f32; 4]>)> {
+    page_regions
+        .into_iter()
+        .map(|(page, regions)| {
+            let bboxes: Vec<[f32; 4]> = regions
+                .iter()
+                .map(|r| {
+                    if r.len() != 4 {
+                        [0.0, 0.0, 0.0, 0.0]
+                    } else {
+                        [r[0] as f32, r[1] as f32, r[2] as f32, r[3] as f32]
+                    }
+                })
+                .collect();
+            (page, bboxes)
+        })
+        .collect()
+}
+
+fn convert_region_results(results: Vec<crate::PageRegionResult>) -> Vec<PyPageRegionTexts> {
+    results
+        .into_iter()
+        .map(|page_result| PyPageRegionTexts {
+            page: page_result.page,
+            regions: page_result
+                .regions
+                .into_iter()
+                .map(|r| PyRegionText {
+                    text: r.text,
+                    needs_ocr: r.needs_ocr,
+                })
+                .collect(),
+        })
+        .collect()
+}
+
 // ---------------------------------------------------------------------------
 // Public Python API
 // ---------------------------------------------------------------------------
 
 /// Process a PDF file: detect type, extract text, and convert to Markdown.
-///
-/// Args:
-///     path: Path to the PDF file.
-///     pages: Optional list of 1-indexed page numbers to process.
-///
-/// Returns:
-///     PdfResult with markdown, pdf_type, and metadata.
 #[pyfunction]
 #[pyo3(signature = (path, pages=None))]
 fn process_pdf(path: &str, pages: Option<Vec<u32>>) -> PyResult<PyPdfResult> {
@@ -167,13 +301,6 @@ fn process_pdf(path: &str, pages: Option<Vec<u32>>) -> PyResult<PyPdfResult> {
 }
 
 /// Process a PDF from bytes in memory.
-///
-/// Args:
-///     data: PDF file contents as bytes.
-///     pages: Optional list of 1-indexed page numbers to process.
-///
-/// Returns:
-///     PdfResult with markdown, pdf_type, and metadata.
 #[pyfunction]
 #[pyo3(signature = (data, pages=None))]
 fn process_pdf_bytes(data: &[u8], pages: Option<Vec<u32>>) -> PyResult<PyPdfResult> {
@@ -186,12 +313,6 @@ fn process_pdf_bytes(data: &[u8], pages: Option<Vec<u32>>) -> PyResult<PyPdfResu
 }
 
 /// Fast detection only — no text extraction or markdown.
-///
-/// Args:
-///     path: Path to the PDF file.
-///
-/// Returns:
-///     PdfResult with pdf_type and metadata (markdown will be None).
 #[pyfunction]
 fn detect_pdf(path: &str) -> PyResult<PyPdfResult> {
     let result = crate::detect_pdf(path).map_err(to_py_err)?;
@@ -205,26 +326,41 @@ fn detect_pdf_bytes(data: &[u8]) -> PyResult<PyPdfResult> {
     Ok(to_py_result(result))
 }
 
+/// Lightweight PDF classification — returns type, page count, and OCR pages.
+/// Faster than detect_pdf as it skips building the full PdfProcessResult.
+/// Pages in pages_needing_ocr are 0-indexed.
+#[pyfunction]
+fn classify_pdf(path: &str) -> PyResult<PyPdfClassification> {
+    let data = std::fs::read(path).map_err(|e| PyValueError::new_err(e.to_string()))?;
+    classify_pdf_bytes(&data)
+}
+
+/// Lightweight PDF classification from bytes.
+/// Pages in pages_needing_ocr are 0-indexed.
+#[pyfunction]
+fn classify_pdf_bytes(data: &[u8]) -> PyResult<PyPdfClassification> {
+    let result = crate::classify_pdf_mem(data).map_err(to_py_err)?;
+    Ok(PyPdfClassification {
+        pdf_type: pdf_type_str(result.pdf_type),
+        page_count: result.page_count,
+        pages_needing_ocr: result.pages_needing_ocr,
+        confidence: result.confidence,
+    })
+}
+
 /// Extract plain text from a PDF file.
-///
-/// Args:
-///     path: Path to the PDF file.
-///
-/// Returns:
-///     Extracted text as a string.
 #[pyfunction]
 fn extract_text(path: &str) -> PyResult<String> {
     crate::extract_text(path).map_err(to_py_err)
 }
 
-/// Extract text with position information.
-///
-/// Args:
-///     path: Path to the PDF file.
-///     pages: Optional list of 1-indexed page numbers.
-///
-/// Returns:
-///     List of TextItem objects with text, position, font info.
+/// Extract plain text from PDF bytes.
+#[pyfunction]
+fn extract_text_bytes(data: &[u8]) -> PyResult<String> {
+    crate::extractor::extract_text_mem(data).map_err(to_py_err)
+}
+
+/// Extract text with position information from a file.
 #[pyfunction]
 #[pyo3(signature = (path, pages=None))]
 fn extract_text_with_positions(path: &str, pages: Option<Vec<u32>>) -> PyResult<Vec<PyTextItem>> {
@@ -235,35 +371,83 @@ fn extract_text_with_positions(path: &str, pages: Option<Vec<u32>>) -> PyResult<
         }
         None => crate::extract_text_with_positions(path).map_err(to_py_err)?,
     };
+    Ok(convert_text_items(items))
+}
 
-    Ok(items
-        .into_iter()
-        .map(|item| PyTextItem {
-            text: item.text,
-            x: item.x,
-            y: item.y,
-            width: item.width,
-            height: item.height,
-            font: item.font,
-            font_size: item.font_size,
-            page: item.page,
-            is_bold: item.is_bold,
-            is_italic: item.is_italic,
-            item_type: item_type_str(&item.item_type),
-        })
-        .collect())
+/// Extract text with position information from bytes.
+#[pyfunction]
+#[pyo3(signature = (data, pages=None))]
+fn extract_text_with_positions_bytes(
+    data: &[u8],
+    pages: Option<Vec<u32>>,
+) -> PyResult<Vec<PyTextItem>> {
+    let items = match pages {
+        Some(p) => {
+            let page_set: HashSet<u32> = p.into_iter().collect();
+            crate::extractor::extract_text_with_positions_mem_pages(data, Some(&page_set))
+                .map_err(to_py_err)?
+        }
+        None => crate::extractor::extract_text_with_positions_mem(data).map_err(to_py_err)?,
+    };
+    Ok(convert_text_items(items))
+}
+
+/// Extract text within bounding-box regions from a PDF file.
+///
+/// Args:
+///     path: Path to the PDF file.
+///     page_regions: List of (page_0indexed, [[x1, y1, x2, y2], ...]) tuples.
+///         Coordinates are PDF points with top-left origin.
+///
+/// Returns:
+///     List of PageRegionTexts with per-region text and needs_ocr flag.
+#[pyfunction]
+fn extract_text_in_regions(
+    path: &str,
+    page_regions: Vec<(u32, Vec<Vec<f64>>)>,
+) -> PyResult<Vec<PyPageRegionTexts>> {
+    let data = std::fs::read(path).map_err(|e| PyValueError::new_err(e.to_string()))?;
+    extract_text_in_regions_bytes(&data, page_regions)
+}
+
+/// Extract text within bounding-box regions from PDF bytes.
+///
+/// Args:
+///     data: PDF file contents as bytes.
+///     page_regions: List of (page_0indexed, [[x1, y1, x2, y2], ...]) tuples.
+///         Coordinates are PDF points with top-left origin.
+///
+/// Returns:
+///     List of PageRegionTexts with per-region text and needs_ocr flag.
+#[pyfunction]
+fn extract_text_in_regions_bytes(
+    data: &[u8],
+    page_regions: Vec<(u32, Vec<Vec<f64>>)>,
+) -> PyResult<Vec<PyPageRegionTexts>> {
+    let regions = parse_page_regions(page_regions);
+    let results = crate::extract_text_in_regions_mem(data, &regions).map_err(to_py_err)?;
+    Ok(convert_region_results(results))
 }
 
 /// Python module definition.
 #[pymodule]
 fn pdf_inspector(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<PyPdfResult>()?;
+    m.add_class::<PyPdfClassification>()?;
     m.add_class::<PyTextItem>()?;
+    m.add_class::<PyRegionText>()?;
+    m.add_class::<PyPageRegionTexts>()?;
     m.add_function(wrap_pyfunction!(process_pdf, m)?)?;
     m.add_function(wrap_pyfunction!(process_pdf_bytes, m)?)?;
     m.add_function(wrap_pyfunction!(detect_pdf, m)?)?;
     m.add_function(wrap_pyfunction!(detect_pdf_bytes, m)?)?;
+    m.add_function(wrap_pyfunction!(classify_pdf, m)?)?;
+    m.add_function(wrap_pyfunction!(classify_pdf_bytes, m)?)?;
     m.add_function(wrap_pyfunction!(extract_text, m)?)?;
+    m.add_function(wrap_pyfunction!(extract_text_bytes, m)?)?;
     m.add_function(wrap_pyfunction!(extract_text_with_positions, m)?)?;
+    m.add_function(wrap_pyfunction!(extract_text_with_positions_bytes, m)?)?;
+    m.add_function(wrap_pyfunction!(extract_text_in_regions, m)?)?;
+    m.add_function(wrap_pyfunction!(extract_text_in_regions_bytes, m)?)?;
     Ok(())
 }
diff --git a/tests/test_python.py b/tests/test_python.py
index 379f8b4..f8daffa 100644
--- a/tests/test_python.py
+++ b/tests/test_python.py
@@ -11,6 +11,11 @@ def fixture_path(name: str) -> str:
     return os.path.join(FIXTURES_DIR, name)
 
 
+def fixture_bytes(name: str) -> bytes:
+    with open(fixture_path(name), "rb") as f:
+        return f.read()
+
+
 # ---------------------------------------------------------------------------
 # process_pdf
 # ---------------------------------------------------------------------------
@@ -61,15 +66,13 @@ def test_result_fields(self):
 
 class TestProcessPdfBytes:
     def test_basic(self):
-        with open(fixture_path("thermo-freon12.pdf"), "rb") as f:
-            data = f.read()
+        data = fixture_bytes("thermo-freon12.pdf")
         result = pdf_inspector.process_pdf_bytes(data)
         assert result.pdf_type == "text_based"
         assert result.markdown is not None
 
     def test_with_pages(self):
-        with open(fixture_path("thermo-freon12.pdf"), "rb") as f:
-            data = f.read()
+        data = fixture_bytes("thermo-freon12.pdf")
         result = pdf_inspector.process_pdf_bytes(data, pages=[1, 2])
         assert result.markdown is not None
 
@@ -87,15 +90,48 @@ def test_detect_file(self):
         assert result.page_count == 3
 
     def test_detect_bytes(self):
-        with open(fixture_path("thermo-freon12.pdf"), "rb") as f:
-            data = f.read()
+        data = fixture_bytes("thermo-freon12.pdf")
         result = pdf_inspector.detect_pdf_bytes(data)
         assert result.pdf_type == "text_based"
         assert result.markdown is None
 
 
 # ---------------------------------------------------------------------------
-# extract_text
+# classify_pdf / classify_pdf_bytes
+# ---------------------------------------------------------------------------
+
+
+class TestClassifyPdf:
+    def test_classify_file(self):
+        result = pdf_inspector.classify_pdf(fixture_path("thermo-freon12.pdf"))
+        assert result.pdf_type == "text_based"
+        assert result.page_count == 3
+        assert result.confidence > 0.0
+        assert isinstance(result.pages_needing_ocr, list)
+
+    def test_classify_bytes(self):
+        data = fixture_bytes("thermo-freon12.pdf")
+        result = pdf_inspector.classify_pdf_bytes(data)
+        assert result.pdf_type == "text_based"
+        assert result.page_count == 3
+        assert result.confidence > 0.0
+
+    def test_classify_repr(self):
+        result = pdf_inspector.classify_pdf(fixture_path("thermo-freon12.pdf"))
+        r = repr(result)
+        assert "PdfClassification" in r
+        assert "text_based" in r
+
+    def test_classify_fields(self):
+        result = pdf_inspector.classify_pdf(fixture_path("thermo-freon12.pdf"))
+        assert isinstance(result.pdf_type, str)
+        assert isinstance(result.page_count, int)
+        assert isinstance(result.pages_needing_ocr, list)
+        assert isinstance(result.confidence, float)
+
+
+# ---------------------------------------------------------------------------
+# extract_text / extract_text_bytes
 # ---------------------------------------------------------------------------
 
 
@@ -105,9 +141,20 @@ def test_basic(self):
         assert isinstance(text, str)
         assert len(text) > 0
 
+    def test_bytes(self):
+        data = fixture_bytes("thermo-freon12.pdf")
+        text = pdf_inspector.extract_text_bytes(data)
+        assert isinstance(text, str)
+        assert len(text) > 0
+
+    def test_bytes_matches_file(self):
+        text_file = pdf_inspector.extract_text(fixture_path("thermo-freon12.pdf"))
+        text_bytes = pdf_inspector.extract_text_bytes(fixture_bytes("thermo-freon12.pdf"))
+        assert text_file == text_bytes
+
 
 # ---------------------------------------------------------------------------
-# extract_text_with_positions
+# extract_text_with_positions / extract_text_with_positions_bytes
 # ---------------------------------------------------------------------------
 
 
@@ -144,6 +191,77 @@ def test_repr(self):
         r = repr(items[0])
         assert "TextItem" in r
 
+    def test_bytes(self):
+        data = fixture_bytes("thermo-freon12.pdf")
+        items = pdf_inspector.extract_text_with_positions_bytes(data)
+        assert len(items) > 0
+        assert isinstance(items[0].text, str)
+
+    def test_bytes_with_pages(self):
+        data = fixture_bytes("thermo-freon12.pdf")
+        items = pdf_inspector.extract_text_with_positions_bytes(data, pages=[1])
+        assert len(items) > 0
+        assert all(item.page == 1 for item in items)
+
+
+# ---------------------------------------------------------------------------
+# extract_text_in_regions / extract_text_in_regions_bytes
+# ---------------------------------------------------------------------------
+
+
+class TestExtractTextInRegions:
+    def test_file(self):
+        results = pdf_inspector.extract_text_in_regions(
+            fixture_path("thermo-freon12.pdf"),
+            [(0, [[0.0, 0.0, 600.0, 100.0]])],
+        )
+        assert len(results) == 1
+        assert results[0].page == 0
+        assert len(results[0].regions) == 1
+        assert isinstance(results[0].regions[0].text, str)
+        assert isinstance(results[0].regions[0].needs_ocr, bool)
+
+    def test_bytes(self):
+        data = fixture_bytes("thermo-freon12.pdf")
+        results = pdf_inspector.extract_text_in_regions_bytes(
+            data,
+            [(0, [[0.0, 0.0, 600.0, 100.0]])],
+        )
+        assert len(results) == 1
+        assert results[0].page == 0
+        assert len(results[0].regions) == 1
+        assert isinstance(results[0].regions[0].text, str)
+
+    def test_repr(self):
+        results = pdf_inspector.extract_text_in_regions(
+            fixture_path("thermo-freon12.pdf"),
+            [(0, [[0.0, 0.0, 600.0, 100.0]])],
+        )
+        r = repr(results[0])
+        assert "PageRegionTexts" in r
+        r2 = repr(results[0].regions[0])
+        assert "RegionText" in r2
+
+    def test_multiple_regions(self):
+        results = pdf_inspector.extract_text_in_regions(
+            fixture_path("thermo-freon12.pdf"),
+            [(0, [[0.0, 0.0, 300.0, 100.0], [300.0, 0.0, 600.0, 100.0]])],
+        )
+        assert len(results) == 1
+        assert len(results[0].regions) == 2
+
+    def test_multiple_pages(self):
+        results = pdf_inspector.extract_text_in_regions(
+            fixture_path("thermo-freon12.pdf"),
+            [
+                (0, [[0.0, 0.0, 600.0, 100.0]]),
+                (1, [[0.0, 0.0, 600.0, 100.0]]),
+            ],
+        )
+        assert len(results) == 2
+        assert results[0].page == 0
+        assert results[1].page == 1
+
 
 # ---------------------------------------------------------------------------
 # Error handling
@@ -163,6 +281,24 @@ def test_empty_bytes(self):
         with pytest.raises(ValueError):
             pdf_inspector.process_pdf_bytes(b"")
 
+    def test_classify_not_a_pdf(self):
+        with pytest.raises(ValueError):
+            pdf_inspector.classify_pdf_bytes(b"not a pdf")
+
+    def test_classify_nonexistent(self):
+        with pytest.raises((ValueError, OSError)):
+            pdf_inspector.classify_pdf("/nonexistent/file.pdf")
+
+    def test_extract_text_bytes_not_a_pdf(self):
+        with pytest.raises(ValueError):
+            pdf_inspector.extract_text_bytes(b"not a pdf")
+
+    def test_regions_not_a_pdf(self):
+        with pytest.raises(ValueError):
+            pdf_inspector.extract_text_in_regions_bytes(
+                b"not a pdf", [(0, [[0.0, 0.0, 100.0, 100.0]])]
+            )
+
 
 # ---------------------------------------------------------------------------
 # Multiple fixtures

From 2e874f6cfd6991b1c23740b20cc0d0414dacf523 Mon Sep 17 00:00:00 2001
From: Abimael Martell <abimex@gmail.com>
Date: Thu, 2 Apr 2026 12:18:06 -0700
Subject: [PATCH 3/3] update napi index.js version strings to match 0.2.2

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 napi/index.js | 104 +++++++++++++++++++++++++-------------------------
 1 file changed, 52 insertions(+), 52 deletions(-)

diff --git a/napi/index.js b/napi/index.js
index 41d142e..df6cd7e 100644
--- a/napi/index.js
+++ b/napi/index.js
@@ -77,8 +77,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-android-arm64')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-android-arm64/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {
@@ -93,8 +93,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-android-arm-eabi')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-android-arm-eabi/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {
@@ -114,8 +114,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-win32-x64-gnu')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-win32-x64-gnu/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {
@@ -130,8 +130,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-win32-x64-msvc')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-win32-x64-msvc/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {
@@ -147,8 +147,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-win32-ia32-msvc')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-win32-ia32-msvc/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {
@@ -163,8 +163,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-win32-arm64-msvc')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-win32-arm64-msvc/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {
@@ -182,8 +182,8 @@ function requireNative() {
     try {
       const binding = require('firecrawl-pdf-inspector-darwin-universal')
       const bindingPackageVersion = require('firecrawl-pdf-inspector-darwin-universal/package.json').version
-      if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-        throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+      if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+        throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
       }
       return binding
     } catch (e) {
@@ -198,8 +198,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-darwin-x64')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-darwin-x64/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {
@@ -214,8 +214,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-darwin-arm64')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-darwin-arm64/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {
@@ -234,8 +234,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-freebsd-x64')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-freebsd-x64/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {
@@ -250,8 +250,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-freebsd-arm64')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-freebsd-arm64/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {
@@ -271,8 +271,8 @@ function requireNative() {
         try {
           const binding = require('firecrawl-pdf-inspector-linux-x64-musl')
           const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-x64-musl/package.json').version
-          if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-            throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+          if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+            throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
           }
           return binding
         } catch (e) {
@@ -287,8 +287,8 @@ function requireNative() {
         try {
           const binding = require('firecrawl-pdf-inspector-linux-x64-gnu')
           const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-x64-gnu/package.json').version
-          if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-            throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+          if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+            throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
           }
           return binding
         } catch (e) {
@@ -305,8 +305,8 @@ function requireNative() {
         try {
           const binding = require('firecrawl-pdf-inspector-linux-arm64-musl')
           const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-arm64-musl/package.json').version
-          if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-            throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+          if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+            throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
           }
           return binding
         } catch (e) {
@@ -321,8 +321,8 @@ function requireNative() {
         try {
           const binding = require('firecrawl-pdf-inspector-linux-arm64-gnu')
           const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-arm64-gnu/package.json').version
-          if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-            throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+          if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+            throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
           }
           return binding
         } catch (e) {
@@ -339,8 +339,8 @@ function requireNative() {
         try {
           const binding = require('firecrawl-pdf-inspector-linux-arm-musleabihf')
           const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-arm-musleabihf/package.json').version
-          if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-            throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+          if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+            throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
           }
           return binding
         } catch (e) {
@@ -355,8 +355,8 @@ function requireNative() {
         try {
           const binding = require('firecrawl-pdf-inspector-linux-arm-gnueabihf')
           const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-arm-gnueabihf/package.json').version
-          if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-            throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+          if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+            throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
           }
           return binding
         } catch (e) {
@@ -373,8 +373,8 @@ function requireNative() {
         try {
           const binding = require('firecrawl-pdf-inspector-linux-loong64-musl')
           const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-loong64-musl/package.json').version
-          if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-            throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+          if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+            throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
           }
           return binding
         } catch (e) {
@@ -389,8 +389,8 @@ function requireNative() {
         try {
           const binding = require('firecrawl-pdf-inspector-linux-loong64-gnu')
           const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-loong64-gnu/package.json').version
-          if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-            throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+          if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+            throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
           }
           return binding
         } catch (e) {
@@ -407,8 +407,8 @@ function requireNative() {
         try {
           const binding = require('firecrawl-pdf-inspector-linux-riscv64-musl')
           const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-riscv64-musl/package.json').version
-          if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-            throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+          if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+            throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
           }
           return binding
         } catch (e) {
@@ -423,8 +423,8 @@ function requireNative() {
         try {
           const binding = require('firecrawl-pdf-inspector-linux-riscv64-gnu')
           const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-riscv64-gnu/package.json').version
-          if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-            throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+          if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+            throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
           }
           return binding
         } catch (e) {
@@ -440,8 +440,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-linux-ppc64-gnu')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-ppc64-gnu/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {
@@ -456,8 +456,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-linux-s390x-gnu')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-linux-s390x-gnu/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {
@@ -476,8 +476,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-openharmony-arm64')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-openharmony-arm64/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {
@@ -492,8 +492,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-openharmony-x64')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-openharmony-x64/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {
@@ -508,8 +508,8 @@ function requireNative() {
       try {
         const binding = require('firecrawl-pdf-inspector-openharmony-arm')
         const bindingPackageVersion = require('firecrawl-pdf-inspector-openharmony-arm/package.json').version
-        if (bindingPackageVersion !== '0.2.0' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
-          throw new Error(`Native binding package version mismatch, expected 0.2.0 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
+        if (bindingPackageVersion !== '0.2.2' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
+          throw new Error(`Native binding package version mismatch, expected 0.2.2 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
         }
         return binding
       } catch (e) {