firecrawl · abimaelmartell · Apr 2, 2026 · Mar 12, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -8,7 +8,14 @@ description = "Fast PDF inspection, classification, and text extraction with sma
 license = "MIT"
 repository = "https://github.com/firecrawl/pdf-inspector"
 
+[lib]
+name = "pdf_inspector"
+crate-type = ["lib", "cdylib"]
+
 [dependencies]
+# Python bindings
+pyo3 = { version = "0.25", features = ["extension-module"], optional = true }
+
 # PDF parsing
 lopdf = { git = "https://github.com/J-F-Liu/lopdf", rev = "052674053814a9f4897af94f0b8e46a545c9b329", features = ["rayon"] }
 
@@ -35,6 +42,7 @@ tempfile = "3.3"
 
 [features]
 default = []
+python = ["pyo3"]
 
 [[bin]]
 name = "pdf2md"

diff --git a/README.md b/README.md
@@ -15,10 +15,120 @@ Built by [Firecrawl](https://firecrawl.dev) to handle text-based PDFs locally in
 - **Encoding issue detection** — Automatically flags broken font encodings (garbled text, replacement characters) so callers can fall back to OCR.
 - **Single document load** — The document is parsed once and shared between detection and extraction, avoiding redundant I/O.
 - **Lightweight** — Pure Rust, no ML models, no external services. Single dependency on `lopdf` for PDF parsing.
+- **Python bindings** — Use from Python via PyO3. Install with `pip install pdf-inspector` or build from source with `maturin`.
 
 ## Quick start
 
-### As a library
+### Python
+
+Install from source (requires Rust toolchain):
+
+```bash
+pip install maturin
+maturin develop --release
+```
+
+Use it:
+
+```python
+import pdf_inspector
+
+# Full processing: detect + extract + convert to Markdown
+result = pdf_inspector.process_pdf("document.pdf")
+print(result.pdf_type)      # "text_based", "scanned", "image_based", "mixed"
+print(result.confidence)     # 0.0 - 1.0
+print(result.page_count)     # number of pages
+print(result.markdown)       # Markdown string or None
+
+# Process specific pages only
+result = pdf_inspector.process_pdf("document.pdf", pages=[1, 3, 5])
+
+# Process from bytes (no filesystem needed)
+with open("document.pdf", "rb") as f:
+    result = pdf_inspector.process_pdf_bytes(f.read())
+
+# Fast detection only (no text extraction)
+result = pdf_inspector.detect_pdf("document.pdf")
+if result.pdf_type == "text_based":
+    print("Can extract locally!")
+else:
+    print(f"Pages needing OCR: {result.pages_needing_ocr}")
+
+# Plain text extraction
+text = pdf_inspector.extract_text("document.pdf")
+
+# Positioned text items with font info
+items = pdf_inspector.extract_text_with_positions("document.pdf")
+for item in items[:5]:
+    print(f"'{item.text}' at ({item.x:.0f}, {item.y:.0f}) size={item.font_size}")
+```
+
+#### Python API reference
+
+| Function | Description |
+|---|---|
+| `process_pdf(path, pages=None)` | Full processing (detect + extract + markdown) |
+| `process_pdf_bytes(data, pages=None)` | Full processing from bytes |
+| `detect_pdf(path)` | Fast detection only (returns PdfResult) |
+| `detect_pdf_bytes(data)` | Fast detection from bytes |
+| `classify_pdf(path)` | Lightweight classification (returns PdfClassification) |
+| `classify_pdf_bytes(data)` | Lightweight classification from bytes |
+| `extract_text(path)` | Plain text extraction |
+| `extract_text_bytes(data)` | Plain text extraction from bytes |
+| `extract_text_with_positions(path, pages=None)` | Text with X/Y coords and font info |
+| `extract_text_with_positions_bytes(data, pages=None)` | Text with positions from bytes |
+| `extract_text_in_regions(path, page_regions)` | Extract text in bounding-box regions |
+| `extract_text_in_regions_bytes(data, page_regions)` | Region extraction from bytes |
+
+**`PdfResult` fields:** `pdf_type`, `markdown`, `page_count`, `processing_time_ms`, `pages_needing_ocr`, `title`, `confidence`, `is_complex_layout`, `pages_with_tables`, `pages_with_columns`, `has_encoding_issues`
+
+**`PdfClassification` fields:** `pdf_type`, `page_count`, `pages_needing_ocr` (0-indexed), `confidence`
+
+**`TextItem` fields:** `text`, `x`, `y`, `width`, `height`, `font`, `font_size`, `page`, `is_bold`, `is_italic`, `item_type`
+
+**`RegionText` fields:** `text`, `needs_ocr`
+
+**`PageRegionTexts` fields:** `page` (0-indexed), `regions` (list of RegionText)
+
+### Node.js (NAPI)
+
+```bash
+npm install @firecrawl/pdf-inspector-js
+```
+
+```javascript
+import { readFileSync } from 'fs';
+import { processPdf, classifyPdf, extractTextInRegions } from '@firecrawl/pdf-inspector-js';
+
+const buffer = readFileSync('document.pdf');
+
+// Full processing
+const result = processPdf(buffer);
+console.log(result.pdfType);   // "TextBased", "Scanned", "ImageBased", "Mixed"
+console.log(result.markdown);  // Markdown string or null
+
+// Lightweight classification
+const cls = classifyPdf(buffer);
+console.log(cls.pdfType, cls.pagesNeedingOcr);
+
+// Region-based extraction (for hybrid OCR pipelines)
+const regions = extractTextInRegions(buffer, [
+  { page: 0, regions: [[0, 0, 600, 100]] }
+]);
+```
+
+#### Node.js API reference
+
+| Function | Description |
+|---|---|
+| `processPdf(buffer, pages?)` | Full processing (detect + extract + markdown) |
+| `detectPdf(buffer)` | Fast detection only (returns PdfResult) |
+| `classifyPdf(buffer)` | Lightweight classification (returns PdfClassification) |
+| `extractText(buffer)` | Plain text extraction |
+| `extractTextWithPositions(buffer, pages?)` | Text with X/Y coords and font info |
+| `extractTextInRegions(buffer, pageRegions)` | Extract text in bounding-box regions |
+
+### Rust
 
 Add to your `Cargo.toml`:
 
@@ -159,6 +269,7 @@ The document is loaded **once** via `load_document_from_path` / `load_document_f
 ```
 src/
   lib.rs                — Public API, PdfOptions builder, convenience functions
+  python.rs             — PyO3 Python bindings
   types.rs              — Shared types: TextItem, TextLine, PdfRect, ItemType
   text_utils.rs         — Character/text helpers (CJK, RTL, ligatures, bold/italic)
   process_mode.rs       — ProcessMode enum (DetectOnly, Analyze, Full)
@@ -189,7 +300,7 @@ This detects 300+ page PDFs in milliseconds. The result includes `pages_needing_
 | `Sample(n)` | Sample `n` evenly distributed pages (first, last, middle) | Very large PDFs where speed matters more than precision |
 | `Pages(vec)` | Only scan specific 1-indexed page numbers | When the caller knows which pages to check |
 
-## API
+## Rust API
 
 ### Processing modes
 

diff --git a/examples/basic_usage.py b/examples/basic_usage.py
@@ -0,0 +1,98 @@
+"""Basic usage examples for pdf-inspector Python library."""
+
+import sys
+import pdf_inspector
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python basic_usage.py <path-to-pdf>")
+        sys.exit(1)
+
+    path = sys.argv[1]
+
+    # 1. Full processing: detect + extract + markdown
+    print("=" * 60)
+    print("Full processing")
+    print("=" * 60)
+    result = pdf_inspector.process_pdf(path)
+    print(f"Type:       {result.pdf_type}")
+    print(f"Pages:      {result.page_count}")
+    print(f"Confidence: {result.confidence:.0%}")
+    print(f"Time:       {result.processing_time_ms}ms")
+    print(f"Title:      {result.title}")
+    print(f"Complex:    {result.is_complex_layout}")
+    print(f"Tables on:  {result.pages_with_tables}")
+    print(f"Columns on: {result.pages_with_columns}")
+    print(f"Encoding:   {'issues detected' if result.has_encoding_issues else 'ok'}")
+    print(f"OCR needed: {result.pages_needing_ocr or 'none'}")
+    if result.markdown:
+        print(f"\n--- Markdown ({len(result.markdown)} chars) ---")
+        print(result.markdown[:500])
+        if len(result.markdown) > 500:
+            print(f"\n... ({len(result.markdown) - 500} more chars)")
+
+    # 2. Fast detection only
+    print("\n" + "=" * 60)
+    print("Detection only")
+    print("=" * 60)
+    info = pdf_inspector.detect_pdf(path)
+    print(f"Type:       {info.pdf_type}")
+    print(f"Confidence: {info.confidence:.0%}")
+    print(f"Time:       {info.processing_time_ms}ms")
+
+    # 3. From bytes
+    print("\n" + "=" * 60)
+    print("From bytes")
+    print("=" * 60)
+    with open(path, "rb") as f:
+        data = f.read()
+    result = pdf_inspector.process_pdf_bytes(data)
+    print(f"Type: {result.pdf_type}, Pages: {result.page_count}")
+
+    # 4. Plain text
+    print("\n" + "=" * 60)
+    print("Plain text extraction")
+    print("=" * 60)
+    text = pdf_inspector.extract_text(path)
+    print(text[:300])
+
+    # 5. Positioned items
+    print("\n" + "=" * 60)
+    print("Positioned text items (first 10)")
+    print("=" * 60)
+    items = pdf_inspector.extract_text_with_positions(path, pages=[1])
+    for item in items[:10]:
+        bold = " [B]" if item.is_bold else ""
+        italic = " [I]" if item.is_italic else ""
+        print(
+            f"  p{item.page} ({item.x:6.1f}, {item.y:6.1f}) "
+            f"size={item.font_size:5.1f}{bold}{italic} "
+            f"'{item.text}'"
+        )
+
+    # 6. Lightweight classification
+    print("\n" + "=" * 60)
+    print("Lightweight classification")
+    print("=" * 60)
+    cls = pdf_inspector.classify_pdf(path)
+    print(f"Type:       {cls.pdf_type}")
+    print(f"Pages:      {cls.page_count}")
+    print(f"Confidence: {cls.confidence:.0%}")
+    print(f"OCR pages:  {cls.pages_needing_ocr or 'none'} (0-indexed)")
+
+    # 7. Region-based text extraction
+    print("\n" + "=" * 60)
+    print("Region-based text extraction (page 0, top region)")
+    print("=" * 60)
+    regions = pdf_inspector.extract_text_in_regions(
+        path, [(0, [[0.0, 0.0, 600.0, 200.0]])]
+    )
+    for page_result in regions:
+        for i, region in enumerate(page_result.regions):
+            print(f"  Region {i}: needs_ocr={region.needs_ocr}")
+            print(f"  Text: {region.text[:200]}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/napi/build.rs b/napi/build.rs
@@ -1,3 +1,3 @@
 fn main() {
-  napi_build::setup();
+    napi_build::setup();
 }
diff --git a/napi/index.d.ts b/napi/index.d.ts
@@ -1,25 +1,35 @@
 /* auto-generated by NAPI-RS */
 /* eslint-disable */
 /**
- * Classify a PDF: detect type (TextBased/Scanned/Mixed/ImageBased),
- * page count, and which pages need OCR. Takes PDF bytes as Buffer.
+ * Lightweight PDF classification — returns type, page count, and OCR pages.
+ * Faster than detectPdf as it skips building the full PdfResult.
+ * Pages in pagesNeedingOcr are 0-indexed.
  */
 export declare function classifyPdf(buffer: Buffer): PdfClassification
 
+/** Fast detection only — no text extraction or markdown. */
+export declare function detectPdf(buffer: Buffer): PdfResult
+
+/** Extract plain text from a PDF Buffer. */
+export declare function extractText(buffer: Buffer): string
+
 /**
  * Extract text within bounding-box regions from a PDF.
  *
  * For hybrid OCR: layout model detects regions in rendered images,
  * this extracts PDF text within those regions — skipping GPU OCR
  * for text-based pages.
  *
- * Each region result includes `needs_ocr` — set when the extracted text
+ * Each region result includes `needsOcr` — set when the extracted text
  * is unreliable (empty, GID-encoded fonts, garbage, encoding issues).
  *
  * Coordinates are PDF points with top-left origin.
  */
 export declare function extractTextInRegions(buffer: Buffer, pageRegions: Array<PageRegions>): Array<PageRegionTexts>
 
+/** Extract text with position information from a PDF Buffer. */
+export declare function extractTextWithPositions(buffer: Buffer, pages?: Array<number> | undefined | null): Array<TextItem>
+
 /** A page's regions for text extraction: (page_index_0based, bboxes). */
 export interface PageRegions {
   page: number
@@ -37,13 +47,48 @@ export interface PageRegionTexts {
 export interface PdfClassification {
   pdfType: string
   pageCount: number
+  /** 0-indexed page numbers that need OCR. */
+  pagesNeedingOcr: Array<number>
+  confidence: number
+}
+
+/** Full PDF processing result with markdown and metadata. */
+export interface PdfResult {
+  pdfType: string
+  markdown?: string
+  pageCount: number
+  processingTimeMs: number
+  /** 1-indexed page numbers that need OCR. */
   pagesNeedingOcr: Array<number>
+  title?: string
   confidence: number
+  isComplexLayout: boolean
+  pagesWithTables: Array<number>
+  pagesWithColumns: Array<number>
+  hasEncodingIssues: boolean
 }
 
+/** Process a PDF from a Buffer: detect type, extract text, and convert to Markdown. */
+export declare function processPdf(buffer: Buffer, pages?: Array<number> | undefined | null): PdfResult
+
 /** Extracted text for a single region. */
 export interface RegionText {
   text: string
   /** `true` when the text should not be trusted (empty, GID fonts, garbage, encoding issues). */
   needsOcr: boolean
 }
+
+/** A positioned text item extracted from a PDF. */
+export interface TextItem {
+  text: string
+  x: number
+  y: number
+  width: number
+  height: number
+  font: string
+  fontSize: number
+  page: number
+  isBold: boolean
+  isItalic: boolean
+  itemType: string
+}