zalun · zalun · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 
+## [0.1.5] - 2026-04-03
+
+### Added
+- `src/docproc/vision.py` — Async Vision LLM extraction via DeepFellow OpenAI-compatible API
+- PDF-to-image conversion using PyMuPDF (zero system dependencies)
+- Base64 image encoding and per-page Vision API calls
+- Retry logic with exponential backoff (3 attempts, 1s initial delay, 2x factor)
+- `pymupdf` dependency for local PDF rendering
+- Test suite for Vision module (~27 tests)
+
 ## [0.1.4] - 2026-02-27
 
 ### Added

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "docproc"
-version = "0.1.4"
+version = "0.1.5"
 requires-python = ">=3.14"
 dependencies = [
     "watchdog>=4.0.0",
@@ -10,6 +10,7 @@ dependencies = [
     "gradio>=4.0.0",
     "python-dotenv>=1.0.0",
     "httpx>=0.28.0",
+    "pymupdf>=1.25.0",
 ]
 
 [build-system]

diff --git a/src/docproc/__init__.py b/src/docproc/__init__.py
@@ -1 +1 @@
-__version__ = "0.1.4"
+__version__ = "0.1.5"
diff --git a/src/docproc/vision.py b/src/docproc/vision.py
@@ -0,0 +1,197 @@
+"""Vision LLM extraction via DeepFellow OpenAI-compatible API.
+
+Converts PDF pages to images using PyMuPDF, sends base64-encoded images
+to a vision model via chat completions, and returns structured markdown.
+Runs async for parallel execution with OCR extraction.
+"""
+
+import asyncio
+import base64
+import logging
+from pathlib import Path
+
+import pymupdf
+from openai import APIConnectionError, APIStatusError, AsyncOpenAI
+
+from docproc.config import Config
+from docproc.models import VisionResult
+
+logger = logging.getLogger(__name__)
+
+SUPPORTED_EXTENSIONS = frozenset({".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".tif"})
+_PDF_EXTENSION = ".pdf"
+_IMAGE_DPI = 150
+_MAX_RETRIES = 3
+_INITIAL_DELAY = 1.0
+_BACKOFF_FACTOR = 2.0
+_MAX_TOKENS = 4096
+
+_EXTRACTION_PROMPT = """\
+Extract all text content from this document image.
+Preserve the structure including headers, paragraphs, and lists.
+Format tables as markdown tables.
+Note any structural elements (letterhead, signatures, stamps).
+Output everything in clean markdown format.\
+"""
+
+
+class VisionError(Exception):
+    """Raised when Vision extraction fails."""
+
+
+def _validate_file(file_path: Path) -> None:
+    if not file_path.is_file():
+        msg = f"File not found or not a regular file: {file_path}"
+        raise VisionError(msg)
+    ext = file_path.suffix.lower()
+    if ext not in SUPPORTED_EXTENSIONS:
+        msg = f"Unsupported file type: {ext}"
+        raise VisionError(msg)
+
+
+def _pdf_to_images(file_path: Path) -> list[bytes]:
+    """Render each PDF page as PNG bytes using PyMuPDF."""
+    try:
+        doc = pymupdf.open(file_path)
+    except Exception as exc:
+        msg = f"Failed to open PDF: {file_path}: {exc}"
+        raise VisionError(msg) from exc
+    try:
+        images = []
+        for page in doc:
+            pix = page.get_pixmap(dpi=_IMAGE_DPI)
+            images.append(pix.tobytes("png"))
+        return images
+    except Exception as exc:
+        msg = f"Failed to render PDF pages: {file_path}: {exc}"
+        raise VisionError(msg) from exc
+    finally:
+        doc.close()
+
+
+def _encode_image(image_bytes: bytes) -> str:
+    return base64.b64encode(image_bytes).decode("utf-8")
+
+
+async def _call_vision_api(
+    client: AsyncOpenAI,
+    model: str,
+    encoded_image: str,
+) -> str:
+    """Send a single image to the vision model with retry on failures."""
+    delay = _INITIAL_DELAY
+    last_error: Exception | None = None
+
+    for attempt in range(1, _MAX_RETRIES + 1):
+        try:
+            response = await client.chat.completions.create(
+                model=model,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": _EXTRACTION_PROMPT},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/png;base64,{encoded_image}"
+                                },
+                            },
+                        ],
+                    }
+                ],
+                max_tokens=_MAX_TOKENS,
+            )
+            if not response.choices:
+                msg = "Vision API returned empty choices"
+                raise VisionError(msg)
+            content = response.choices[0].message.content
+            if not content:
+                logger.warning(
+                    "Vision API returned empty content on attempt %d",
+                    attempt,
+                )
+            return content or ""
+
+        except APIStatusError as exc:
+            if exc.status_code >= 500:
+                last_error = exc
+                logger.warning(
+                    "Vision attempt %d/%d failed (HTTP %d): %s",
+                    attempt,
+                    _MAX_RETRIES,
+                    exc.status_code,
+                    str(exc)[:200],
+                )
+                if attempt < _MAX_RETRIES:
+                    await asyncio.sleep(delay)
+                    delay *= _BACKOFF_FACTOR
+                continue
+            msg = f"Client error {exc.status_code}: {exc}"
+            raise VisionError(msg) from exc
+
+        except APIConnectionError as exc:
+            last_error = exc
+            logger.warning(
+                "Vision attempt %d/%d failed with connection error: %s",
+                attempt,
+                _MAX_RETRIES,
+                exc,
+            )
+            if attempt < _MAX_RETRIES:
+                await asyncio.sleep(delay)
+                delay *= _BACKOFF_FACTOR
+
+    msg = f"Vision extraction failed after {_MAX_RETRIES} attempts"
+    raise VisionError(msg) from last_error
+
+
+async def extract_with_vision(file_path: Path, config: Config) -> VisionResult:
+    """Extract content from a document using a Vision LLM model.
+
+    Converts PDF pages to images, sends each to the vision model,
+    and returns structured markdown content.
+
+    Args:
+        file_path: Path to PDF or image file.
+        config: Application configuration.
+
+    Returns:
+        VisionResult with markdown content.
+
+    Raises:
+        VisionError: If extraction fails after retries.
+    """
+    _validate_file(file_path)
+
+    ext = file_path.suffix.lower()
+    if ext == _PDF_EXTENSION:
+        image_pages = _pdf_to_images(file_path)
+    else:
+        try:
+            image_pages = [file_path.read_bytes()]
+        except OSError as exc:
+            msg = f"Failed to read file {file_path}: {exc}"
+            raise VisionError(msg) from exc
+
+    logger.info(
+        "Starting Vision extraction: %s (%d pages)",
+        file_path.name,
+        len(image_pages),
+    )
+
+    client = AsyncOpenAI(
+        base_url=config.deepfellow.base_url,
+        api_key=config.deepfellow.api_key,
+    )
+
+    page_results = []
+    for i, page_bytes in enumerate(image_pages, 1):
+        encoded = _encode_image(page_bytes)
+        text = await _call_vision_api(client, config.deepfellow.vision_model, encoded)
+        page_results.append(text)
+        logger.debug("Vision page %d/%d complete", i, len(image_pages))
+
+    content = "\n\n".join(page_results)
+    logger.info("Vision complete: %s (%d pages)", file_path.name, len(image_pages))
+    return VisionResult(content=content)
diff --git a/tests/test_init.py b/tests/test_init.py
@@ -2,4 +2,4 @@
 
 
 def test_version_matches_expected():
-    assert __version__ == "0.1.4"
+    assert __version__ == "0.1.5"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,4 +2,4 @@


		def test_version_matches_expected():
		assert __version__ == "0.1.4"
		assert __version__ == "0.1.5"