Open-Source-Legal · JSv4 · Dec 29, 2025 · Dec 27, 2025 · Dec 27, 2025 · Dec 27, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+#### LlamaParse Document Parser Integration (Issue #692)
+- **New LlamaParseParser** (`opencontractserver/pipeline/parsers/llamaparse_parser.py`): Full integration with LlamaParse API for document parsing with layout extraction
+  - Supports PDF and DOCX file types
+  - Extracts structural annotations (Title, Heading, Paragraph, Table, Figure, List, etc.) with bounding boxes
+  - Generates PAWLS tokens from LlamaParse layout data for PDF annotation display
+  - Supports multiple bounding box formats (fractional 0-1, absolute coordinates, array format)
+  - Configurable via environment variables or Django settings
+- **Environment variable configuration**:
+  - `LLAMAPARSE_API_KEY` / `LLAMA_CLOUD_API_KEY`: API key for LlamaParse authentication
+  - `LLAMAPARSE_RESULT_TYPE`: Output type ("json", "markdown", "text") - default: "json"
+  - `LLAMAPARSE_EXTRACT_LAYOUT`: Enable layout extraction with bounding boxes - default: True
+  - `LLAMAPARSE_NUM_WORKERS`: Parallel processing workers - default: 4
+  - `LLAMAPARSE_LANGUAGE`: Document language - default: "en"
+  - `LLAMAPARSE_VERBOSE`: Enable verbose logging - default: False
+- **Parser selection via environment variable**:
+  - `PDF_PARSER`: Set to "llamaparse", "docling" (default), or "nlm" to select default PDF parser
+  - Location: `config/settings/base.py:740-765`
+- **Comprehensive test suite** (`opencontractserver/tests/test_doc_parser_llamaparse.py`):
+  - Tests for successful parsing with layout extraction
+  - Tests for markdown mode without layout
+  - Tests for bounding box format conversion (fractional, absolute, array)
+  - Tests for annotation creation and token generation
+  - Tests for error handling (missing API key, API errors, empty results)
+  - Tests for configuration via settings and kwargs override
+
 #### Thread/Message Triggered Corpus Actions for Automated Moderation
 - **Extended CorpusActionTrigger enum** with `NEW_THREAD` and `NEW_MESSAGE` triggers (`opencontractserver/corpuses/models.py:849-854`) to enable automated moderation of discussion threads
 - **New moderation tools** (`opencontractserver/llms/tools/moderation_tools.py`): 9 tools for thread moderation including:
@@ -44,8 +69,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Automated responses (e.g., welcome messages for new threads)
 - Content classification (e.g., auto-pin important announcements)
 
-### Added
-
 #### Proactive Apollo Cache Management System (PR #725)
 - **New `CacheManager` service** (`frontend/src/services/cacheManager.ts`): Centralized Apollo cache management with debouncing, targeted invalidation, and auth-aware cache operations
   - `resetOnAuthChange()`: Full cache clear with optional refetch for login/logout transitions
@@ -56,6 +79,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - **New `useCacheManager` hook** (`frontend/src/hooks/useCacheManager.ts`): React hook with memoized CacheManager instance and stable callback references
 - **Comprehensive test suite** (`frontend/src/services/__tests__/cacheManager.test.ts`, `frontend/src/hooks/__tests__/useCacheManager.test.tsx`): 30+ tests covering debouncing, error handling, lifecycle, singleton management, and auth scenarios
 
+### Technical Details
+
+#### LlamaParse Parser Architecture
+- Uses `llama-parse` library for API communication
+- JSON mode with `extract_layout=True` provides bounding boxes as fractions of page dimensions (0-1)
+- Converts LlamaParse layout elements to OpenContracts structural annotations
+- Generates PAWLS tokens by splitting text into words and distributing across bounding box
+- Element type mapping converts LlamaParse labels (title, paragraph, table, etc.) to OpenContracts annotation labels
+- Falls back to text extraction mode when layout extraction is disabled
+
 ### Fixed
 
 #### Cache Management Race Condition Fix (PR #725)

diff --git a/config/settings/base.py b/config/settings/base.py
@@ -657,6 +657,16 @@
 )
 use_cloud_run_iam_auth = True
 
+# LlamaParse Settings - for LlamaParse document parser
+# Supports both LLAMAPARSE_API_KEY and LLAMA_CLOUD_API_KEY (LlamaIndex's default env var)
+_llamaparse_key = env.str("LLAMAPARSE_API_KEY", default="")
+LLAMAPARSE_API_KEY = _llamaparse_key or env.str("LLAMA_CLOUD_API_KEY", default="")
+LLAMAPARSE_RESULT_TYPE = env.str("LLAMAPARSE_RESULT_TYPE", default="json")
+LLAMAPARSE_EXTRACT_LAYOUT = env.bool("LLAMAPARSE_EXTRACT_LAYOUT", default=True)
+LLAMAPARSE_NUM_WORKERS = env.int("LLAMAPARSE_NUM_WORKERS", default=4)
+LLAMAPARSE_LANGUAGE = env.str("LLAMAPARSE_LANGUAGE", default="en")
+LLAMAPARSE_VERBOSE = env.bool("LLAMAPARSE_VERBOSE", default=False)
+
 # LLM SETTING
 OPENAI_API_KEY = env.str("OPENAI_API_KEY", default="")
 OPENAI_MODEL = env.str("OPENAI_MODEL", default="gpt-4o")
@@ -728,13 +738,29 @@
     "SENTENCE_TRANSFORMER_MODELS_PATH", default="/models/sentence-transformers"
 )
 
+# Parser selection via environment variable
+# Options: "docling" (default), "llamaparse", "nlm"
+PDF_PARSER = env.str("PDF_PARSER", default="docling")
+
+# Map parser names to their full paths
+_PDF_PARSER_MAP = {
+    "docling": "opencontractserver.pipeline.parsers.docling_parser_rest.DoclingParser",
+    "llamaparse": "opencontractserver.pipeline.parsers.llamaparse_parser.LlamaParseParser",
+    "nlm": "opencontractserver.pipeline.parsers.nlm_ingest_parser.NLMIngestParser",
+}
+
+# Get the selected PDF parser (with fallback to docling)
+_SELECTED_PDF_PARSER = _PDF_PARSER_MAP.get(
+    PDF_PARSER.lower(), _PDF_PARSER_MAP["docling"]
+)
+
 # Preferred parsers for each MIME type
 PREFERRED_PARSERS = {
-    "application/pdf": "opencontractserver.pipeline.parsers.docling_parser_rest.DoclingParser",
+    "application/pdf": _SELECTED_PDF_PARSER,
     "text/plain": "opencontractserver.pipeline.parsers.oc_text_parser.TxtParser",
     "application/txt": "opencontractserver.pipeline.parsers.oc_text_parser.TxtParser",
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "opencontractserver.pipeline.parsers.docling_parser_rest.DoclingParser",  # noqa
-    "application/vnd.openxmlformats-officedocument.presentationml.presentation": "opencontractserver.pipeline.parsers.docling_parser_rest.DoclingParser",  # noqa
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": _SELECTED_PDF_PARSER,  # noqa
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation": _SELECTED_PDF_PARSER,  # noqa
     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "opencontractserver.pipeline.parsers.docling_parser_rest.DoclingParser",  # noqa
 }
 
@@ -802,6 +828,14 @@
         "api_key": "",
         "use_ocr": True,
     },
+    "opencontractserver.pipeline.parsers.llamaparse_parser.LlamaParseParser": {
+        "api_key": LLAMAPARSE_API_KEY,
+        "result_type": "json",
+        "extract_layout": True,
+        "num_workers": 4,
+        "language": "en",
+        "verbose": False,
+    },
 }
 
 # Analyzers

diff --git a/docs/pipelines/docling_parser.md b/docs/pipelines/docling_parser.md
@@ -316,6 +316,8 @@ Common issues and solutions:
 ## See Also
 
 - [Pipeline Overview](pipeline_overview.md)
+- [LlamaParse Parser](llamaparse_parser.md) - Cloud-based alternative
+- [NLM-Ingest Parser](nlm_ingest_parser.md) - Another local alternative
 - [PDF Data Layer Architecture](../architecture/PDF-data-layer.md)
 - [Document Processing Flow](../architecture/asynchronous-processing.md)
 - [Docling Library](https://github.com/DS4SD/docling)