diff --git a/CHANGELOG.md b/CHANGELOG.md index c82b78d75..a6dd4083b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +#### LlamaParse Document Parser Integration (Issue #692) +- **New LlamaParseParser** (`opencontractserver/pipeline/parsers/llamaparse_parser.py`): Full integration with LlamaParse API for document parsing with layout extraction + - Supports PDF and DOCX file types + - Extracts structural annotations (Title, Heading, Paragraph, Table, Figure, List, etc.) with bounding boxes + - Generates PAWLS tokens from LlamaParse layout data for PDF annotation display + - Supports multiple bounding box formats (fractional 0-1, absolute coordinates, array format) + - Configurable via environment variables or Django settings +- **Environment variable configuration**: + - `LLAMAPARSE_API_KEY` / `LLAMA_CLOUD_API_KEY`: API key for LlamaParse authentication + - `LLAMAPARSE_RESULT_TYPE`: Output type ("json", "markdown", "text") - default: "json" + - `LLAMAPARSE_EXTRACT_LAYOUT`: Enable layout extraction with bounding boxes - default: True + - `LLAMAPARSE_NUM_WORKERS`: Parallel processing workers - default: 4 + - `LLAMAPARSE_LANGUAGE`: Document language - default: "en" + - `LLAMAPARSE_VERBOSE`: Enable verbose logging - default: False +- **Parser selection via environment variable**: + - `PDF_PARSER`: Set to "llamaparse", "docling" (default), or "nlm" to select default PDF parser + - Location: `config/settings/base.py:740-765` +- **Comprehensive test suite** (`opencontractserver/tests/test_doc_parser_llamaparse.py`): + - Tests for successful parsing with layout extraction + - Tests for markdown mode without layout + - Tests for bounding box format conversion (fractional, absolute, array) + - Tests for annotation creation and token generation + - Tests for error handling (missing API key, API errors, empty results) + - Tests for configuration via settings and kwargs override + #### Thread/Message Triggered Corpus Actions for Automated Moderation - **Extended CorpusActionTrigger enum** with `NEW_THREAD` and `NEW_MESSAGE` triggers (`opencontractserver/corpuses/models.py:849-854`) to enable automated moderation of discussion threads - **New moderation tools** (`opencontractserver/llms/tools/moderation_tools.py`): 9 tools for thread moderation including: @@ -44,8 +69,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Automated responses (e.g., welcome messages for new threads) - Content classification (e.g., auto-pin important announcements) -### Added - #### Proactive Apollo Cache Management System (PR #725) - **New `CacheManager` service** (`frontend/src/services/cacheManager.ts`): Centralized Apollo cache management with debouncing, targeted invalidation, and auth-aware cache operations - `resetOnAuthChange()`: Full cache clear with optional refetch for login/logout transitions @@ -56,6 +79,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **New `useCacheManager` hook** (`frontend/src/hooks/useCacheManager.ts`): React hook with memoized CacheManager instance and stable callback references - **Comprehensive test suite** (`frontend/src/services/__tests__/cacheManager.test.ts`, `frontend/src/hooks/__tests__/useCacheManager.test.tsx`): 30+ tests covering debouncing, error handling, lifecycle, singleton management, and auth scenarios +### Technical Details + +#### LlamaParse Parser Architecture +- Uses `llama-parse` library for API communication +- JSON mode with `extract_layout=True` provides bounding boxes as fractions of page dimensions (0-1) +- Converts LlamaParse layout elements to OpenContracts structural annotations +- Generates PAWLS tokens by splitting text into words and distributing across bounding box +- Element type mapping converts LlamaParse labels (title, paragraph, table, etc.) to OpenContracts annotation labels +- Falls back to text extraction mode when layout extraction is disabled + ### Fixed #### Cache Management Race Condition Fix (PR #725) diff --git a/config/settings/base.py b/config/settings/base.py index 0a407a1b8..44f8061d4 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -657,6 +657,16 @@ ) use_cloud_run_iam_auth = True +# LlamaParse Settings - for LlamaParse document parser +# Supports both LLAMAPARSE_API_KEY and LLAMA_CLOUD_API_KEY (LlamaIndex's default env var) +_llamaparse_key = env.str("LLAMAPARSE_API_KEY", default="") +LLAMAPARSE_API_KEY = _llamaparse_key or env.str("LLAMA_CLOUD_API_KEY", default="") +LLAMAPARSE_RESULT_TYPE = env.str("LLAMAPARSE_RESULT_TYPE", default="json") +LLAMAPARSE_EXTRACT_LAYOUT = env.bool("LLAMAPARSE_EXTRACT_LAYOUT", default=True) +LLAMAPARSE_NUM_WORKERS = env.int("LLAMAPARSE_NUM_WORKERS", default=4) +LLAMAPARSE_LANGUAGE = env.str("LLAMAPARSE_LANGUAGE", default="en") +LLAMAPARSE_VERBOSE = env.bool("LLAMAPARSE_VERBOSE", default=False) + # LLM SETTING OPENAI_API_KEY = env.str("OPENAI_API_KEY", default="") OPENAI_MODEL = env.str("OPENAI_MODEL", default="gpt-4o") @@ -728,13 +738,29 @@ "SENTENCE_TRANSFORMER_MODELS_PATH", default="/models/sentence-transformers" ) +# Parser selection via environment variable +# Options: "docling" (default), "llamaparse", "nlm" +PDF_PARSER = env.str("PDF_PARSER", default="docling") + +# Map parser names to their full paths +_PDF_PARSER_MAP = { + "docling": "opencontractserver.pipeline.parsers.docling_parser_rest.DoclingParser", + "llamaparse": "opencontractserver.pipeline.parsers.llamaparse_parser.LlamaParseParser", + "nlm": "opencontractserver.pipeline.parsers.nlm_ingest_parser.NLMIngestParser", +} + +# Get the selected PDF parser (with fallback to docling) +_SELECTED_PDF_PARSER = _PDF_PARSER_MAP.get( + PDF_PARSER.lower(), _PDF_PARSER_MAP["docling"] +) + # Preferred parsers for each MIME type PREFERRED_PARSERS = { - "application/pdf": "opencontractserver.pipeline.parsers.docling_parser_rest.DoclingParser", + "application/pdf": _SELECTED_PDF_PARSER, "text/plain": "opencontractserver.pipeline.parsers.oc_text_parser.TxtParser", "application/txt": "opencontractserver.pipeline.parsers.oc_text_parser.TxtParser", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "opencontractserver.pipeline.parsers.docling_parser_rest.DoclingParser", # noqa - "application/vnd.openxmlformats-officedocument.presentationml.presentation": "opencontractserver.pipeline.parsers.docling_parser_rest.DoclingParser", # noqa + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": _SELECTED_PDF_PARSER, # noqa + "application/vnd.openxmlformats-officedocument.presentationml.presentation": _SELECTED_PDF_PARSER, # noqa "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "opencontractserver.pipeline.parsers.docling_parser_rest.DoclingParser", # noqa } @@ -802,6 +828,14 @@ "api_key": "", "use_ocr": True, }, + "opencontractserver.pipeline.parsers.llamaparse_parser.LlamaParseParser": { + "api_key": LLAMAPARSE_API_KEY, + "result_type": "json", + "extract_layout": True, + "num_workers": 4, + "language": "en", + "verbose": False, + }, } # Analyzers diff --git a/docs/pipelines/docling_parser.md b/docs/pipelines/docling_parser.md index aa24241d1..4d5446a48 100644 --- a/docs/pipelines/docling_parser.md +++ b/docs/pipelines/docling_parser.md @@ -316,6 +316,8 @@ Common issues and solutions: ## See Also - [Pipeline Overview](pipeline_overview.md) +- [LlamaParse Parser](llamaparse_parser.md) - Cloud-based alternative +- [NLM-Ingest Parser](nlm_ingest_parser.md) - Another local alternative - [PDF Data Layer Architecture](../architecture/PDF-data-layer.md) - [Document Processing Flow](../architecture/asynchronous-processing.md) - [Docling Library](https://github.com/DS4SD/docling) diff --git a/docs/pipelines/llamaparse_parser.md b/docs/pipelines/llamaparse_parser.md new file mode 100644 index 000000000..913c57b43 --- /dev/null +++ b/docs/pipelines/llamaparse_parser.md @@ -0,0 +1,376 @@ +# LlamaParse Parser + +## Intro + +The LlamaParse Parser integrates with [LlamaParse](https://cloud.llamaindex.ai/) (from LlamaIndex) to parse PDF and DOCX documents with advanced layout extraction. It provides high-quality structural annotations with bounding boxes, making it ideal for complex document layouts. + +LlamaParse is a cloud-based API service that uses advanced ML models to extract document structure, including titles, headings, paragraphs, tables, figures, and more. Unlike the Docling parser which runs as a local microservice, LlamaParse requires an API key and sends documents to LlamaIndex's cloud infrastructure. + +## Architecture + +```mermaid +sequenceDiagram + participant U as User + participant LP as LlamaParseParser + participant API as LlamaParse Cloud API + participant DB as Database + + U->>LP: parse_document(user_id, doc_id) + LP->>DB: Load document + LP->>LP: Write to temp file + LP->>API: HTTP POST with document + API->>API: ML-based parsing + API->>API: Layout extraction + API-->>LP: JSON with layout data + LP->>LP: Parse bounding boxes + LP->>LP: Create annotations (bbox only) + LP->>DB: Store parsed data + LP-->>U: OpenContractDocExport +``` + +## Features + +- **Cloud-based API**: Uses LlamaIndex's managed parsing infrastructure +- **Layout Extraction**: Returns bounding boxes for all document elements +- **Multiple Output Formats**: Supports JSON (with layout), markdown, and plain text +- **Structural Annotations**: Automatically creates annotations for document structure +- **Multi-format Support**: Parses both PDF and DOCX files +- **Parallel Processing**: Configurable worker count for batch processing +- **Automatic OCR**: Handles scanned documents automatically + +## Configuration + +### Environment Variables + +Configure the parser using environment variables: + +```bash +# Required: API key (either variable works) +LLAMAPARSE_API_KEY=llx-your-api-key-here +# OR use LlamaIndex's default env var name: +LLAMA_CLOUD_API_KEY=llx-your-api-key-here + +# Optional: Output format ("json", "markdown", "text") +# Default: "json" - required for layout extraction +LLAMAPARSE_RESULT_TYPE=json + +# Optional: Enable layout extraction with bounding boxes +# Default: True +LLAMAPARSE_EXTRACT_LAYOUT=True + +# Optional: Number of parallel workers for batch processing +# Default: 4 +LLAMAPARSE_NUM_WORKERS=4 + +# Optional: Document language code +# Default: "en" +LLAMAPARSE_LANGUAGE=en + +# Optional: Enable verbose logging +# Default: False +LLAMAPARSE_VERBOSE=False + +# Select LlamaParse as the default PDF parser +PDF_PARSER=llamaparse +``` + +### Django Settings + +The parser is configured in `config/settings/base.py`: + +```python +# LlamaParse Settings +LLAMAPARSE_API_KEY = env.str("LLAMAPARSE_API_KEY", default="") +LLAMAPARSE_RESULT_TYPE = env.str("LLAMAPARSE_RESULT_TYPE", default="json") +LLAMAPARSE_EXTRACT_LAYOUT = env.bool("LLAMAPARSE_EXTRACT_LAYOUT", default=True) +LLAMAPARSE_NUM_WORKERS = env.int("LLAMAPARSE_NUM_WORKERS", default=4) +LLAMAPARSE_LANGUAGE = env.str("LLAMAPARSE_LANGUAGE", default="en") +LLAMAPARSE_VERBOSE = env.bool("LLAMAPARSE_VERBOSE", default=False) + +# Parser selection +PDF_PARSER = env.str("PDF_PARSER", default="docling") # Set to "llamaparse" +``` + +### Parser Registration + +The parser is automatically registered in `PREFERRED_PARSERS` when `PDF_PARSER=llamaparse`: + +```python +PREFERRED_PARSERS = { + "application/pdf": "opencontractserver.pipeline.parsers.llamaparse_parser.LlamaParseParser", + # ... other mime types +} +``` + +## Usage + +### Basic Usage + +```python +from opencontractserver.pipeline.parsers.llamaparse_parser import LlamaParseParser + +parser = LlamaParseParser() +result = parser.parse_document(user_id=1, doc_id=123) +``` + +### With Options Override + +```python +# Override default settings for a specific parse +result = parser.parse_document( + user_id=1, + doc_id=123, + result_type="json", + extract_layout=True, + language="en", + verbose=True, +) +``` + +### Text-Only Mode (No Layout) + +```python +# For faster parsing without bounding boxes +result = parser.parse_document( + user_id=1, + doc_id=123, + result_type="markdown", # or "text" + extract_layout=False, +) +``` + +## Output + +The parser returns an `OpenContractDocExport` dictionary containing: + +```python +{ + "title": str, # Document title + "description": str, # Document description + "content": str, # Full text content + "page_count": int, # Number of pages + "pawls_file_content": List[dict], # PAWLS token data per page + "labelled_text": List[dict], # Structural annotations + "relationships": List[dict], # (Empty - no relationships extracted) + "doc_labels": List[dict], # (Empty - no doc labels extracted) +} +``` + +## Element Type Mapping + +LlamaParse elements are mapped to OpenContracts annotation labels: + +| LlamaParse Type | OpenContracts Label | +|-----------------|---------------------| +| `title` | Title | +| `section_header` | Section Header | +| `heading` | Heading | +| `text` | Text Block | +| `paragraph` | Paragraph | +| `table` | Table | +| `figure` | Figure | +| `image` | Image | +| `list` | List | +| `list_item` | List Item | +| `caption` | Caption | +| `footnote` | Footnote | +| `header` | Page Header | +| `footer` | Page Footer | +| `page_number` | Page Number | +| `equation` | Equation | +| `code` | Code Block | + +## Processing Steps + +1. **Document Loading** + - Loads document from Django storage + - Writes to temporary file (LlamaParse requires file path) + +2. **API Call** + - Sends document to LlamaParse cloud API + - Uses `get_json_result()` for layout mode + - Uses `load_data()` for text/markdown mode + +3. **Bounding Box Conversion** + - LlamaParse returns coordinates in various formats (fractional 0-1 or absolute) + - Converts to absolute page coordinates + - Handles multiple bbox formats (`x/y/w/h`, `left/top/right/bottom`, `x1/y1/x2/y2`, arrays) + - Applies sanity checks and bounds clamping + +4. **Annotation Creation** + - Maps element types to OpenContracts labels + - Creates structural annotations with bounding boxes + - Annotations use empty `tokensJsons` (see [Limitations](#limitations)) + +5. **Cleanup** + - Removes temporary file + - Returns OpenContractDocExport + +## Comparison with Other Parsers + +| Feature | LlamaParse | Docling | NLM-Ingest | +|---------|------------|---------|------------| +| Deployment | Cloud API | Local microservice | Local microservice | +| API Key Required | Yes | No | No | +| Layout Extraction | Yes | Yes | Yes | +| Relationship Detection | No | Yes (groups) | Limited | +| OCR Support | Yes (automatic) | Yes (Tesseract) | Yes | +| DOCX Support | Yes | Yes | No | +| Cost | Per-page pricing | Free | Free | +| Privacy | Cloud processing | Local processing | Local processing | + +## Error Handling + +The parser handles errors gracefully: + +- **Missing API Key**: Returns None with error log +- **Document Not Found**: Returns None with error log +- **API Errors**: Returns None with detailed error message +- **Import Errors**: Returns None if `llama-parse` not installed +- **Empty Results**: Returns None with warning + +Example error handling: + +```python +result = parser.parse_document(user_id=1, doc_id=123) +if result is None: + # Check logs for error details + logger.error("Parsing failed") +``` + +## Troubleshooting + +### Common Issues + +1. **API Key Not Configured** + ``` + LlamaParse API key not configured. Set LLAMAPARSE_API_KEY or LLAMA_CLOUD_API_KEY environment variable. + ``` + - Set `LLAMAPARSE_API_KEY` in your environment + - Verify the key is valid at [cloud.llamaindex.ai](https://cloud.llamaindex.ai/) + +2. **Library Not Installed** + ``` + llama-parse library not installed. Install with: pip install llama-parse + ``` + - Install the library: `pip install llama-parse` + - Or add to requirements: `llama-parse>=0.4.0` + +3. **Empty Results** + ``` + LlamaParse returned empty results + ``` + - Verify document is readable (not corrupted) + - Check if document has extractable text + - Try with `verbose=True` for more details + +4. **No Bounding Boxes** + - Ensure `result_type="json"` (not "markdown" or "text") + - Ensure `extract_layout=True` + - Some document types may not support layout extraction + +5. **Rate Limiting** + - LlamaParse has API rate limits + - Reduce `num_workers` for batch processing + - Implement retry logic for production use + +### Debug Mode + +Enable verbose logging for troubleshooting: + +```bash +LLAMAPARSE_VERBOSE=True +``` + +Or in code: + +```python +result = parser.parse_document(user_id=1, doc_id=123, verbose=True) +``` + +## Performance Considerations + +- **Network Latency**: Cloud API adds network round-trip time +- **Per-page Pricing**: LlamaParse charges per page processed +- **Parallel Workers**: Increase `LLAMAPARSE_NUM_WORKERS` for batch jobs +- **Result Type**: "markdown" and "text" modes are faster but lack layout +- **File Size**: Large documents may take longer to upload and process + +## Security Considerations + +- **API Key Security**: Store API key in environment variables, not code +- **Data Privacy**: Documents are sent to LlamaIndex cloud for processing +- **Temporary Files**: Parser cleans up temp files after processing +- **Logging**: API key is redacted from log output + +## Dependencies + +- `llama-parse>=0.4.0`: LlamaParse Python client +- `llama-index-core`: Core LlamaIndex library (installed with llama-parse) + +Add to requirements: + +``` +llama-parse>=0.4.0 +``` + +## Limitations + +LlamaParse has several limitations compared to other parsers like Docling: + +### No Token-Level Data + +LlamaParse only provides **element-level bounding boxes**, not token-level (word-level) positions. This means: + +- Annotations display as bounding box outlines only, without individual word highlighting +- The `tokensJsons` field in annotations is empty +- Text selection and word-level interactions are not available for LlamaParse-generated annotations +- The frontend handles this gracefully by showing just the bounding box boundary + +**Workaround**: If you need token-level precision, use the Docling parser instead, which provides full PAWLS token data. + +### No Parent-Child Relationships + +LlamaParse returns **flat layout blocks** without hierarchical structure: + +- No parent/child relationships between elements (e.g., list items under a list) +- No nesting information for sections/subsections +- The `relationships` field in the export is always empty +- Document structure must be inferred from element types and spatial positioning + +**Workaround**: Use the Docling parser for relationship detection, which can group related elements. + +### Cloud Processing Required + +- Documents are sent to LlamaIndex's cloud infrastructure for processing +- Requires internet connectivity +- Subject to LlamaIndex's data handling policies +- Not suitable for highly sensitive documents that cannot leave your network + +**Workaround**: Use Docling or NLM-Ingest for fully local processing. + +### Per-Page Pricing + +- LlamaParse charges per page processed (with layout extraction: 1 extra credit per page) +- Costs can add up for large document volumes +- Free tier has limited credits + +### Bounding Box Precision + +- Bounding boxes may be slightly larger or smaller than the actual content +- Complex layouts (multi-column, overlapping elements) may have less accurate boxes +- Tables and figures are detected as single blocks without internal structure + +### No Streaming Support + +- Entire document must be uploaded and processed before results are returned +- Large documents may have significant processing time +- No progress indicators during parsing + +## See Also + +- [Pipeline Overview](pipeline_overview.md) +- [Docling Parser](docling_parser.md) - Local ML-based alternative with token-level data and relationships +- [NLM-Ingest Parser](nlm_ingest_parser.md) - Another local alternative +- [LlamaParse Documentation](https://developers.llamaindex.ai/python/cloud/llamaparse/) +- [LlamaIndex Cloud](https://cloud.llamaindex.ai/) diff --git a/docs/pipelines/nlm_ingest_parser.md b/docs/pipelines/nlm_ingest_parser.md index 4014d2134..2d8e31f13 100644 --- a/docs/pipelines/nlm_ingest_parser.md +++ b/docs/pipelines/nlm_ingest_parser.md @@ -170,16 +170,19 @@ class NLMIngestParser(BaseParser): - **File Size**: Can handle large PDF files efficiently - **Concurrent Processing**: Thread-safe for parallel processing -## Comparison with Docling Parser - -| Feature | NLM Ingest | Docling | -|---------|------------|---------| -| Speed | Faster | Slower | -| Accuracy | Good | Excellent | -| OCR Support | Limited | Full | -| Table Extraction | Good | Excellent | -| Memory Usage | Lower | Higher | -| Dependencies | Simpler | Complex | +## Comparison with Other Parsers + +| Feature | NLM Ingest | Docling | LlamaParse | +|---------|------------|---------|------------| +| Deployment | Local | Local microservice | Cloud API | +| Speed | Faster | Slower | Network-dependent | +| Accuracy | Good | Excellent | Excellent | +| OCR Support | Limited | Full | Full (automatic) | +| Table Extraction | Good | Excellent | Good | +| Memory Usage | Lower | Higher | Minimal (cloud) | +| Dependencies | Simpler | Complex | Simple (API client) | +| Cost | Free | Free | Per-page pricing | +| Privacy | Local | Local | Cloud processing | ## Best Practices @@ -252,6 +255,7 @@ Required Python packages: ## See Also - [Pipeline Overview](pipeline_overview.md) -- [Docling Parser](docling_parser.md) +- [Docling Parser](docling_parser.md) - ML-based local parser with OCR +- [LlamaParse Parser](llamaparse_parser.md) - Cloud-based alternative - [PDF Data Layer Architecture](../architecture/PDF-data-layer.md) - [NLM Ingest Library](https://github.com/nlmatics/nlm-ingestor) diff --git a/docs/pipelines/pipeline_overview.md b/docs/pipelines/pipeline_overview.md index 3530e38b1..71c10c55b 100644 --- a/docs/pipelines/pipeline_overview.md +++ b/docs/pipelines/pipeline_overview.md @@ -23,6 +23,7 @@ graph TD B --> B1[DoclingParser REST] B --> B2[NLMIngestParser] B --> B3[TxtParser] + B --> B4[LlamaParseParser] C --> C1[PdfThumbnailGenerator] C --> C2[TextThumbnailGenerator] @@ -88,6 +89,7 @@ class BaseParser(ABC): Current implementations: - **DoclingParser**: Advanced PDF parser using machine learning (REST microservice) +- **LlamaParseParser**: Cloud-based parser using LlamaParse API with layout extraction - **NLMIngestParser**: Alternative PDF parser using NLM Ingest library - **TxtParser**: Simple text file parser diff --git a/frontend/src/graphql/mutations.ts b/frontend/src/graphql/mutations.ts index 9b6fe1bf3..9dd4a1e76 100644 --- a/frontend/src/graphql/mutations.ts +++ b/frontend/src/graphql/mutations.ts @@ -3318,7 +3318,7 @@ export interface RestoreDeletedDocumentOutput { } export const PERMANENTLY_DELETE_DOCUMENT = gql` - mutation PermanentlyDeleteDocument($documentId: ID!, $corpusId: ID!) { + mutation PermanentlyDeleteDocument($documentId: String!, $corpusId: String!) { permanentlyDeleteDocument(documentId: $documentId, corpusId: $corpusId) { ok message @@ -3339,7 +3339,7 @@ export interface PermanentlyDeleteDocumentOutput { } export const EMPTY_TRASH = gql` - mutation EmptyTrash($corpusId: ID!) { + mutation EmptyTrash($corpusId: String!) { emptyTrash(corpusId: $corpusId) { ok message diff --git a/opencontractserver/pipeline/parsers/llamaparse_parser.py b/opencontractserver/pipeline/parsers/llamaparse_parser.py new file mode 100644 index 000000000..5475482ff --- /dev/null +++ b/opencontractserver/pipeline/parsers/llamaparse_parser.py @@ -0,0 +1,663 @@ +""" +LlamaParse Parser for OpenContracts. + +This parser uses the LlamaParse API (from LlamaIndex) to parse PDF documents +and extract structural annotations with bounding boxes. +""" + +import logging +import os +import tempfile +from typing import Any, Optional + +from django.conf import settings +from django.core.files.storage import default_storage + +from opencontractserver.annotations.models import TOKEN_LABEL +from opencontractserver.documents.models import Document +from opencontractserver.pipeline.base.file_types import FileTypeEnum +from opencontractserver.pipeline.base.parser import BaseParser +from opencontractserver.types.dicts import ( + BoundingBoxPythonType, + OpenContractDocExport, + OpenContractsAnnotationPythonType, + OpenContractsSinglePageAnnotationType, + PawlsPagePythonType, + PawlsTokenPythonType, +) + +logger = logging.getLogger(__name__) + + +class LlamaParseParser(BaseParser): + """ + A parser that uses the LlamaParse API to parse PDF documents. + + LlamaParse provides advanced document parsing with layout extraction, + returning bounding boxes for various document elements (titles, text, + tables, figures, lists). + + Configuration via environment variables: + - LLAMAPARSE_API_KEY: API key for LlamaParse (required) + - LLAMAPARSE_RESULT_TYPE: Output type (default: "json") + - LLAMAPARSE_EXTRACT_LAYOUT: Whether to extract layout (default: True) + - LLAMAPARSE_NUM_WORKERS: Number of parallel workers (default: 4) + - LLAMAPARSE_LANGUAGE: Document language (default: "en") + - LLAMAPARSE_VERBOSE: Enable verbose logging (default: False) + """ + + title = "LlamaParse Parser" + description = ( + "Parses PDF documents using the LlamaParse API with layout extraction." + ) + author = "OpenContracts Team" + dependencies = ["llama-parse"] + supported_file_types = [FileTypeEnum.PDF, FileTypeEnum.DOCX] + + # Mapping from LlamaParse element types to OpenContracts annotation labels + ELEMENT_TYPE_MAPPING = { + "title": "Title", + "section_header": "Section Header", + "heading": "Heading", + "text": "Text Block", + "paragraph": "Paragraph", + "table": "Table", + "figure": "Figure", + "image": "Image", + "list": "List", + "list_item": "List Item", + "caption": "Caption", + "footnote": "Footnote", + "header": "Page Header", + "footer": "Page Footer", + "page_number": "Page Number", + "equation": "Equation", + "code": "Code Block", + } + + def __init__(self): + """Initialize the LlamaParse parser with configuration from settings.""" + super().__init__() + + # Get API key from settings (which reads from env vars, supporting both + # LLAMAPARSE_API_KEY and LLAMA_CLOUD_API_KEY) + self.api_key = getattr(settings, "LLAMAPARSE_API_KEY", "") + + # Get other configuration options + self.result_type = getattr(settings, "LLAMAPARSE_RESULT_TYPE", "json") + self.extract_layout = getattr(settings, "LLAMAPARSE_EXTRACT_LAYOUT", True) + self.num_workers = getattr(settings, "LLAMAPARSE_NUM_WORKERS", 4) + self.language = getattr(settings, "LLAMAPARSE_LANGUAGE", "en") + self.verbose = getattr(settings, "LLAMAPARSE_VERBOSE", False) + + logger.info( + f"LlamaParseParser initialized with extract_layout={self.extract_layout}, " + f"language={self.language}" + ) + + def _parse_document_impl( + self, user_id: int, doc_id: int, **all_kwargs + ) -> Optional[OpenContractDocExport]: + """ + Parse a document using the LlamaParse API. + + Args: + user_id: ID of the user requesting the parse. + doc_id: ID of the document to parse. + **all_kwargs: Additional configuration options that can override defaults: + - api_key: Override the API key + - result_type: Output type ("json", "markdown", "text") + - extract_layout: Whether to extract layout/bounding boxes + - num_workers: Number of parallel workers + - language: Document language + - verbose: Enable verbose logging + + Returns: + OpenContractDocExport with the parsed document data, or None if parsing failed. + """ + # Redact sensitive keys before logging + safe_kwargs = { + k: ("***" if k == "api_key" else v) for k, v in all_kwargs.items() + } + logger.info( + f"LlamaParseParser - Parsing doc {doc_id} for user {user_id} " + f"with effective kwargs: {safe_kwargs}" + ) + + # Override settings with kwargs if provided + api_key = all_kwargs.get("api_key", self.api_key) + result_type = all_kwargs.get("result_type", self.result_type) + extract_layout = all_kwargs.get("extract_layout", self.extract_layout) + num_workers = all_kwargs.get("num_workers", self.num_workers) + language = all_kwargs.get("language", self.language) + verbose = all_kwargs.get("verbose", self.verbose) + + if not api_key: + logger.error( + "LlamaParse API key not configured. Set LLAMAPARSE_API_KEY or " + "LLAMA_CLOUD_API_KEY environment variable." + ) + return None + + # Get the document + try: + document = Document.objects.get(pk=doc_id) + except Document.DoesNotExist: + logger.error(f"Document {doc_id} not found") + return None + + # Determine which file to use + if document.pdf_file and document.pdf_file.name: + doc_path = document.pdf_file.name + else: + logger.error(f"No PDF file found for document {doc_id}") + return None + + try: + # Import llama-parse here to avoid import errors if not installed + from llama_parse import LlamaParse + + # Initialize the parser + parser = LlamaParse( + api_key=api_key, + result_type=result_type, + num_workers=num_workers, + verbose=verbose, + language=language, + ) + + # Read the file from storage and write to a temp file + # (LlamaParse needs a file path) + with default_storage.open(doc_path, "rb") as doc_file: + doc_bytes = doc_file.read() + + # Determine file extension from document type + file_type = document.file_type.lower() if document.file_type else "pdf" + suffix = f".{file_type}" if file_type in ("pdf", "docx") else ".pdf" + + # Create a temporary file - use a nested try-finally to ensure cleanup + # on all exit paths (success, error, or early return) + temp_file_path = None + try: + with tempfile.NamedTemporaryFile( + suffix=suffix, delete=False + ) as temp_file: + temp_file.write(doc_bytes) + temp_file_path = temp_file.name + + # Parse the document + logger.info("Sending document to LlamaParse API...") + + # Use get_json_result for JSON with layout data + if result_type == "json" and extract_layout: + # For JSON with layout, we need to use the async API or + # get_json_result method + json_results = parser.get_json_result(temp_file_path) + + if not json_results: + logger.error("LlamaParse returned empty results") + return None + + # Convert to OpenContracts format + return self._convert_json_to_opencontracts( + document, json_results, extract_layout + ) + else: + # For markdown/text output, use load_data + documents = parser.load_data(temp_file_path) + + if not documents: + logger.error("LlamaParse returned empty results") + return None + + # Convert simple text/markdown output + return self._convert_text_to_opencontracts(document, documents) + + finally: + # Clean up temp file - always runs on any exit path + if temp_file_path and os.path.exists(temp_file_path): + os.unlink(temp_file_path) + + except ImportError: + logger.error( + "llama-parse library not installed. " + "Install with: pip install llama-parse" + ) + return None + except Exception as e: + import traceback + + stacktrace = traceback.format_exc() + logger.error(f"LlamaParse parsing failed: {e}\n{stacktrace}") + return None + + def _convert_json_to_opencontracts( + self, + document: Document, + json_results: list[dict[str, Any]], + extract_layout: bool = True, + ) -> OpenContractDocExport: + """ + Convert LlamaParse JSON results to OpenContracts format. + + Args: + document: The Document model instance. + json_results: List of JSON results from LlamaParse. + extract_layout: Whether layout data with bounding boxes is included. + + Returns: + OpenContractDocExport with parsed data. + """ + # The first result contains the parsed document + result = json_results[0] if json_results else {} + pages = result.get("pages", []) + + # Build the full text content + full_text_parts = [] + pawls_pages: list[PawlsPagePythonType] = [] + annotations: list[OpenContractsAnnotationPythonType] = [] + + # Track annotation IDs + annotation_id_counter = 0 + + for page_idx, page in enumerate(pages): + page_text = page.get("text", "") + full_text_parts.append(page_text) + + # Log full page structure on first page for debugging + if page_idx == 0: + page_keys = list(page.keys()) + logger.info(f"DEBUG: Page keys: {page_keys}") + + # Get page dimensions (default to standard US Letter size in points: 8.5" x 11") + # Note: A4 size would be 595 x 842 points + # LlamaParse may use different key names for dimensions + DEFAULT_WIDTH = 612 + DEFAULT_HEIGHT = 792 + page_width = page.get("width", page.get("w", page.get("pageWidth"))) + page_height = page.get("height", page.get("h", page.get("pageHeight"))) + + # Validate dimensions - must be positive numbers + if page_width is None or page_width <= 0: + page_width = DEFAULT_WIDTH + logger.warning( + f"Page {page_idx} has invalid width, using default: {page_width}" + ) + if page_height is None or page_height <= 0: + page_height = DEFAULT_HEIGHT + logger.warning( + f"Page {page_idx} has invalid height, using default: {page_height}" + ) + + # Create PAWLS page structure + pawls_page: PawlsPagePythonType = { + "page": { + "width": page_width, + "height": page_height, + "index": page_idx, + }, + "tokens": [], + } + + # Extract layout elements if available + layout_elements = page.get("layout", []) if extract_layout else [] + items = page.get("items", []) + + # Process items (elements with text and positions) + # Debug: Log first few items to understand bbox format + if page_idx == 0 and items: + logger.info(f"DEBUG: Page dimensions: {page_width}x{page_height}") + # Log full structure of first item for debugging + if items: + logger.info(f"DEBUG: Full first item structure: {items[0]}") + for i, debug_item in enumerate(items[:3]): + # Check all possible bbox key names + bbox_val = debug_item.get( + "bBox", + debug_item.get("bbox", debug_item.get("bounding_box", "NONE")), + ) + logger.info( + f"DEBUG: Item {i} keys: {debug_item.keys()}, " + f"bBox: {bbox_val}, " + f"text: {debug_item.get('text', debug_item.get('value', ''))[:50]}" + ) + + for item in items: + item_text = item.get("text", "") or item.get("value", "") + item_type = item.get("type", "text").lower() + # LlamaParse uses 'bBox' (camelCase), also check 'bbox' and 'bounding_box' + bbox = item.get("bBox", item.get("bbox", item.get("bounding_box", {}))) + + if not item_text.strip(): + continue + + # Parse bbox to get bounds (no tokens - LlamaParse doesn't provide them) + _, bounds = self._create_pawls_tokens_from_bbox( + item_text, + bbox, + page_width, + page_height, + annotation_id_counter, # Just used for debug logging + ) + + # Create annotation for this element + label = self.ELEMENT_TYPE_MAPPING.get(item_type, "Text Block") + annotation = self._create_annotation( + annotation_id=str(annotation_id_counter), + label=label, + raw_text=item_text, + page_idx=page_idx, + bounds=bounds, + ) + annotations.append(annotation) + annotation_id_counter += 1 + + # If no items but we have layout, process layout elements + if not items and layout_elements: + for element in layout_elements: + element_type = element.get("label", "text").lower() + # Check all possible bbox key names (bBox, bbox, bounding_box) + bbox = element.get( + "bBox", element.get("bbox", element.get("bounding_box", {})) + ) + element_text = element.get("text", "") + + if not element_text and element_type not in ["figure", "image"]: + continue + + # Parse bbox to get bounds (no tokens - LlamaParse doesn't provide them) + _, bounds = self._create_pawls_tokens_from_bbox( + element_text or f"[{element_type}]", + bbox, + page_width, + page_height, + annotation_id_counter, # Just used for debug logging + ) + + label = self.ELEMENT_TYPE_MAPPING.get(element_type, "Text Block") + annotation = self._create_annotation( + annotation_id=str(annotation_id_counter), + label=label, + raw_text=element_text or f"[{element_type}]", + page_idx=page_idx, + bounds=bounds, + ) + annotations.append(annotation) + annotation_id_counter += 1 + + pawls_pages.append(pawls_page) + + # Combine all text + full_text = "\n\n".join(full_text_parts) + + # Build the export + export: OpenContractDocExport = { + "title": document.title, + "content": full_text, + "description": document.description or "", + "pawls_file_content": pawls_pages, + "page_count": len(pages), + "doc_labels": [], + "labelled_text": annotations, + "relationships": [], + } + + logger.info( + f"Converted LlamaParse output: {len(pages)} pages, " + f"{len(annotations)} annotations" + ) + + return export + + def _convert_text_to_opencontracts( + self, + document: Document, + llama_documents: list[Any], + ) -> OpenContractDocExport: + """ + Convert simple text/markdown LlamaParse output to OpenContracts format. + + This is used when layout extraction is not enabled. + + Args: + document: The Document model instance. + llama_documents: List of LlamaIndex Document objects. + + Returns: + OpenContractDocExport with parsed data. + """ + # Combine text from all documents + full_text = "\n\n".join(doc.text for doc in llama_documents if doc.text) + + # Without layout data, we create a minimal export + export: OpenContractDocExport = { + "title": document.title, + "content": full_text, + "description": document.description or "", + "pawls_file_content": [], + "page_count": len(llama_documents) or 1, + "doc_labels": [], + "labelled_text": [], + "relationships": [], + } + + logger.info( + f"Converted LlamaParse text output: {len(llama_documents)} documents, " + f"{len(full_text)} characters" + ) + + return export + + def _create_pawls_tokens_from_bbox( + self, + text: str, + bbox: dict[str, Any], + page_width: float, + page_height: float, + start_token_idx: int, + ) -> tuple[list[PawlsTokenPythonType], BoundingBoxPythonType]: + """ + Create PAWLS tokens from text and bounding box. + + LlamaParse returns bounding boxes as fractions (0-1) of page dimensions. + We need to convert these to absolute coordinates. + + Args: + text: The text content. + bbox: Bounding box dict with keys like 'x', 'y', 'width', 'height' or + 'left', 'top', 'right', 'bottom' (as fractions 0-1). + page_width: Page width in points. + page_height: Page height in points. + start_token_idx: Starting token index. + + Returns: + Tuple of (list of PAWLS tokens, overall bounding box). + """ + tokens: list[PawlsTokenPythonType] = [] + + # Default margin constant (1 inch = 72 points) + DEFAULT_MARGIN = 72 + # Default bottom position for fallback bounding boxes (~1.4 inches from top) + # This provides reasonable vertical space for a single-line text element + DEFAULT_BOTTOM = 100 + + # Parse bounding box - handle different formats from LlamaParse + # LlamaParse may return fractional coordinates (0-1) or absolute coordinates + bbox_format = "none" + is_fractional = False + + if not bbox: + # No bbox, create a default one with standard margins + bbox_format = "default/empty" + left, top = DEFAULT_MARGIN, DEFAULT_MARGIN + right, bottom = page_width - DEFAULT_MARGIN, DEFAULT_BOTTOM + elif "x1" in bbox and "y1" in bbox: + # Format: {x1, y1, x2, y2} - corner coordinates + bbox_format = "x1/y1/x2/y2" + x1 = float(bbox.get("x1", 0)) + y1 = float(bbox.get("y1", 0)) + x2 = float(bbox.get("x2", 0)) + y2 = float(bbox.get("y2", 0)) + + # Check if fractional + is_fractional = all(0 <= v <= 1.0 for v in [x1, y1, x2, y2]) + if is_fractional: + left = x1 * page_width + top = y1 * page_height + right = x2 * page_width + bottom = y2 * page_height + else: + left, top, right, bottom = x1, y1, x2, y2 + elif "x" in bbox and "y" in bbox: + # Format: {x, y, width/w, height/h} + # LlamaParse uses 'w' and 'h' shorthand + bbox_format = "x/y/w/h" + x = float(bbox.get("x", 0)) + y = float(bbox.get("y", 0)) + w = float(bbox.get("w", bbox.get("width", 0.1))) + h = float(bbox.get("h", bbox.get("height", 0.02))) + + # Check if values are fractions (0-1) or absolute + # Heuristic: if both corners (x,y) and (x+w,y+h) are in [0,1], treat as fractional + is_fractional = ( + 0 <= x <= 1.0 + and 0 <= y <= 1.0 + and 0 <= (x + w) <= 1.0 + and 0 <= (y + h) <= 1.0 + ) + if is_fractional: + left = x * page_width + top = y * page_height + right = (x + w) * page_width + bottom = (y + h) * page_height + else: + left, top = x, y + right = x + w + bottom = y + h + elif "left" in bbox: + # Format: {left, top, right, bottom} + bbox_format = "left/top/right/bottom" + bbox_l = float(bbox.get("left", 0)) + bbox_t = float(bbox.get("top", 0)) + bbox_r = float(bbox.get("right", 1)) + bbox_b = float(bbox.get("bottom", 0.05)) + + # Check if ALL values are in [0,1] range - indicates fractional coordinates + is_fractional = all(0 <= v <= 1.0 for v in [bbox_l, bbox_t, bbox_r, bbox_b]) + if is_fractional: + left = bbox_l * page_width + top = bbox_t * page_height + right = bbox_r * page_width + bottom = bbox_b * page_height + else: + left, top, right, bottom = bbox_l, bbox_t, bbox_r, bbox_b + elif isinstance(bbox, (list, tuple)) and len(bbox) >= 4: + # Format: [x1, y1, x2, y2] or [left, top, right, bottom] + bbox_format = "array[4]" + vals = [float(v) for v in bbox[:4]] + # Check if ALL values are in [0,1] range - indicates fractional coordinates + is_fractional = all(0 <= v <= 1.0 for v in vals) + if is_fractional: + left = vals[0] * page_width + top = vals[1] * page_height + right = vals[2] * page_width + bottom = vals[3] * page_height + else: + left, top, right, bottom = vals + else: + # Unknown format, use defaults with standard margins + bbox_format = f"unknown:{type(bbox).__name__}" + left, top = DEFAULT_MARGIN, DEFAULT_MARGIN + right, bottom = page_width - DEFAULT_MARGIN, DEFAULT_BOTTOM + + # Sanity checks and bounds validation + # Ensure left < right and top < bottom (swap if needed) + if left > right: + left, right = right, left + if top > bottom: + top, bottom = bottom, top + + # Clamp to page bounds + left = max(0, min(left, page_width)) + right = max(0, min(right, page_width)) + top = max(0, min(top, page_height)) + bottom = max(0, min(bottom, page_height)) + + # Ensure minimum dimensions (at least 1 point) + if right - left < 1: + right = left + 1 + if bottom - top < 1: + bottom = top + 1 + + # NOTE: We do NOT create fake tokens here. LlamaParse only provides element-level + # bounding boxes, not token-level data. Creating fake tokens by evenly distributing + # words across the bbox produces incorrect highlights. The frontend handles + # annotations with empty tokensJsons gracefully - it just shows the bounding box + # without individual token highlights. + + # Create overall bounding box + bounds: BoundingBoxPythonType = { + "left": left, + "top": top, + "right": right, + "bottom": bottom, + } + + # Debug logging for first few conversions + if start_token_idx < 5: + logger.info( + f"DEBUG bbox: format={bbox_format}, fractional={is_fractional}, " + f"input={bbox}" + ) + logger.info( + f"DEBUG output: bounds=({left:.1f}, {top:.1f}, {right:.1f}, {bottom:.1f}), " + f"page={page_width:.0f}x{page_height:.0f}" + ) + + # Return empty tokens list - we don't have real token data from LlamaParse + return tokens, bounds + + def _create_annotation( + self, + annotation_id: str, + label: str, + raw_text: str, + page_idx: int, + bounds: BoundingBoxPythonType, + ) -> OpenContractsAnnotationPythonType: + """ + Create an OpenContracts annotation. + + Args: + annotation_id: Unique ID for the annotation. + label: The annotation label. + raw_text: The text content. + page_idx: Page index (0-based). + bounds: Bounding box. + + Returns: + OpenContractsAnnotationPythonType annotation. + """ + # NOTE: We use empty tokensJsons because LlamaParse only provides element-level + # bounding boxes, not token-level data. The frontend handles this gracefully + # by showing just the bounding box without individual token highlights. + + # Create page annotation with empty token references + page_annotation: OpenContractsSinglePageAnnotationType = { + "bounds": bounds, + "tokensJsons": [], # Empty - no token data from LlamaParse + "rawText": raw_text, + } + + annotation: OpenContractsAnnotationPythonType = { + "id": annotation_id, + "annotationLabel": label, + "rawText": raw_text, + "page": page_idx, + "annotation_json": {str(page_idx): page_annotation}, + "parent_id": None, + "annotation_type": TOKEN_LABEL, + "structural": True, + } + + return annotation diff --git a/opencontractserver/tests/test_doc_parser_llamaparse.py b/opencontractserver/tests/test_doc_parser_llamaparse.py new file mode 100644 index 000000000..8938ea145 --- /dev/null +++ b/opencontractserver/tests/test_doc_parser_llamaparse.py @@ -0,0 +1,879 @@ +""" +Tests for the LlamaParseParser class. + +Tests cover: +- Successful document parsing with JSON/layout output +- Bounding box parsing and conversion +- Structural annotation creation (without token-level data) +- Error handling (missing API key, API errors, etc.) +- Configuration via environment variables + +Note: LlamaParse only provides element-level bounding boxes, not token-level data. +Annotations are created with empty tokensJsons - the frontend handles this gracefully +by showing just the bounding box outline without individual token highlights. +""" + +import sys +from unittest.mock import MagicMock, patch + +from django.contrib.auth import get_user_model +from django.core.files.base import ContentFile +from django.db import transaction +from django.test import TestCase, override_settings + +from opencontractserver.documents.models import Document +from opencontractserver.pipeline.parsers.llamaparse_parser import LlamaParseParser + +User = get_user_model() + +# Create a mock llama_parse module for testing since it may not be installed +mock_llama_parse = MagicMock() +mock_llama_parse.LlamaParse = MagicMock() +sys.modules["llama_parse"] = mock_llama_parse + + +class MockLlamaDocument: + """Mock LlamaIndex Document object.""" + + def __init__(self, text: str): + self.text = text + + +class TestLlamaParseParser(TestCase): + """Tests for the LlamaParseParser class.""" + + def setUp(self): + """Set up test environment.""" + with transaction.atomic(): + self.user = User.objects.create_user( + username="testuser", password="testpass123" + ) + + # Create a sample Document object with a mock PDF file + self.doc = Document.objects.create( + title="Test LlamaParse Document", + description="Test Description", + file_type="pdf", + creator=self.user, + ) + + # Create a minimal valid PDF for testing + pdf_content = b"%PDF-1.7\n1 0 obj\n<>\nendobj\n2 0 obj\n<>\nendobj\n3 0 obj\n<>>>\nendobj\nxref\n0 4\n0000000000 65535 f\n0000000010 00000 n\n0000000053 00000 n\n0000000102 00000 n\ntrailer\n<>\nstartxref\n178\n%%EOF\n" # noqa: E501 + self.doc.pdf_file.save("test_llama.pdf", ContentFile(pdf_content)) + + # Sample JSON response from LlamaParse with layout data + # Note: LlamaParse uses 'bBox' (camelCase) with 'w'/'h' keys + self.sample_json_response = [ + { + "pages": [ + { + "text": "This is the first page of the document.", + "width": 612, + "height": 792, + "items": [ + { + "type": "title", + "text": "Document Title", + "bBox": { + "x": 61.2, + "y": 39.6, + "w": 489.6, + "h": 39.6, + }, + }, + { + "type": "paragraph", + "text": "This is a paragraph with some content.", + "bBox": { + "x": 61.2, + "y": 118.8, + "w": 489.6, + "h": 79.2, + }, + }, + { + "type": "table", + "text": "Column A | Column B\nValue 1 | Value 2", + "bBox": { + "x": 61.2, + "y": 237.6, + "w": 489.6, + "h": 158.4, + }, + }, + ], + "layout": [ + { + "label": "title", + "bBox": { + "x": 61.2, + "y": 39.6, + "w": 489.6, + "h": 39.6, + }, + "confidence": 0.95, + "isLikelyNoise": False, + }, + ], + }, + { + "text": "This is the second page.", + "width": 612, + "height": 792, + "items": [ + { + "type": "text", + "text": "More content on page 2.", + "bBox": { + "x": 61.2, + "y": 79.2, + "w": 489.6, + "h": 79.2, + }, + }, + ], + }, + ] + } + ] + + @override_settings(LLAMAPARSE_API_KEY="test-api-key-123") + @patch("llama_parse.LlamaParse") + @patch("opencontractserver.pipeline.parsers.llamaparse_parser.default_storage.open") + def test_parse_document_success_with_layout( + self, mock_open, mock_llama_parse_class + ): + """Test successful document parsing with layout extraction.""" + # Mock file reading + mock_file = MagicMock() + mock_file.read.return_value = b"mock pdf content" + mock_open.return_value.__enter__.return_value = mock_file + + # Mock the LlamaParse instance + mock_parser = MagicMock() + mock_parser.get_json_result.return_value = self.sample_json_response + mock_llama_parse_class.return_value = mock_parser + + # Create parser and parse document + parser = LlamaParseParser() + result = parser.parse_document(user_id=self.user.id, doc_id=self.doc.id) + + # Verify result structure + self.assertIsNotNone(result) + self.assertEqual(result["title"], "Test LlamaParse Document") + self.assertEqual(result["page_count"], 2) + + # Verify PAWLS content was generated + self.assertIn("pawls_file_content", result) + self.assertEqual(len(result["pawls_file_content"]), 2) + + # Verify first page structure + first_page = result["pawls_file_content"][0] + self.assertEqual(first_page["page"]["index"], 0) + self.assertEqual(first_page["page"]["width"], 612) + self.assertEqual(first_page["page"]["height"], 792) + # LlamaParse doesn't provide token-level data, so tokens list is empty + self.assertEqual(len(first_page["tokens"]), 0) + + # Verify annotations were created + self.assertIn("labelled_text", result) + self.assertGreater(len(result["labelled_text"]), 0) + + # Verify annotation structure + first_annotation = result["labelled_text"][0] + self.assertEqual(first_annotation["annotationLabel"], "Title") + self.assertEqual(first_annotation["structural"], True) + self.assertEqual(first_annotation["annotation_type"], "TOKEN_LABEL") + self.assertIn("annotation_json", first_annotation) + + @override_settings(LLAMAPARSE_API_KEY="test-api-key-123") + @patch("llama_parse.LlamaParse") + @patch("opencontractserver.pipeline.parsers.llamaparse_parser.default_storage.open") + def test_parse_document_markdown_mode(self, mock_open, mock_llama_parse_class): + """Test document parsing with markdown output (no layout).""" + # Mock file reading + mock_file = MagicMock() + mock_file.read.return_value = b"mock pdf content" + mock_open.return_value.__enter__.return_value = mock_file + + # Mock the LlamaParse instance for markdown mode + mock_parser = MagicMock() + mock_parser.load_data.return_value = [ + MockLlamaDocument("# Title\n\nThis is the document content."), + MockLlamaDocument("## Section 2\n\nMore content here."), + ] + mock_llama_parse_class.return_value = mock_parser + + # Create parser and parse document with markdown mode + parser = LlamaParseParser() + result = parser.parse_document( + user_id=self.user.id, + doc_id=self.doc.id, + result_type="markdown", + extract_layout=False, + ) + + # Verify result structure + self.assertIsNotNone(result) + self.assertEqual(result["title"], "Test LlamaParse Document") + self.assertIn("# Title", result["content"]) + self.assertEqual(result["page_count"], 2) + + # Verify no PAWLS content (markdown mode) + self.assertEqual(result["pawls_file_content"], []) + + # Verify no annotations (markdown mode without layout) + self.assertEqual(result["labelled_text"], []) + + def test_parse_document_no_api_key(self): + """Test that parsing fails gracefully without API key.""" + with override_settings(LLAMAPARSE_API_KEY=""): + parser = LlamaParseParser() + parser.api_key = "" # Ensure no API key + + result = parser.parse_document(user_id=self.user.id, doc_id=self.doc.id) + + self.assertIsNone(result) + + @override_settings(LLAMAPARSE_API_KEY="test-api-key-123") + @patch("llama_parse.LlamaParse") + @patch("opencontractserver.pipeline.parsers.llamaparse_parser.default_storage.open") + def test_parse_document_api_error(self, mock_open, mock_llama_parse_class): + """Test handling of API errors.""" + # Mock file reading + mock_file = MagicMock() + mock_file.read.return_value = b"mock pdf content" + mock_open.return_value.__enter__.return_value = mock_file + + # Mock the LlamaParse instance to raise an error + mock_parser = MagicMock() + mock_parser.get_json_result.side_effect = Exception("API rate limit exceeded") + mock_llama_parse_class.return_value = mock_parser + + # Create parser and attempt to parse + parser = LlamaParseParser() + result = parser.parse_document(user_id=self.user.id, doc_id=self.doc.id) + + # Should return None on error + self.assertIsNone(result) + + @override_settings(LLAMAPARSE_API_KEY="test-api-key-123") + @patch("llama_parse.LlamaParse") + @patch("opencontractserver.pipeline.parsers.llamaparse_parser.default_storage.open") + def test_parse_document_empty_result(self, mock_open, mock_llama_parse_class): + """Test handling of empty results from API.""" + # Mock file reading + mock_file = MagicMock() + mock_file.read.return_value = b"mock pdf content" + mock_open.return_value.__enter__.return_value = mock_file + + # Mock empty response + mock_parser = MagicMock() + mock_parser.get_json_result.return_value = [] + mock_llama_parse_class.return_value = mock_parser + + parser = LlamaParseParser() + result = parser.parse_document(user_id=self.user.id, doc_id=self.doc.id) + + self.assertIsNone(result) + + def test_parse_document_nonexistent(self): + """Test parsing a document that doesn't exist.""" + with override_settings(LLAMAPARSE_API_KEY="test-api-key-123"): + parser = LlamaParseParser() + result = parser.parse_document(user_id=self.user.id, doc_id=999999) + + self.assertIsNone(result) + + @override_settings(LLAMAPARSE_API_KEY="test-api-key-123") + def test_parse_document_no_pdf_file(self): + """Test parsing a document without a PDF file.""" + # Create a document without a PDF file + doc_without_pdf = Document.objects.create( + title="No PDF Document", + description="Test", + file_type="pdf", + creator=self.user, + ) + + parser = LlamaParseParser() + result = parser.parse_document(user_id=self.user.id, doc_id=doc_without_pdf.id) + + self.assertIsNone(result) + + +class TestLlamaParseParserBboxConversion(TestCase): + """Tests for bounding box conversion methods. + + Note: LlamaParse only provides element-level bounding boxes, not token-level data. + The _create_pawls_tokens_from_bbox method returns empty tokens list and just the bounds. + """ + + def setUp(self): + """Set up test environment.""" + self.parser = LlamaParseParser() + + def test_bbox_fractional_xy_format(self): + """Test conversion of fractional x,y,width,height bbox format.""" + bbox = {"x": 0.1, "y": 0.2, "width": 0.3, "height": 0.1} + tokens, bounds = self.parser._create_pawls_tokens_from_bbox( + text="test word", + bbox=bbox, + page_width=612, + page_height=792, + start_token_idx=0, + ) + + # Check bounds are converted to absolute coordinates + self.assertAlmostEqual(bounds["left"], 61.2, places=1) + self.assertAlmostEqual(bounds["top"], 158.4, places=1) + self.assertAlmostEqual(bounds["right"], 244.8, places=1) + self.assertAlmostEqual(bounds["bottom"], 237.6, places=1) + + # Tokens list is empty - we don't generate fake tokens + self.assertEqual(len(tokens), 0) + + def test_bbox_llamaparse_format(self): + """Test conversion of LlamaParse's actual format: bBox with x/y/w/h.""" + # This is the actual format LlamaParse uses (absolute coordinates) + bbox = {"x": 72.1, "y": 35.4, "w": 467.35, "h": 151} + tokens, bounds = self.parser._create_pawls_tokens_from_bbox( + text="LlamaParse format test", + bbox=bbox, + page_width=612, + page_height=792, + start_token_idx=0, + ) + + # Should be treated as absolute coordinates since values > 1 + self.assertAlmostEqual(bounds["left"], 72.1, places=1) + self.assertAlmostEqual(bounds["top"], 35.4, places=1) + self.assertAlmostEqual(bounds["right"], 539.45, places=1) # x + w + self.assertAlmostEqual(bounds["bottom"], 186.4, places=1) # y + h + + # No tokens + self.assertEqual(len(tokens), 0) + + def test_bbox_fractional_ltrb_format(self): + """Test conversion of fractional left,top,right,bottom bbox format.""" + bbox = {"left": 0.1, "top": 0.2, "right": 0.9, "bottom": 0.3} + tokens, bounds = self.parser._create_pawls_tokens_from_bbox( + text="hello world", + bbox=bbox, + page_width=612, + page_height=792, + start_token_idx=0, + ) + + self.assertAlmostEqual(bounds["left"], 61.2, places=1) + self.assertAlmostEqual(bounds["top"], 158.4, places=1) + self.assertAlmostEqual(bounds["right"], 550.8, places=1) + self.assertAlmostEqual(bounds["bottom"], 237.6, places=1) + + # No tokens + self.assertEqual(len(tokens), 0) + + def test_bbox_array_format(self): + """Test conversion of array bbox format [x1, y1, x2, y2].""" + bbox = [0.1, 0.2, 0.9, 0.3] + tokens, bounds = self.parser._create_pawls_tokens_from_bbox( + text="array format test", + bbox=bbox, + page_width=612, + page_height=792, + start_token_idx=0, + ) + + self.assertAlmostEqual(bounds["left"], 61.2, places=1) + self.assertAlmostEqual(bounds["top"], 158.4, places=1) + self.assertAlmostEqual(bounds["right"], 550.8, places=1) + self.assertAlmostEqual(bounds["bottom"], 237.6, places=1) + + # No tokens + self.assertEqual(len(tokens), 0) + + def test_bbox_absolute_coordinates(self): + """Test handling of absolute coordinate bbox (values > 1).""" + bbox = {"x": 100, "y": 200, "width": 300, "height": 50} + tokens, bounds = self.parser._create_pawls_tokens_from_bbox( + text="absolute coords", + bbox=bbox, + page_width=612, + page_height=792, + start_token_idx=0, + ) + + # When values > 1, they're treated as absolute + self.assertEqual(bounds["left"], 100) + self.assertEqual(bounds["top"], 200) + self.assertEqual(bounds["right"], 400) # x + width + self.assertEqual(bounds["bottom"], 250) # y + height + + # No tokens + self.assertEqual(len(tokens), 0) + + def test_bbox_empty(self): + """Test handling of empty/missing bbox.""" + tokens, bounds = self.parser._create_pawls_tokens_from_bbox( + text="no bbox", + bbox={}, + page_width=612, + page_height=792, + start_token_idx=0, + ) + + # Should use default margins + self.assertEqual(bounds["left"], 72) + self.assertEqual(bounds["top"], 72) + + # No tokens + self.assertEqual(len(tokens), 0) + + def test_bbox_x1_y1_x2_y2_format(self): + """Test conversion of x1/y1/x2/y2 corner coordinate format.""" + bbox = {"x1": 0.1, "y1": 0.2, "x2": 0.9, "y2": 0.3} + tokens, bounds = self.parser._create_pawls_tokens_from_bbox( + text="corner format test", + bbox=bbox, + page_width=612, + page_height=792, + start_token_idx=0, + ) + + self.assertAlmostEqual(bounds["left"], 61.2, places=1) + self.assertAlmostEqual(bounds["top"], 158.4, places=1) + self.assertAlmostEqual(bounds["right"], 550.8, places=1) + self.assertAlmostEqual(bounds["bottom"], 237.6, places=1) + + # No tokens + self.assertEqual(len(tokens), 0) + + def test_bbox_sanity_checks(self): + """Test that sanity checks are applied to bounding boxes.""" + # Test bounds are clamped to page + bbox = {"x": -10, "y": -10, "w": 1000, "h": 1000} + tokens, bounds = self.parser._create_pawls_tokens_from_bbox( + text="out of bounds", + bbox=bbox, + page_width=612, + page_height=792, + start_token_idx=0, + ) + + # Should be clamped to page bounds + self.assertGreaterEqual(bounds["left"], 0) + self.assertGreaterEqual(bounds["top"], 0) + self.assertLessEqual(bounds["right"], 612) + self.assertLessEqual(bounds["bottom"], 792) + + +class TestLlamaParseParserAnnotations(TestCase): + """Tests for annotation creation methods. + + Note: LlamaParse annotations use empty tokensJsons since LlamaParse + only provides element-level bounding boxes, not token-level data. + """ + + def setUp(self): + """Set up test environment.""" + self.parser = LlamaParseParser() + + def test_create_annotation_structure(self): + """Test annotation creation has correct structure.""" + bounds = {"left": 100, "top": 100, "right": 300, "bottom": 150} + + annotation = self.parser._create_annotation( + annotation_id="anno-1", + label="Title", + raw_text="Sample Title", + page_idx=0, + bounds=bounds, + ) + + # Check required fields + self.assertEqual(annotation["id"], "anno-1") + self.assertEqual(annotation["annotationLabel"], "Title") + self.assertEqual(annotation["rawText"], "Sample Title") + self.assertEqual(annotation["page"], 0) + self.assertEqual(annotation["structural"], True) + self.assertEqual(annotation["annotation_type"], "TOKEN_LABEL") + self.assertIsNone(annotation["parent_id"]) + + # Check annotation_json structure + self.assertIn("0", annotation["annotation_json"]) + page_anno = annotation["annotation_json"]["0"] + self.assertEqual(page_anno["bounds"], bounds) + self.assertEqual(page_anno["rawText"], "Sample Title") + # tokensJsons is empty - LlamaParse doesn't provide token-level data + self.assertEqual(len(page_anno["tokensJsons"]), 0) + + def test_element_type_mapping(self): + """Test that element types are properly mapped to labels.""" + type_mappings = { + "title": "Title", + "paragraph": "Paragraph", + "table": "Table", + "figure": "Figure", + "list": "List", + "heading": "Heading", + "unknown_type": "Text Block", # Default + } + + for element_type, expected_label in type_mappings.items(): + label = LlamaParseParser.ELEMENT_TYPE_MAPPING.get( + element_type, "Text Block" + ) + self.assertEqual( + label, + expected_label, + f"Element type '{element_type}' should map to '{expected_label}'", + ) + + +class TestLlamaParseParserConfiguration(TestCase): + """Tests for parser configuration.""" + + def test_default_configuration(self): + """Test default configuration values.""" + with override_settings( + LLAMAPARSE_API_KEY="test-key", + LLAMAPARSE_RESULT_TYPE="json", + LLAMAPARSE_EXTRACT_LAYOUT=True, + LLAMAPARSE_NUM_WORKERS=4, + LLAMAPARSE_LANGUAGE="en", + LLAMAPARSE_VERBOSE=False, + ): + parser = LlamaParseParser() + + self.assertEqual(parser.result_type, "json") + self.assertEqual(parser.extract_layout, True) + self.assertEqual(parser.num_workers, 4) + self.assertEqual(parser.language, "en") + self.assertEqual(parser.verbose, False) + + def test_custom_configuration(self): + """Test custom configuration via settings.""" + with override_settings( + LLAMAPARSE_API_KEY="custom-key", + LLAMAPARSE_RESULT_TYPE="markdown", + LLAMAPARSE_EXTRACT_LAYOUT=False, + LLAMAPARSE_NUM_WORKERS=8, + LLAMAPARSE_LANGUAGE="de", + LLAMAPARSE_VERBOSE=True, + ): + parser = LlamaParseParser() + + self.assertEqual(parser.result_type, "markdown") + self.assertEqual(parser.extract_layout, False) + self.assertEqual(parser.num_workers, 8) + self.assertEqual(parser.language, "de") + self.assertEqual(parser.verbose, True) + + @override_settings(LLAMAPARSE_API_KEY="test-key") + @patch("llama_parse.LlamaParse") + @patch("opencontractserver.pipeline.parsers.llamaparse_parser.default_storage.open") + def test_kwargs_override_settings(self, mock_open, mock_llama_parse_class): + """Test that kwargs override settings.""" + with transaction.atomic(): + user = User.objects.create_user( + username="configtestuser", password="pass123" + ) + + doc = Document.objects.create( + title="Config Test Doc", + file_type="pdf", + creator=user, + ) + doc.pdf_file.save("config_test.pdf", ContentFile(b"%PDF-1.4 test")) + + mock_file = MagicMock() + mock_file.read.return_value = b"mock pdf" + mock_open.return_value.__enter__.return_value = mock_file + + mock_parser = MagicMock() + mock_parser.get_json_result.return_value = [{"pages": []}] + mock_llama_parse_class.return_value = mock_parser + + parser = LlamaParseParser() + parser.parse_document( + user_id=user.id, + doc_id=doc.id, + language="fr", + num_workers=16, + ) + + # Verify LlamaParse was called with overridden values + mock_llama_parse_class.assert_called_once() + call_kwargs = mock_llama_parse_class.call_args.kwargs + self.assertEqual(call_kwargs["language"], "fr") + self.assertEqual(call_kwargs["num_workers"], 16) + + +class TestLlamaParseParserLayoutOnlyProcessing(TestCase): + """Tests for layout-only processing (when items are empty but layout exists).""" + + def setUp(self): + """Set up test environment.""" + with transaction.atomic(): + self.user = User.objects.create_user( + username="layouttestuser", password="testpass123" + ) + + self.doc = Document.objects.create( + title="Layout Test Document", + description="Test Description", + file_type="pdf", + creator=self.user, + ) + + pdf_content = b"%PDF-1.7\n1 0 obj\n<>\nendobj\n2 0 obj\n<>\nendobj\n3 0 obj\n<>>>\nendobj\nxref\n0 4\n0000000000 65535 f\n0000000010 00000 n\n0000000053 00000 n\n0000000102 00000 n\ntrailer\n<>\nstartxref\n178\n%%EOF\n" # noqa: E501 + self.doc.pdf_file.save("test_layout.pdf", ContentFile(pdf_content)) + + # JSON response with layout elements but no items + # Uses actual LlamaParse format with bBox and w/h + self.layout_only_response = [ + { + "pages": [ + { + "text": "Layout only page content.", + "width": 612, + "height": 792, + "items": [], # Empty items list + "layout": [ + { + "label": "title", + "bBox": { + "x": 61.2, + "y": 39.6, + "w": 489.6, + "h": 39.6, + }, + "text": "Document Title from Layout", + }, + { + "label": "paragraph", + "bBox": { + "x": 61.2, + "y": 158.4, + "w": 489.6, + "h": 79.2, + }, + "text": "Paragraph content from layout element.", + }, + { + "label": "figure", + "bBox": { + "x": 122.4, + "y": 316.8, + "w": 367.2, + "h": 237.6, + }, + "text": "", # Empty text for figure - should use [figure] + }, + { + "label": "text", + "bBox": { + "x": 61.2, + "y": 594.0, + "w": 489.6, + "h": 39.6, + }, + "text": "", # Empty text for non-figure - should be skipped + }, + ], + } + ] + } + ] + + @override_settings(LLAMAPARSE_API_KEY="test-api-key-123") + @patch("llama_parse.LlamaParse") + @patch("opencontractserver.pipeline.parsers.llamaparse_parser.default_storage.open") + def test_parse_document_layout_only_processing( + self, mock_open, mock_llama_parse_class + ): + """Test document parsing when items are empty but layout exists. + + This tests lines 344-381 in llamaparse_parser.py. + """ + mock_file = MagicMock() + mock_file.read.return_value = b"mock pdf content" + mock_open.return_value.__enter__.return_value = mock_file + + mock_parser = MagicMock() + mock_parser.get_json_result.return_value = self.layout_only_response + mock_llama_parse_class.return_value = mock_parser + + parser = LlamaParseParser() + result = parser.parse_document(user_id=self.user.id, doc_id=self.doc.id) + + # Verify result structure + self.assertIsNotNone(result) + self.assertEqual(result["title"], "Layout Test Document") + self.assertEqual(result["page_count"], 1) + + # Verify PAWLS content structure (tokens are empty - no token-level data) + self.assertIn("pawls_file_content", result) + self.assertEqual(len(result["pawls_file_content"]), 1) + first_page = result["pawls_file_content"][0] + # LlamaParse doesn't provide token-level data + self.assertEqual(len(first_page["tokens"]), 0) + + # Verify annotations were created from layout elements + self.assertIn("labelled_text", result) + # Should have 3 annotations: title, paragraph, and figure + # The empty text "text" type should be skipped + self.assertEqual(len(result["labelled_text"]), 3) + + # Check the annotation labels + labels = [anno["annotationLabel"] for anno in result["labelled_text"]] + self.assertIn("Title", labels) + self.assertIn("Paragraph", labels) + self.assertIn("Figure", labels) + + @override_settings(LLAMAPARSE_API_KEY="test-api-key-123") + @patch("llama_parse.LlamaParse") + @patch("opencontractserver.pipeline.parsers.llamaparse_parser.default_storage.open") + def test_parse_document_layout_figure_without_text( + self, mock_open, mock_llama_parse_class + ): + """Test that figures/images with empty text are processed correctly. + + Figures and images should use [element_type] as placeholder text. + """ + layout_with_images = [ + { + "pages": [ + { + "text": "Page with figures", + "width": 612, + "height": 792, + "items": [], + "layout": [ + { + "label": "image", + "bBox": { + "x": 61.2, + "y": 79.2, + "w": 489.6, + "h": 316.8, + }, + "text": "", # Empty text - should use [image] + }, + { + "label": "figure", + "bBox": { + "x": 61.2, + "y": 475.2, + "w": 489.6, + "h": 237.6, + }, + "text": "", # Empty text - should use [figure] + }, + ], + } + ] + } + ] + + mock_file = MagicMock() + mock_file.read.return_value = b"mock pdf content" + mock_open.return_value.__enter__.return_value = mock_file + + mock_parser = MagicMock() + mock_parser.get_json_result.return_value = layout_with_images + mock_llama_parse_class.return_value = mock_parser + + parser = LlamaParseParser() + result = parser.parse_document(user_id=self.user.id, doc_id=self.doc.id) + + # Both figure and image should be processed + self.assertIsNotNone(result) + self.assertEqual(len(result["labelled_text"]), 2) + + # Check that placeholder text was used + for anno in result["labelled_text"]: + self.assertIn(anno["rawText"], ["[image]", "[figure]"]) + + @override_settings(LLAMAPARSE_API_KEY="test-api-key-123") + @patch("llama_parse.LlamaParse") + @patch("opencontractserver.pipeline.parsers.llamaparse_parser.default_storage.open") + def test_parse_document_layout_skips_empty_text_non_figures( + self, mock_open, mock_llama_parse_class + ): + """Test that non-figure elements with empty text are skipped.""" + layout_with_empty_text = [ + { + "pages": [ + { + "text": "Page content", + "width": 612, + "height": 792, + "items": [], + "layout": [ + { + "label": "title", + "bBox": { + "x": 61.2, + "y": 79.2, + "w": 489.6, + "h": 39.6, + }, + "text": "Valid Title", # Has text - should be included + }, + { + "label": "paragraph", + "bBox": { + "x": 61.2, + "y": 158.4, + "w": 489.6, + "h": 79.2, + }, + "text": "", # Empty text - should be skipped + }, + { + "label": "heading", + "bBox": { + "x": 61.2, + "y": 316.8, + "w": 489.6, + "h": 39.6, + }, + "text": "", # Empty text - should be skipped + }, + { + "label": "section_header", + "bBox": { + "x": 61.2, + "y": 396.0, + "w": 489.6, + "h": 39.6, + }, + "text": "Valid Section Header", # Has text - should be included + }, + ], + } + ] + } + ] + + mock_file = MagicMock() + mock_file.read.return_value = b"mock pdf content" + mock_open.return_value.__enter__.return_value = mock_file + + mock_parser = MagicMock() + mock_parser.get_json_result.return_value = layout_with_empty_text + mock_llama_parse_class.return_value = mock_parser + + parser = LlamaParseParser() + result = parser.parse_document(user_id=self.user.id, doc_id=self.doc.id) + + # Only 2 annotations should be created (title and section_header) + self.assertIsNotNone(result) + self.assertEqual(len(result["labelled_text"]), 2) + + labels = [anno["annotationLabel"] for anno in result["labelled_text"]] + self.assertIn("Title", labels) + self.assertIn("Section Header", labels) + self.assertNotIn("Paragraph", labels) + self.assertNotIn("Heading", labels) diff --git a/opencontractserver/tests/test_pipeline_component_queries.py b/opencontractserver/tests/test_pipeline_component_queries.py index 831e9b7fc..5ffe82197 100644 --- a/opencontractserver/tests/test_pipeline_component_queries.py +++ b/opencontractserver/tests/test_pipeline_component_queries.py @@ -344,7 +344,11 @@ def test_pipeline_components_query_with_mimetype(self): self.assertIn("Test PostProcessor", post_processor_titles) def test_pipeline_components_query_with_mimetype_no_components(self): - """Test querying pipeline components with a mimetype that has no components.""" + """Test querying pipeline components with a mimetype that has limited components. + + Note: DOCX now has LlamaParseParser support, but no thumbnailer support. + This test verifies the filtering behavior for file types with partial support. + """ # Use the enum value, not the full MIME type query = """ @@ -366,13 +370,19 @@ def test_pipeline_components_query_with_mimetype_no_components(self): } """ - variables = {"mimetype": "DOCX"} # Our test components do not support DOCX + variables = {"mimetype": "DOCX"} result = self.client.execute(query, variables=variables) self.assertIsNone(result.get("errors")) data = result["data"]["pipelineComponents"] - self.assertEqual(len(data["parsers"]), 0) + + # LlamaParseParser supports DOCX, so we expect at least one parser + parsers = data["parsers"] + parser_titles = [parser["title"] for parser in parsers] + self.assertIn("LlamaParse Parser", parser_titles) + + # No thumbnailers support DOCX self.assertEqual(len(data["thumbnailers"]), 0) # Embedders are included regardless of mimetype in our utils diff --git a/requirements/ingestors/llama_parse.txt b/requirements/ingestors/llama_parse.txt new file mode 100644 index 000000000..12219944a --- /dev/null +++ b/requirements/ingestors/llama_parse.txt @@ -0,0 +1,2 @@ +# LlamaParse document parsing +llama-parse>=0.5.0