From 6a175d6c35cb62f7f02adf790e6afd0240fc129a Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 27 Dec 2025 02:11:15 +0000 Subject: [PATCH 1/6] Add comprehensive MCP interface proposal for issue #387 This proposal designs an elegant, performant MCP (Model Context Protocol) interface for OpenContracts that provides read-only access to public resources. Key features: - One-corpus-at-a-time model for focused exploration - Anonymous user permission model (public resources only) - 4 MCP resources: corpus, document, annotation, thread - 7 MCP tools for discovery and retrieval - Performance optimized using AnnotationQueryOptimizer - Respects consolidated permissioning guide - Comprehensive implementation examples and testing strategy Implementation includes: - Resource URI scheme (corpus://, document://, annotation://, thread://) - Tool definitions (list_public_corpuses, list_documents, get_document_text, list_annotations, search_corpus, list_threads, get_thread_messages) - Security considerations (rate limiting, input validation) - Performance optimizations (caching, pagination, query optimization) - Complete code examples for server.py, resources.py, tools.py - Testing strategy with unit and integration tests - Deployment instructions Addresses: #387 --- docs/mcp/mcp_interface_proposal.md | 1166 ++++++++++++++++++++++++++++ 1 file changed, 1166 insertions(+) create mode 100644 docs/mcp/mcp_interface_proposal.md diff --git a/docs/mcp/mcp_interface_proposal.md b/docs/mcp/mcp_interface_proposal.md new file mode 100644 index 000000000..3bcd7598e --- /dev/null +++ b/docs/mcp/mcp_interface_proposal.md @@ -0,0 +1,1166 @@ +# OpenContracts MCP Interface Proposal + +## Overview + +This document proposes an elegant, performant Model Context Protocol (MCP) interface for OpenContracts that provides **read-only access to public resources**. The interface follows a **one-corpus-at-a-time** model, allowing AI assistants and other MCP clients to explore public corpuses, documents, annotations, and discussion threads. + +## Design Principles + +1. **Public-Only Access**: Only resources where `is_public=True` are accessible +2. **Read-Only Operations**: No mutations - pure information retrieval +3. **One Corpus Context**: Users select a corpus, then explore within that scope +4. **Performance First**: Leverage existing query optimizers and manager methods +5. **Anonymous User Model**: Operate as anonymous user with READ permissions only +6. **Respect Permission Model**: Follow existing permissioning rules (document + corpus both must be public) + +## Architecture + +### Permission Strategy + +The MCP server operates as an **anonymous user**, which means: + +```python +# Permission checks follow anonymous user rules from permissioning guide: +# - Corpus: is_public=True +# - Document: is_public=True AND (no corpus OR corpus.is_public=True) +# - Annotation: document.is_public=True AND corpus.is_public=True +# - Thread: is_public=True +# - ChatMessage: thread.is_public=True + +from django.contrib.auth.models import AnonymousUser + +# All queries use visible_to_user() with AnonymousUser +anonymous = AnonymousUser() +public_corpuses = Corpus.objects.visible_to_user(anonymous) +``` + +### Resource Naming Convention + +MCP resources follow a hierarchical URI pattern: + +``` +corpus://{corpus_slug} +document://{corpus_slug}/{document_slug} +annotation://{corpus_slug}/{document_slug}/{annotation_id} +thread://{corpus_slug}/threads/{thread_id} +``` + +## MCP Resources + +Resources provide **static content** for context windows. They represent specific entities. + +### 1. Corpus Resource + +**URI**: `corpus://{corpus_slug}` + +**Content**: Full corpus metadata and summary statistics + +```json +{ + "slug": "legal-contracts-2024", + "title": "Legal Contracts Database 2024", + "description": "Curated collection of legal contracts...", + "document_count": 1247, + "annotation_count": 15632, + "thread_count": 89, + "created": "2024-01-15T10:30:00Z", + "modified": "2024-12-20T14:22:00Z", + "label_set": { + "title": "Legal Annotation Labels", + "labels": [ + {"text": "indemnification", "color": "#FF5733", "label_type": "TOKEN_LABEL"}, + {"text": "termination", "color": "#33FF57", "label_type": "SPAN_LABEL"} + ] + } +} +``` + +**Implementation**: +```python +def get_corpus_resource(corpus_slug: str) -> str: + anonymous = AnonymousUser() + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + return json.dumps({ + "slug": corpus.slug, + "title": corpus.title, + "description": corpus.description, + "document_count": corpus.document_count(), + "created": corpus.created.isoformat(), + "modified": corpus.modified.isoformat(), + # ... statistics and metadata + }) +``` + +### 2. Document Resource + +**URI**: `document://{corpus_slug}/{document_slug}` + +**Content**: Document metadata, extracted text, and structural information + +```json +{ + "slug": "employment-agreement-acme-2024", + "title": "Employment Agreement - Acme Corp 2024", + "description": "Standard employment contract template", + "file_type": "application/pdf", + "page_count": 12, + "text_preview": "This Employment Agreement is entered into...", + "full_text": "[Full extracted text content]", + "created": "2024-03-10T09:15:00Z", + "corpus": "legal-contracts-2024" +} +``` + +**Implementation**: +```python +def get_document_resource(corpus_slug: str, document_slug: str) -> str: + anonymous = AnonymousUser() + + # Get corpus context + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + # Get document within corpus (both must be public) + document = (Document.objects + .visible_to_user(anonymous) + .filter(corpuses=corpus, slug=document_slug) + .first()) + + if not document: + raise NotFoundError() + + # Read extracted text + full_text = "" + if document.txt_extract_file: + with document.txt_extract_file.open('r') as f: + full_text = f.read() + + return json.dumps({ + "slug": document.slug, + "title": document.title, + "description": document.description, + "page_count": document.page_count, + "full_text": full_text, + # ... + }) +``` + +### 3. Annotation Resource + +**URI**: `annotation://{corpus_slug}/{document_slug}/{annotation_id}` + +**Content**: Specific annotation with location and metadata + +```json +{ + "id": "12345", + "page": 3, + "raw_text": "indemnification clause", + "annotation_label": { + "text": "indemnification", + "color": "#FF5733", + "label_type": "SPAN_LABEL" + }, + "bounding_box": { + "top": 120, + "left": 50, + "right": 450, + "bottom": 145 + }, + "structural": false, + "created": "2024-03-12T11:20:00Z" +} +``` + +**Implementation**: +```python +def get_annotation_resource(corpus_slug: str, document_slug: str, annotation_id: int) -> str: + from opencontractserver.annotations.query_optimizer import AnnotationQueryOptimizer + anonymous = AnonymousUser() + + # Get corpus and document + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + document = Document.objects.visible_to_user(anonymous).get( + corpuses=corpus, slug=document_slug + ) + + # Use query optimizer for efficient permission checking + annotations = AnnotationQueryOptimizer.get_document_annotations( + document_id=document.id, + user=anonymous, + corpus_id=corpus.id + ) + + annotation = annotations.get(id=annotation_id) + + return json.dumps({ + "id": str(annotation.id), + "page": annotation.page, + "raw_text": annotation.raw_text, + # ... + }) +``` + +### 4. Thread Resource + +**URI**: `thread://{corpus_slug}/threads/{thread_id}` + +**Content**: Discussion thread with messages + +```json +{ + "id": "9876", + "title": "Question about indemnification clause interpretation", + "description": "Discussion about standard indemnification language", + "message_count": 12, + "is_locked": false, + "is_pinned": true, + "created_at": "2024-11-15T14:30:00Z", + "messages": [ + { + "id": "msg-1", + "content": "Can someone explain the scope of this indemnification clause?", + "msg_type": "HUMAN", + "created_at": "2024-11-15T14:30:00Z", + "upvote_count": 5, + "downvote_count": 0, + "replies": [ + { + "id": "msg-2", + "content": "This clause provides protection for...", + "msg_type": "HUMAN", + "created_at": "2024-11-15T15:10:00Z", + "upvote_count": 8, + "downvote_count": 0 + } + ] + } + ] +} +``` + +**Implementation**: +```python +def get_thread_resource(corpus_slug: str, thread_id: int, include_messages: bool = True) -> str: + anonymous = AnonymousUser() + + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + # Get public thread in this corpus + thread = (Conversation.objects + .visible_to_user(anonymous) + .filter( + conversation_type=ConversationTypeChoices.THREAD, + chat_with_corpus=corpus, + id=thread_id + ) + .first()) + + if not thread: + raise NotFoundError() + + data = { + "id": str(thread.id), + "title": thread.title, + "description": thread.description, + "is_locked": thread.is_locked, + "is_pinned": thread.is_pinned, + "created_at": thread.created_at.isoformat(), + } + + if include_messages: + messages = build_threaded_messages(thread, anonymous) + data["messages"] = messages + + return json.dumps(data) + +def build_threaded_messages(thread: Conversation, user) -> list: + """Build hierarchical message tree""" + messages = (ChatMessage.objects + .visible_to_user(user) + .filter(conversation=thread, parent_message__isnull=True) + .order_by('created_at')) + + return [format_message_with_replies(msg, user) for msg in messages] +``` + +## MCP Tools + +Tools provide **dynamic operations** - they execute queries and return results. + +### 1. list_public_corpuses + +**Purpose**: Discover available public corpuses + +**Parameters**: +- `limit` (optional, default=20): Number of results +- `offset` (optional, default=0): Pagination offset +- `search` (optional): Filter by title/description + +**Returns**: List of corpus summaries + +```json +{ + "total_count": 47, + "corpuses": [ + { + "slug": "legal-contracts-2024", + "title": "Legal Contracts Database 2024", + "description": "Curated collection...", + "document_count": 1247, + "created": "2024-01-15T10:30:00Z" + } + ] +} +``` + +**Implementation**: +```python +async def list_public_corpuses(limit: int = 20, offset: int = 0, search: str = "") -> dict: + anonymous = AnonymousUser() + + qs = Corpus.objects.visible_to_user(anonymous) + + if search: + from django.db.models import Q + qs = qs.filter( + Q(title__icontains=search) | Q(description__icontains=search) + ) + + total_count = qs.count() + corpuses = qs[offset:offset+limit] + + return { + "total_count": total_count, + "corpuses": [format_corpus_summary(c) for c in corpuses] + } +``` + +### 2. list_documents + +**Purpose**: List documents in a corpus + +**Parameters**: +- `corpus_slug` (required): Corpus identifier +- `limit` (optional, default=50): Number of results +- `offset` (optional, default=0): Pagination offset +- `search` (optional): Filter by title/description + +**Returns**: List of document summaries + +```json +{ + "total_count": 1247, + "documents": [ + { + "slug": "employment-agreement-acme-2024", + "title": "Employment Agreement - Acme Corp 2024", + "page_count": 12, + "created": "2024-03-10T09:15:00Z" + } + ] +} +``` + +**Implementation**: +```python +async def list_documents( + corpus_slug: str, + limit: int = 50, + offset: int = 0, + search: str = "" +) -> dict: + anonymous = AnonymousUser() + + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + # Get public documents in this corpus + qs = (Document.objects + .visible_to_user(anonymous) + .filter(corpuses=corpus)) + + if search: + from django.db.models import Q + qs = qs.filter( + Q(title__icontains=search) | Q(description__icontains=search) + ) + + total_count = qs.count() + documents = qs[offset:offset+limit] + + return { + "total_count": total_count, + "documents": [format_document_summary(d) for d in documents] + } +``` + +### 3. get_document_text + +**Purpose**: Retrieve full extracted text from a document + +**Parameters**: +- `corpus_slug` (required): Corpus identifier +- `document_slug` (required): Document identifier + +**Returns**: Plain text content + +```json +{ + "document_slug": "employment-agreement-acme-2024", + "page_count": 12, + "text": "This Employment Agreement is entered into as of January 1, 2024..." +} +``` + +**Implementation**: +```python +async def get_document_text(corpus_slug: str, document_slug: str) -> dict: + anonymous = AnonymousUser() + + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + document = (Document.objects + .visible_to_user(anonymous) + .get(corpuses=corpus, slug=document_slug)) + + full_text = "" + if document.txt_extract_file: + with document.txt_extract_file.open('r') as f: + full_text = f.read() + + return { + "document_slug": document.slug, + "page_count": document.page_count, + "text": full_text + } +``` + +### 4. list_annotations + +**Purpose**: List annotations on a document + +**Parameters**: +- `corpus_slug` (required): Corpus identifier +- `document_slug` (required): Document identifier +- `page` (optional): Filter to specific page +- `label_text` (optional): Filter by label text +- `limit` (optional, default=100): Number of results +- `offset` (optional, default=0): Pagination offset + +**Returns**: List of annotations + +```json +{ + "total_count": 156, + "annotations": [ + { + "id": "12345", + "page": 3, + "raw_text": "indemnification clause", + "annotation_label": { + "text": "indemnification", + "color": "#FF5733" + }, + "structural": false + } + ] +} +``` + +**Implementation**: +```python +async def list_annotations( + corpus_slug: str, + document_slug: str, + page: int | None = None, + label_text: str | None = None, + limit: int = 100, + offset: int = 0 +) -> dict: + from opencontractserver.annotations.query_optimizer import AnnotationQueryOptimizer + anonymous = AnonymousUser() + + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + document = Document.objects.visible_to_user(anonymous).get( + corpuses=corpus, slug=document_slug + ) + + # Use query optimizer - eliminates N+1 permission queries + qs = AnnotationQueryOptimizer.get_document_annotations( + document_id=document.id, + user=anonymous, + corpus_id=corpus.id + ) + + # Apply filters + if page is not None: + qs = qs.filter(page=page) + + if label_text: + qs = qs.filter(annotation_label__text=label_text) + + total_count = qs.count() + annotations = qs.select_related('annotation_label')[offset:offset+limit] + + return { + "total_count": total_count, + "annotations": [format_annotation(a) for a in annotations] + } +``` + +### 5. search_corpus + +**Purpose**: Semantic search within a corpus using vector embeddings + +**Parameters**: +- `corpus_slug` (required): Corpus identifier +- `query` (required): Search query text +- `limit` (optional, default=10): Number of results + +**Returns**: Ranked list of relevant documents and annotations + +```json +{ + "query": "indemnification provisions", + "results": [ + { + "type": "document", + "slug": "employment-agreement-acme-2024", + "title": "Employment Agreement - Acme Corp 2024", + "similarity_score": 0.89, + "snippet": "...indemnification provisions in Section 7..." + }, + { + "type": "annotation", + "document_slug": "service-agreement-beta-2024", + "id": "45678", + "raw_text": "indemnification by service provider", + "similarity_score": 0.85, + "page": 5 + } + ] +} +``` + +**Implementation**: +```python +async def search_corpus( + corpus_slug: str, + query: str, + limit: int = 10 +) -> dict: + from opencontractserver.utils.embeddings import generate_embeddings_from_text + anonymous = AnonymousUser() + + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + # Generate query embedding using corpus's preferred embedder + embedder_path, query_vector = corpus.embed_text(query) + + if not query_vector: + # Fallback to text search if embeddings unavailable + return await text_search_fallback(corpus, query, limit) + + # Search documents + doc_results = (Document.objects + .visible_to_user(anonymous) + .filter(corpuses=corpus) + .search_by_embedding(query_vector, embedder_path, top_k=limit)) + + # Search annotations (requires custom implementation using Annotation embeddings) + # ann_results = search_annotations_by_embedding(corpus, document, query_vector, embedder_path, limit) + + # Combine and rank results + results = [] + for doc in doc_results: + results.append({ + "type": "document", + "slug": doc.slug, + "title": doc.title, + "similarity_score": float(doc.similarity_score), + }) + + return { + "query": query, + "results": results[:limit] + } +``` + +### 6. list_threads + +**Purpose**: List discussion threads in a corpus or document + +**Parameters**: +- `corpus_slug` (required): Corpus identifier +- `document_slug` (optional): Filter to document-specific threads +- `limit` (optional, default=20): Number of results +- `offset` (optional, default=0): Pagination offset + +**Returns**: List of thread summaries + +```json +{ + "total_count": 89, + "threads": [ + { + "id": "9876", + "title": "Question about indemnification clause", + "message_count": 12, + "is_pinned": true, + "is_locked": false, + "created_at": "2024-11-15T14:30:00Z", + "last_activity": "2024-12-15T09:20:00Z" + } + ] +} +``` + +**Implementation**: +```python +async def list_threads( + corpus_slug: str, + document_slug: str | None = None, + limit: int = 20, + offset: int = 0 +) -> dict: + anonymous = AnonymousUser() + + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + qs = (Conversation.objects + .visible_to_user(anonymous) + .filter( + conversation_type=ConversationTypeChoices.THREAD, + chat_with_corpus=corpus + )) + + if document_slug: + document = Document.objects.visible_to_user(anonymous).get( + corpuses=corpus, slug=document_slug + ) + qs = qs.filter(chat_with_document=document) + + # Order by pinned first, then recent activity + qs = qs.order_by('-is_pinned', '-updated_at') + + total_count = qs.count() + threads = qs[offset:offset+limit] + + return { + "total_count": total_count, + "threads": [format_thread_summary(t) for t in threads] + } +``` + +### 7. get_thread_messages + +**Purpose**: Retrieve all messages in a thread with hierarchical structure + +**Parameters**: +- `corpus_slug` (required): Corpus identifier +- `thread_id` (required): Thread identifier +- `flatten` (optional, default=False): Return flat list instead of tree + +**Returns**: Thread messages in hierarchical or flat format + +```json +{ + "thread_id": "9876", + "title": "Question about indemnification clause", + "messages": [ + { + "id": "msg-1", + "content": "Can someone explain...", + "msg_type": "HUMAN", + "created_at": "2024-11-15T14:30:00Z", + "upvote_count": 5, + "replies": [ + { + "id": "msg-2", + "content": "This clause provides...", + "created_at": "2024-11-15T15:10:00Z", + "upvote_count": 8 + } + ] + } + ] +} +``` + +## Implementation Structure + +### Directory Layout + +``` +opencontractserver/ + mcp/ + __init__.py + server.py # MCP server entry point + resources.py # Resource handlers + tools.py # Tool implementations + permissions.py # Permission utilities + formatters.py # Response formatting + config.py # Configuration +``` + +### MCP Server Entry Point + +```python +# opencontractserver/mcp/server.py +import asyncio +from mcp import Server, Resource, Tool +from mcp.types import TextContent, EmbeddedResource + +from .resources import ( + get_corpus_resource, + get_document_resource, + get_annotation_resource, + get_thread_resource +) + +from .tools import ( + list_public_corpuses, + list_documents, + get_document_text, + list_annotations, + search_corpus, + list_threads, + get_thread_messages +) + +# Initialize MCP server +mcp_server = Server("opencontracts") + +# Register resources +@mcp_server.list_resources() +async def list_resources() -> list[Resource]: + """List available resource patterns""" + return [ + Resource( + uri="corpus://{corpus_slug}", + name="Public Corpus", + description="Access public corpus metadata and contents", + mimeType="application/json" + ), + Resource( + uri="document://{corpus_slug}/{document_slug}", + name="Public Document", + description="Access public document with extracted text", + mimeType="application/json" + ), + Resource( + uri="annotation://{corpus_slug}/{document_slug}/{annotation_id}", + name="Document Annotation", + description="Access specific annotation on a document", + mimeType="application/json" + ), + Resource( + uri="thread://{corpus_slug}/threads/{thread_id}", + name="Discussion Thread", + description="Access public discussion thread with messages", + mimeType="application/json" + ) + ] + +@mcp_server.read_resource() +async def read_resource(uri: str) -> str: + """Resolve resource URI and return content""" + if uri.startswith("corpus://"): + corpus_slug = uri.replace("corpus://", "") + return await get_corpus_resource(corpus_slug) + + elif uri.startswith("document://"): + # Parse: document://{corpus_slug}/{document_slug} + parts = uri.replace("document://", "").split("/") + return await get_document_resource(parts[0], parts[1]) + + elif uri.startswith("annotation://"): + # Parse: annotation://{corpus_slug}/{document_slug}/{annotation_id} + parts = uri.replace("annotation://", "").split("/") + return await get_annotation_resource(parts[0], parts[1], int(parts[2])) + + elif uri.startswith("thread://"): + # Parse: thread://{corpus_slug}/threads/{thread_id} + parts = uri.replace("thread://", "").split("/") + corpus_slug = parts[0] + thread_id = int(parts[2]) # Skip "threads" part + return await get_thread_resource(corpus_slug, thread_id) + + raise ValueError(f"Unknown resource URI: {uri}") + +# Register tools +@mcp_server.list_tools() +async def list_tools() -> list[Tool]: + """List available tools""" + return [ + Tool( + name="list_public_corpuses", + description="List all publicly accessible corpuses", + inputSchema={ + "type": "object", + "properties": { + "limit": {"type": "integer", "default": 20}, + "offset": {"type": "integer", "default": 0}, + "search": {"type": "string", "default": ""} + } + } + ), + Tool( + name="list_documents", + description="List documents in a corpus", + inputSchema={ + "type": "object", + "properties": { + "corpus_slug": {"type": "string", "description": "Corpus identifier"}, + "limit": {"type": "integer", "default": 50}, + "offset": {"type": "integer", "default": 0}, + "search": {"type": "string", "default": ""} + }, + "required": ["corpus_slug"] + } + ), + # ... (register all other tools) + ] + +@mcp_server.call_tool() +async def call_tool(name: str, arguments: dict) -> list[TextContent]: + """Execute tool and return results""" + if name == "list_public_corpuses": + result = await list_public_corpuses(**arguments) + elif name == "list_documents": + result = await list_documents(**arguments) + elif name == "get_document_text": + result = await get_document_text(**arguments) + elif name == "list_annotations": + result = await list_annotations(**arguments) + elif name == "search_corpus": + result = await search_corpus(**arguments) + elif name == "list_threads": + result = await list_threads(**arguments) + elif name == "get_thread_messages": + result = await get_thread_messages(**arguments) + else: + raise ValueError(f"Unknown tool: {name}") + + return [TextContent(type="text", text=json.dumps(result, indent=2))] + +# Entry point +async def main(): + """Run MCP server""" + from mcp.server.stdio import stdio_server + + async with stdio_server() as streams: + await mcp_server.run( + streams[0], # read_stream + streams[1], # write_stream + mcp_server.create_initialization_options() + ) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Performance Optimizations + +### 1. Query Optimizer Usage + +```python +# ALWAYS use query optimizers for annotations +from opencontractserver.annotations.query_optimizer import AnnotationQueryOptimizer + +# Good: Eliminates N+1 queries +annotations = AnnotationQueryOptimizer.get_document_annotations( + document_id=doc.id, + user=anonymous, + corpus_id=corpus.id +) + +# Bad: N+1 permission queries +annotations = Annotation.objects.filter(document=doc) # Don't do this! +``` + +### 2. Select Related / Prefetch + +```python +# Eager load related objects to avoid additional queries +documents = (Document.objects + .visible_to_user(anonymous) + .select_related('creator') + .prefetch_related('doc_annotations__annotation_label')) +``` + +### 3. Pagination + +```python +# Always use limit/offset for large result sets +def list_with_pagination(queryset, limit, offset): + total_count = queryset.count() + results = queryset[offset:offset+limit] + + return { + "total_count": total_count, + "limit": limit, + "offset": offset, + "has_more": offset + limit < total_count, + "results": [format_item(r) for r in results] + } +``` + +### 4. Caching Strategy + +```python +from django.core.cache import cache +from django.utils.encoding import force_str + +def cached_corpus_summary(corpus_slug: str) -> dict: + """Cache corpus summaries for 5 minutes""" + cache_key = f"mcp:corpus_summary:{corpus_slug}" + + cached = cache.get(cache_key) + if cached: + return cached + + result = generate_corpus_summary(corpus_slug) + cache.set(cache_key, result, 300) # 5 minutes + + return result +``` + +## Security Considerations + +### 1. Public-Only Filter + +```python +# ALWAYS apply anonymous user filter +from django.contrib.auth.models import AnonymousUser + +anonymous = AnonymousUser() + +# This automatically filters to is_public=True resources +public_resources = Model.objects.visible_to_user(anonymous) +``` + +### 2. Input Validation + +```python +import re + +def validate_slug(slug: str) -> bool: + """Validate slug format matches OpenContracts pattern""" + # From CLAUDE.md: Case-sensitive, A-Z, a-z, 0-9, hyphen (-) + return bool(re.match(r'^[A-Za-z0-9\-]+$', slug)) + +def sanitize_inputs(corpus_slug: str, document_slug: str | None = None): + """Validate and sanitize all slug inputs""" + if not validate_slug(corpus_slug): + raise ValueError(f"Invalid corpus slug: {corpus_slug}") + + if document_slug and not validate_slug(document_slug): + raise ValueError(f"Invalid document slug: {document_slug}") +``` + +### 3. Rate Limiting + +```python +from django.core.cache import cache +from datetime import datetime, timedelta + +class RateLimiter: + """Simple rate limiter for MCP requests""" + + def __init__(self, max_requests: int = 100, window_seconds: int = 60): + self.max_requests = max_requests + self.window_seconds = window_seconds + + def check_rate_limit(self, client_id: str) -> bool: + """Returns True if request is allowed, False if rate limited""" + key = f"mcp:ratelimit:{client_id}" + + current = cache.get(key, 0) + if current >= self.max_requests: + return False + + cache.set(key, current + 1, self.window_seconds) + return True +``` + +## Configuration + +### Environment Variables + +```bash +# .env +MCP_SERVER_ENABLED=true +MCP_MAX_RESULTS_PER_PAGE=100 +MCP_RATE_LIMIT_REQUESTS=100 +MCP_RATE_LIMIT_WINDOW=60 +MCP_CACHE_TTL=300 +``` + +### Django Settings + +```python +# config/settings/base.py + +# MCP Server Configuration +MCP_SERVER = { + 'enabled': env.bool('MCP_SERVER_ENABLED', default=False), + 'max_results_per_page': env.int('MCP_MAX_RESULTS_PER_PAGE', default=100), + 'rate_limit': { + 'requests': env.int('MCP_RATE_LIMIT_REQUESTS', default=100), + 'window': env.int('MCP_RATE_LIMIT_WINDOW', default=60), + }, + 'cache_ttl': env.int('MCP_CACHE_TTL', default=300), +} +``` + +## Testing Strategy + +### Unit Tests + +```python +# opencontractserver/mcp/tests/test_resources.py +from django.test import TestCase +from django.contrib.auth.models import AnonymousUser +from opencontractserver.corpuses.models import Corpus +from opencontractserver.mcp.resources import get_corpus_resource + +class CorpusResourceTest(TestCase): + def setUp(self): + self.public_corpus = Corpus.objects.create( + title="Public Corpus", + description="Test corpus", + slug="public-corpus", + is_public=True, + creator=self.create_user("owner") + ) + + self.private_corpus = Corpus.objects.create( + title="Private Corpus", + description="Private test corpus", + slug="private-corpus", + is_public=False, + creator=self.create_user("owner") + ) + + def test_get_public_corpus_resource(self): + """Anonymous users can access public corpus resources""" + result = get_corpus_resource("public-corpus") + data = json.loads(result) + + self.assertEqual(data["slug"], "public-corpus") + self.assertEqual(data["title"], "Public Corpus") + + def test_get_private_corpus_resource_denied(self): + """Anonymous users cannot access private corpus resources""" + with self.assertRaises(Corpus.DoesNotExist): + get_corpus_resource("private-corpus") +``` + +### Integration Tests + +```python +# opencontractserver/mcp/tests/test_integration.py +import pytest +from mcp.client import ClientSession + +@pytest.mark.asyncio +async def test_full_corpus_exploration(): + """Test complete workflow: discover corpus → list documents → get annotations""" + async with ClientSession("opencontracts-mcp") as session: + # 1. List public corpuses + corpuses_result = await session.call_tool("list_public_corpuses", {}) + corpuses = json.loads(corpuses_result[0].text) + + assert len(corpuses["corpuses"]) > 0 + corpus_slug = corpuses["corpuses"][0]["slug"] + + # 2. List documents in corpus + docs_result = await session.call_tool("list_documents", { + "corpus_slug": corpus_slug, + "limit": 10 + }) + docs = json.loads(docs_result[0].text) + + assert len(docs["documents"]) > 0 + document_slug = docs["documents"][0]["slug"] + + # 3. Get document text + text_result = await session.call_tool("get_document_text", { + "corpus_slug": corpus_slug, + "document_slug": document_slug + }) + text_data = json.loads(text_result[0].text) + + assert len(text_data["text"]) > 0 + + # 4. List annotations + ann_result = await session.call_tool("list_annotations", { + "corpus_slug": corpus_slug, + "document_slug": document_slug, + "limit": 50 + }) + annotations = json.loads(ann_result[0].text) + + assert "annotations" in annotations +``` + +## Deployment + +### Standalone MCP Server + +```bash +# Run as standalone process +python -m opencontractserver.mcp.server + +# Or via Docker +docker run -p 3000:3000 opencontracts-mcp +``` + +### Integration with Claude Desktop + +```json +{ + "mcpServers": { + "opencontracts": { + "command": "python", + "args": ["-m", "opencontractserver.mcp.server"], + "env": { + "DJANGO_SETTINGS_MODULE": "config.settings.production" + } + } + } +} +``` + +## Future Enhancements + +### Phase 2: Advanced Search + +- **Full-text search** with highlighting +- **Faceted search** by label type, date range, creator +- **Cross-corpus search** (search across multiple public corpuses) + +### Phase 3: Relationship Exploration + +- **Annotation relationships** - explore connected annotations +- **Document relationships** - find related documents +- **Citation graphs** - visualize document citation networks + +### Phase 4: Analytics + +- **Usage statistics** per corpus +- **Popular annotations** (most referenced/discussed) +- **Trending threads** in discussions + +## Summary + +This MCP interface proposal provides: + +✅ **Read-only access** to public OpenContracts resources +✅ **One-corpus-at-a-time** scoping for focused exploration +✅ **Performance optimized** using existing query optimizers +✅ **Elegant API** with intuitive resource URIs and tools +✅ **Security first** with anonymous user model and permission checks +✅ **Comprehensive coverage** of corpuses, documents, annotations, and threads + +The implementation follows OpenContracts' established patterns and leverages the existing permissioning infrastructure for a robust, maintainable solution. From 19171dd791c42fd6614142b984f61d8069a4402d Mon Sep 17 00:00:00 2001 From: JSv4 Date: Sun, 28 Dec 2025 17:11:40 -0500 Subject: [PATCH 2/6] Improve MCP interface proposal with complete implementations - Convert async tool functions to sync with sync_to_async wrapper pattern - Add URIParser class with regex-based URI validation for safety - Add complete helper function implementations (format_corpus_summary, format_document_summary, format_annotation, format_thread_summary, format_message, format_message_with_replies, text_search_fallback) - Fix NotFoundError references to use proper Django exceptions - Add prefetch_related for threaded messages to avoid N+1 queries - Add proper imports to all resource and tool functions - Update CHANGELOG with MCP interface proposal entry Addresses review feedback on PR #726 for issue #387 --- CHANGELOG.md | 14 +- docs/mcp/mcp_interface_proposal.md | 447 ++++++++++++++++++++++++----- 2 files changed, 382 insertions(+), 79 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d2c5c03c..e199dcfe7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,19 @@ All notable changes to OpenContracts will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] - 2025-12-26 +## [Unreleased] - 2025-12-28 + +### Added + +#### MCP (Model Context Protocol) Interface Proposal (Issue #387) +- **Comprehensive MCP interface design** (`docs/mcp/mcp_interface_proposal.md`): Read-only access to public OpenContracts resources for AI assistants +- **4 resource types**: corpus, document, annotation, thread - with hierarchical URI patterns +- **7 tools for discovery and retrieval**: `list_public_corpuses`, `list_documents`, `get_document_text`, `list_annotations`, `search_corpus`, `list_threads`, `get_thread_messages` +- **Anonymous user permission model**: Operates as AnonymousUser with automatic filtering to `is_public=True` resources +- **Synchronous Django ORM implementation**: Uses `sync_to_async` wrapper pattern for MCP server integration +- **Performance optimizations**: Uses existing `AnnotationQueryOptimizer`, `prefetch_related` for threaded messages, and proper pagination +- **Robust URI parsing**: Regex-based URI parsing with slug validation to prevent injection attacks +- **Helper function implementations**: Complete `format_*` functions for corpus, document, annotation, thread, and message formatting ### Fixed diff --git a/docs/mcp/mcp_interface_proposal.md b/docs/mcp/mcp_interface_proposal.md index 3bcd7598e..b63fdcc91 100644 --- a/docs/mcp/mcp_interface_proposal.md +++ b/docs/mcp/mcp_interface_proposal.md @@ -127,7 +127,8 @@ def get_document_resource(corpus_slug: str, document_slug: str) -> str: .first()) if not document: - raise NotFoundError() + from opencontractserver.documents.models import Document + raise Document.DoesNotExist(f"Document '{document_slug}' not found in corpus '{corpus_slug}'") # Read extracted text full_text = "" @@ -242,8 +243,17 @@ def get_annotation_resource(corpus_slug: str, document_slug: str, annotation_id: **Implementation**: ```python def get_thread_resource(corpus_slug: str, thread_id: int, include_messages: bool = True) -> str: - anonymous = AnonymousUser() + """Get a discussion thread resource.""" + import json + from django.contrib.auth.models import AnonymousUser + from opencontractserver.conversations.models import ( + ChatMessage, + Conversation, + ConversationTypeChoices, + ) + from opencontractserver.corpuses.models import Corpus + anonymous = AnonymousUser() corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) # Get public thread in this corpus @@ -257,15 +267,15 @@ def get_thread_resource(corpus_slug: str, thread_id: int, include_messages: bool .first()) if not thread: - raise NotFoundError() + raise Conversation.DoesNotExist(f"Thread '{thread_id}' not found in corpus '{corpus_slug}'") data = { "id": str(thread.id), - "title": thread.title, - "description": thread.description, + "title": thread.title or "", + "description": thread.description or "", "is_locked": thread.is_locked, "is_pinned": thread.is_pinned, - "created_at": thread.created_at.isoformat(), + "created_at": thread.created.isoformat() if thread.created else None, } if include_messages: @@ -274,12 +284,20 @@ def get_thread_resource(corpus_slug: str, thread_id: int, include_messages: bool return json.dumps(data) -def build_threaded_messages(thread: Conversation, user) -> list: - """Build hierarchical message tree""" - messages = (ChatMessage.objects - .visible_to_user(user) - .filter(conversation=thread, parent_message__isnull=True) - .order_by('created_at')) + +def build_threaded_messages(thread, user) -> list: + """ + Build hierarchical message tree. + + Uses prefetch_related to avoid N+1 queries when accessing nested replies. + """ + from opencontractserver.conversations.models import ChatMessage + + messages = list(ChatMessage.objects + .visible_to_user(user) + .filter(conversation=thread, parent_message__isnull=True) + .prefetch_related('replies__replies') # Prefetch 2 levels + .order_by('created_at')) return [format_message_with_replies(msg, user) for msg in messages] ``` @@ -316,24 +334,44 @@ Tools provide **dynamic operations** - they execute queries and return results. **Implementation**: ```python -async def list_public_corpuses(limit: int = 20, offset: int = 0, search: str = "") -> dict: - anonymous = AnonymousUser() +def list_public_corpuses(limit: int = 20, offset: int = 0, search: str = "") -> dict: + """ + List public corpuses visible to anonymous users. + + Note: This is a synchronous implementation. Django ORM operations are blocking, + so we keep this synchronous for simplicity. For async, wrap ORM calls with + sync_to_async from asgiref.sync. + """ + from django.contrib.auth.models import AnonymousUser + from django.db.models import Q + from opencontractserver.corpuses.models import Corpus + anonymous = AnonymousUser() qs = Corpus.objects.visible_to_user(anonymous) if search: - from django.db.models import Q qs = qs.filter( Q(title__icontains=search) | Q(description__icontains=search) ) total_count = qs.count() - corpuses = qs[offset:offset+limit] + corpuses = list(qs[offset:offset+limit]) return { "total_count": total_count, "corpuses": [format_corpus_summary(c) for c in corpuses] } + + +def format_corpus_summary(corpus) -> dict: + """Format a corpus for list display.""" + return { + "slug": corpus.slug, + "title": corpus.title, + "description": corpus.description or "", + "document_count": corpus.document_count(), + "created": corpus.created.isoformat(), + } ``` ### 2. list_documents @@ -364,14 +402,21 @@ async def list_public_corpuses(limit: int = 20, offset: int = 0, search: str = " **Implementation**: ```python -async def list_documents( +def list_documents( corpus_slug: str, limit: int = 50, offset: int = 0, search: str = "" ) -> dict: + """List documents in a public corpus.""" + from django.contrib.auth.models import AnonymousUser + from django.db.models import Q + from opencontractserver.corpuses.models import Corpus + from opencontractserver.documents.models import Document + anonymous = AnonymousUser() + # Get corpus (raises Corpus.DoesNotExist if not found or not public) corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) # Get public documents in this corpus @@ -380,18 +425,29 @@ async def list_documents( .filter(corpuses=corpus)) if search: - from django.db.models import Q qs = qs.filter( Q(title__icontains=search) | Q(description__icontains=search) ) total_count = qs.count() - documents = qs[offset:offset+limit] + documents = list(qs[offset:offset+limit]) return { "total_count": total_count, "documents": [format_document_summary(d) for d in documents] } + + +def format_document_summary(document) -> dict: + """Format a document for list display.""" + return { + "slug": document.slug, + "title": document.title, + "description": document.description or "", + "page_count": document.page_count, + "file_type": document.file_type or "unknown", + "created": document.created.isoformat(), + } ``` ### 3. get_document_text @@ -414,10 +470,18 @@ async def list_documents( **Implementation**: ```python -async def get_document_text(corpus_slug: str, document_slug: str) -> dict: +def get_document_text(corpus_slug: str, document_slug: str) -> dict: + """Retrieve full extracted text from a document.""" + from django.contrib.auth.models import AnonymousUser + from opencontractserver.corpuses.models import Corpus + from opencontractserver.documents.models import Document + anonymous = AnonymousUser() + # Raises Corpus.DoesNotExist if not found/not public corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + # Raises Document.DoesNotExist if not found/not public document = (Document.objects .visible_to_user(anonymous) .get(corpuses=corpus, slug=document_slug)) @@ -468,7 +532,7 @@ async def get_document_text(corpus_slug: str, document_slug: str) -> dict: **Implementation**: ```python -async def list_annotations( +def list_annotations( corpus_slug: str, document_slug: str, page: int | None = None, @@ -476,7 +540,12 @@ async def list_annotations( limit: int = 100, offset: int = 0 ) -> dict: + """List annotations on a document with optional filtering.""" + from django.contrib.auth.models import AnonymousUser from opencontractserver.annotations.query_optimizer import AnnotationQueryOptimizer + from opencontractserver.corpuses.models import Corpus + from opencontractserver.documents.models import Document + anonymous = AnonymousUser() corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) @@ -499,12 +568,32 @@ async def list_annotations( qs = qs.filter(annotation_label__text=label_text) total_count = qs.count() - annotations = qs.select_related('annotation_label')[offset:offset+limit] + annotations = list(qs.select_related('annotation_label')[offset:offset+limit]) return { "total_count": total_count, "annotations": [format_annotation(a) for a in annotations] } + + +def format_annotation(annotation) -> dict: + """Format an annotation for API response.""" + label_data = None + if annotation.annotation_label: + label_data = { + "text": annotation.annotation_label.text, + "color": annotation.annotation_label.color or "#000000", + "label_type": annotation.annotation_label.label_type, + } + + return { + "id": str(annotation.id), + "page": annotation.page, + "raw_text": annotation.raw_text or "", + "annotation_label": label_data, + "structural": annotation.structural, + "created": annotation.created.isoformat() if annotation.created else None, + } ``` ### 5. search_corpus @@ -543,31 +632,34 @@ async def list_annotations( **Implementation**: ```python -async def search_corpus( +def search_corpus( corpus_slug: str, query: str, limit: int = 10 ) -> dict: - from opencontractserver.utils.embeddings import generate_embeddings_from_text - anonymous = AnonymousUser() + """Semantic search within a corpus using vector embeddings.""" + from django.contrib.auth.models import AnonymousUser + from django.db.models import Q + from opencontractserver.corpuses.models import Corpus + from opencontractserver.documents.models import Document + anonymous = AnonymousUser() corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) # Generate query embedding using corpus's preferred embedder + # embed_text() returns (embedder_path, query_vector) tuple embedder_path, query_vector = corpus.embed_text(query) if not query_vector: # Fallback to text search if embeddings unavailable - return await text_search_fallback(corpus, query, limit) + return text_search_fallback(corpus, query, limit, anonymous) - # Search documents - doc_results = (Document.objects - .visible_to_user(anonymous) - .filter(corpuses=corpus) - .search_by_embedding(query_vector, embedder_path, top_k=limit)) - - # Search annotations (requires custom implementation using Annotation embeddings) - # ann_results = search_annotations_by_embedding(corpus, document, query_vector, embedder_path, limit) + # Search documents using vector similarity + # search_by_embedding adds 'similarity_score' annotation + doc_results = list(Document.objects + .visible_to_user(anonymous) + .filter(corpuses=corpus) + .search_by_embedding(query_vector, embedder_path, top_k=limit)) # Combine and rank results results = [] @@ -583,6 +675,33 @@ async def search_corpus( "query": query, "results": results[:limit] } + + +def text_search_fallback(corpus, query: str, limit: int, user) -> dict: + """Fallback to text search when embeddings are unavailable.""" + from django.db.models import Q + from opencontractserver.documents.models import Document + + # Simple text search on title and description + documents = list(Document.objects + .visible_to_user(user) + .filter(corpuses=corpus) + .filter(Q(title__icontains=query) | Q(description__icontains=query)) + [:limit]) + + results = [] + for doc in documents: + results.append({ + "type": "document", + "slug": doc.slug, + "title": doc.title, + "similarity_score": None, # No similarity score for text search + }) + + return { + "query": query, + "results": results + } ``` ### 6. list_threads @@ -616,14 +735,20 @@ async def search_corpus( **Implementation**: ```python -async def list_threads( +def list_threads( corpus_slug: str, document_slug: str | None = None, limit: int = 20, offset: int = 0 ) -> dict: - anonymous = AnonymousUser() + """List discussion threads in a corpus or document.""" + from django.contrib.auth.models import AnonymousUser + from django.db.models import Count + from opencontractserver.conversations.models import Conversation, ConversationTypeChoices + from opencontractserver.corpuses.models import Corpus + from opencontractserver.documents.models import Document + anonymous = AnonymousUser() corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) qs = (Conversation.objects @@ -631,7 +756,8 @@ async def list_threads( .filter( conversation_type=ConversationTypeChoices.THREAD, chat_with_corpus=corpus - )) + ) + .annotate(message_count=Count('messages'))) # Efficient count if document_slug: document = Document.objects.visible_to_user(anonymous).get( @@ -643,12 +769,26 @@ async def list_threads( qs = qs.order_by('-is_pinned', '-updated_at') total_count = qs.count() - threads = qs[offset:offset+limit] + threads = list(qs[offset:offset+limit]) return { "total_count": total_count, "threads": [format_thread_summary(t) for t in threads] } + + +def format_thread_summary(thread) -> dict: + """Format a thread for list display.""" + return { + "id": str(thread.id), + "title": thread.title or "", + "description": thread.description or "", + "message_count": getattr(thread, 'message_count', 0), + "is_pinned": thread.is_pinned, + "is_locked": thread.is_locked, + "created_at": thread.created.isoformat() if thread.created else None, + "last_activity": thread.updated.isoformat() if thread.updated else None, + } ``` ### 7. get_thread_messages @@ -686,6 +826,104 @@ async def list_threads( } ``` +**Implementation**: +```python +def get_thread_messages( + corpus_slug: str, + thread_id: int, + flatten: bool = False +) -> dict: + """Retrieve all messages in a thread with hierarchical structure.""" + from django.contrib.auth.models import AnonymousUser + from opencontractserver.conversations.models import ( + ChatMessage, + Conversation, + ConversationTypeChoices, + ) + from opencontractserver.corpuses.models import Corpus + + anonymous = AnonymousUser() + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + # Get the thread + thread = (Conversation.objects + .visible_to_user(anonymous) + .filter( + conversation_type=ConversationTypeChoices.THREAD, + chat_with_corpus=corpus, + id=thread_id + ) + .first()) + + if not thread: + from django.core.exceptions import ObjectDoesNotExist + raise ObjectDoesNotExist(f"Thread {thread_id} not found") + + if flatten: + # Return all messages in flat list, ordered by created_at + messages = list(ChatMessage.objects + .visible_to_user(anonymous) + .filter(conversation=thread) + .order_by('created_at')) + return { + "thread_id": str(thread.id), + "title": thread.title or "", + "messages": [format_message(m) for m in messages] + } + + # Build hierarchical structure with prefetch to avoid N+1 queries + # Prefetch 2 levels of replies (adjust depth as needed) + root_messages = list(ChatMessage.objects + .visible_to_user(anonymous) + .filter(conversation=thread, parent_message__isnull=True) + .prefetch_related('replies__replies') + .order_by('created_at')) + + return { + "thread_id": str(thread.id), + "title": thread.title or "", + "messages": [format_message_with_replies(m, anonymous) for m in root_messages] + } + + +def format_message(message) -> dict: + """Format a single message without replies.""" + return { + "id": str(message.id), + "content": message.content, + "msg_type": message.msg_type, + "created_at": message.created_at.isoformat() if message.created_at else None, + "upvote_count": message.upvote_count, + "downvote_count": message.downvote_count, + } + + +def format_message_with_replies(message, user, max_depth: int = 3, current_depth: int = 0) -> dict: + """ + Format a message with its replies recursively. + + Uses prefetched replies to avoid N+1 queries. + Limits recursion depth to prevent deeply nested structures. + """ + formatted = format_message(message) + + if current_depth >= max_depth: + # Stop recursion at max depth + formatted["replies"] = [] + formatted["has_more_replies"] = message.replies.exists() if hasattr(message, 'replies') else False + return formatted + + # Access prefetched replies (no additional queries) + replies = list(message.replies.all()) if hasattr(message, 'replies') else [] + + formatted["replies"] = [ + format_message_with_replies(reply, user, max_depth, current_depth + 1) + for reply in replies + ] + + return formatted +``` + ## Implementation Structure ### Directory Layout @@ -707,6 +945,11 @@ opencontractserver/ ```python # opencontractserver/mcp/server.py import asyncio +import json +import re +from typing import Optional + +from asgiref.sync import sync_to_async from mcp import Server, Resource, Tool from mcp.types import TextContent, EmbeddedResource @@ -730,6 +973,46 @@ from .tools import ( # Initialize MCP server mcp_server = Server("opencontracts") + +# URI parsing utilities with regex for safety +class URIParser: + """Parse MCP resource URIs safely using regex patterns.""" + + # Slug pattern: alphanumeric and hyphens only (matches OpenContracts slug format) + SLUG_PATTERN = r'[A-Za-z0-9\-]+' + + PATTERNS = { + 'corpus': re.compile(rf'^corpus://({SLUG_PATTERN})$'), + 'document': re.compile(rf'^document://({SLUG_PATTERN})/({SLUG_PATTERN})$'), + 'annotation': re.compile(rf'^annotation://({SLUG_PATTERN})/({SLUG_PATTERN})/(\d+)$'), + 'thread': re.compile(rf'^thread://({SLUG_PATTERN})/threads/(\d+)$'), + } + + @classmethod + def parse_corpus(cls, uri: str) -> Optional[str]: + """Parse corpus URI, returns corpus_slug or None.""" + match = cls.PATTERNS['corpus'].match(uri) + return match.group(1) if match else None + + @classmethod + def parse_document(cls, uri: str) -> Optional[tuple[str, str]]: + """Parse document URI, returns (corpus_slug, document_slug) or None.""" + match = cls.PATTERNS['document'].match(uri) + return (match.group(1), match.group(2)) if match else None + + @classmethod + def parse_annotation(cls, uri: str) -> Optional[tuple[str, str, int]]: + """Parse annotation URI, returns (corpus_slug, document_slug, annotation_id) or None.""" + match = cls.PATTERNS['annotation'].match(uri) + return (match.group(1), match.group(2), int(match.group(3))) if match else None + + @classmethod + def parse_thread(cls, uri: str) -> Optional[tuple[str, int]]: + """Parse thread URI, returns (corpus_slug, thread_id) or None.""" + match = cls.PATTERNS['thread'].match(uri) + return (match.group(1), int(match.group(2))) if match else None + + # Register resources @mcp_server.list_resources() async def list_resources() -> list[Resource]: @@ -761,31 +1044,35 @@ async def list_resources() -> list[Resource]: ) ] + @mcp_server.read_resource() async def read_resource(uri: str) -> str: - """Resolve resource URI and return content""" - if uri.startswith("corpus://"): - corpus_slug = uri.replace("corpus://", "") - return await get_corpus_resource(corpus_slug) - - elif uri.startswith("document://"): - # Parse: document://{corpus_slug}/{document_slug} - parts = uri.replace("document://", "").split("/") - return await get_document_resource(parts[0], parts[1]) - - elif uri.startswith("annotation://"): - # Parse: annotation://{corpus_slug}/{document_slug}/{annotation_id} - parts = uri.replace("annotation://", "").split("/") - return await get_annotation_resource(parts[0], parts[1], int(parts[2])) - - elif uri.startswith("thread://"): - # Parse: thread://{corpus_slug}/threads/{thread_id} - parts = uri.replace("thread://", "").split("/") - corpus_slug = parts[0] - thread_id = int(parts[2]) # Skip "threads" part - return await get_thread_resource(corpus_slug, thread_id) - - raise ValueError(f"Unknown resource URI: {uri}") + """Resolve resource URI and return content.""" + # Try corpus URI + corpus_slug = URIParser.parse_corpus(uri) + if corpus_slug: + return await sync_to_async(get_corpus_resource)(corpus_slug) + + # Try document URI + doc_parts = URIParser.parse_document(uri) + if doc_parts: + corpus_slug, document_slug = doc_parts + return await sync_to_async(get_document_resource)(corpus_slug, document_slug) + + # Try annotation URI + ann_parts = URIParser.parse_annotation(uri) + if ann_parts: + corpus_slug, document_slug, annotation_id = ann_parts + return await sync_to_async(get_annotation_resource)(corpus_slug, document_slug, annotation_id) + + # Try thread URI + thread_parts = URIParser.parse_thread(uri) + if thread_parts: + corpus_slug, thread_id = thread_parts + return await sync_to_async(get_thread_resource)(corpus_slug, thread_id) + + raise ValueError(f"Invalid or unrecognized resource URI: {uri}") + # Register tools @mcp_server.list_tools() @@ -818,31 +1105,35 @@ async def list_tools() -> list[Tool]: "required": ["corpus_slug"] } ), - # ... (register all other tools) + # ... (register all other tools - list_annotations, search_corpus, list_threads, get_thread_messages) ] + +# Map tool names to their implementations +TOOL_HANDLERS = { + "list_public_corpuses": list_public_corpuses, + "list_documents": list_documents, + "get_document_text": get_document_text, + "list_annotations": list_annotations, + "search_corpus": search_corpus, + "list_threads": list_threads, + "get_thread_messages": get_thread_messages, +} + + @mcp_server.call_tool() async def call_tool(name: str, arguments: dict) -> list[TextContent]: - """Execute tool and return results""" - if name == "list_public_corpuses": - result = await list_public_corpuses(**arguments) - elif name == "list_documents": - result = await list_documents(**arguments) - elif name == "get_document_text": - result = await get_document_text(**arguments) - elif name == "list_annotations": - result = await list_annotations(**arguments) - elif name == "search_corpus": - result = await search_corpus(**arguments) - elif name == "list_threads": - result = await list_threads(**arguments) - elif name == "get_thread_messages": - result = await get_thread_messages(**arguments) - else: + """Execute tool and return results.""" + handler = TOOL_HANDLERS.get(name) + if not handler: raise ValueError(f"Unknown tool: {name}") + # Run synchronous Django ORM handlers in thread pool + result = await sync_to_async(handler)(**arguments) + return [TextContent(type="text", text=json.dumps(result, indent=2))] + # Entry point async def main(): """Run MCP server""" From 80d45c329c2162eeb16ed1820b928d77286e633d Mon Sep 17 00:00:00 2001 From: JSv4 Date: Sun, 28 Dec 2025 19:15:27 -0500 Subject: [PATCH 3/6] Implement MCP server for read-only access to public resources Add Model Context Protocol (MCP) server module that provides AI assistants with structured access to public OpenContracts data: - Resources: corpus, document, annotation, thread (via custom URI schemes) - Tools: list_public_corpuses, list_documents, get_document_text, list_annotations, search_corpus, list_threads, get_thread_messages - Anonymous user model ensures only public resources are accessible - Uses existing visible_to_user() and AnnotationQueryOptimizer patterns - Rate limiting via Django cache - Comprehensive unit tests (19 tests) Closes #726 --- config/settings/base.py | 15 +- opencontractserver/mcp/__init__.py | 20 ++ opencontractserver/mcp/config.py | 56 ++++ opencontractserver/mcp/formatters.py | 111 ++++++ opencontractserver/mcp/permissions.py | 89 +++++ opencontractserver/mcp/resources.py | 214 ++++++++++++ opencontractserver/mcp/server.py | 285 ++++++++++++++++ opencontractserver/mcp/tests/__init__.py | 6 + opencontractserver/mcp/tests/test_mcp.py | 345 +++++++++++++++++++ opencontractserver/mcp/tools.py | 408 +++++++++++++++++++++++ requirements/base.txt | 4 + 11 files changed, 1552 insertions(+), 1 deletion(-) create mode 100644 opencontractserver/mcp/__init__.py create mode 100644 opencontractserver/mcp/config.py create mode 100644 opencontractserver/mcp/formatters.py create mode 100644 opencontractserver/mcp/permissions.py create mode 100644 opencontractserver/mcp/resources.py create mode 100644 opencontractserver/mcp/server.py create mode 100644 opencontractserver/mcp/tests/__init__.py create mode 100644 opencontractserver/mcp/tests/test_mcp.py create mode 100644 opencontractserver/mcp/tools.py diff --git a/config/settings/base.py b/config/settings/base.py index 0a407a1b8..a4bf592c9 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -720,7 +720,7 @@ "http://127.0.0.1:5173", ] -DEFAULT_IMAGE = """""" # noqa +DEFAULT_IMAGE = """""" # noqa # Model paths DOCLING_MODELS_PATH = env.str("DOCLING_MODELS_PATH", default="/models/docling") @@ -983,3 +983,16 @@ ) POSTHOG_HOST = env.str("POSTHOG_HOST", default="https://us.i.posthog.com") MODE = "LOCAL" + +# MCP Server Configuration +# ------------------------------------------------------------------------------ +# See docs/mcp/mcp_interface_proposal.md for details +MCP_SERVER = { + 'enabled': env.bool('MCP_SERVER_ENABLED', default=False), + 'max_results_per_page': env.int('MCP_MAX_RESULTS_PER_PAGE', default=100), + 'rate_limit': { + 'requests': env.int('MCP_RATE_LIMIT_REQUESTS', default=100), + 'window': env.int('MCP_RATE_LIMIT_WINDOW', default=60), + }, + 'cache_ttl': env.int('MCP_CACHE_TTL', default=300), +} diff --git a/opencontractserver/mcp/__init__.py b/opencontractserver/mcp/__init__.py new file mode 100644 index 000000000..f4ae0b861 --- /dev/null +++ b/opencontractserver/mcp/__init__.py @@ -0,0 +1,20 @@ +""" +MCP (Model Context Protocol) Interface for OpenContracts. + +This module provides a read-only Model Context Protocol server interface for OpenContracts, +enabling external AI assistants and tools to access public resources, documents, and corpus +information through the standardized MCP protocol. + +The MCP interface focuses on providing read-only access to: +- Public documents and their content +- Public corpus metadata +- Annotation data for accessible documents +- Document structure and relationships + +All access is subject to OpenContracts' standard permission checks, ensuring that only +publicly accessible or user-authorized resources are exposed through the MCP interface. + +For more information on the Model Context Protocol, see: https://modelcontextprotocol.io +""" + +__all__ = [] diff --git a/opencontractserver/mcp/config.py b/opencontractserver/mcp/config.py new file mode 100644 index 000000000..09bf2de6c --- /dev/null +++ b/opencontractserver/mcp/config.py @@ -0,0 +1,56 @@ +""" +Configuration settings for the OpenContracts MCP (Model Context Protocol) server. + +This module provides default configuration values and utility functions for accessing +MCP-specific Django settings. +""" + +import re +from typing import Any, Optional + +# MCP Server Configuration Defaults +MAX_RESULTS_PER_PAGE = 100 +DEFAULT_PAGE_SIZE = 20 +RATE_LIMIT_REQUESTS = 100 +RATE_LIMIT_WINDOW = 60 # seconds +CACHE_TTL = 300 # seconds + +# URI Pattern Constants +SLUG_PATTERN = re.compile(r'^[A-Za-z0-9-]+$') + + +def get_mcp_setting(key: str, default: Optional[Any] = None) -> Any: + """ + Get MCP setting from Django settings or return default. + + Settings should be defined in Django settings.py as: + MCP_SERVER = { + 'MAX_RESULTS_PER_PAGE': 100, + 'DEFAULT_PAGE_SIZE': 20, + ... + } + + Args: + key: The setting key to retrieve + default: Default value if setting not found + + Returns: + The setting value or default + """ + from django.conf import settings + + mcp_settings = getattr(settings, 'MCP_SERVER', {}) + return mcp_settings.get(key, default) + + +def validate_slug(slug: str) -> bool: + """ + Validate that a slug matches the expected pattern. + + Args: + slug: The slug string to validate + + Returns: + True if slug is valid, False otherwise + """ + return bool(SLUG_PATTERN.match(slug)) diff --git a/opencontractserver/mcp/formatters.py b/opencontractserver/mcp/formatters.py new file mode 100644 index 000000000..08cd3766c --- /dev/null +++ b/opencontractserver/mcp/formatters.py @@ -0,0 +1,111 @@ +"""Response formatters for MCP resources and tools.""" +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from opencontractserver.annotations.models import Annotation + from opencontractserver.conversations.models import ChatMessage, Conversation + from opencontractserver.corpuses.models import Corpus + from opencontractserver.documents.models import Document + + +def format_corpus_summary(corpus: "Corpus") -> dict: + """Format a corpus for list display.""" + return { + "slug": corpus.slug, + "title": corpus.title, + "description": corpus.description or "", + "document_count": corpus.document_count() if hasattr(corpus, 'document_count') else 0, + "created": corpus.created.isoformat() if corpus.created else None, + } + + +def format_document_summary(document: "Document") -> dict: + """Format a document for list display.""" + return { + "slug": document.slug, + "title": document.title or "", + "description": document.description or "", + "page_count": document.page_count or 0, + "file_type": document.file_type or "unknown", + "created": document.created.isoformat() if document.created else None, + } + + +def format_annotation(annotation: "Annotation") -> dict: + """Format an annotation for API response.""" + label_data = None + if annotation.annotation_label: + label_data = { + "text": annotation.annotation_label.text, + "color": annotation.annotation_label.color or "#000000", + "label_type": annotation.annotation_label.label_type, + } + + return { + "id": str(annotation.id), + "page": annotation.page, + "raw_text": annotation.raw_text or "", + "annotation_label": label_data, + "structural": annotation.structural, + "created": annotation.created.isoformat() if annotation.created else None, + } + + +def format_thread_summary(thread: "Conversation") -> dict: + """Format a thread for list display.""" + return { + "id": str(thread.id), + "title": thread.title or "", + "description": thread.description or "", + "message_count": getattr(thread, 'message_count', 0), + "is_pinned": thread.is_pinned, + "is_locked": thread.is_locked, + "created_at": thread.created.isoformat() if thread.created else None, + "last_activity": thread.modified.isoformat() if thread.modified else None, + } + + +def format_message(message: "ChatMessage") -> dict: + """Format a single message without replies.""" + return { + "id": str(message.id), + "content": message.content, + "msg_type": message.msg_type, + "created_at": message.created_at.isoformat() if message.created_at else None, + "upvote_count": message.upvote_count, + "downvote_count": message.downvote_count, + } + + +def format_message_with_replies( + message: "ChatMessage", + user, + max_depth: int = 3, + current_depth: int = 0 +) -> dict: + """ + Format a message with its replies recursively. + + Uses prefetched replies to avoid N+1 queries. + Limits recursion depth to prevent deeply nested structures. + """ + formatted = format_message(message) + + if current_depth >= max_depth: + formatted["replies"] = [] + formatted["has_more_replies"] = ( + message.replies.exists() if hasattr(message, 'replies') else False + ) + return formatted + + # Access prefetched replies (no additional queries if prefetched) + replies = list(message.replies.all()) if hasattr(message, 'replies') else [] + + formatted["replies"] = [ + format_message_with_replies(reply, user, max_depth, current_depth + 1) + for reply in replies + ] + + return formatted diff --git a/opencontractserver/mcp/permissions.py b/opencontractserver/mcp/permissions.py new file mode 100644 index 000000000..3e47c540e --- /dev/null +++ b/opencontractserver/mcp/permissions.py @@ -0,0 +1,89 @@ +"""Permission utilities for the MCP server. + +This module provides input validation, user handling, and rate limiting +for MCP server operations. +""" + +import re + +from django.contrib.auth.models import AnonymousUser +from django.core.cache import cache + +# Slug validation pattern - matches OpenContracts format (A-Z, a-z, 0-9, hyphen) +SLUG_PATTERN = re.compile(r"^[A-Za-z0-9\-]+$") + + +def validate_slug(slug: str) -> bool: + """Validate slug format matches OpenContracts pattern (A-Z, a-z, 0-9, hyphen). + + Args: + slug: The slug string to validate + + Returns: + True if slug is valid, False otherwise + """ + return bool(SLUG_PATTERN.match(slug)) + + +def sanitize_and_validate_slugs( + corpus_slug: str, document_slug: str | None = None +) -> tuple[str, str | None]: + """Validate and return slugs, raising ValueError if invalid. + + Args: + corpus_slug: The corpus slug to validate (required) + document_slug: The document slug to validate (optional) + + Returns: + Tuple of (corpus_slug, document_slug) if valid + + Raises: + ValueError: If either slug is invalid + """ + if not validate_slug(corpus_slug): + raise ValueError(f"Invalid corpus slug: {corpus_slug}") + if document_slug and not validate_slug(document_slug): + raise ValueError(f"Invalid document slug: {document_slug}") + return corpus_slug, document_slug + + +def get_anonymous_user() -> AnonymousUser: + """Get Django's AnonymousUser for permission checks. + + Returns: + AnonymousUser instance for use in permission checks + """ + return AnonymousUser() + + +class RateLimiter: + """Simple rate limiter for MCP requests using Django cache. + + This limiter uses a sliding window approach stored in Django's cache backend. + """ + + def __init__(self, max_requests: int = 100, window_seconds: int = 60): + """Initialize the rate limiter. + + Args: + max_requests: Maximum number of requests allowed in the time window + window_seconds: Time window in seconds + """ + self.max_requests = max_requests + self.window_seconds = window_seconds + + def check_rate_limit(self, client_id: str) -> bool: + """Check if a request is allowed based on rate limits. + + Args: + client_id: Unique identifier for the client making the request + + Returns: + True if request is allowed, False if rate limited + """ + key = f"mcp:ratelimit:{client_id}" + current = cache.get(key, 0) + if current >= self.max_requests: + return False + cache.set(key, current + 1, self.window_seconds) + return True diff --git a/opencontractserver/mcp/resources.py b/opencontractserver/mcp/resources.py new file mode 100644 index 000000000..ca257b0c8 --- /dev/null +++ b/opencontractserver/mcp/resources.py @@ -0,0 +1,214 @@ +"""MCP Resource handlers for OpenContracts. + +Resources provide static content for context windows, representing specific entities. +""" +from __future__ import annotations + +import json + +from django.contrib.auth.models import AnonymousUser + + +def get_corpus_resource(corpus_slug: str) -> str: + """ + Get corpus resource content. + + URI: corpus://{corpus_slug} + Returns: JSON with corpus metadata and summary statistics + """ + from opencontractserver.corpuses.models import Corpus + + anonymous = AnonymousUser() + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + # Get label set info if available + label_set_data = None + if corpus.label_set: + labels = [] + for label in corpus.label_set.annotation_labels.all()[:20]: # Limit labels + labels.append({ + "text": label.text, + "color": label.color or "#000000", + "label_type": label.label_type, + }) + label_set_data = { + "title": corpus.label_set.title or "", + "labels": labels, + } + + return json.dumps({ + "slug": corpus.slug, + "title": corpus.title, + "description": corpus.description or "", + "document_count": corpus.document_count(), + "created": corpus.created.isoformat() if corpus.created else None, + "modified": corpus.modified.isoformat() if corpus.modified else None, + "label_set": label_set_data, + }) + + +def get_document_resource(corpus_slug: str, document_slug: str) -> str: + """ + Get document resource content. + + URI: document://{corpus_slug}/{document_slug} + Returns: JSON with document metadata and extracted text + """ + from opencontractserver.corpuses.models import Corpus + from opencontractserver.documents.models import Document + + anonymous = AnonymousUser() + + # Get corpus context + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + # Get document within corpus (both must be public) + document = ( + Document.objects + .visible_to_user(anonymous) + .filter(corpuses=corpus, slug=document_slug) + .first() + ) + + if not document: + raise Document.DoesNotExist( + f"Document '{document_slug}' not found in corpus '{corpus_slug}'" + ) + + # Read extracted text + full_text = "" + if document.txt_extract_file: + try: + with document.txt_extract_file.open('r') as f: + full_text = f.read() + except Exception: + full_text = "" + + return json.dumps({ + "slug": document.slug, + "title": document.title or "", + "description": document.description or "", + "file_type": document.file_type or "application/pdf", + "page_count": document.page_count or 0, + "text_preview": full_text[:500] if full_text else "", + "full_text": full_text, + "created": document.created.isoformat() if document.created else None, + "corpus": corpus_slug, + }) + + +def get_annotation_resource( + corpus_slug: str, + document_slug: str, + annotation_id: int +) -> str: + """ + Get annotation resource content. + + URI: annotation://{corpus_slug}/{document_slug}/{annotation_id} + Returns: JSON with annotation details including label and bounding box + """ + from opencontractserver.annotations.query_optimizer import AnnotationQueryOptimizer + from opencontractserver.corpuses.models import Corpus + from opencontractserver.documents.models import Document + + anonymous = AnonymousUser() + + # Get corpus and document + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + document = Document.objects.visible_to_user(anonymous).get( + corpuses=corpus, slug=document_slug + ) + + # Use query optimizer for efficient permission checking + annotations = AnnotationQueryOptimizer.get_document_annotations( + document_id=document.id, + user=anonymous, + corpus_id=corpus.id + ) + + annotation = annotations.get(id=annotation_id) + + # Format label data + label_data = None + if annotation.annotation_label: + label_data = { + "text": annotation.annotation_label.text, + "color": annotation.annotation_label.color or "#000000", + "label_type": annotation.annotation_label.label_type, + } + + return json.dumps({ + "id": str(annotation.id), + "page": annotation.page, + "raw_text": annotation.raw_text or "", + "annotation_label": label_data, + "bounding_box": annotation.bounding_box, + "structural": annotation.structural, + "created": annotation.created.isoformat() if annotation.created else None, + }) + + +def get_thread_resource( + corpus_slug: str, + thread_id: int, + include_messages: bool = True +) -> str: + """ + Get thread resource content. + + URI: thread://{corpus_slug}/threads/{thread_id} + Returns: JSON with thread metadata and optionally messages + """ + from opencontractserver.conversations.models import ( + ChatMessage, + Conversation, + ConversationTypeChoices, + ) + from opencontractserver.corpuses.models import Corpus + + from .formatters import format_message_with_replies + + anonymous = AnonymousUser() + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + # Get public thread in this corpus + thread = ( + Conversation.objects + .visible_to_user(anonymous) + .filter( + conversation_type=ConversationTypeChoices.THREAD, + chat_with_corpus=corpus, + id=thread_id + ) + .first() + ) + + if not thread: + raise Conversation.DoesNotExist( + f"Thread '{thread_id}' not found in corpus '{corpus_slug}'" + ) + + data = { + "id": str(thread.id), + "title": thread.title or "", + "description": thread.description or "", + "is_locked": thread.is_locked, + "is_pinned": thread.is_pinned, + "created_at": thread.created.isoformat() if thread.created else None, + } + + if include_messages: + # Build hierarchical message structure with prefetch + messages = list( + ChatMessage.objects + .visible_to_user(anonymous) + .filter(conversation=thread, parent_message__isnull=True) + .prefetch_related('replies__replies') + .order_by('created_at') + ) + data["messages"] = [ + format_message_with_replies(msg, anonymous) for msg in messages + ] + + return json.dumps(data) diff --git a/opencontractserver/mcp/server.py b/opencontractserver/mcp/server.py new file mode 100644 index 000000000..c3398c11e --- /dev/null +++ b/opencontractserver/mcp/server.py @@ -0,0 +1,285 @@ +""" +OpenContracts MCP Server. + +Model Context Protocol server providing read-only access to public OpenContracts resources. +""" +from __future__ import annotations + +import asyncio +import json +import re +from typing import Optional + +from asgiref.sync import sync_to_async +from mcp.server import Server +from mcp.server.stdio import stdio_server +from mcp.types import Resource, TextContent, Tool + +from .resources import ( + get_annotation_resource, + get_corpus_resource, + get_document_resource, + get_thread_resource, +) +from .tools import ( + get_document_text, + get_thread_messages, + list_annotations, + list_documents, + list_public_corpuses, + list_threads, + search_corpus, +) + + +class URIParser: + """Parse MCP resource URIs safely using regex patterns.""" + + # Slug pattern: alphanumeric and hyphens only + SLUG_PATTERN = r'[A-Za-z0-9\-]+' + + PATTERNS = { + 'corpus': re.compile(rf'^corpus://({SLUG_PATTERN})$'), + 'document': re.compile(rf'^document://({SLUG_PATTERN})/({SLUG_PATTERN})$'), + 'annotation': re.compile( + rf'^annotation://({SLUG_PATTERN})/({SLUG_PATTERN})/(\d+)$' + ), + 'thread': re.compile(rf'^thread://({SLUG_PATTERN})/threads/(\d+)$'), + } + + @classmethod + def parse_corpus(cls, uri: str) -> Optional[str]: + """Parse corpus URI, returns corpus_slug or None.""" + match = cls.PATTERNS['corpus'].match(uri) + return match.group(1) if match else None + + @classmethod + def parse_document(cls, uri: str) -> Optional[tuple[str, str]]: + """Parse document URI, returns (corpus_slug, document_slug) or None.""" + match = cls.PATTERNS['document'].match(uri) + return (match.group(1), match.group(2)) if match else None + + @classmethod + def parse_annotation(cls, uri: str) -> Optional[tuple[str, str, int]]: + """Parse annotation URI, returns (corpus_slug, document_slug, annotation_id) or None.""" + match = cls.PATTERNS['annotation'].match(uri) + return (match.group(1), match.group(2), int(match.group(3))) if match else None + + @classmethod + def parse_thread(cls, uri: str) -> Optional[tuple[str, int]]: + """Parse thread URI, returns (corpus_slug, thread_id) or None.""" + match = cls.PATTERNS['thread'].match(uri) + return (match.group(1), int(match.group(2))) if match else None + + +# Initialize MCP server +mcp_server = Server("opencontracts") + + +@mcp_server.list_resources() +async def list_resources() -> list[Resource]: + """List available resource patterns.""" + return [ + Resource( + uri="corpus://{corpus_slug}", + name="Public Corpus", + description="Access public corpus metadata and contents", + mimeType="application/json" + ), + Resource( + uri="document://{corpus_slug}/{document_slug}", + name="Public Document", + description="Access public document with extracted text", + mimeType="application/json" + ), + Resource( + uri="annotation://{corpus_slug}/{document_slug}/{annotation_id}", + name="Document Annotation", + description="Access specific annotation on a document", + mimeType="application/json" + ), + Resource( + uri="thread://{corpus_slug}/threads/{thread_id}", + name="Discussion Thread", + description="Access public discussion thread with messages", + mimeType="application/json" + ) + ] + + +@mcp_server.read_resource() +async def read_resource(uri: str) -> str: + """Resolve resource URI and return content.""" + # Try corpus URI + corpus_slug = URIParser.parse_corpus(uri) + if corpus_slug: + return await sync_to_async(get_corpus_resource)(corpus_slug) + + # Try document URI + doc_parts = URIParser.parse_document(uri) + if doc_parts: + corpus_slug, document_slug = doc_parts + return await sync_to_async(get_document_resource)(corpus_slug, document_slug) + + # Try annotation URI + ann_parts = URIParser.parse_annotation(uri) + if ann_parts: + corpus_slug, document_slug, annotation_id = ann_parts + return await sync_to_async(get_annotation_resource)( + corpus_slug, document_slug, annotation_id + ) + + # Try thread URI + thread_parts = URIParser.parse_thread(uri) + if thread_parts: + corpus_slug, thread_id = thread_parts + return await sync_to_async(get_thread_resource)(corpus_slug, thread_id) + + raise ValueError(f"Invalid or unrecognized resource URI: {uri}") + + +@mcp_server.list_tools() +async def list_tools() -> list[Tool]: + """List available tools.""" + return [ + Tool( + name="list_public_corpuses", + description="List all publicly accessible corpuses", + inputSchema={ + "type": "object", + "properties": { + "limit": {"type": "integer", "default": 20, "description": "Max results (1-100)"}, + "offset": {"type": "integer", "default": 0, "description": "Pagination offset"}, + "search": {"type": "string", "default": "", "description": "Search filter"} + } + } + ), + Tool( + name="list_documents", + description="List documents in a corpus", + inputSchema={ + "type": "object", + "properties": { + "corpus_slug": {"type": "string", "description": "Corpus identifier"}, + "limit": {"type": "integer", "default": 50}, + "offset": {"type": "integer", "default": 0}, + "search": {"type": "string", "default": ""} + }, + "required": ["corpus_slug"] + } + ), + Tool( + name="get_document_text", + description="Get full extracted text from a document", + inputSchema={ + "type": "object", + "properties": { + "corpus_slug": {"type": "string", "description": "Corpus identifier"}, + "document_slug": {"type": "string", "description": "Document identifier"} + }, + "required": ["corpus_slug", "document_slug"] + } + ), + Tool( + name="list_annotations", + description="List annotations on a document", + inputSchema={ + "type": "object", + "properties": { + "corpus_slug": {"type": "string"}, + "document_slug": {"type": "string"}, + "page": {"type": "integer", "description": "Filter to page number"}, + "label_text": {"type": "string", "description": "Filter by label text"}, + "limit": {"type": "integer", "default": 100}, + "offset": {"type": "integer", "default": 0} + }, + "required": ["corpus_slug", "document_slug"] + } + ), + Tool( + name="search_corpus", + description="Semantic search within a corpus", + inputSchema={ + "type": "object", + "properties": { + "corpus_slug": {"type": "string"}, + "query": {"type": "string", "description": "Search query"}, + "limit": {"type": "integer", "default": 10} + }, + "required": ["corpus_slug", "query"] + } + ), + Tool( + name="list_threads", + description="List discussion threads in a corpus", + inputSchema={ + "type": "object", + "properties": { + "corpus_slug": {"type": "string"}, + "document_slug": {"type": "string", "description": "Optional document filter"}, + "limit": {"type": "integer", "default": 20}, + "offset": {"type": "integer", "default": 0} + }, + "required": ["corpus_slug"] + } + ), + Tool( + name="get_thread_messages", + description="Get messages in a thread", + inputSchema={ + "type": "object", + "properties": { + "corpus_slug": {"type": "string"}, + "thread_id": {"type": "integer"}, + "flatten": {"type": "boolean", "default": False, "description": "Return flat list"} + }, + "required": ["corpus_slug", "thread_id"] + } + ), + ] + + +# Map tool names to implementations +TOOL_HANDLERS = { + "list_public_corpuses": list_public_corpuses, + "list_documents": list_documents, + "get_document_text": get_document_text, + "list_annotations": list_annotations, + "search_corpus": search_corpus, + "list_threads": list_threads, + "get_thread_messages": get_thread_messages, +} + + +@mcp_server.call_tool() +async def call_tool(name: str, arguments: dict) -> list[TextContent]: + """Execute tool and return results.""" + handler = TOOL_HANDLERS.get(name) + if not handler: + raise ValueError(f"Unknown tool: {name}") + + # Run synchronous Django ORM handlers in thread pool + result = await sync_to_async(handler)(**arguments) + + return [TextContent(type="text", text=json.dumps(result, indent=2))] + + +async def main(): + """Run MCP server.""" + async with stdio_server() as streams: + await mcp_server.run( + streams[0], # read_stream + streams[1], # write_stream + mcp_server.create_initialization_options() + ) + + +if __name__ == "__main__": + # Setup Django before running + import os + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.local") + + import django + django.setup() + + asyncio.run(main()) diff --git a/opencontractserver/mcp/tests/__init__.py b/opencontractserver/mcp/tests/__init__.py new file mode 100644 index 000000000..6c316805c --- /dev/null +++ b/opencontractserver/mcp/tests/__init__.py @@ -0,0 +1,6 @@ +""" +Tests for the MCP (Model Context Protocol) interface. + +This package contains tests for the OpenContracts MCP server implementation, +including resource access, tool functionality, and permission enforcement. +""" diff --git a/opencontractserver/mcp/tests/test_mcp.py b/opencontractserver/mcp/tests/test_mcp.py new file mode 100644 index 000000000..65883bbab --- /dev/null +++ b/opencontractserver/mcp/tests/test_mcp.py @@ -0,0 +1,345 @@ +"""Tests for MCP server functionality.""" +import json + +from django.contrib.auth import get_user_model +from django.contrib.auth.models import AnonymousUser +from django.test import TestCase + +from opencontractserver.corpuses.models import Corpus + + +User = get_user_model() + + +class URIParserTest(TestCase): + """Tests for MCP URI parsing.""" + + def test_parse_corpus_uri(self): + """Test parsing corpus URIs.""" + from opencontractserver.mcp.server import URIParser + + # Valid URI + result = URIParser.parse_corpus("corpus://my-corpus") + self.assertEqual(result, "my-corpus") + + # More complex slugs + result = URIParser.parse_corpus("corpus://Legal-Contracts-2024") + self.assertEqual(result, "Legal-Contracts-2024") + + # Invalid URIs + self.assertIsNone(URIParser.parse_corpus("corpus://")) + self.assertIsNone(URIParser.parse_corpus("document://my-corpus")) + self.assertIsNone(URIParser.parse_corpus("corpus://my corpus")) # space invalid + self.assertIsNone(URIParser.parse_corpus("corpus://my_corpus")) # underscore invalid + + def test_parse_document_uri(self): + """Test parsing document URIs.""" + from opencontractserver.mcp.server import URIParser + + result = URIParser.parse_document("document://my-corpus/my-doc") + self.assertEqual(result, ("my-corpus", "my-doc")) + + result = URIParser.parse_document("document://corp-1/doc-2024") + self.assertEqual(result, ("corp-1", "doc-2024")) + + self.assertIsNone(URIParser.parse_document("document://my-corpus")) + self.assertIsNone(URIParser.parse_document("document://")) + + def test_parse_annotation_uri(self): + """Test parsing annotation URIs.""" + from opencontractserver.mcp.server import URIParser + + result = URIParser.parse_annotation("annotation://corp/doc/123") + self.assertEqual(result, ("corp", "doc", 123)) + + result = URIParser.parse_annotation("annotation://my-corpus/my-doc/999") + self.assertEqual(result, ("my-corpus", "my-doc", 999)) + + self.assertIsNone(URIParser.parse_annotation("annotation://corp/doc")) + self.assertIsNone(URIParser.parse_annotation("annotation://corp/doc/abc")) + + def test_parse_thread_uri(self): + """Test parsing thread URIs.""" + from opencontractserver.mcp.server import URIParser + + result = URIParser.parse_thread("thread://my-corpus/threads/456") + self.assertEqual(result, ("my-corpus", 456)) + + result = URIParser.parse_thread("thread://legal-2024/threads/1") + self.assertEqual(result, ("legal-2024", 1)) + + self.assertIsNone(URIParser.parse_thread("thread://my-corpus/456")) + self.assertIsNone(URIParser.parse_thread("thread://my-corpus/threads/")) + + +class MCPPermissionsTest(TestCase): + """Tests for MCP permission validation.""" + + def test_validate_slug(self): + """Test slug validation.""" + from opencontractserver.mcp.permissions import validate_slug + + # Valid slugs + self.assertTrue(validate_slug("my-corpus")) + self.assertTrue(validate_slug("MyCorpus123")) + self.assertTrue(validate_slug("test-doc-2024")) + self.assertTrue(validate_slug("ABC")) + self.assertTrue(validate_slug("123")) + + # Invalid slugs + self.assertFalse(validate_slug("my corpus")) # space + self.assertFalse(validate_slug("my_corpus")) # underscore + self.assertFalse(validate_slug("")) + self.assertFalse(validate_slug("my@corpus")) # special char + + def test_sanitize_and_validate_slugs(self): + """Test slug sanitization and validation.""" + from opencontractserver.mcp.permissions import sanitize_and_validate_slugs + + # Valid slugs pass through + result = sanitize_and_validate_slugs("my-corpus", "my-doc") + self.assertEqual(result, ("my-corpus", "my-doc")) + + # None document slug is allowed + result = sanitize_and_validate_slugs("my-corpus", None) + self.assertEqual(result, ("my-corpus", None)) + + # Invalid corpus slug raises + with self.assertRaises(ValueError): + sanitize_and_validate_slugs("my corpus") + + # Invalid document slug raises + with self.assertRaises(ValueError): + sanitize_and_validate_slugs("my-corpus", "my_doc") + + def test_get_anonymous_user(self): + """Test anonymous user helper.""" + from opencontractserver.mcp.permissions import get_anonymous_user + + user = get_anonymous_user() + self.assertIsInstance(user, AnonymousUser) + self.assertFalse(user.is_authenticated) + + +class MCPResourcesTest(TestCase): + """Tests for MCP resource handlers.""" + + @classmethod + def setUpTestData(cls): + """Create test data.""" + cls.owner = User.objects.create_user( + username="testowner", + email="owner@test.com", + password="testpass123" + ) + + # Create public corpus + cls.public_corpus = Corpus.objects.create( + title="Public Test Corpus", + description="A public test corpus", + creator=cls.owner, + is_public=True + ) + + # Create private corpus + cls.private_corpus = Corpus.objects.create( + title="Private Test Corpus", + description="A private test corpus", + creator=cls.owner, + is_public=False + ) + + def test_get_public_corpus_resource(self): + """Anonymous users can access public corpus resources.""" + from opencontractserver.mcp.resources import get_corpus_resource + + result = get_corpus_resource(self.public_corpus.slug) + data = json.loads(result) + + self.assertEqual(data["slug"], self.public_corpus.slug) + self.assertEqual(data["title"], "Public Test Corpus") + self.assertEqual(data["description"], "A public test corpus") + + def test_get_private_corpus_resource_denied(self): + """Anonymous users cannot access private corpus resources.""" + from opencontractserver.mcp.resources import get_corpus_resource + + with self.assertRaises(Corpus.DoesNotExist): + get_corpus_resource(self.private_corpus.slug) + + +class MCPToolsTest(TestCase): + """Tests for MCP tool handlers.""" + + @classmethod + def setUpTestData(cls): + """Create test data.""" + cls.owner = User.objects.create_user( + username="toolsowner", + email="tools@test.com", + password="testpass123" + ) + + # Create public corpuses + cls.corpus1 = Corpus.objects.create( + title="Corpus One", + description="First corpus", + creator=cls.owner, + is_public=True + ) + cls.corpus2 = Corpus.objects.create( + title="Corpus Two", + description="Second corpus", + creator=cls.owner, + is_public=True + ) + + # Create private corpus (should not appear) + cls.private = Corpus.objects.create( + title="Private Corpus", + creator=cls.owner, + is_public=False + ) + + def test_list_public_corpuses(self): + """Test listing public corpuses.""" + from opencontractserver.mcp.tools import list_public_corpuses + + result = list_public_corpuses() + + self.assertIn("total_count", result) + self.assertIn("corpuses", result) + + # Should only include public corpuses + slugs = [c["slug"] for c in result["corpuses"]] + self.assertIn(self.corpus1.slug, slugs) + self.assertIn(self.corpus2.slug, slugs) + self.assertNotIn(self.private.slug, slugs) + + def test_list_public_corpuses_with_search(self): + """Test searching corpuses.""" + from opencontractserver.mcp.tools import list_public_corpuses + + result = list_public_corpuses(search="One") + + slugs = [c["slug"] for c in result["corpuses"]] + self.assertIn(self.corpus1.slug, slugs) + self.assertNotIn(self.corpus2.slug, slugs) + + def test_list_public_corpuses_pagination(self): + """Test pagination.""" + from opencontractserver.mcp.tools import list_public_corpuses + + result = list_public_corpuses(limit=1, offset=0) + self.assertEqual(len(result["corpuses"]), 1) + + result2 = list_public_corpuses(limit=1, offset=1) + self.assertEqual(len(result2["corpuses"]), 1) + + # Different results + self.assertNotEqual( + result["corpuses"][0]["slug"], + result2["corpuses"][0]["slug"] + ) + + def test_list_public_corpuses_max_limit(self): + """Test that limit is capped at 100.""" + from opencontractserver.mcp.tools import list_public_corpuses + + # Even with a huge limit, should be capped + result = list_public_corpuses(limit=1000) + # The function caps at 100, but we only have 2 public corpuses + self.assertLessEqual(len(result["corpuses"]), 100) + + +class MCPFormattersTest(TestCase): + """Tests for MCP response formatters.""" + + @classmethod + def setUpTestData(cls): + cls.owner = User.objects.create_user( + username="formatowner", + email="format@test.com", + password="testpass123" + ) + + cls.corpus = Corpus.objects.create( + title="Format Test Corpus", + description="Testing formatters", + creator=cls.owner, + is_public=True + ) + + def test_format_corpus_summary(self): + """Test corpus summary formatting.""" + from opencontractserver.mcp.formatters import format_corpus_summary + + result = format_corpus_summary(self.corpus) + + self.assertEqual(result["slug"], self.corpus.slug) + self.assertEqual(result["title"], "Format Test Corpus") + self.assertEqual(result["description"], "Testing formatters") + self.assertIn("created", result) + self.assertIn("document_count", result) + + +class MCPConfigTest(TestCase): + """Tests for MCP configuration.""" + + def test_get_mcp_setting_with_default(self): + """Test getting settings with defaults.""" + from opencontractserver.mcp.config import get_mcp_setting + + # Non-existent key returns default + result = get_mcp_setting("nonexistent_key", "default_value") + self.assertEqual(result, "default_value") + + def test_validate_slug(self): + """Test slug validation in config.""" + from opencontractserver.mcp.config import validate_slug + + self.assertTrue(validate_slug("valid-slug")) + self.assertTrue(validate_slug("Valid123")) + self.assertFalse(validate_slug("invalid_slug")) + self.assertFalse(validate_slug("invalid slug")) + + +class MCPRateLimiterTest(TestCase): + """Tests for MCP rate limiter.""" + + def test_rate_limiter_allows_requests(self): + """Test that rate limiter allows requests under limit.""" + from opencontractserver.mcp.permissions import RateLimiter + + limiter = RateLimiter(max_requests=5, window_seconds=60) + + # First 5 requests should be allowed + for i in range(5): + self.assertTrue(limiter.check_rate_limit("test-client")) + + def test_rate_limiter_blocks_excess_requests(self): + """Test that rate limiter blocks requests over limit.""" + from opencontractserver.mcp.permissions import RateLimiter + + limiter = RateLimiter(max_requests=2, window_seconds=60) + + # First 2 requests allowed + self.assertTrue(limiter.check_rate_limit("test-client-2")) + self.assertTrue(limiter.check_rate_limit("test-client-2")) + + # Third request blocked + self.assertFalse(limiter.check_rate_limit("test-client-2")) + + def test_rate_limiter_separate_clients(self): + """Test that rate limiter tracks clients separately.""" + from opencontractserver.mcp.permissions import RateLimiter + + limiter = RateLimiter(max_requests=1, window_seconds=60) + + # Each client gets their own limit + self.assertTrue(limiter.check_rate_limit("client-a")) + self.assertTrue(limiter.check_rate_limit("client-b")) + + # But each is limited individually + self.assertFalse(limiter.check_rate_limit("client-a")) + self.assertFalse(limiter.check_rate_limit("client-b")) diff --git a/opencontractserver/mcp/tools.py b/opencontractserver/mcp/tools.py new file mode 100644 index 000000000..b2080ecf4 --- /dev/null +++ b/opencontractserver/mcp/tools.py @@ -0,0 +1,408 @@ +"""MCP Tool implementations for OpenContracts. + +Tools provide dynamic operations - they execute queries and return results. +""" +from __future__ import annotations + +from django.contrib.auth.models import AnonymousUser +from django.db.models import Count, Q + +from .formatters import ( + format_annotation, + format_corpus_summary, + format_document_summary, + format_message, + format_message_with_replies, + format_thread_summary, +) + + +def list_public_corpuses( + limit: int = 20, + offset: int = 0, + search: str = "" +) -> dict: + """ + List public corpuses visible to anonymous users. + + Args: + limit: Number of results (default 20, max 100) + offset: Pagination offset + search: Optional search filter for title/description + + Returns: + Dict with total_count and list of corpus summaries + """ + from opencontractserver.corpuses.models import Corpus + + # Enforce max limit + limit = min(limit, 100) + + anonymous = AnonymousUser() + qs = Corpus.objects.visible_to_user(anonymous) + + if search: + qs = qs.filter( + Q(title__icontains=search) | Q(description__icontains=search) + ) + + total_count = qs.count() + corpuses = list(qs[offset:offset + limit]) + + return { + "total_count": total_count, + "corpuses": [format_corpus_summary(c) for c in corpuses] + } + + +def list_documents( + corpus_slug: str, + limit: int = 50, + offset: int = 0, + search: str = "" +) -> dict: + """ + List documents in a public corpus. + + Args: + corpus_slug: Corpus identifier + limit: Number of results (default 50, max 100) + offset: Pagination offset + search: Optional search filter + + Returns: + Dict with total_count and list of document summaries + """ + from opencontractserver.corpuses.models import Corpus + from opencontractserver.documents.models import Document + + limit = min(limit, 100) + anonymous = AnonymousUser() + + # Get corpus (raises Corpus.DoesNotExist if not found or not public) + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + # Get public documents in this corpus + qs = ( + Document.objects + .visible_to_user(anonymous) + .filter(corpuses=corpus) + ) + + if search: + qs = qs.filter( + Q(title__icontains=search) | Q(description__icontains=search) + ) + + total_count = qs.count() + documents = list(qs[offset:offset + limit]) + + return { + "total_count": total_count, + "documents": [format_document_summary(d) for d in documents] + } + + +def get_document_text(corpus_slug: str, document_slug: str) -> dict: + """ + Retrieve full extracted text from a document. + + Args: + corpus_slug: Corpus identifier + document_slug: Document identifier + + Returns: + Dict with document slug, page count, and full text + """ + from opencontractserver.corpuses.models import Corpus + from opencontractserver.documents.models import Document + + anonymous = AnonymousUser() + + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + document = ( + Document.objects + .visible_to_user(anonymous) + .get(corpuses=corpus, slug=document_slug) + ) + + full_text = "" + if document.txt_extract_file: + try: + with document.txt_extract_file.open('r') as f: + full_text = f.read() + except Exception: + full_text = "" + + return { + "document_slug": document.slug, + "page_count": document.page_count or 0, + "text": full_text + } + + +def list_annotations( + corpus_slug: str, + document_slug: str, + page: int | None = None, + label_text: str | None = None, + limit: int = 100, + offset: int = 0 +) -> dict: + """ + List annotations on a document with optional filtering. + + Args: + corpus_slug: Corpus identifier + document_slug: Document identifier + page: Optional page number filter + label_text: Optional label text filter + limit: Number of results (max 100) + offset: Pagination offset + + Returns: + Dict with total_count and list of annotations + """ + from opencontractserver.annotations.query_optimizer import AnnotationQueryOptimizer + from opencontractserver.corpuses.models import Corpus + from opencontractserver.documents.models import Document + + limit = min(limit, 100) + anonymous = AnonymousUser() + + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + document = Document.objects.visible_to_user(anonymous).get( + corpuses=corpus, slug=document_slug + ) + + # Use query optimizer - eliminates N+1 permission queries + qs = AnnotationQueryOptimizer.get_document_annotations( + document_id=document.id, + user=anonymous, + corpus_id=corpus.id + ) + + # Apply filters + if page is not None: + qs = qs.filter(page=page) + + if label_text: + qs = qs.filter(annotation_label__text=label_text) + + total_count = qs.count() + annotations = list(qs.select_related('annotation_label')[offset:offset + limit]) + + return { + "total_count": total_count, + "annotations": [format_annotation(a) for a in annotations] + } + + +def search_corpus( + corpus_slug: str, + query: str, + limit: int = 10 +) -> dict: + """ + Semantic search within a corpus using vector embeddings. + + Falls back to text search if embeddings are unavailable. + + Args: + corpus_slug: Corpus identifier + query: Search query text + limit: Number of results (max 50) + + Returns: + Dict with query and ranked results + """ + from opencontractserver.corpuses.models import Corpus + from opencontractserver.documents.models import Document + + limit = min(limit, 50) + anonymous = AnonymousUser() + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + # Try to use vector search + try: + # embed_text() returns (embedder_path, query_vector) tuple + embedder_path, query_vector = corpus.embed_text(query) + + if query_vector: + # Search documents using vector similarity + doc_results = list( + Document.objects + .visible_to_user(anonymous) + .filter(corpuses=corpus) + .search_by_embedding(query_vector, embedder_path, top_k=limit) + ) + + results = [] + for doc in doc_results: + results.append({ + "type": "document", + "slug": doc.slug, + "title": doc.title or "", + "similarity_score": float(getattr(doc, 'similarity_score', 0)), + }) + + return {"query": query, "results": results} + except Exception: + pass + + # Fallback to text search + return _text_search_fallback(corpus, query, limit, anonymous) + + +def _text_search_fallback(corpus, query: str, limit: int, user) -> dict: + """Fallback to text search when embeddings are unavailable.""" + from opencontractserver.documents.models import Document + + documents = list( + Document.objects + .visible_to_user(user) + .filter(corpuses=corpus) + .filter(Q(title__icontains=query) | Q(description__icontains=query)) + [:limit] + ) + + results = [] + for doc in documents: + results.append({ + "type": "document", + "slug": doc.slug, + "title": doc.title or "", + "similarity_score": None, + }) + + return {"query": query, "results": results} + + +def list_threads( + corpus_slug: str, + document_slug: str | None = None, + limit: int = 20, + offset: int = 0 +) -> dict: + """ + List discussion threads in a corpus or document. + + Args: + corpus_slug: Corpus identifier + document_slug: Optional document filter + limit: Number of results (max 100) + offset: Pagination offset + + Returns: + Dict with total_count and list of thread summaries + """ + from opencontractserver.conversations.models import ( + Conversation, + ConversationTypeChoices, + ) + from opencontractserver.corpuses.models import Corpus + from opencontractserver.documents.models import Document + + limit = min(limit, 100) + anonymous = AnonymousUser() + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + qs = ( + Conversation.objects + .visible_to_user(anonymous) + .filter( + conversation_type=ConversationTypeChoices.THREAD, + chat_with_corpus=corpus + ) + .annotate(message_count=Count('messages')) + ) + + if document_slug: + document = Document.objects.visible_to_user(anonymous).get( + corpuses=corpus, slug=document_slug + ) + qs = qs.filter(chat_with_document=document) + + # Order by pinned first, then recent activity + qs = qs.order_by('-is_pinned', '-modified') + + total_count = qs.count() + threads = list(qs[offset:offset + limit]) + + return { + "total_count": total_count, + "threads": [format_thread_summary(t) for t in threads] + } + + +def get_thread_messages( + corpus_slug: str, + thread_id: int, + flatten: bool = False +) -> dict: + """ + Retrieve all messages in a thread with hierarchical structure. + + Args: + corpus_slug: Corpus identifier + thread_id: Thread identifier + flatten: If True, return flat list instead of tree + + Returns: + Dict with thread_id, title, and messages + """ + from django.core.exceptions import ObjectDoesNotExist + + from opencontractserver.conversations.models import ( + ChatMessage, + Conversation, + ConversationTypeChoices, + ) + from opencontractserver.corpuses.models import Corpus + + anonymous = AnonymousUser() + corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) + + thread = ( + Conversation.objects + .visible_to_user(anonymous) + .filter( + conversation_type=ConversationTypeChoices.THREAD, + chat_with_corpus=corpus, + id=thread_id + ) + .first() + ) + + if not thread: + raise ObjectDoesNotExist(f"Thread {thread_id} not found") + + if flatten: + messages = list( + ChatMessage.objects + .visible_to_user(anonymous) + .filter(conversation=thread) + .order_by('created_at') + ) + return { + "thread_id": str(thread.id), + "title": thread.title or "", + "messages": [format_message(m) for m in messages] + } + + # Build hierarchical structure with prefetch + root_messages = list( + ChatMessage.objects + .visible_to_user(anonymous) + .filter(conversation=thread, parent_message__isnull=True) + .prefetch_related('replies__replies') + .order_by('created_at') + ) + + return { + "thread_id": str(thread.id), + "title": thread.title or "", + "messages": [ + format_message_with_replies(m, anonymous) for m in root_messages + ] + } diff --git a/requirements/base.txt b/requirements/base.txt index bbe50c6d9..f8d257e11 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -76,6 +76,10 @@ posthog==3.8.3 # https://github.com/posthog/posthog-python # ------------------------------------------------------------------------------ jsonschema==4.25.1 +# Model Context Protocol +# ------------------------------------------------------------------------------ +mcp>=1.0.0 # https://github.com/anthropics/python-sdk + # Not directly required, pinned by Snyk to avoid a vulnerability # ------------------------------------------------------------------------------ twisted>=24.7.0rc1 # not directly required, pinned by Snyk to avoid a vulnerability From 5fa50150c59ed176e3c8e52ca8b188faeebba49c Mon Sep 17 00:00:00 2001 From: JSv4 Date: Sun, 28 Dec 2025 19:30:27 -0500 Subject: [PATCH 4/6] Add SSE transport for HTTP-based MCP client access - Add ASGI routing for /mcp/* endpoints in config/asgi.py - Implement SSE transport handlers (GET /mcp/sse/, POST /mcp/messages/) - Refactor server.py to use factory pattern for cleaner initialization - Apply code formatting fixes from pre-commit hooks --- config/asgi.py | 23 +- config/settings/base.py | 12 +- opencontractserver/mcp/config.py | 4 +- opencontractserver/mcp/formatters.py | 26 +- opencontractserver/mcp/resources.py | 106 ++--- opencontractserver/mcp/server.py | 523 ++++++++++++++--------- opencontractserver/mcp/tests/test_mcp.py | 35 +- opencontractserver/mcp/tools.py | 143 +++---- 8 files changed, 491 insertions(+), 381 deletions(-) diff --git a/config/asgi.py b/config/asgi.py index 1185e54ea..7bd27a24f 100644 --- a/config/asgi.py +++ b/config/asgi.py @@ -41,6 +41,7 @@ from config.websocket.consumers.unified_agent_conversation import ( # noqa: E402 UnifiedAgentConsumer, ) +from opencontractserver.mcp.server import mcp_asgi_app # noqa: E402 logger = logging.getLogger(__name__) @@ -52,6 +53,26 @@ # This application object is used by any ASGI server configured to use this file. django_application = get_asgi_application() + +def create_http_router(django_app, mcp_app): + """ + Create an HTTP router that dispatches to MCP or Django based on path. + + Routes /mcp/* to the MCP ASGI app, everything else to Django. + """ + + async def router(scope, receive, send): + path = scope.get("path", "") + if path.startswith("/mcp/"): + await mcp_app(scope, receive, send) + else: + await django_app(scope, receive, send) + + return router + + +http_application = create_http_router(django_application, mcp_asgi_app) + document_query_pattern = re_path( r"ws/document/(?P[-a-zA-Z0-9_=]+)/query/(?:corpus/(?P[-a-zA-Z0-9_=]+)/)?$", DocumentQueryConsumer.as_asgi(), @@ -119,7 +140,7 @@ # 4. URL routing application = ProtocolTypeRouter( { - "http": django_application, + "http": http_application, # Routes /mcp/* to MCP, rest to Django "websocket": websocket_auth_middleware(URLRouter(websocket_urlpatterns)), } ) diff --git a/config/settings/base.py b/config/settings/base.py index a4bf592c9..73f95fbcd 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -988,11 +988,11 @@ # ------------------------------------------------------------------------------ # See docs/mcp/mcp_interface_proposal.md for details MCP_SERVER = { - 'enabled': env.bool('MCP_SERVER_ENABLED', default=False), - 'max_results_per_page': env.int('MCP_MAX_RESULTS_PER_PAGE', default=100), - 'rate_limit': { - 'requests': env.int('MCP_RATE_LIMIT_REQUESTS', default=100), - 'window': env.int('MCP_RATE_LIMIT_WINDOW', default=60), + "enabled": env.bool("MCP_SERVER_ENABLED", default=False), + "max_results_per_page": env.int("MCP_MAX_RESULTS_PER_PAGE", default=100), + "rate_limit": { + "requests": env.int("MCP_RATE_LIMIT_REQUESTS", default=100), + "window": env.int("MCP_RATE_LIMIT_WINDOW", default=60), }, - 'cache_ttl': env.int('MCP_CACHE_TTL', default=300), + "cache_ttl": env.int("MCP_CACHE_TTL", default=300), } diff --git a/opencontractserver/mcp/config.py b/opencontractserver/mcp/config.py index 09bf2de6c..5829e9a34 100644 --- a/opencontractserver/mcp/config.py +++ b/opencontractserver/mcp/config.py @@ -16,7 +16,7 @@ CACHE_TTL = 300 # seconds # URI Pattern Constants -SLUG_PATTERN = re.compile(r'^[A-Za-z0-9-]+$') +SLUG_PATTERN = re.compile(r"^[A-Za-z0-9-]+$") def get_mcp_setting(key: str, default: Optional[Any] = None) -> Any: @@ -39,7 +39,7 @@ def get_mcp_setting(key: str, default: Optional[Any] = None) -> Any: """ from django.conf import settings - mcp_settings = getattr(settings, 'MCP_SERVER', {}) + mcp_settings = getattr(settings, "MCP_SERVER", {}) return mcp_settings.get(key, default) diff --git a/opencontractserver/mcp/formatters.py b/opencontractserver/mcp/formatters.py index 08cd3766c..0853569fd 100644 --- a/opencontractserver/mcp/formatters.py +++ b/opencontractserver/mcp/formatters.py @@ -1,4 +1,5 @@ """Response formatters for MCP resources and tools.""" + from __future__ import annotations from typing import TYPE_CHECKING @@ -10,18 +11,20 @@ from opencontractserver.documents.models import Document -def format_corpus_summary(corpus: "Corpus") -> dict: +def format_corpus_summary(corpus: Corpus) -> dict: """Format a corpus for list display.""" return { "slug": corpus.slug, "title": corpus.title, "description": corpus.description or "", - "document_count": corpus.document_count() if hasattr(corpus, 'document_count') else 0, + "document_count": ( + corpus.document_count() if hasattr(corpus, "document_count") else 0 + ), "created": corpus.created.isoformat() if corpus.created else None, } -def format_document_summary(document: "Document") -> dict: +def format_document_summary(document: Document) -> dict: """Format a document for list display.""" return { "slug": document.slug, @@ -33,7 +36,7 @@ def format_document_summary(document: "Document") -> dict: } -def format_annotation(annotation: "Annotation") -> dict: +def format_annotation(annotation: Annotation) -> dict: """Format an annotation for API response.""" label_data = None if annotation.annotation_label: @@ -53,13 +56,13 @@ def format_annotation(annotation: "Annotation") -> dict: } -def format_thread_summary(thread: "Conversation") -> dict: +def format_thread_summary(thread: Conversation) -> dict: """Format a thread for list display.""" return { "id": str(thread.id), "title": thread.title or "", "description": thread.description or "", - "message_count": getattr(thread, 'message_count', 0), + "message_count": getattr(thread, "message_count", 0), "is_pinned": thread.is_pinned, "is_locked": thread.is_locked, "created_at": thread.created.isoformat() if thread.created else None, @@ -67,7 +70,7 @@ def format_thread_summary(thread: "Conversation") -> dict: } -def format_message(message: "ChatMessage") -> dict: +def format_message(message: ChatMessage) -> dict: """Format a single message without replies.""" return { "id": str(message.id), @@ -80,10 +83,7 @@ def format_message(message: "ChatMessage") -> dict: def format_message_with_replies( - message: "ChatMessage", - user, - max_depth: int = 3, - current_depth: int = 0 + message: ChatMessage, user, max_depth: int = 3, current_depth: int = 0 ) -> dict: """ Format a message with its replies recursively. @@ -96,12 +96,12 @@ def format_message_with_replies( if current_depth >= max_depth: formatted["replies"] = [] formatted["has_more_replies"] = ( - message.replies.exists() if hasattr(message, 'replies') else False + message.replies.exists() if hasattr(message, "replies") else False ) return formatted # Access prefetched replies (no additional queries if prefetched) - replies = list(message.replies.all()) if hasattr(message, 'replies') else [] + replies = list(message.replies.all()) if hasattr(message, "replies") else [] formatted["replies"] = [ format_message_with_replies(reply, user, max_depth, current_depth + 1) diff --git a/opencontractserver/mcp/resources.py b/opencontractserver/mcp/resources.py index ca257b0c8..2645ea85d 100644 --- a/opencontractserver/mcp/resources.py +++ b/opencontractserver/mcp/resources.py @@ -2,6 +2,7 @@ Resources provide static content for context windows, representing specific entities. """ + from __future__ import annotations import json @@ -26,25 +27,29 @@ def get_corpus_resource(corpus_slug: str) -> str: if corpus.label_set: labels = [] for label in corpus.label_set.annotation_labels.all()[:20]: # Limit labels - labels.append({ - "text": label.text, - "color": label.color or "#000000", - "label_type": label.label_type, - }) + labels.append( + { + "text": label.text, + "color": label.color or "#000000", + "label_type": label.label_type, + } + ) label_set_data = { "title": corpus.label_set.title or "", "labels": labels, } - return json.dumps({ - "slug": corpus.slug, - "title": corpus.title, - "description": corpus.description or "", - "document_count": corpus.document_count(), - "created": corpus.created.isoformat() if corpus.created else None, - "modified": corpus.modified.isoformat() if corpus.modified else None, - "label_set": label_set_data, - }) + return json.dumps( + { + "slug": corpus.slug, + "title": corpus.title, + "description": corpus.description or "", + "document_count": corpus.document_count(), + "created": corpus.created.isoformat() if corpus.created else None, + "modified": corpus.modified.isoformat() if corpus.modified else None, + "label_set": label_set_data, + } + ) def get_document_resource(corpus_slug: str, document_slug: str) -> str: @@ -64,8 +69,7 @@ def get_document_resource(corpus_slug: str, document_slug: str) -> str: # Get document within corpus (both must be public) document = ( - Document.objects - .visible_to_user(anonymous) + Document.objects.visible_to_user(anonymous) .filter(corpuses=corpus, slug=document_slug) .first() ) @@ -79,28 +83,28 @@ def get_document_resource(corpus_slug: str, document_slug: str) -> str: full_text = "" if document.txt_extract_file: try: - with document.txt_extract_file.open('r') as f: + with document.txt_extract_file.open("r") as f: full_text = f.read() except Exception: full_text = "" - return json.dumps({ - "slug": document.slug, - "title": document.title or "", - "description": document.description or "", - "file_type": document.file_type or "application/pdf", - "page_count": document.page_count or 0, - "text_preview": full_text[:500] if full_text else "", - "full_text": full_text, - "created": document.created.isoformat() if document.created else None, - "corpus": corpus_slug, - }) + return json.dumps( + { + "slug": document.slug, + "title": document.title or "", + "description": document.description or "", + "file_type": document.file_type or "application/pdf", + "page_count": document.page_count or 0, + "text_preview": full_text[:500] if full_text else "", + "full_text": full_text, + "created": document.created.isoformat() if document.created else None, + "corpus": corpus_slug, + } + ) def get_annotation_resource( - corpus_slug: str, - document_slug: str, - annotation_id: int + corpus_slug: str, document_slug: str, annotation_id: int ) -> str: """ Get annotation resource content. @@ -122,9 +126,7 @@ def get_annotation_resource( # Use query optimizer for efficient permission checking annotations = AnnotationQueryOptimizer.get_document_annotations( - document_id=document.id, - user=anonymous, - corpus_id=corpus.id + document_id=document.id, user=anonymous, corpus_id=corpus.id ) annotation = annotations.get(id=annotation_id) @@ -138,21 +140,21 @@ def get_annotation_resource( "label_type": annotation.annotation_label.label_type, } - return json.dumps({ - "id": str(annotation.id), - "page": annotation.page, - "raw_text": annotation.raw_text or "", - "annotation_label": label_data, - "bounding_box": annotation.bounding_box, - "structural": annotation.structural, - "created": annotation.created.isoformat() if annotation.created else None, - }) + return json.dumps( + { + "id": str(annotation.id), + "page": annotation.page, + "raw_text": annotation.raw_text or "", + "annotation_label": label_data, + "bounding_box": annotation.bounding_box, + "structural": annotation.structural, + "created": annotation.created.isoformat() if annotation.created else None, + } + ) def get_thread_resource( - corpus_slug: str, - thread_id: int, - include_messages: bool = True + corpus_slug: str, thread_id: int, include_messages: bool = True ) -> str: """ Get thread resource content. @@ -174,12 +176,11 @@ def get_thread_resource( # Get public thread in this corpus thread = ( - Conversation.objects - .visible_to_user(anonymous) + Conversation.objects.visible_to_user(anonymous) .filter( conversation_type=ConversationTypeChoices.THREAD, chat_with_corpus=corpus, - id=thread_id + id=thread_id, ) .first() ) @@ -201,11 +202,10 @@ def get_thread_resource( if include_messages: # Build hierarchical message structure with prefetch messages = list( - ChatMessage.objects - .visible_to_user(anonymous) + ChatMessage.objects.visible_to_user(anonymous) .filter(conversation=thread, parent_message__isnull=True) - .prefetch_related('replies__replies') - .order_by('created_at') + .prefetch_related("replies__replies") + .order_by("created_at") ) data["messages"] = [ format_message_with_replies(msg, anonymous) for msg in messages diff --git a/opencontractserver/mcp/server.py b/opencontractserver/mcp/server.py index c3398c11e..cf3763a48 100644 --- a/opencontractserver/mcp/server.py +++ b/opencontractserver/mcp/server.py @@ -2,16 +2,19 @@ OpenContracts MCP Server. Model Context Protocol server providing read-only access to public OpenContracts resources. +Supports both SSE transport (for HTTP) and stdio transport (for CLI). """ + from __future__ import annotations import asyncio import json +import logging import re -from typing import Optional from asgiref.sync import sync_to_async from mcp.server import Server +from mcp.server.sse import SseServerTransport from mcp.server.stdio import stdio_server from mcp.types import Resource, TextContent, Tool @@ -31,255 +34,381 @@ search_corpus, ) +logger = logging.getLogger(__name__) + class URIParser: """Parse MCP resource URIs safely using regex patterns.""" # Slug pattern: alphanumeric and hyphens only - SLUG_PATTERN = r'[A-Za-z0-9\-]+' + SLUG_PATTERN = r"[A-Za-z0-9\-]+" PATTERNS = { - 'corpus': re.compile(rf'^corpus://({SLUG_PATTERN})$'), - 'document': re.compile(rf'^document://({SLUG_PATTERN})/({SLUG_PATTERN})$'), - 'annotation': re.compile( - rf'^annotation://({SLUG_PATTERN})/({SLUG_PATTERN})/(\d+)$' + "corpus": re.compile(rf"^corpus://({SLUG_PATTERN})$"), + "document": re.compile(rf"^document://({SLUG_PATTERN})/({SLUG_PATTERN})$"), + "annotation": re.compile( + rf"^annotation://({SLUG_PATTERN})/({SLUG_PATTERN})/(\d+)$" ), - 'thread': re.compile(rf'^thread://({SLUG_PATTERN})/threads/(\d+)$'), + "thread": re.compile(rf"^thread://({SLUG_PATTERN})/threads/(\d+)$"), } @classmethod - def parse_corpus(cls, uri: str) -> Optional[str]: + def parse_corpus(cls, uri: str) -> str | None: """Parse corpus URI, returns corpus_slug or None.""" - match = cls.PATTERNS['corpus'].match(uri) + match = cls.PATTERNS["corpus"].match(uri) return match.group(1) if match else None @classmethod - def parse_document(cls, uri: str) -> Optional[tuple[str, str]]: + def parse_document(cls, uri: str) -> tuple[str, str] | None: """Parse document URI, returns (corpus_slug, document_slug) or None.""" - match = cls.PATTERNS['document'].match(uri) + match = cls.PATTERNS["document"].match(uri) return (match.group(1), match.group(2)) if match else None @classmethod - def parse_annotation(cls, uri: str) -> Optional[tuple[str, str, int]]: + def parse_annotation(cls, uri: str) -> tuple[str, str, int] | None: """Parse annotation URI, returns (corpus_slug, document_slug, annotation_id) or None.""" - match = cls.PATTERNS['annotation'].match(uri) + match = cls.PATTERNS["annotation"].match(uri) return (match.group(1), match.group(2), int(match.group(3))) if match else None @classmethod - def parse_thread(cls, uri: str) -> Optional[tuple[str, int]]: + def parse_thread(cls, uri: str) -> tuple[str, int] | None: """Parse thread URI, returns (corpus_slug, thread_id) or None.""" - match = cls.PATTERNS['thread'].match(uri) + match = cls.PATTERNS["thread"].match(uri) return (match.group(1), int(match.group(2))) if match else None -# Initialize MCP server -mcp_server = Server("opencontracts") - - -@mcp_server.list_resources() -async def list_resources() -> list[Resource]: - """List available resource patterns.""" - return [ - Resource( - uri="corpus://{corpus_slug}", - name="Public Corpus", - description="Access public corpus metadata and contents", - mimeType="application/json" - ), - Resource( - uri="document://{corpus_slug}/{document_slug}", - name="Public Document", - description="Access public document with extracted text", - mimeType="application/json" - ), - Resource( - uri="annotation://{corpus_slug}/{document_slug}/{annotation_id}", - name="Document Annotation", - description="Access specific annotation on a document", - mimeType="application/json" - ), - Resource( - uri="thread://{corpus_slug}/threads/{thread_id}", - name="Discussion Thread", - description="Access public discussion thread with messages", - mimeType="application/json" - ) - ] - - -@mcp_server.read_resource() -async def read_resource(uri: str) -> str: - """Resolve resource URI and return content.""" - # Try corpus URI - corpus_slug = URIParser.parse_corpus(uri) - if corpus_slug: - return await sync_to_async(get_corpus_resource)(corpus_slug) - - # Try document URI - doc_parts = URIParser.parse_document(uri) - if doc_parts: - corpus_slug, document_slug = doc_parts - return await sync_to_async(get_document_resource)(corpus_slug, document_slug) - - # Try annotation URI - ann_parts = URIParser.parse_annotation(uri) - if ann_parts: - corpus_slug, document_slug, annotation_id = ann_parts - return await sync_to_async(get_annotation_resource)( - corpus_slug, document_slug, annotation_id - ) - - # Try thread URI - thread_parts = URIParser.parse_thread(uri) - if thread_parts: - corpus_slug, thread_id = thread_parts - return await sync_to_async(get_thread_resource)(corpus_slug, thread_id) - - raise ValueError(f"Invalid or unrecognized resource URI: {uri}") - - -@mcp_server.list_tools() -async def list_tools() -> list[Tool]: - """List available tools.""" - return [ - Tool( - name="list_public_corpuses", - description="List all publicly accessible corpuses", - inputSchema={ - "type": "object", - "properties": { - "limit": {"type": "integer", "default": 20, "description": "Max results (1-100)"}, - "offset": {"type": "integer", "default": 0, "description": "Pagination offset"}, - "search": {"type": "string", "default": "", "description": "Search filter"} - } - } - ), - Tool( - name="list_documents", - description="List documents in a corpus", - inputSchema={ - "type": "object", - "properties": { - "corpus_slug": {"type": "string", "description": "Corpus identifier"}, - "limit": {"type": "integer", "default": 50}, - "offset": {"type": "integer", "default": 0}, - "search": {"type": "string", "default": ""} +def create_mcp_server() -> Server: + """Create and configure the MCP server instance.""" + mcp_server = Server("opencontracts") + + @mcp_server.list_resources() + async def list_resources() -> list[Resource]: + """List available resource patterns.""" + return [ + Resource( + uri="corpus://{corpus_slug}", + name="Public Corpus", + description="Access public corpus metadata and contents", + mimeType="application/json", + ), + Resource( + uri="document://{corpus_slug}/{document_slug}", + name="Public Document", + description="Access public document with extracted text", + mimeType="application/json", + ), + Resource( + uri="annotation://{corpus_slug}/{document_slug}/{annotation_id}", + name="Document Annotation", + description="Access specific annotation on a document", + mimeType="application/json", + ), + Resource( + uri="thread://{corpus_slug}/threads/{thread_id}", + name="Discussion Thread", + description="Access public discussion thread with messages", + mimeType="application/json", + ), + ] + + @mcp_server.read_resource() + async def read_resource(uri: str) -> str: + """Resolve resource URI and return content.""" + # Try corpus URI + corpus_slug = URIParser.parse_corpus(uri) + if corpus_slug: + return await sync_to_async(get_corpus_resource)(corpus_slug) + + # Try document URI + doc_parts = URIParser.parse_document(uri) + if doc_parts: + corpus_slug, document_slug = doc_parts + return await sync_to_async(get_document_resource)( + corpus_slug, document_slug + ) + + # Try annotation URI + ann_parts = URIParser.parse_annotation(uri) + if ann_parts: + corpus_slug, document_slug, annotation_id = ann_parts + return await sync_to_async(get_annotation_resource)( + corpus_slug, document_slug, annotation_id + ) + + # Try thread URI + thread_parts = URIParser.parse_thread(uri) + if thread_parts: + corpus_slug, thread_id = thread_parts + return await sync_to_async(get_thread_resource)(corpus_slug, thread_id) + + raise ValueError(f"Invalid or unrecognized resource URI: {uri}") + + @mcp_server.list_tools() + async def list_tools() -> list[Tool]: + """List available tools.""" + return [ + Tool( + name="list_public_corpuses", + description="List all publicly accessible corpuses", + inputSchema={ + "type": "object", + "properties": { + "limit": { + "type": "integer", + "default": 20, + "description": "Max results (1-100)", + }, + "offset": { + "type": "integer", + "default": 0, + "description": "Pagination offset", + }, + "search": { + "type": "string", + "default": "", + "description": "Search filter", + }, + }, }, - "required": ["corpus_slug"] - } - ), - Tool( - name="get_document_text", - description="Get full extracted text from a document", - inputSchema={ - "type": "object", - "properties": { - "corpus_slug": {"type": "string", "description": "Corpus identifier"}, - "document_slug": {"type": "string", "description": "Document identifier"} + ), + Tool( + name="list_documents", + description="List documents in a corpus", + inputSchema={ + "type": "object", + "properties": { + "corpus_slug": { + "type": "string", + "description": "Corpus identifier", + }, + "limit": {"type": "integer", "default": 50}, + "offset": {"type": "integer", "default": 0}, + "search": {"type": "string", "default": ""}, + }, + "required": ["corpus_slug"], }, - "required": ["corpus_slug", "document_slug"] - } - ), - Tool( - name="list_annotations", - description="List annotations on a document", - inputSchema={ - "type": "object", - "properties": { - "corpus_slug": {"type": "string"}, - "document_slug": {"type": "string"}, - "page": {"type": "integer", "description": "Filter to page number"}, - "label_text": {"type": "string", "description": "Filter by label text"}, - "limit": {"type": "integer", "default": 100}, - "offset": {"type": "integer", "default": 0} + ), + Tool( + name="get_document_text", + description="Get full extracted text from a document", + inputSchema={ + "type": "object", + "properties": { + "corpus_slug": { + "type": "string", + "description": "Corpus identifier", + }, + "document_slug": { + "type": "string", + "description": "Document identifier", + }, + }, + "required": ["corpus_slug", "document_slug"], }, - "required": ["corpus_slug", "document_slug"] - } - ), - Tool( - name="search_corpus", - description="Semantic search within a corpus", - inputSchema={ - "type": "object", - "properties": { - "corpus_slug": {"type": "string"}, - "query": {"type": "string", "description": "Search query"}, - "limit": {"type": "integer", "default": 10} + ), + Tool( + name="list_annotations", + description="List annotations on a document", + inputSchema={ + "type": "object", + "properties": { + "corpus_slug": {"type": "string"}, + "document_slug": {"type": "string"}, + "page": { + "type": "integer", + "description": "Filter to page number", + }, + "label_text": { + "type": "string", + "description": "Filter by label text", + }, + "limit": {"type": "integer", "default": 100}, + "offset": {"type": "integer", "default": 0}, + }, + "required": ["corpus_slug", "document_slug"], }, - "required": ["corpus_slug", "query"] - } - ), - Tool( - name="list_threads", - description="List discussion threads in a corpus", - inputSchema={ - "type": "object", - "properties": { - "corpus_slug": {"type": "string"}, - "document_slug": {"type": "string", "description": "Optional document filter"}, - "limit": {"type": "integer", "default": 20}, - "offset": {"type": "integer", "default": 0} + ), + Tool( + name="search_corpus", + description="Semantic search within a corpus", + inputSchema={ + "type": "object", + "properties": { + "corpus_slug": {"type": "string"}, + "query": {"type": "string", "description": "Search query"}, + "limit": {"type": "integer", "default": 10}, + }, + "required": ["corpus_slug", "query"], }, - "required": ["corpus_slug"] - } - ), - Tool( - name="get_thread_messages", - description="Get messages in a thread", - inputSchema={ - "type": "object", - "properties": { - "corpus_slug": {"type": "string"}, - "thread_id": {"type": "integer"}, - "flatten": {"type": "boolean", "default": False, "description": "Return flat list"} + ), + Tool( + name="list_threads", + description="List discussion threads in a corpus", + inputSchema={ + "type": "object", + "properties": { + "corpus_slug": {"type": "string"}, + "document_slug": { + "type": "string", + "description": "Optional document filter", + }, + "limit": {"type": "integer", "default": 20}, + "offset": {"type": "integer", "default": 0}, + }, + "required": ["corpus_slug"], }, - "required": ["corpus_slug", "thread_id"] - } - ), - ] - - -# Map tool names to implementations -TOOL_HANDLERS = { - "list_public_corpuses": list_public_corpuses, - "list_documents": list_documents, - "get_document_text": get_document_text, - "list_annotations": list_annotations, - "search_corpus": search_corpus, - "list_threads": list_threads, - "get_thread_messages": get_thread_messages, -} + ), + Tool( + name="get_thread_messages", + description="Get messages in a thread", + inputSchema={ + "type": "object", + "properties": { + "corpus_slug": {"type": "string"}, + "thread_id": {"type": "integer"}, + "flatten": { + "type": "boolean", + "default": False, + "description": "Return flat list", + }, + }, + "required": ["corpus_slug", "thread_id"], + }, + ), + ] + + # Map tool names to implementations + TOOL_HANDLERS = { + "list_public_corpuses": list_public_corpuses, + "list_documents": list_documents, + "get_document_text": get_document_text, + "list_annotations": list_annotations, + "search_corpus": search_corpus, + "list_threads": list_threads, + "get_thread_messages": get_thread_messages, + } + @mcp_server.call_tool() + async def call_tool(name: str, arguments: dict) -> list[TextContent]: + """Execute tool and return results.""" + handler = TOOL_HANDLERS.get(name) + if not handler: + raise ValueError(f"Unknown tool: {name}") + + # Run synchronous Django ORM handlers in thread pool + result = await sync_to_async(handler)(**arguments) + + return [TextContent(type="text", text=json.dumps(result, indent=2))] + + return mcp_server + + +# Create the global MCP server instance +mcp_server = create_mcp_server() + +# Create SSE transport for HTTP access +# The endpoint is where clients POST messages (relative to the SSE connection) +sse_transport = SseServerTransport("/mcp/messages/") + + +async def handle_sse(scope, receive, send): + """ + ASGI handler for SSE connections (GET /mcp/sse/). + + This establishes the SSE stream for server-to-client messages. + """ + logger.info("MCP SSE connection initiated") + try: + async with sse_transport.connect_sse(scope, receive, send) as streams: + await mcp_server.run( + streams[0], # read_stream + streams[1], # write_stream + mcp_server.create_initialization_options(), + ) + except Exception as e: + logger.error(f"MCP SSE error: {e}") + raise + + +async def handle_messages(scope, receive, send): + """ + ASGI handler for client messages (POST /mcp/messages/). + + This receives client requests and routes them to the appropriate session. + """ + logger.debug("MCP message received") + await sse_transport.handle_post_message(scope, receive, send) + + +def create_mcp_asgi_app(): + """ + Create an ASGI application that routes MCP requests. + + Routes: + GET /mcp/sse/ - Establish SSE connection + POST /mcp/messages/ - Send messages to server + """ + + async def app(scope, receive, send): + if scope["type"] != "http": + return + + path = scope.get("path", "") + method = scope.get("method", "GET") + + if path == "/mcp/sse/" and method == "GET": + await handle_sse(scope, receive, send) + elif path == "/mcp/messages/" and method == "POST": + await handle_messages(scope, receive, send) + else: + # Return 404 for unhandled paths + await send( + { + "type": "http.response.start", + "status": 404, + "headers": [[b"content-type", b"application/json"]], + } + ) + await send( + { + "type": "http.response.body", + "body": json.dumps( + { + "error": "Not found", + "endpoints": { + "sse": "GET /mcp/sse/", + "messages": "POST /mcp/messages/", + }, + } + ).encode(), + } + ) -@mcp_server.call_tool() -async def call_tool(name: str, arguments: dict) -> list[TextContent]: - """Execute tool and return results.""" - handler = TOOL_HANDLERS.get(name) - if not handler: - raise ValueError(f"Unknown tool: {name}") + return app - # Run synchronous Django ORM handlers in thread pool - result = await sync_to_async(handler)(**arguments) - return [TextContent(type="text", text=json.dumps(result, indent=2))] +# ASGI application for mounting in Django +mcp_asgi_app = create_mcp_asgi_app() async def main(): - """Run MCP server.""" + """Run MCP server with stdio transport (for CLI usage).""" async with stdio_server() as streams: await mcp_server.run( streams[0], # read_stream streams[1], # write_stream - mcp_server.create_initialization_options() + mcp_server.create_initialization_options(), ) if __name__ == "__main__": # Setup Django before running import os + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.local") import django + django.setup() asyncio.run(main()) diff --git a/opencontractserver/mcp/tests/test_mcp.py b/opencontractserver/mcp/tests/test_mcp.py index 65883bbab..36b7f882e 100644 --- a/opencontractserver/mcp/tests/test_mcp.py +++ b/opencontractserver/mcp/tests/test_mcp.py @@ -1,4 +1,5 @@ """Tests for MCP server functionality.""" + import json from django.contrib.auth import get_user_model @@ -7,7 +8,6 @@ from opencontractserver.corpuses.models import Corpus - User = get_user_model() @@ -30,7 +30,9 @@ def test_parse_corpus_uri(self): self.assertIsNone(URIParser.parse_corpus("corpus://")) self.assertIsNone(URIParser.parse_corpus("document://my-corpus")) self.assertIsNone(URIParser.parse_corpus("corpus://my corpus")) # space invalid - self.assertIsNone(URIParser.parse_corpus("corpus://my_corpus")) # underscore invalid + self.assertIsNone( + URIParser.parse_corpus("corpus://my_corpus") + ) # underscore invalid def test_parse_document_uri(self): """Test parsing document URIs.""" @@ -128,9 +130,7 @@ class MCPResourcesTest(TestCase): def setUpTestData(cls): """Create test data.""" cls.owner = User.objects.create_user( - username="testowner", - email="owner@test.com", - password="testpass123" + username="testowner", email="owner@test.com", password="testpass123" ) # Create public corpus @@ -138,7 +138,7 @@ def setUpTestData(cls): title="Public Test Corpus", description="A public test corpus", creator=cls.owner, - is_public=True + is_public=True, ) # Create private corpus @@ -146,7 +146,7 @@ def setUpTestData(cls): title="Private Test Corpus", description="A private test corpus", creator=cls.owner, - is_public=False + is_public=False, ) def test_get_public_corpus_resource(self): @@ -175,9 +175,7 @@ class MCPToolsTest(TestCase): def setUpTestData(cls): """Create test data.""" cls.owner = User.objects.create_user( - username="toolsowner", - email="tools@test.com", - password="testpass123" + username="toolsowner", email="tools@test.com", password="testpass123" ) # Create public corpuses @@ -185,20 +183,18 @@ def setUpTestData(cls): title="Corpus One", description="First corpus", creator=cls.owner, - is_public=True + is_public=True, ) cls.corpus2 = Corpus.objects.create( title="Corpus Two", description="Second corpus", creator=cls.owner, - is_public=True + is_public=True, ) # Create private corpus (should not appear) cls.private = Corpus.objects.create( - title="Private Corpus", - creator=cls.owner, - is_public=False + title="Private Corpus", creator=cls.owner, is_public=False ) def test_list_public_corpuses(self): @@ -238,8 +234,7 @@ def test_list_public_corpuses_pagination(self): # Different results self.assertNotEqual( - result["corpuses"][0]["slug"], - result2["corpuses"][0]["slug"] + result["corpuses"][0]["slug"], result2["corpuses"][0]["slug"] ) def test_list_public_corpuses_max_limit(self): @@ -258,16 +253,14 @@ class MCPFormattersTest(TestCase): @classmethod def setUpTestData(cls): cls.owner = User.objects.create_user( - username="formatowner", - email="format@test.com", - password="testpass123" + username="formatowner", email="format@test.com", password="testpass123" ) cls.corpus = Corpus.objects.create( title="Format Test Corpus", description="Testing formatters", creator=cls.owner, - is_public=True + is_public=True, ) def test_format_corpus_summary(self): diff --git a/opencontractserver/mcp/tools.py b/opencontractserver/mcp/tools.py index b2080ecf4..49de7479d 100644 --- a/opencontractserver/mcp/tools.py +++ b/opencontractserver/mcp/tools.py @@ -2,6 +2,7 @@ Tools provide dynamic operations - they execute queries and return results. """ + from __future__ import annotations from django.contrib.auth.models import AnonymousUser @@ -17,11 +18,7 @@ ) -def list_public_corpuses( - limit: int = 20, - offset: int = 0, - search: str = "" -) -> dict: +def list_public_corpuses(limit: int = 20, offset: int = 0, search: str = "") -> dict: """ List public corpuses visible to anonymous users. @@ -42,24 +39,19 @@ def list_public_corpuses( qs = Corpus.objects.visible_to_user(anonymous) if search: - qs = qs.filter( - Q(title__icontains=search) | Q(description__icontains=search) - ) + qs = qs.filter(Q(title__icontains=search) | Q(description__icontains=search)) total_count = qs.count() - corpuses = list(qs[offset:offset + limit]) + corpuses = list(qs[offset : offset + limit]) return { "total_count": total_count, - "corpuses": [format_corpus_summary(c) for c in corpuses] + "corpuses": [format_corpus_summary(c) for c in corpuses], } def list_documents( - corpus_slug: str, - limit: int = 50, - offset: int = 0, - search: str = "" + corpus_slug: str, limit: int = 50, offset: int = 0, search: str = "" ) -> dict: """ List documents in a public corpus. @@ -83,23 +75,17 @@ def list_documents( corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) # Get public documents in this corpus - qs = ( - Document.objects - .visible_to_user(anonymous) - .filter(corpuses=corpus) - ) + qs = Document.objects.visible_to_user(anonymous).filter(corpuses=corpus) if search: - qs = qs.filter( - Q(title__icontains=search) | Q(description__icontains=search) - ) + qs = qs.filter(Q(title__icontains=search) | Q(description__icontains=search)) total_count = qs.count() - documents = list(qs[offset:offset + limit]) + documents = list(qs[offset : offset + limit]) return { "total_count": total_count, - "documents": [format_document_summary(d) for d in documents] + "documents": [format_document_summary(d) for d in documents], } @@ -120,16 +106,14 @@ def get_document_text(corpus_slug: str, document_slug: str) -> dict: anonymous = AnonymousUser() corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) - document = ( - Document.objects - .visible_to_user(anonymous) - .get(corpuses=corpus, slug=document_slug) + document = Document.objects.visible_to_user(anonymous).get( + corpuses=corpus, slug=document_slug ) full_text = "" if document.txt_extract_file: try: - with document.txt_extract_file.open('r') as f: + with document.txt_extract_file.open("r") as f: full_text = f.read() except Exception: full_text = "" @@ -137,7 +121,7 @@ def get_document_text(corpus_slug: str, document_slug: str) -> dict: return { "document_slug": document.slug, "page_count": document.page_count or 0, - "text": full_text + "text": full_text, } @@ -147,7 +131,7 @@ def list_annotations( page: int | None = None, label_text: str | None = None, limit: int = 100, - offset: int = 0 + offset: int = 0, ) -> dict: """ List annotations on a document with optional filtering. @@ -177,9 +161,7 @@ def list_annotations( # Use query optimizer - eliminates N+1 permission queries qs = AnnotationQueryOptimizer.get_document_annotations( - document_id=document.id, - user=anonymous, - corpus_id=corpus.id + document_id=document.id, user=anonymous, corpus_id=corpus.id ) # Apply filters @@ -190,19 +172,15 @@ def list_annotations( qs = qs.filter(annotation_label__text=label_text) total_count = qs.count() - annotations = list(qs.select_related('annotation_label')[offset:offset + limit]) + annotations = list(qs.select_related("annotation_label")[offset : offset + limit]) return { "total_count": total_count, - "annotations": [format_annotation(a) for a in annotations] + "annotations": [format_annotation(a) for a in annotations], } -def search_corpus( - corpus_slug: str, - query: str, - limit: int = 10 -) -> dict: +def search_corpus(corpus_slug: str, query: str, limit: int = 10) -> dict: """ Semantic search within a corpus using vector embeddings. @@ -231,20 +209,21 @@ def search_corpus( if query_vector: # Search documents using vector similarity doc_results = list( - Document.objects - .visible_to_user(anonymous) + Document.objects.visible_to_user(anonymous) .filter(corpuses=corpus) .search_by_embedding(query_vector, embedder_path, top_k=limit) ) results = [] for doc in doc_results: - results.append({ - "type": "document", - "slug": doc.slug, - "title": doc.title or "", - "similarity_score": float(getattr(doc, 'similarity_score', 0)), - }) + results.append( + { + "type": "document", + "slug": doc.slug, + "title": doc.title or "", + "similarity_score": float(getattr(doc, "similarity_score", 0)), + } + ) return {"query": query, "results": results} except Exception: @@ -259,30 +238,27 @@ def _text_search_fallback(corpus, query: str, limit: int, user) -> dict: from opencontractserver.documents.models import Document documents = list( - Document.objects - .visible_to_user(user) + Document.objects.visible_to_user(user) .filter(corpuses=corpus) - .filter(Q(title__icontains=query) | Q(description__icontains=query)) - [:limit] + .filter(Q(title__icontains=query) | Q(description__icontains=query))[:limit] ) results = [] for doc in documents: - results.append({ - "type": "document", - "slug": doc.slug, - "title": doc.title or "", - "similarity_score": None, - }) + results.append( + { + "type": "document", + "slug": doc.slug, + "title": doc.title or "", + "similarity_score": None, + } + ) return {"query": query, "results": results} def list_threads( - corpus_slug: str, - document_slug: str | None = None, - limit: int = 20, - offset: int = 0 + corpus_slug: str, document_slug: str | None = None, limit: int = 20, offset: int = 0 ) -> dict: """ List discussion threads in a corpus or document. @@ -308,13 +284,11 @@ def list_threads( corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) qs = ( - Conversation.objects - .visible_to_user(anonymous) + Conversation.objects.visible_to_user(anonymous) .filter( - conversation_type=ConversationTypeChoices.THREAD, - chat_with_corpus=corpus + conversation_type=ConversationTypeChoices.THREAD, chat_with_corpus=corpus ) - .annotate(message_count=Count('messages')) + .annotate(message_count=Count("messages")) ) if document_slug: @@ -324,21 +298,19 @@ def list_threads( qs = qs.filter(chat_with_document=document) # Order by pinned first, then recent activity - qs = qs.order_by('-is_pinned', '-modified') + qs = qs.order_by("-is_pinned", "-modified") total_count = qs.count() - threads = list(qs[offset:offset + limit]) + threads = list(qs[offset : offset + limit]) return { "total_count": total_count, - "threads": [format_thread_summary(t) for t in threads] + "threads": [format_thread_summary(t) for t in threads], } def get_thread_messages( - corpus_slug: str, - thread_id: int, - flatten: bool = False + corpus_slug: str, thread_id: int, flatten: bool = False ) -> dict: """ Retrieve all messages in a thread with hierarchical structure. @@ -364,12 +336,11 @@ def get_thread_messages( corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) thread = ( - Conversation.objects - .visible_to_user(anonymous) + Conversation.objects.visible_to_user(anonymous) .filter( conversation_type=ConversationTypeChoices.THREAD, chat_with_corpus=corpus, - id=thread_id + id=thread_id, ) .first() ) @@ -379,30 +350,26 @@ def get_thread_messages( if flatten: messages = list( - ChatMessage.objects - .visible_to_user(anonymous) + ChatMessage.objects.visible_to_user(anonymous) .filter(conversation=thread) - .order_by('created_at') + .order_by("created_at") ) return { "thread_id": str(thread.id), "title": thread.title or "", - "messages": [format_message(m) for m in messages] + "messages": [format_message(m) for m in messages], } # Build hierarchical structure with prefetch root_messages = list( - ChatMessage.objects - .visible_to_user(anonymous) + ChatMessage.objects.visible_to_user(anonymous) .filter(conversation=thread, parent_message__isnull=True) - .prefetch_related('replies__replies') - .order_by('created_at') + .prefetch_related("replies__replies") + .order_by("created_at") ) return { "thread_id": str(thread.id), "title": thread.title or "", - "messages": [ - format_message_with_replies(m, anonymous) for m in root_messages - ] + "messages": [format_message_with_replies(m, anonymous) for m in root_messages], } From aebc80765e76c68d5c982e0df52740bef0ac0ea7 Mon Sep 17 00:00:00 2001 From: JSv4 Date: Sun, 28 Dec 2025 23:22:12 -0500 Subject: [PATCH 5/6] Add MCP documentation and finalize server implementation - Add docs/mcp/README.md with getting started guide, tool/resource reference, transport options, and architecture overview - Remove proposal document (replaced by actual documentation) - Switch to Streamable HTTP transport (stateless mode) for reliability - Use DocumentPath for corpus document membership (source of truth) - Add CorpusSettings embed model configuration UI --- config/asgi.py | 6 +- docs/mcp/README.md | 174 ++ docs/mcp/mcp_interface_proposal.md | 1457 ----------------- .../components/corpuses/CorpusSettings.tsx | 72 +- frontend/src/graphql/queries.ts | 9 + opencontractserver/mcp/resources.py | 10 +- opencontractserver/mcp/server.py | 113 +- opencontractserver/mcp/tools.py | 29 +- 8 files changed, 338 insertions(+), 1532 deletions(-) create mode 100644 docs/mcp/README.md delete mode 100644 docs/mcp/mcp_interface_proposal.md diff --git a/config/asgi.py b/config/asgi.py index 7bd27a24f..557cbc18c 100644 --- a/config/asgi.py +++ b/config/asgi.py @@ -58,12 +58,14 @@ def create_http_router(django_app, mcp_app): """ Create an HTTP router that dispatches to MCP or Django based on path. - Routes /mcp/* to the MCP ASGI app, everything else to Django. + Routes /mcp and /mcp/* to the MCP ASGI app, everything else to Django. + The MCP server uses Streamable HTTP transport in stateless mode. """ async def router(scope, receive, send): path = scope.get("path", "") - if path.startswith("/mcp/"): + # Match /mcp exactly or /mcp/* paths + if path == "/mcp" or path.startswith("/mcp/"): await mcp_app(scope, receive, send) else: await django_app(scope, receive, send) diff --git a/docs/mcp/README.md b/docs/mcp/README.md new file mode 100644 index 000000000..6f45adda2 --- /dev/null +++ b/docs/mcp/README.md @@ -0,0 +1,174 @@ +# OpenContracts MCP Server + +## TL;DR + +OpenContracts exposes a read-only [Model Context Protocol (MCP)](https://modelcontextprotocol.io/) server for AI assistants to access **public** corpuses, documents, annotations, and discussion threads. + +- **Endpoint**: `POST /mcp/` (Streamable HTTP, stateless) +- **Scope**: Public resources only (anonymous user visibility) +- **Auth**: None required (public data only) + +### Claude Desktop Quick Start + +Add to `~/.config/Claude/claude_desktop_config.json`: + +```json +{ + "mcpServers": { + "opencontracts": { + "command": "npx", + "args": [ + "mcp-remote", + "https://your-instance.com/mcp/" + ] + } + } +} +``` + +--- + +## Available Tools + +| Tool | Description | +|------|-------------| +| `list_public_corpuses` | List all public corpuses (paginated, searchable) | +| `list_documents` | List documents in a corpus | +| `get_document_text` | Get full extracted text from a document | +| `list_annotations` | List annotations on a document (filter by page/label) | +| `search_corpus` | Semantic vector search within a corpus | +| `list_threads` | List discussion threads in a corpus | +| `get_thread_messages` | Get messages in a thread (flat or hierarchical) | + +## Available Resources + +Resources use URI patterns for direct access: + +| URI Pattern | Description | +|-------------|-------------| +| `corpus://{corpus_slug}` | Corpus metadata and document list | +| `document://{corpus_slug}/{document_slug}` | Document with extracted text | +| `annotation://{corpus_slug}/{document_slug}/{annotation_id}` | Specific annotation | +| `thread://{corpus_slug}/threads/{thread_id}` | Thread with messages | + +--- + +## Transport Options + +### HTTP (Streamable HTTP) + +The primary transport. Stateless mode - each request is independent. + +```bash +# Test with curl +curl -X POST https://your-instance.com/mcp/ \ + -H "Content-Type: application/json" \ + -d '{"jsonrpc": "2.0", "method": "tools/list", "id": 1}' +``` + +### stdio (CLI) + +For local development or direct integration: + +```bash +cd /path/to/OpenContracts +python -m opencontractserver.mcp.server +``` + +--- + +## Example Usage + +### List Public Corpuses + +```json +{ + "jsonrpc": "2.0", + "method": "tools/call", + "params": { + "name": "list_public_corpuses", + "arguments": {"limit": 10} + }, + "id": 1 +} +``` + +### Semantic Search + +```json +{ + "jsonrpc": "2.0", + "method": "tools/call", + "params": { + "name": "search_corpus", + "arguments": { + "corpus_slug": "my-corpus", + "query": "indemnification clause", + "limit": 5 + } + }, + "id": 2 +} +``` + +### Read Resource + +```json +{ + "jsonrpc": "2.0", + "method": "resources/read", + "params": { + "uri": "document://my-corpus/contract-2024" + }, + "id": 3 +} +``` + +--- + +## Architecture + +``` +┌─────────────────┐ POST /mcp/ ┌──────────────────────┐ +│ MCP Client │ ◄────────────────► │ StreamableHTTP │ +│ (Claude, etc) │ JSON-RPC 2.0 │ Session Manager │ +└─────────────────┘ │ (stateless mode) │ + └──────────┬───────────┘ + │ + ┌──────────▼───────────┐ + │ MCP Server │ + │ - Tools (7) │ + │ - Resources (4) │ + └──────────┬───────────┘ + │ + ┌──────────▼───────────┐ + │ Django ORM │ + │ visible_to_user() │ + │ (AnonymousUser) │ + └──────────────────────┘ +``` + +**Key files**: +- `opencontractserver/mcp/server.py` - Server setup, ASGI app, URI parsing +- `opencontractserver/mcp/tools.py` - Tool implementations +- `opencontractserver/mcp/resources.py` - Resource handlers +- `opencontractserver/mcp/formatters.py` - Response formatters +- `config/asgi.py` - HTTP routing (`/mcp/*` → MCP app) + +--- + +## Security Model + +- **Read-only**: No mutations, no writes +- **Public only**: Uses `AnonymousUser` for all permission checks +- **Slug-based**: All identifiers are URL-safe slugs (no internal IDs exposed) +- **No auth required**: Only public resources are accessible + +--- + +## Limitations + +- No authentication (future: JWT/API key support for private resources) +- No write operations (by design) +- No streaming of large documents (text returned in full) +- Semantic search requires corpus to have embeddings configured diff --git a/docs/mcp/mcp_interface_proposal.md b/docs/mcp/mcp_interface_proposal.md deleted file mode 100644 index b63fdcc91..000000000 --- a/docs/mcp/mcp_interface_proposal.md +++ /dev/null @@ -1,1457 +0,0 @@ -# OpenContracts MCP Interface Proposal - -## Overview - -This document proposes an elegant, performant Model Context Protocol (MCP) interface for OpenContracts that provides **read-only access to public resources**. The interface follows a **one-corpus-at-a-time** model, allowing AI assistants and other MCP clients to explore public corpuses, documents, annotations, and discussion threads. - -## Design Principles - -1. **Public-Only Access**: Only resources where `is_public=True` are accessible -2. **Read-Only Operations**: No mutations - pure information retrieval -3. **One Corpus Context**: Users select a corpus, then explore within that scope -4. **Performance First**: Leverage existing query optimizers and manager methods -5. **Anonymous User Model**: Operate as anonymous user with READ permissions only -6. **Respect Permission Model**: Follow existing permissioning rules (document + corpus both must be public) - -## Architecture - -### Permission Strategy - -The MCP server operates as an **anonymous user**, which means: - -```python -# Permission checks follow anonymous user rules from permissioning guide: -# - Corpus: is_public=True -# - Document: is_public=True AND (no corpus OR corpus.is_public=True) -# - Annotation: document.is_public=True AND corpus.is_public=True -# - Thread: is_public=True -# - ChatMessage: thread.is_public=True - -from django.contrib.auth.models import AnonymousUser - -# All queries use visible_to_user() with AnonymousUser -anonymous = AnonymousUser() -public_corpuses = Corpus.objects.visible_to_user(anonymous) -``` - -### Resource Naming Convention - -MCP resources follow a hierarchical URI pattern: - -``` -corpus://{corpus_slug} -document://{corpus_slug}/{document_slug} -annotation://{corpus_slug}/{document_slug}/{annotation_id} -thread://{corpus_slug}/threads/{thread_id} -``` - -## MCP Resources - -Resources provide **static content** for context windows. They represent specific entities. - -### 1. Corpus Resource - -**URI**: `corpus://{corpus_slug}` - -**Content**: Full corpus metadata and summary statistics - -```json -{ - "slug": "legal-contracts-2024", - "title": "Legal Contracts Database 2024", - "description": "Curated collection of legal contracts...", - "document_count": 1247, - "annotation_count": 15632, - "thread_count": 89, - "created": "2024-01-15T10:30:00Z", - "modified": "2024-12-20T14:22:00Z", - "label_set": { - "title": "Legal Annotation Labels", - "labels": [ - {"text": "indemnification", "color": "#FF5733", "label_type": "TOKEN_LABEL"}, - {"text": "termination", "color": "#33FF57", "label_type": "SPAN_LABEL"} - ] - } -} -``` - -**Implementation**: -```python -def get_corpus_resource(corpus_slug: str) -> str: - anonymous = AnonymousUser() - corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) - - return json.dumps({ - "slug": corpus.slug, - "title": corpus.title, - "description": corpus.description, - "document_count": corpus.document_count(), - "created": corpus.created.isoformat(), - "modified": corpus.modified.isoformat(), - # ... statistics and metadata - }) -``` - -### 2. Document Resource - -**URI**: `document://{corpus_slug}/{document_slug}` - -**Content**: Document metadata, extracted text, and structural information - -```json -{ - "slug": "employment-agreement-acme-2024", - "title": "Employment Agreement - Acme Corp 2024", - "description": "Standard employment contract template", - "file_type": "application/pdf", - "page_count": 12, - "text_preview": "This Employment Agreement is entered into...", - "full_text": "[Full extracted text content]", - "created": "2024-03-10T09:15:00Z", - "corpus": "legal-contracts-2024" -} -``` - -**Implementation**: -```python -def get_document_resource(corpus_slug: str, document_slug: str) -> str: - anonymous = AnonymousUser() - - # Get corpus context - corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) - - # Get document within corpus (both must be public) - document = (Document.objects - .visible_to_user(anonymous) - .filter(corpuses=corpus, slug=document_slug) - .first()) - - if not document: - from opencontractserver.documents.models import Document - raise Document.DoesNotExist(f"Document '{document_slug}' not found in corpus '{corpus_slug}'") - - # Read extracted text - full_text = "" - if document.txt_extract_file: - with document.txt_extract_file.open('r') as f: - full_text = f.read() - - return json.dumps({ - "slug": document.slug, - "title": document.title, - "description": document.description, - "page_count": document.page_count, - "full_text": full_text, - # ... - }) -``` - -### 3. Annotation Resource - -**URI**: `annotation://{corpus_slug}/{document_slug}/{annotation_id}` - -**Content**: Specific annotation with location and metadata - -```json -{ - "id": "12345", - "page": 3, - "raw_text": "indemnification clause", - "annotation_label": { - "text": "indemnification", - "color": "#FF5733", - "label_type": "SPAN_LABEL" - }, - "bounding_box": { - "top": 120, - "left": 50, - "right": 450, - "bottom": 145 - }, - "structural": false, - "created": "2024-03-12T11:20:00Z" -} -``` - -**Implementation**: -```python -def get_annotation_resource(corpus_slug: str, document_slug: str, annotation_id: int) -> str: - from opencontractserver.annotations.query_optimizer import AnnotationQueryOptimizer - anonymous = AnonymousUser() - - # Get corpus and document - corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) - document = Document.objects.visible_to_user(anonymous).get( - corpuses=corpus, slug=document_slug - ) - - # Use query optimizer for efficient permission checking - annotations = AnnotationQueryOptimizer.get_document_annotations( - document_id=document.id, - user=anonymous, - corpus_id=corpus.id - ) - - annotation = annotations.get(id=annotation_id) - - return json.dumps({ - "id": str(annotation.id), - "page": annotation.page, - "raw_text": annotation.raw_text, - # ... - }) -``` - -### 4. Thread Resource - -**URI**: `thread://{corpus_slug}/threads/{thread_id}` - -**Content**: Discussion thread with messages - -```json -{ - "id": "9876", - "title": "Question about indemnification clause interpretation", - "description": "Discussion about standard indemnification language", - "message_count": 12, - "is_locked": false, - "is_pinned": true, - "created_at": "2024-11-15T14:30:00Z", - "messages": [ - { - "id": "msg-1", - "content": "Can someone explain the scope of this indemnification clause?", - "msg_type": "HUMAN", - "created_at": "2024-11-15T14:30:00Z", - "upvote_count": 5, - "downvote_count": 0, - "replies": [ - { - "id": "msg-2", - "content": "This clause provides protection for...", - "msg_type": "HUMAN", - "created_at": "2024-11-15T15:10:00Z", - "upvote_count": 8, - "downvote_count": 0 - } - ] - } - ] -} -``` - -**Implementation**: -```python -def get_thread_resource(corpus_slug: str, thread_id: int, include_messages: bool = True) -> str: - """Get a discussion thread resource.""" - import json - from django.contrib.auth.models import AnonymousUser - from opencontractserver.conversations.models import ( - ChatMessage, - Conversation, - ConversationTypeChoices, - ) - from opencontractserver.corpuses.models import Corpus - - anonymous = AnonymousUser() - corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) - - # Get public thread in this corpus - thread = (Conversation.objects - .visible_to_user(anonymous) - .filter( - conversation_type=ConversationTypeChoices.THREAD, - chat_with_corpus=corpus, - id=thread_id - ) - .first()) - - if not thread: - raise Conversation.DoesNotExist(f"Thread '{thread_id}' not found in corpus '{corpus_slug}'") - - data = { - "id": str(thread.id), - "title": thread.title or "", - "description": thread.description or "", - "is_locked": thread.is_locked, - "is_pinned": thread.is_pinned, - "created_at": thread.created.isoformat() if thread.created else None, - } - - if include_messages: - messages = build_threaded_messages(thread, anonymous) - data["messages"] = messages - - return json.dumps(data) - - -def build_threaded_messages(thread, user) -> list: - """ - Build hierarchical message tree. - - Uses prefetch_related to avoid N+1 queries when accessing nested replies. - """ - from opencontractserver.conversations.models import ChatMessage - - messages = list(ChatMessage.objects - .visible_to_user(user) - .filter(conversation=thread, parent_message__isnull=True) - .prefetch_related('replies__replies') # Prefetch 2 levels - .order_by('created_at')) - - return [format_message_with_replies(msg, user) for msg in messages] -``` - -## MCP Tools - -Tools provide **dynamic operations** - they execute queries and return results. - -### 1. list_public_corpuses - -**Purpose**: Discover available public corpuses - -**Parameters**: -- `limit` (optional, default=20): Number of results -- `offset` (optional, default=0): Pagination offset -- `search` (optional): Filter by title/description - -**Returns**: List of corpus summaries - -```json -{ - "total_count": 47, - "corpuses": [ - { - "slug": "legal-contracts-2024", - "title": "Legal Contracts Database 2024", - "description": "Curated collection...", - "document_count": 1247, - "created": "2024-01-15T10:30:00Z" - } - ] -} -``` - -**Implementation**: -```python -def list_public_corpuses(limit: int = 20, offset: int = 0, search: str = "") -> dict: - """ - List public corpuses visible to anonymous users. - - Note: This is a synchronous implementation. Django ORM operations are blocking, - so we keep this synchronous for simplicity. For async, wrap ORM calls with - sync_to_async from asgiref.sync. - """ - from django.contrib.auth.models import AnonymousUser - from django.db.models import Q - from opencontractserver.corpuses.models import Corpus - - anonymous = AnonymousUser() - qs = Corpus.objects.visible_to_user(anonymous) - - if search: - qs = qs.filter( - Q(title__icontains=search) | Q(description__icontains=search) - ) - - total_count = qs.count() - corpuses = list(qs[offset:offset+limit]) - - return { - "total_count": total_count, - "corpuses": [format_corpus_summary(c) for c in corpuses] - } - - -def format_corpus_summary(corpus) -> dict: - """Format a corpus for list display.""" - return { - "slug": corpus.slug, - "title": corpus.title, - "description": corpus.description or "", - "document_count": corpus.document_count(), - "created": corpus.created.isoformat(), - } -``` - -### 2. list_documents - -**Purpose**: List documents in a corpus - -**Parameters**: -- `corpus_slug` (required): Corpus identifier -- `limit` (optional, default=50): Number of results -- `offset` (optional, default=0): Pagination offset -- `search` (optional): Filter by title/description - -**Returns**: List of document summaries - -```json -{ - "total_count": 1247, - "documents": [ - { - "slug": "employment-agreement-acme-2024", - "title": "Employment Agreement - Acme Corp 2024", - "page_count": 12, - "created": "2024-03-10T09:15:00Z" - } - ] -} -``` - -**Implementation**: -```python -def list_documents( - corpus_slug: str, - limit: int = 50, - offset: int = 0, - search: str = "" -) -> dict: - """List documents in a public corpus.""" - from django.contrib.auth.models import AnonymousUser - from django.db.models import Q - from opencontractserver.corpuses.models import Corpus - from opencontractserver.documents.models import Document - - anonymous = AnonymousUser() - - # Get corpus (raises Corpus.DoesNotExist if not found or not public) - corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) - - # Get public documents in this corpus - qs = (Document.objects - .visible_to_user(anonymous) - .filter(corpuses=corpus)) - - if search: - qs = qs.filter( - Q(title__icontains=search) | Q(description__icontains=search) - ) - - total_count = qs.count() - documents = list(qs[offset:offset+limit]) - - return { - "total_count": total_count, - "documents": [format_document_summary(d) for d in documents] - } - - -def format_document_summary(document) -> dict: - """Format a document for list display.""" - return { - "slug": document.slug, - "title": document.title, - "description": document.description or "", - "page_count": document.page_count, - "file_type": document.file_type or "unknown", - "created": document.created.isoformat(), - } -``` - -### 3. get_document_text - -**Purpose**: Retrieve full extracted text from a document - -**Parameters**: -- `corpus_slug` (required): Corpus identifier -- `document_slug` (required): Document identifier - -**Returns**: Plain text content - -```json -{ - "document_slug": "employment-agreement-acme-2024", - "page_count": 12, - "text": "This Employment Agreement is entered into as of January 1, 2024..." -} -``` - -**Implementation**: -```python -def get_document_text(corpus_slug: str, document_slug: str) -> dict: - """Retrieve full extracted text from a document.""" - from django.contrib.auth.models import AnonymousUser - from opencontractserver.corpuses.models import Corpus - from opencontractserver.documents.models import Document - - anonymous = AnonymousUser() - - # Raises Corpus.DoesNotExist if not found/not public - corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) - - # Raises Document.DoesNotExist if not found/not public - document = (Document.objects - .visible_to_user(anonymous) - .get(corpuses=corpus, slug=document_slug)) - - full_text = "" - if document.txt_extract_file: - with document.txt_extract_file.open('r') as f: - full_text = f.read() - - return { - "document_slug": document.slug, - "page_count": document.page_count, - "text": full_text - } -``` - -### 4. list_annotations - -**Purpose**: List annotations on a document - -**Parameters**: -- `corpus_slug` (required): Corpus identifier -- `document_slug` (required): Document identifier -- `page` (optional): Filter to specific page -- `label_text` (optional): Filter by label text -- `limit` (optional, default=100): Number of results -- `offset` (optional, default=0): Pagination offset - -**Returns**: List of annotations - -```json -{ - "total_count": 156, - "annotations": [ - { - "id": "12345", - "page": 3, - "raw_text": "indemnification clause", - "annotation_label": { - "text": "indemnification", - "color": "#FF5733" - }, - "structural": false - } - ] -} -``` - -**Implementation**: -```python -def list_annotations( - corpus_slug: str, - document_slug: str, - page: int | None = None, - label_text: str | None = None, - limit: int = 100, - offset: int = 0 -) -> dict: - """List annotations on a document with optional filtering.""" - from django.contrib.auth.models import AnonymousUser - from opencontractserver.annotations.query_optimizer import AnnotationQueryOptimizer - from opencontractserver.corpuses.models import Corpus - from opencontractserver.documents.models import Document - - anonymous = AnonymousUser() - - corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) - document = Document.objects.visible_to_user(anonymous).get( - corpuses=corpus, slug=document_slug - ) - - # Use query optimizer - eliminates N+1 permission queries - qs = AnnotationQueryOptimizer.get_document_annotations( - document_id=document.id, - user=anonymous, - corpus_id=corpus.id - ) - - # Apply filters - if page is not None: - qs = qs.filter(page=page) - - if label_text: - qs = qs.filter(annotation_label__text=label_text) - - total_count = qs.count() - annotations = list(qs.select_related('annotation_label')[offset:offset+limit]) - - return { - "total_count": total_count, - "annotations": [format_annotation(a) for a in annotations] - } - - -def format_annotation(annotation) -> dict: - """Format an annotation for API response.""" - label_data = None - if annotation.annotation_label: - label_data = { - "text": annotation.annotation_label.text, - "color": annotation.annotation_label.color or "#000000", - "label_type": annotation.annotation_label.label_type, - } - - return { - "id": str(annotation.id), - "page": annotation.page, - "raw_text": annotation.raw_text or "", - "annotation_label": label_data, - "structural": annotation.structural, - "created": annotation.created.isoformat() if annotation.created else None, - } -``` - -### 5. search_corpus - -**Purpose**: Semantic search within a corpus using vector embeddings - -**Parameters**: -- `corpus_slug` (required): Corpus identifier -- `query` (required): Search query text -- `limit` (optional, default=10): Number of results - -**Returns**: Ranked list of relevant documents and annotations - -```json -{ - "query": "indemnification provisions", - "results": [ - { - "type": "document", - "slug": "employment-agreement-acme-2024", - "title": "Employment Agreement - Acme Corp 2024", - "similarity_score": 0.89, - "snippet": "...indemnification provisions in Section 7..." - }, - { - "type": "annotation", - "document_slug": "service-agreement-beta-2024", - "id": "45678", - "raw_text": "indemnification by service provider", - "similarity_score": 0.85, - "page": 5 - } - ] -} -``` - -**Implementation**: -```python -def search_corpus( - corpus_slug: str, - query: str, - limit: int = 10 -) -> dict: - """Semantic search within a corpus using vector embeddings.""" - from django.contrib.auth.models import AnonymousUser - from django.db.models import Q - from opencontractserver.corpuses.models import Corpus - from opencontractserver.documents.models import Document - - anonymous = AnonymousUser() - corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) - - # Generate query embedding using corpus's preferred embedder - # embed_text() returns (embedder_path, query_vector) tuple - embedder_path, query_vector = corpus.embed_text(query) - - if not query_vector: - # Fallback to text search if embeddings unavailable - return text_search_fallback(corpus, query, limit, anonymous) - - # Search documents using vector similarity - # search_by_embedding adds 'similarity_score' annotation - doc_results = list(Document.objects - .visible_to_user(anonymous) - .filter(corpuses=corpus) - .search_by_embedding(query_vector, embedder_path, top_k=limit)) - - # Combine and rank results - results = [] - for doc in doc_results: - results.append({ - "type": "document", - "slug": doc.slug, - "title": doc.title, - "similarity_score": float(doc.similarity_score), - }) - - return { - "query": query, - "results": results[:limit] - } - - -def text_search_fallback(corpus, query: str, limit: int, user) -> dict: - """Fallback to text search when embeddings are unavailable.""" - from django.db.models import Q - from opencontractserver.documents.models import Document - - # Simple text search on title and description - documents = list(Document.objects - .visible_to_user(user) - .filter(corpuses=corpus) - .filter(Q(title__icontains=query) | Q(description__icontains=query)) - [:limit]) - - results = [] - for doc in documents: - results.append({ - "type": "document", - "slug": doc.slug, - "title": doc.title, - "similarity_score": None, # No similarity score for text search - }) - - return { - "query": query, - "results": results - } -``` - -### 6. list_threads - -**Purpose**: List discussion threads in a corpus or document - -**Parameters**: -- `corpus_slug` (required): Corpus identifier -- `document_slug` (optional): Filter to document-specific threads -- `limit` (optional, default=20): Number of results -- `offset` (optional, default=0): Pagination offset - -**Returns**: List of thread summaries - -```json -{ - "total_count": 89, - "threads": [ - { - "id": "9876", - "title": "Question about indemnification clause", - "message_count": 12, - "is_pinned": true, - "is_locked": false, - "created_at": "2024-11-15T14:30:00Z", - "last_activity": "2024-12-15T09:20:00Z" - } - ] -} -``` - -**Implementation**: -```python -def list_threads( - corpus_slug: str, - document_slug: str | None = None, - limit: int = 20, - offset: int = 0 -) -> dict: - """List discussion threads in a corpus or document.""" - from django.contrib.auth.models import AnonymousUser - from django.db.models import Count - from opencontractserver.conversations.models import Conversation, ConversationTypeChoices - from opencontractserver.corpuses.models import Corpus - from opencontractserver.documents.models import Document - - anonymous = AnonymousUser() - corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) - - qs = (Conversation.objects - .visible_to_user(anonymous) - .filter( - conversation_type=ConversationTypeChoices.THREAD, - chat_with_corpus=corpus - ) - .annotate(message_count=Count('messages'))) # Efficient count - - if document_slug: - document = Document.objects.visible_to_user(anonymous).get( - corpuses=corpus, slug=document_slug - ) - qs = qs.filter(chat_with_document=document) - - # Order by pinned first, then recent activity - qs = qs.order_by('-is_pinned', '-updated_at') - - total_count = qs.count() - threads = list(qs[offset:offset+limit]) - - return { - "total_count": total_count, - "threads": [format_thread_summary(t) for t in threads] - } - - -def format_thread_summary(thread) -> dict: - """Format a thread for list display.""" - return { - "id": str(thread.id), - "title": thread.title or "", - "description": thread.description or "", - "message_count": getattr(thread, 'message_count', 0), - "is_pinned": thread.is_pinned, - "is_locked": thread.is_locked, - "created_at": thread.created.isoformat() if thread.created else None, - "last_activity": thread.updated.isoformat() if thread.updated else None, - } -``` - -### 7. get_thread_messages - -**Purpose**: Retrieve all messages in a thread with hierarchical structure - -**Parameters**: -- `corpus_slug` (required): Corpus identifier -- `thread_id` (required): Thread identifier -- `flatten` (optional, default=False): Return flat list instead of tree - -**Returns**: Thread messages in hierarchical or flat format - -```json -{ - "thread_id": "9876", - "title": "Question about indemnification clause", - "messages": [ - { - "id": "msg-1", - "content": "Can someone explain...", - "msg_type": "HUMAN", - "created_at": "2024-11-15T14:30:00Z", - "upvote_count": 5, - "replies": [ - { - "id": "msg-2", - "content": "This clause provides...", - "created_at": "2024-11-15T15:10:00Z", - "upvote_count": 8 - } - ] - } - ] -} -``` - -**Implementation**: -```python -def get_thread_messages( - corpus_slug: str, - thread_id: int, - flatten: bool = False -) -> dict: - """Retrieve all messages in a thread with hierarchical structure.""" - from django.contrib.auth.models import AnonymousUser - from opencontractserver.conversations.models import ( - ChatMessage, - Conversation, - ConversationTypeChoices, - ) - from opencontractserver.corpuses.models import Corpus - - anonymous = AnonymousUser() - corpus = Corpus.objects.visible_to_user(anonymous).get(slug=corpus_slug) - - # Get the thread - thread = (Conversation.objects - .visible_to_user(anonymous) - .filter( - conversation_type=ConversationTypeChoices.THREAD, - chat_with_corpus=corpus, - id=thread_id - ) - .first()) - - if not thread: - from django.core.exceptions import ObjectDoesNotExist - raise ObjectDoesNotExist(f"Thread {thread_id} not found") - - if flatten: - # Return all messages in flat list, ordered by created_at - messages = list(ChatMessage.objects - .visible_to_user(anonymous) - .filter(conversation=thread) - .order_by('created_at')) - return { - "thread_id": str(thread.id), - "title": thread.title or "", - "messages": [format_message(m) for m in messages] - } - - # Build hierarchical structure with prefetch to avoid N+1 queries - # Prefetch 2 levels of replies (adjust depth as needed) - root_messages = list(ChatMessage.objects - .visible_to_user(anonymous) - .filter(conversation=thread, parent_message__isnull=True) - .prefetch_related('replies__replies') - .order_by('created_at')) - - return { - "thread_id": str(thread.id), - "title": thread.title or "", - "messages": [format_message_with_replies(m, anonymous) for m in root_messages] - } - - -def format_message(message) -> dict: - """Format a single message without replies.""" - return { - "id": str(message.id), - "content": message.content, - "msg_type": message.msg_type, - "created_at": message.created_at.isoformat() if message.created_at else None, - "upvote_count": message.upvote_count, - "downvote_count": message.downvote_count, - } - - -def format_message_with_replies(message, user, max_depth: int = 3, current_depth: int = 0) -> dict: - """ - Format a message with its replies recursively. - - Uses prefetched replies to avoid N+1 queries. - Limits recursion depth to prevent deeply nested structures. - """ - formatted = format_message(message) - - if current_depth >= max_depth: - # Stop recursion at max depth - formatted["replies"] = [] - formatted["has_more_replies"] = message.replies.exists() if hasattr(message, 'replies') else False - return formatted - - # Access prefetched replies (no additional queries) - replies = list(message.replies.all()) if hasattr(message, 'replies') else [] - - formatted["replies"] = [ - format_message_with_replies(reply, user, max_depth, current_depth + 1) - for reply in replies - ] - - return formatted -``` - -## Implementation Structure - -### Directory Layout - -``` -opencontractserver/ - mcp/ - __init__.py - server.py # MCP server entry point - resources.py # Resource handlers - tools.py # Tool implementations - permissions.py # Permission utilities - formatters.py # Response formatting - config.py # Configuration -``` - -### MCP Server Entry Point - -```python -# opencontractserver/mcp/server.py -import asyncio -import json -import re -from typing import Optional - -from asgiref.sync import sync_to_async -from mcp import Server, Resource, Tool -from mcp.types import TextContent, EmbeddedResource - -from .resources import ( - get_corpus_resource, - get_document_resource, - get_annotation_resource, - get_thread_resource -) - -from .tools import ( - list_public_corpuses, - list_documents, - get_document_text, - list_annotations, - search_corpus, - list_threads, - get_thread_messages -) - -# Initialize MCP server -mcp_server = Server("opencontracts") - - -# URI parsing utilities with regex for safety -class URIParser: - """Parse MCP resource URIs safely using regex patterns.""" - - # Slug pattern: alphanumeric and hyphens only (matches OpenContracts slug format) - SLUG_PATTERN = r'[A-Za-z0-9\-]+' - - PATTERNS = { - 'corpus': re.compile(rf'^corpus://({SLUG_PATTERN})$'), - 'document': re.compile(rf'^document://({SLUG_PATTERN})/({SLUG_PATTERN})$'), - 'annotation': re.compile(rf'^annotation://({SLUG_PATTERN})/({SLUG_PATTERN})/(\d+)$'), - 'thread': re.compile(rf'^thread://({SLUG_PATTERN})/threads/(\d+)$'), - } - - @classmethod - def parse_corpus(cls, uri: str) -> Optional[str]: - """Parse corpus URI, returns corpus_slug or None.""" - match = cls.PATTERNS['corpus'].match(uri) - return match.group(1) if match else None - - @classmethod - def parse_document(cls, uri: str) -> Optional[tuple[str, str]]: - """Parse document URI, returns (corpus_slug, document_slug) or None.""" - match = cls.PATTERNS['document'].match(uri) - return (match.group(1), match.group(2)) if match else None - - @classmethod - def parse_annotation(cls, uri: str) -> Optional[tuple[str, str, int]]: - """Parse annotation URI, returns (corpus_slug, document_slug, annotation_id) or None.""" - match = cls.PATTERNS['annotation'].match(uri) - return (match.group(1), match.group(2), int(match.group(3))) if match else None - - @classmethod - def parse_thread(cls, uri: str) -> Optional[tuple[str, int]]: - """Parse thread URI, returns (corpus_slug, thread_id) or None.""" - match = cls.PATTERNS['thread'].match(uri) - return (match.group(1), int(match.group(2))) if match else None - - -# Register resources -@mcp_server.list_resources() -async def list_resources() -> list[Resource]: - """List available resource patterns""" - return [ - Resource( - uri="corpus://{corpus_slug}", - name="Public Corpus", - description="Access public corpus metadata and contents", - mimeType="application/json" - ), - Resource( - uri="document://{corpus_slug}/{document_slug}", - name="Public Document", - description="Access public document with extracted text", - mimeType="application/json" - ), - Resource( - uri="annotation://{corpus_slug}/{document_slug}/{annotation_id}", - name="Document Annotation", - description="Access specific annotation on a document", - mimeType="application/json" - ), - Resource( - uri="thread://{corpus_slug}/threads/{thread_id}", - name="Discussion Thread", - description="Access public discussion thread with messages", - mimeType="application/json" - ) - ] - - -@mcp_server.read_resource() -async def read_resource(uri: str) -> str: - """Resolve resource URI and return content.""" - # Try corpus URI - corpus_slug = URIParser.parse_corpus(uri) - if corpus_slug: - return await sync_to_async(get_corpus_resource)(corpus_slug) - - # Try document URI - doc_parts = URIParser.parse_document(uri) - if doc_parts: - corpus_slug, document_slug = doc_parts - return await sync_to_async(get_document_resource)(corpus_slug, document_slug) - - # Try annotation URI - ann_parts = URIParser.parse_annotation(uri) - if ann_parts: - corpus_slug, document_slug, annotation_id = ann_parts - return await sync_to_async(get_annotation_resource)(corpus_slug, document_slug, annotation_id) - - # Try thread URI - thread_parts = URIParser.parse_thread(uri) - if thread_parts: - corpus_slug, thread_id = thread_parts - return await sync_to_async(get_thread_resource)(corpus_slug, thread_id) - - raise ValueError(f"Invalid or unrecognized resource URI: {uri}") - - -# Register tools -@mcp_server.list_tools() -async def list_tools() -> list[Tool]: - """List available tools""" - return [ - Tool( - name="list_public_corpuses", - description="List all publicly accessible corpuses", - inputSchema={ - "type": "object", - "properties": { - "limit": {"type": "integer", "default": 20}, - "offset": {"type": "integer", "default": 0}, - "search": {"type": "string", "default": ""} - } - } - ), - Tool( - name="list_documents", - description="List documents in a corpus", - inputSchema={ - "type": "object", - "properties": { - "corpus_slug": {"type": "string", "description": "Corpus identifier"}, - "limit": {"type": "integer", "default": 50}, - "offset": {"type": "integer", "default": 0}, - "search": {"type": "string", "default": ""} - }, - "required": ["corpus_slug"] - } - ), - # ... (register all other tools - list_annotations, search_corpus, list_threads, get_thread_messages) - ] - - -# Map tool names to their implementations -TOOL_HANDLERS = { - "list_public_corpuses": list_public_corpuses, - "list_documents": list_documents, - "get_document_text": get_document_text, - "list_annotations": list_annotations, - "search_corpus": search_corpus, - "list_threads": list_threads, - "get_thread_messages": get_thread_messages, -} - - -@mcp_server.call_tool() -async def call_tool(name: str, arguments: dict) -> list[TextContent]: - """Execute tool and return results.""" - handler = TOOL_HANDLERS.get(name) - if not handler: - raise ValueError(f"Unknown tool: {name}") - - # Run synchronous Django ORM handlers in thread pool - result = await sync_to_async(handler)(**arguments) - - return [TextContent(type="text", text=json.dumps(result, indent=2))] - - -# Entry point -async def main(): - """Run MCP server""" - from mcp.server.stdio import stdio_server - - async with stdio_server() as streams: - await mcp_server.run( - streams[0], # read_stream - streams[1], # write_stream - mcp_server.create_initialization_options() - ) - -if __name__ == "__main__": - asyncio.run(main()) -``` - -## Performance Optimizations - -### 1. Query Optimizer Usage - -```python -# ALWAYS use query optimizers for annotations -from opencontractserver.annotations.query_optimizer import AnnotationQueryOptimizer - -# Good: Eliminates N+1 queries -annotations = AnnotationQueryOptimizer.get_document_annotations( - document_id=doc.id, - user=anonymous, - corpus_id=corpus.id -) - -# Bad: N+1 permission queries -annotations = Annotation.objects.filter(document=doc) # Don't do this! -``` - -### 2. Select Related / Prefetch - -```python -# Eager load related objects to avoid additional queries -documents = (Document.objects - .visible_to_user(anonymous) - .select_related('creator') - .prefetch_related('doc_annotations__annotation_label')) -``` - -### 3. Pagination - -```python -# Always use limit/offset for large result sets -def list_with_pagination(queryset, limit, offset): - total_count = queryset.count() - results = queryset[offset:offset+limit] - - return { - "total_count": total_count, - "limit": limit, - "offset": offset, - "has_more": offset + limit < total_count, - "results": [format_item(r) for r in results] - } -``` - -### 4. Caching Strategy - -```python -from django.core.cache import cache -from django.utils.encoding import force_str - -def cached_corpus_summary(corpus_slug: str) -> dict: - """Cache corpus summaries for 5 minutes""" - cache_key = f"mcp:corpus_summary:{corpus_slug}" - - cached = cache.get(cache_key) - if cached: - return cached - - result = generate_corpus_summary(corpus_slug) - cache.set(cache_key, result, 300) # 5 minutes - - return result -``` - -## Security Considerations - -### 1. Public-Only Filter - -```python -# ALWAYS apply anonymous user filter -from django.contrib.auth.models import AnonymousUser - -anonymous = AnonymousUser() - -# This automatically filters to is_public=True resources -public_resources = Model.objects.visible_to_user(anonymous) -``` - -### 2. Input Validation - -```python -import re - -def validate_slug(slug: str) -> bool: - """Validate slug format matches OpenContracts pattern""" - # From CLAUDE.md: Case-sensitive, A-Z, a-z, 0-9, hyphen (-) - return bool(re.match(r'^[A-Za-z0-9\-]+$', slug)) - -def sanitize_inputs(corpus_slug: str, document_slug: str | None = None): - """Validate and sanitize all slug inputs""" - if not validate_slug(corpus_slug): - raise ValueError(f"Invalid corpus slug: {corpus_slug}") - - if document_slug and not validate_slug(document_slug): - raise ValueError(f"Invalid document slug: {document_slug}") -``` - -### 3. Rate Limiting - -```python -from django.core.cache import cache -from datetime import datetime, timedelta - -class RateLimiter: - """Simple rate limiter for MCP requests""" - - def __init__(self, max_requests: int = 100, window_seconds: int = 60): - self.max_requests = max_requests - self.window_seconds = window_seconds - - def check_rate_limit(self, client_id: str) -> bool: - """Returns True if request is allowed, False if rate limited""" - key = f"mcp:ratelimit:{client_id}" - - current = cache.get(key, 0) - if current >= self.max_requests: - return False - - cache.set(key, current + 1, self.window_seconds) - return True -``` - -## Configuration - -### Environment Variables - -```bash -# .env -MCP_SERVER_ENABLED=true -MCP_MAX_RESULTS_PER_PAGE=100 -MCP_RATE_LIMIT_REQUESTS=100 -MCP_RATE_LIMIT_WINDOW=60 -MCP_CACHE_TTL=300 -``` - -### Django Settings - -```python -# config/settings/base.py - -# MCP Server Configuration -MCP_SERVER = { - 'enabled': env.bool('MCP_SERVER_ENABLED', default=False), - 'max_results_per_page': env.int('MCP_MAX_RESULTS_PER_PAGE', default=100), - 'rate_limit': { - 'requests': env.int('MCP_RATE_LIMIT_REQUESTS', default=100), - 'window': env.int('MCP_RATE_LIMIT_WINDOW', default=60), - }, - 'cache_ttl': env.int('MCP_CACHE_TTL', default=300), -} -``` - -## Testing Strategy - -### Unit Tests - -```python -# opencontractserver/mcp/tests/test_resources.py -from django.test import TestCase -from django.contrib.auth.models import AnonymousUser -from opencontractserver.corpuses.models import Corpus -from opencontractserver.mcp.resources import get_corpus_resource - -class CorpusResourceTest(TestCase): - def setUp(self): - self.public_corpus = Corpus.objects.create( - title="Public Corpus", - description="Test corpus", - slug="public-corpus", - is_public=True, - creator=self.create_user("owner") - ) - - self.private_corpus = Corpus.objects.create( - title="Private Corpus", - description="Private test corpus", - slug="private-corpus", - is_public=False, - creator=self.create_user("owner") - ) - - def test_get_public_corpus_resource(self): - """Anonymous users can access public corpus resources""" - result = get_corpus_resource("public-corpus") - data = json.loads(result) - - self.assertEqual(data["slug"], "public-corpus") - self.assertEqual(data["title"], "Public Corpus") - - def test_get_private_corpus_resource_denied(self): - """Anonymous users cannot access private corpus resources""" - with self.assertRaises(Corpus.DoesNotExist): - get_corpus_resource("private-corpus") -``` - -### Integration Tests - -```python -# opencontractserver/mcp/tests/test_integration.py -import pytest -from mcp.client import ClientSession - -@pytest.mark.asyncio -async def test_full_corpus_exploration(): - """Test complete workflow: discover corpus → list documents → get annotations""" - async with ClientSession("opencontracts-mcp") as session: - # 1. List public corpuses - corpuses_result = await session.call_tool("list_public_corpuses", {}) - corpuses = json.loads(corpuses_result[0].text) - - assert len(corpuses["corpuses"]) > 0 - corpus_slug = corpuses["corpuses"][0]["slug"] - - # 2. List documents in corpus - docs_result = await session.call_tool("list_documents", { - "corpus_slug": corpus_slug, - "limit": 10 - }) - docs = json.loads(docs_result[0].text) - - assert len(docs["documents"]) > 0 - document_slug = docs["documents"][0]["slug"] - - # 3. Get document text - text_result = await session.call_tool("get_document_text", { - "corpus_slug": corpus_slug, - "document_slug": document_slug - }) - text_data = json.loads(text_result[0].text) - - assert len(text_data["text"]) > 0 - - # 4. List annotations - ann_result = await session.call_tool("list_annotations", { - "corpus_slug": corpus_slug, - "document_slug": document_slug, - "limit": 50 - }) - annotations = json.loads(ann_result[0].text) - - assert "annotations" in annotations -``` - -## Deployment - -### Standalone MCP Server - -```bash -# Run as standalone process -python -m opencontractserver.mcp.server - -# Or via Docker -docker run -p 3000:3000 opencontracts-mcp -``` - -### Integration with Claude Desktop - -```json -{ - "mcpServers": { - "opencontracts": { - "command": "python", - "args": ["-m", "opencontractserver.mcp.server"], - "env": { - "DJANGO_SETTINGS_MODULE": "config.settings.production" - } - } - } -} -``` - -## Future Enhancements - -### Phase 2: Advanced Search - -- **Full-text search** with highlighting -- **Faceted search** by label type, date range, creator -- **Cross-corpus search** (search across multiple public corpuses) - -### Phase 3: Relationship Exploration - -- **Annotation relationships** - explore connected annotations -- **Document relationships** - find related documents -- **Citation graphs** - visualize document citation networks - -### Phase 4: Analytics - -- **Usage statistics** per corpus -- **Popular annotations** (most referenced/discussed) -- **Trending threads** in discussions - -## Summary - -This MCP interface proposal provides: - -✅ **Read-only access** to public OpenContracts resources -✅ **One-corpus-at-a-time** scoping for focused exploration -✅ **Performance optimized** using existing query optimizers -✅ **Elegant API** with intuitive resource URIs and tools -✅ **Security first** with anonymous user model and permission checks -✅ **Comprehensive coverage** of corpuses, documents, annotations, and threads - -The implementation follows OpenContracts' established patterns and leverages the existing permissioning infrastructure for a robust, maintainable solution. diff --git a/frontend/src/components/corpuses/CorpusSettings.tsx b/frontend/src/components/corpuses/CorpusSettings.tsx index 605ee05c2..15d28c5a7 100644 --- a/frontend/src/components/corpuses/CorpusSettings.tsx +++ b/frontend/src/components/corpuses/CorpusSettings.tsx @@ -12,7 +12,7 @@ import { useQuery, useReactiveVar, useMutation } from "@apollo/client"; import { toast } from "react-toastify"; import { useNavigate } from "react-router-dom"; import styled from "styled-components"; -import { editingCorpus } from "../../graphql/cache"; +import { editingCorpus, backendUserObj } from "../../graphql/cache"; import { GET_CORPUS_ACTIONS, GetCorpusActionsInput, @@ -52,6 +52,7 @@ interface CorpusSettingsProps { preferredEmbedder?: string | null; slug?: string | null; creator?: { + id?: string; email: string; username?: string; slug?: string; @@ -627,6 +628,7 @@ const SettingsContainer = styled.div` export const CorpusSettings: React.FC = ({ corpus }) => { const navigate = useNavigate(); + const currentUser = useReactiveVar(backendUserObj); // Check if myPermissions is already processed (array of PermissionTypes) or raw const permissions = @@ -639,6 +641,31 @@ export const CorpusSettings: React.FC = ({ corpus }) => { const canUpdate = permissions.includes(PermissionTypes.CAN_UPDATE); const canPermission = permissions.includes(PermissionTypes.CAN_PERMISSION); + + // Owner can always change visibility (matches backend SetCorpusVisibility permission check) + // Compare by ID first, fallback to email comparison for reliability + const isOwnerByIdentity = Boolean( + currentUser && + corpus.creator && + ((currentUser.id && + corpus.creator.id && + currentUser.id === corpus.creator.id) || + (currentUser.email && + corpus.creator.email && + currentUser.email === corpus.creator.email)) + ); + + // Fallback: If user has all core owner permissions, they're effectively the owner + // This handles cases where currentUser isn't loaded yet but permissions are + const hasFullOwnerPermissions = + permissions.includes(PermissionTypes.CAN_CREATE) && + permissions.includes(PermissionTypes.CAN_UPDATE) && + permissions.includes(PermissionTypes.CAN_READ) && + permissions.includes(PermissionTypes.CAN_PUBLISH) && + permissions.includes(PermissionTypes.CAN_REMOVE); + + const isOwner = isOwnerByIdentity || hasFullOwnerPermissions; + const canChangeVisibility = isOwner || canPermission; const [slugDraft, setSlugDraft] = useState(""); const [publicDraft, setPublicDraft] = useState( Boolean(corpus.isPublic) @@ -893,7 +920,7 @@ export const CorpusSettings: React.FC = ({ corpus }) => { Visibility & Slug - {!canUpdate && !canPermission && ( + {!canUpdate && !canChangeVisibility && (
= ({ corpus }) => { fontSize: "0.875rem", textTransform: "uppercase", letterSpacing: "0.08em", - color: !canPermission ? "#cbd5e1" : "#64748b", + color: !canChangeVisibility ? "#cbd5e1" : "#64748b", marginBottom: "0.75rem", fontWeight: 600, display: "flex", @@ -940,7 +967,7 @@ export const CorpusSettings: React.FC = ({ corpus }) => { }} > Public visibility - {!canPermission && ( + {!canChangeVisibility && ( = ({ corpus }) => { alignItems: "center", gap: "0.875rem", padding: "0.875rem 1rem", - background: !canPermission + background: !canChangeVisibility ? "linear-gradient(135deg, #f8fafc 0%, #f1f5f9 100%)" : "linear-gradient(135deg, #ffffff 0%, #fafbfc 100%)", border: "2px solid", - borderColor: !canPermission ? "#e2e8f0" : "#cbd5e1", + borderColor: !canChangeVisibility ? "#e2e8f0" : "#cbd5e1", borderRadius: "10px", transition: "all 0.3s ease", }} @@ -976,7 +1003,7 @@ export const CorpusSettings: React.FC = ({ corpus }) => { display: "flex", alignItems: "center", gap: "0.75rem", - cursor: !canPermission ? "not-allowed" : "pointer", + cursor: !canChangeVisibility ? "not-allowed" : "pointer", width: "100%", }} > @@ -984,13 +1011,15 @@ export const CorpusSettings: React.FC = ({ corpus }) => { id="corpus-is-public-checkbox" type="checkbox" checked={publicDraft} - disabled={!canPermission} + disabled={!canChangeVisibility} onChange={(e) => setPublicDraft(e.target.checked)} style={{ width: "20px", height: "20px", - cursor: !canPermission ? "not-allowed" : "pointer", - opacity: !canPermission ? 0.5 : 1, + cursor: !canChangeVisibility + ? "not-allowed" + : "pointer", + opacity: !canChangeVisibility ? 0.5 : 1, accentColor: "#6366f1", }} /> @@ -998,10 +1027,10 @@ export const CorpusSettings: React.FC = ({ corpus }) => { style={{ fontSize: "0.9375rem", fontWeight: 600, - color: !canPermission ? "#94a3b8" : "#1e293b", + color: !canChangeVisibility ? "#94a3b8" : "#1e293b", }} > - {publicDraft ? "Public" : "Private"} + Make corpus publicly accessible
@@ -1072,7 +1101,7 @@ export const CorpusSettings: React.FC = ({ corpus }) => {