From 0d8135567d96833d70f66562fca7f742b90978a3 Mon Sep 17 00:00:00 2001 From: jreakin Date: Tue, 27 Jan 2026 22:38:18 -0500 Subject: [PATCH] Initial project setup --- .env.example | 60 +++++ .gitignore | 54 +++++ .python-version | 1 + AGENTS.md | 384 +++++++++++++++++++++++++++++++ ARCHITECTURE.md | 370 +++++++++++++++++++++++++++++ GUARDRAILS.md | 160 +++++++++++++ PROJECT_STRUCTURE.md | 209 +++++++++++++++++ main.py | 6 + migrations/001_initial.sql | 64 ++++++ pyproject.toml | 30 +++ railway.toml | 9 + src/__init__.py | 1 + src/api/__init__.py | 1 + src/api/health.py | 11 + src/api/setup.py | 1 + src/api/webhook.py | 50 ++++ src/cli/__init__.py | 1 + src/cli/setup_cli.py | 108 +++++++++ src/config.py | 69 ++++++ src/db/__init__.py | 1 + src/db/client.py | 11 + src/db/repository.py | 158 +++++++++++++ src/main.py | 80 +++++++ src/models/__init__.py | 1 + src/models/agent_models.py | 19 ++ src/models/config_models.py | 35 +++ src/models/messenger.py | 23 ++ src/services/__init__.py | 1 + src/services/agent_service.py | 73 ++++++ src/services/copilot_service.py | 118 ++++++++++ src/services/facebook_service.py | 30 +++ src/services/reference_doc.py | 30 +++ src/services/scraper.py | 61 +++++ 33 files changed, 2230 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 .python-version create mode 100644 AGENTS.md create mode 100644 ARCHITECTURE.md create mode 100644 GUARDRAILS.md create mode 100644 PROJECT_STRUCTURE.md create mode 100644 main.py create mode 100644 migrations/001_initial.sql create mode 100644 pyproject.toml create mode 100644 railway.toml create mode 100644 src/__init__.py create mode 100644 src/api/__init__.py create mode 100644 src/api/health.py create mode 100644 src/api/setup.py create mode 100644 src/api/webhook.py create mode 100644 src/cli/__init__.py create mode 100644 src/cli/setup_cli.py create mode 100644 src/config.py create mode 100644 src/db/__init__.py create mode 100644 src/db/client.py create mode 100644 src/db/repository.py create mode 100644 src/main.py create mode 100644 src/models/__init__.py create mode 100644 src/models/agent_models.py create mode 100644 src/models/config_models.py create mode 100644 src/models/messenger.py create mode 100644 src/services/__init__.py create mode 100644 src/services/agent_service.py create mode 100644 src/services/copilot_service.py create mode 100644 src/services/facebook_service.py create mode 100644 src/services/reference_doc.py create mode 100644 src/services/scraper.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..868f3f7 --- /dev/null +++ b/.env.example @@ -0,0 +1,60 @@ +# Facebook Messenger Configuration +# Get these from your Facebook App settings at https://developers.facebook.com/ +FACEBOOK_PAGE_ACCESS_TOKEN=your_page_access_token_here +FACEBOOK_VERIFY_TOKEN=your_verify_token_here +FACEBOOK_APP_SECRET=your_app_secret_here # Optional, for signature verification + +# Supabase Configuration +# Get these from your Supabase project settings +SUPABASE_URL=https://your-project.supabase.co +SUPABASE_SERVICE_KEY=your_service_role_key_here + +# GitHub Copilot SDK Configuration +# Default Copilot CLI host (usually runs on localhost:5909) +COPILOT_CLI_HOST=http://localhost:5909 +# Enable/disable Copilot SDK (set to False to use OpenAI fallback) +COPILOT_ENABLED=True + +# OpenAI API Key (fallback when Copilot is unavailable) +# Get from https://platform.openai.com/api-keys +OPENAI_API_KEY=your_openai_api_key_here + +# Environment +# Options: local, railway, prod +ENV=local + +# Railway Configuration (automatically set by Railway) +# PORT=8000 # Railway sets this automatically, don't override + +## Variable Descriptions + +### Facebook Configuration +# FACEBOOK_PAGE_ACCESS_TOKEN: Long-lived page access token for your Facebook Page. +# Generate this in Facebook App Settings → Messenger → Access Tokens. +# FACEBOOK_VERIFY_TOKEN: Custom token you create for webhook verification. +# Must match the token you set in Facebook App webhook settings. +# FACEBOOK_APP_SECRET: Optional. Used for signature verification of incoming webhook requests. + +### Supabase Configuration +# SUPABASE_URL: Your Supabase project URL. Found in Project Settings → API. +# SUPABASE_SERVICE_KEY: Service role key (has admin privileges). +# Found in Project Settings → API. **Keep this secret!** + +### Copilot SDK Configuration +# COPILOT_CLI_HOST: Base URL for the GitHub Copilot CLI. Default is http://localhost:5909. +# COPILOT_ENABLED: Boolean flag to enable/disable Copilot SDK. +# Set to False to always use OpenAI fallback. + +### OpenAI Configuration +# OPENAI_API_KEY: API key for OpenAI (used as fallback when Copilot is unavailable). +# Get from https://platform.openai.com/api-keys + +### Environment +# ENV: Current environment. Options: local, railway, prod. +# Used for environment-specific behavior. + +## Security Notes +# - Never commit .env file to version control +# - Use .env.example as a template only +# - Rotate secrets regularly +# - Use different tokens/keys for different environments diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bd25dea --- /dev/null +++ b/.gitignore @@ -0,0 +1,54 @@ +# Python-generated files +__pycache__/ +*.py[oc] +*.pyc +*.pyo +*.pyd +.Python +build/ +dist/ +wheels/ +*.egg-info +.eggs/ + +# Virtual environments +.venv +venv/ +ENV/ +env/ + +# Environment variables +.env +.env.local +.env.*.local + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +.tox/ + +# Type checking +.mypy_cache/ +.dmypy.json +dmypy.json + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log +logs/ + +# Database +*.db +*.sqlite +*.sqlite3 diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..04e2079 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12.8 diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..507c039 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,384 @@ +You are an expert Python software engineer working on this project. + +--- + +## Project Overview + +**Tech Stack:** +- Python 3.12+ with uv for package management +- FastAPI for API services +- PostgreSQL/Supabase for database +- PydanticAI for AI agent framework +- GitHub Copilot SDK for LLM operations +- Pytest + Hypothesis for testing +- Ruff for linting and formatting +- Pre-commit hooks for code quality + +**Architecture:** +- Async-first design (asyncio, async/await throughout) +- Dependency injection via FastAPI's DI system +- Repository pattern for database operations +- PydanticAI agent for message handling +- Copilot SDK with OpenAI fallback + +**File Structure:** +``` +src/ +├── api/ # FastAPI routers and endpoints +├── models/ # Pydantic models for validation +├── services/ # Business logic (scraper, agent, copilot, facebook) +├── db/ # Database client and repositories +└── cli/ # CLI commands for setup +tests/ +├── unit/ # Fast, isolated unit tests +├── integration/ # Database and service integration tests +└── e2e/ # End-to-end API tests +migrations/ # Database migrations +``` + +--- + +## Commands You Must Know + +### Development +```bash +uv sync # Install/sync dependencies +uv run pytest # Run all tests +uv run pytest -v --cov=src # Run with coverage report +uv run pytest -k "test_name" # Run specific test +uv run ruff check . # Lint code +uv run ruff format . # Format code +uv run uvicorn src.main:app --reload # Start dev server +``` + +### Testing Strategy +```bash +# Fast feedback loop - run frequently +uv run pytest tests/unit -x # Stop on first failure + +# Before commits - comprehensive +uv run pytest --cov=src --cov-report=term-missing + +# Property-based testing with Hypothesis +uv run pytest tests/unit/test_validation.py -v --hypothesis-show-statistics +``` + +### Database +```bash +# Apply migrations to Supabase +# Use Supabase Dashboard SQL Editor or Supabase CLI +supabase db push # If using Supabase CLI +``` + +### CLI Setup +```bash +uv run python -m src.cli.setup_cli setup # Interactive bot setup +``` + +--- + +## Code Style Standards + +### Naming Conventions + +| Type | Convention | Examples | +|------|------------|----------| +| Functions/variables | `snake_case` | `get_bot_config`, `scrape_website` | +| Classes | `PascalCase` | `CopilotService`, `MessengerAgentService` | +| Constants | `UPPER_SNAKE_CASE` | `MAX_RETRIES`, `COPILOT_CLI_HOST` | +| Private methods | `_leading_underscore` | `_validate_input`, `_fallback_to_openai` | +| Type variables | `PascalCase` with `T` prefix | `TModel`, `TResponse` | + +### Modern Python Patterns + +✅ **GOOD** — Type hints, async/await, context managers + +```python +from typing import List +from contextlib import asynccontextmanager + +async def get_bot_configuration_by_page_id(page_id: str) -> BotConfiguration | None: + """Fetch bot configuration by Facebook Page ID.""" + if not page_id: + raise ValueError("page_id is required") + + supabase = get_supabase_client() + result = supabase.table("bot_configurations").select("*").eq("page_id", page_id).execute() + + if not result.data: + return None + + return BotConfiguration(**result.data[0]) +``` + +❌ **BAD** — No types, sync code, poor error handling + +```python +def get_bot(id): + # Synchronous, no type hints, swallows errors + try: + bot = db.query(Bot).filter(Bot.id == id).first() + return bot + except: + return None +``` + +### FastAPI Route Standards + +✅ **GOOD** — Proper dependencies, response models, error handling + +```python +from fastapi import APIRouter, Request, Response +from fastapi.responses import PlainTextResponse +from src.config import get_settings + +router = APIRouter() + +@router.get("") +async def verify_webhook(request: Request): + """Facebook webhook verification endpoint.""" + settings = get_settings() + + mode = request.query_params.get("hub.mode") + token = request.query_params.get("hub.verify_token") + challenge = request.query_params.get("hub.challenge") + + if mode == "subscribe" and token == settings.facebook_verify_token: + return PlainTextResponse(challenge) + + return Response(status_code=403) +``` + +❌ **BAD** — No dependency injection, dict response, poor status codes + +```python +@app.get("/webhook") +async def webhook(data: dict): + if data.get("token") == "secret": # Hardcoded! + return {"status": "ok"} # Wrong! Use proper response + return {"error": "invalid"} # Wrong! Use HTTPException +``` + +--- + +## Testing Practices + +### Write Tests That Cover Edge Cases + +```python +import pytest +from hypothesis import given, strategies as st +from src.services.scraper import scrape_website + +class TestWebsiteScraper: + """Test website scraping comprehensively.""" + + def test_valid_url_accepted(self): + """Valid URLs should be accepted.""" + # Test with mock HTTP response + pass + + @pytest.mark.parametrize("invalid_url", [ + "", + "not-a-url", + "ftp://example.com", + None, + ]) + def test_invalid_urls_rejected(self, invalid_url): + """Invalid URLs should be rejected.""" + with pytest.raises(ValueError): + scrape_website(invalid_url) + + @pytest.mark.asyncio + async def test_scraping_handles_timeout(self): + """Scraper should handle timeouts gracefully.""" + # Test timeout handling + pass +``` + +### Use Fixtures for Shared Setup + +```python +@pytest.fixture +def mock_supabase_client(): + """Provide mock Supabase client for tests.""" + # Mock implementation + pass + +@pytest.fixture +async def sample_bot_config(mock_supabase_client): + """Create a sample bot configuration for tests.""" + # Create test bot config + pass +``` + +--- + +## Git Workflow + +### Commit Messages +Follow conventional commits: +``` +feat: add Facebook webhook verification endpoint +fix: resolve async issue in scraper service +refactor: extract Copilot service to separate module +test: add integration tests for agent service +docs: update README with deployment instructions +``` + +### Before Every Commit +```bash +uv run ruff format . # Format code +uv run ruff check . --fix # Fix auto-fixable issues +uv run pytest # All tests must pass +``` + +### PR Requirements +- Title format: `[component] Brief description` +- All tests passing in CI +- Code coverage > 85% +- No `ruff` warnings +- Updated documentation if API changes + +--- + +## Security & Best Practices + +### Environment Variables +```python +# ✅ GOOD - Use pydantic-settings +from pydantic_settings import BaseSettings, SettingsConfigDict + +class Settings(BaseSettings): + model_config = SettingsConfigDict(env_file=".env") + + facebook_page_access_token: str + supabase_service_key: str + copilot_enabled: bool = True + +settings = Settings() +``` + +### Secrets Management +- Store secrets in `.env` (gitignored) +- Use Railway environment variables for production +- Never log sensitive data (tokens, keys) +- Sanitize error messages in production + +### Database Queries +✅ **GOOD** — Use Supabase client (parameterized queries) + +```python +async def get_bot_by_page_id(page_id: str) -> BotConfiguration | None: + supabase = get_supabase_client() + result = supabase.table("bot_configurations").select("*").eq("page_id", page_id).execute() + # Supabase client handles parameterization + return BotConfiguration(**result.data[0]) if result.data else None +``` + +❌ **BAD** — SQL injection risk (never use raw SQL strings) + +```python +async def get_bot_by_page_id(page_id: str): + query = f"SELECT * FROM bot_configurations WHERE page_id = '{page_id}'" # NEVER DO THIS + return await db.execute(query) +``` + +--- + +## Boundaries & Guardrails + +### ✅ ALWAYS DO +- Write type hints for all functions and classes +- Use async/await for I/O operations +- Add docstrings to public functions +- Write tests for new features +- Run `ruff format` before committing +- Use dependency injection via FastAPI's `Depends()` +- Validate all external input with Pydantic models +- Use context managers for resources (HTTP clients, DB sessions) +- Handle Copilot SDK failures gracefully with OpenAI fallback + +### ⚠️ ASK FIRST +- Adding new dependencies (check with `uv add`) +- Changing database schema (needs migration review) +- Modifying API contracts (breaks Facebook webhook) +- Changing authentication/authorization logic +- Altering CI/CD configuration +- Modifying Copilot SDK integration + +### 🚫 NEVER DO +- Commit secrets, API keys, or credentials +- Use `print()` instead of logging +- Swallow exceptions with bare `except:` +- Modify code in `site-packages/` or `.venv/` +- Store passwords in plain text +- Use `eval()` or `exec()` with user input +- Import from `__init__.py` files you didn't create +- Hardcode Facebook tokens or Supabase keys +- Skip error handling for external API calls + +--- + +## Common Patterns in This Project + +### Background Tasks +```python +from fastapi import BackgroundTasks + +@router.post("/webhook") +async def handle_webhook( + payload: MessengerWebhookPayload, + background_tasks: BackgroundTasks, +): + # Process message in background + background_tasks.add_task(process_message, payload) + return {"status": "ok"} +``` + +### Error Handling +```python +from fastapi import HTTPException, status + +try: + bot_config = await get_bot_configuration_by_page_id(page_id) +except ValueError as e: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=str(e) + ) +``` + +### Dependency Injection +```python +from typing import Annotated +from fastapi import Depends, Request + +async def get_copilot_service(request: Request) -> CopilotService: + """Get Copilot service from app state.""" + return request.app.state.copilot + +CopilotDep = Annotated[CopilotService, Depends(get_copilot_service)] + +@router.post("/message") +async def process_message( + message: str, + copilot: CopilotDep +): + # Use copilot service + pass +``` + +### Async Service Calls +```python +async def scrape_and_synthesize(url: str) -> str: + """Scrape website and synthesize reference document.""" + # Scrape + chunks = await scrape_website(url) + + # Synthesize via Copilot + copilot = CopilotService(base_url=settings.copilot_cli_host) + reference_doc = await copilot.synthesize_reference(url, chunks) + + return reference_doc +``` diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..228cc66 --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,370 @@ +# Architecture + +Define your agent system's structure, decision flows, and component interactions. + +--- + +## System Overview + +The Facebook Messenger AI Bot is a production-ready FastAPI application that creates AI-powered Facebook Messenger bots. The system uses a single-agent architecture with PydanticAI, powered by GitHub Copilot SDK (with OpenAI fallback), to answer questions based on synthesized reference documents from scraped websites. + +**High-Level Flow:** +``` +Facebook Messenger → Webhook → FastAPI → Agent Service → Copilot SDK → Response → Facebook Messenger +``` + +**Key Components:** +- **FastAPI Application**: Webhook endpoints for Facebook Messenger +- **PydanticAI Agent**: Message processing and response generation +- **Copilot SDK Service**: LLM operations with fallback +- **Scraper Service**: Website content extraction +- **Reference Document Service**: Content synthesis +- **Supabase Database**: Configuration and message history storage +- **Facebook Service**: Message sending via Graph API + +--- + +## Agent Roles & Responsibilities + +| Agent Name | Purpose | Tools | Output | +|------------|---------|-------|--------| +| MessengerAgentService | Process user messages and generate responses | CopilotService (chat), Reference Document (read), Message History (read) | AgentResponse (message, confidence, escalation flags) | + +**Single Agent Architecture:** +- One primary agent handles all message processing +- Agent uses reference document as knowledge base +- Agent maintains conversation context via recent messages +- Agent escalates to human when confidence is low or out of scope + +--- + +## Decision Flow + +``` +User Message (Facebook Messenger) + ↓ +Webhook Endpoint (FastAPI) + ↓ +Parse Message & Extract sender_id, page_id + ↓ +Lookup Bot Configuration (Supabase) + ↓ +Build AgentContext (reference_doc + tone + recent_messages) + ↓ +MessengerAgentService.respond() + ↓ + ├─→ Low Confidence (< 0.7) → Escalate to Human + ├─→ Out of Scope → Escalate to Human + └─→ Valid Response → Send via Facebook Graph API + ↓ +Save to Message History (Supabase) +``` + +**Setup Flow:** +``` +CLI Setup Command + ↓ +Scrape Website → Text Chunks + ↓ +Copilot SDK: Synthesize Reference Document + ↓ +Store Reference Document (Supabase) + ↓ +Create Bot Configuration (Supabase) + ↓ +Ready for Messages +``` + +--- + +## Data Flow + +### Input Schemas + +**Webhook Payload:** +```python +class MessengerWebhookPayload(BaseModel): + object: str + entry: list[dict] # Facebook webhook entry structure +``` + +**Message Input:** +```python +class MessengerMessageIn(BaseModel): + sender_id: str + recipient_id: str + text: str | None + timestamp: int +``` + +### State Management + +**AgentContext:** +```python +class AgentContext(BaseModel): + bot_config_id: str + reference_doc: str # Full markdown reference document + tone: str # Communication tone (professional, friendly, etc.) + recent_messages: list[str] # Last 3 messages for context +``` + +**AgentResponse:** +```python +class AgentResponse(BaseModel): + message: str # Response text (max 300 chars) + confidence: float # 0.0 to 1.0 + requires_escalation: bool + escalation_reason: str | None +``` + +### Output Formats + +- **Success**: AgentResponse with message and confidence > 0.7 +- **Escalation**: AgentResponse with requires_escalation = True +- **Error**: HTTPException with appropriate status code + +--- + +## Orchestration Pattern + +**Used Pattern:** Single-agent with tools + +**Reasoning:** +- Simple use case: Answer questions based on reference document +- No need for complex multi-agent coordination +- Single agent can handle all message types +- Easier to maintain and debug +- Lower latency (no agent handoffs) + +**Agent Tools:** +1. **CopilotService.chat()**: LLM chat interface +2. **Reference Document Access**: Read-only access to synthesized content +3. **Message History**: Read recent conversation context +4. **Facebook Service**: Send messages (called after agent response) + +--- + +## Tools & External Systems + +### Tool Registry + +| Tool | Risk | Description | +|------|------|-------------| +| `scrape_website` | 🟢 LOW | Read-only website scraping, timeout limits | +| `build_reference_doc` | 🟢 LOW | Content synthesis via Copilot SDK | +| `get_bot_configuration` | 🟢 LOW | Read-only database query | +| `get_reference_document` | 🟢 LOW | Read-only database query | +| `agent_service.respond` | 🟡 MEDIUM | AI response generation, confidence-based | +| `send_message` (Facebook) | 🟡 MEDIUM | Send message via Facebook Graph API | +| `save_message_history` | 🟡 MEDIUM | Write message to database | +| `create_bot_configuration` | 🟠 HIGH | Create new bot (CLI only, requires validation) | + +### External Systems + +**GitHub Copilot SDK:** +- Primary LLM provider +- Endpoint: `COPILOT_CLI_HOST` (default: http://localhost:5909) +- Fallback: OpenAI API if Copilot unavailable +- Operations: Chat completion, content synthesis + +**Facebook Graph API:** +- Send messages to users +- Endpoint: `https://graph.facebook.com/v18.0/me/messages` +- Authentication: Page Access Token +- Rate limits: Handled by Facebook + +**Supabase (PostgreSQL):** +- Database for bot configurations +- Reference documents storage +- Message history logging +- Connection: Via Supabase Python client + +--- + +## Error Recovery & Fallback Logic + +### Copilot SDK Failures + +**Detection:** +- Health check failures +- HTTP timeout errors +- Invalid response format + +**Recovery:** +1. Check `copilot.is_available()` before use +2. If unavailable, automatically fallback to OpenAI +3. Log fallback event for monitoring +4. Continue processing with OpenAI + +**Fallback Implementation:** +```python +async def chat(self, system_prompt: str, messages: list[dict]) -> str: + if not await self.is_available(): + logger.warning("Copilot SDK unavailable, using OpenAI fallback") + return await self._fallback_to_openai(system_prompt, messages) + # ... use Copilot SDK +``` + +### Facebook API Failures + +**Detection:** +- HTTP error codes (4xx, 5xx) +- Invalid token responses +- Rate limit errors + +**Recovery:** +1. Retry with exponential backoff (max 3 retries) +2. Log error for monitoring +3. If persistent, alert admin +4. Continue processing (don't block other messages) + +### Database Failures + +**Detection:** +- Connection timeouts +- Query errors +- Transaction failures + +**Recovery:** +1. Retry with backoff (max 3 retries) +2. Use cached bot configurations if available +3. Log error for monitoring +4. Alert admin if persistent + +### Agent Response Failures + +**Detection:** +- Low confidence scores (< 0.7) +- Out-of-scope queries +- Invalid response format + +**Recovery:** +1. Set `requires_escalation = True` +2. Return default escalation message +3. Log for human review +4. Continue processing other messages + +--- + +## Component Interactions + +### Request Flow + +``` +┌─────────────────┐ +│ Facebook │ +│ Messenger │ +└────────┬────────┘ + │ POST /webhook + ↓ +┌─────────────────┐ +│ FastAPI │ +│ Webhook Handler │ +└────────┬────────┘ + │ + ├─→ Parse payload + ├─→ Extract page_id + │ + ↓ +┌─────────────────┐ +│ Repository │ +│ (Supabase) │ +└────────┬────────┘ + │ + ├─→ Get bot_config + ├─→ Get reference_doc + ├─→ Get recent messages + │ + ↓ +┌─────────────────┐ +│ Agent Service │ +│ (PydanticAI) │ +└────────┬────────┘ + │ + ├─→ Build context + ├─→ Call Copilot SDK + ├─→ Generate response + │ + ↓ +┌─────────────────┐ +│ Facebook │ +│ Service │ +└────────┬────────┘ + │ + ├─→ Send message + │ + ↓ +┌─────────────────┐ +│ Repository │ +│ (Save history) │ +└─────────────────┘ +``` + +### Setup Flow + +``` +┌─────────────────┐ +│ CLI Setup │ +└────────┬────────┘ + │ + ├─→ Get website URL + │ + ↓ +┌─────────────────┐ +│ Scraper Service │ +└────────┬────────┘ + │ + ├─→ Scrape website + ├─→ Chunk text + │ + ↓ +┌─────────────────┐ +│ Copilot Service │ +└────────┬────────┘ + │ + ├─→ Synthesize reference doc + │ + ↓ +┌─────────────────┐ +│ Repository │ +│ (Save config) │ +└─────────────────┘ +``` + +--- + +## Scalability Considerations + +### Current Architecture +- Single FastAPI instance +- Single agent per message +- Direct database connections +- Synchronous message processing + +### Future Scaling Options +- **Horizontal Scaling**: Multiple FastAPI instances behind load balancer +- **Message Queue**: Use Redis/RabbitMQ for async message processing +- **Caching**: Redis cache for bot configurations +- **Database Connection Pooling**: Supabase connection pooling +- **Agent Pooling**: Multiple agent instances for concurrent processing + +--- + +## Security Architecture + +### Authentication & Authorization +- Facebook webhook verification via verify_token +- Page Access Token validation +- Supabase service key for database access + +### Data Protection +- Environment variables for secrets +- Encrypted database connections (Supabase) +- HTTPS for all external communications +- PII masking in logs + +### Input Validation +- Pydantic models for all inputs +- URL validation for website scraping +- Message length limits +- Rate limiting per user diff --git a/GUARDRAILS.md b/GUARDRAILS.md new file mode 100644 index 0000000..94cc8b4 --- /dev/null +++ b/GUARDRAILS.md @@ -0,0 +1,160 @@ +Define safety boundaries, validation rules, and risk mitigation strategies. + +--- + +## Risk Classification + +### Input Validation +- Reject inputs > **1000** characters (Facebook Messenger limit is 2000, but we enforce stricter limit) +- Blocklist: list of blocked keywords/patterns for prompt injection +- Jailbreak detection: use relevance classifier for suspicious patterns +- URL validation: Only allow HTTP/HTTPS URLs for website scraping +- Message rate limiting: Max 10 messages per user per minute + +--- + +### Tool Risk Levels + +| Tool | Risk | Guardrail | Human Approval? | +|------|------|-----------|-----------------| +| `scrape_website` | 🟢 LOW | URL validation, timeout limits | No | +| `build_reference_doc` | 🟢 LOW | Content size limits, hash verification | No | +| `get_bot_configuration` | 🟢 LOW | Read-only operation | No | +| `send_message` (Facebook) | 🟡 MEDIUM | Message validation, rate limiting | No | +| `agent_service.respond` | 🟡 MEDIUM | Confidence threshold, escalation flags | Conditional | +| `create_bot_configuration` | 🟠 HIGH | Validation + audit log | No (CLI only) | +| `update_bot_configuration` | 🟠 HIGH | Approval gate + audit log | **Yes** | +| `delete_bot_configuration` | 🔴 CRITICAL | Approval gate + 24hr delay | **Yes** | + +--- + +### PII Handling + +| Field | Policy | +|------|--------| +| Fields to redact | Facebook user IDs, phone numbers, email addresses (if detected) | +| Logging policy | Log message metadata but mask PII in logs | +| Data retention | Message history: 90 days, Bot configs: Indefinite (until deletion) | +| Facebook data | Only store message text and sender_id (no profile data) | + +--- + +### Moderation & Safety + +🛡️ **Use OpenAI Moderation API for:** +- Hate speech +- Violence/harm +- Harassment +- Self-harm content + +**Custom safety classifier** for: +- Political misinformation detection +- Off-topic queries (outside reference document scope) +- Spam detection + +**Response Filtering:** +- Agent responses must be under 300 characters (Facebook Messenger best practice) +- Escalate if confidence < 0.7 +- Flag for human review if requires_escalation = True + +--- + +## Escalation Rules + +🚨 **When agent should hand off to human:** +- Confidence score < 0.7 for specific task +- Agent exceeds 3 retries for same message +- User provides conflicting/incomplete info +- Attempting high-risk action (config updates, deletions) +- Detected prompt injection attempt +- Message contains PII that needs special handling +- Response would exceed 300 characters (requires summarization) + +**Escalation Actions:** +1. Log incident with full context +2. Set `requires_escalation = True` in AgentResponse +3. Send default message: "I'm not sure about that. Let me connect you with a team member who can help." +4. Create escalation ticket in monitoring system +5. Notify admin via configured alert channel + +--- + +## Incident Response + +How to detect, log, and respond to: + +| Incident Type | Detection | Response | +|---------------|-----------|----------| +| Prompt injection attempts | Pattern matching (e.g., "ignore previous instructions", "system:", "you are now"), anomaly detection | Log + block + alert admin | +| System prompt leaks | Output scanning for internal prompts or system messages | Immediate termination + review + rotate tokens | +| Unauthorized tool access | Permission checks, page_id validation | Deny + audit log + alert | +| Hallucinations | Fact-checking against reference doc, confidence thresholds | Flag for review + escalate + update reference doc if needed | +| Rate limit exceeded | Message count tracking per sender_id | Throttle + return rate limit message | +| Copilot SDK failure | Health check failures, timeout errors | Automatic fallback to OpenAI + log incident | +| Facebook API errors | HTTP error codes, invalid token responses | Log + retry with exponential backoff + alert if persistent | +| Database connection failures | Connection timeout, query errors | Retry with backoff + fallback to cached configs + alert | + +--- + +## Content Safety + +### Message Content Validation + +**Before Processing:** +- Check message length (max 1000 chars) +- Scan for blocklisted patterns +- Validate sender_id format +- Check rate limits + +**During Processing:** +- Monitor for prompt injection patterns +- Track confidence scores +- Validate response length +- Check against reference document scope + +**After Processing:** +- Sanitize response (remove any system prompts) +- Validate response format +- Check for PII leakage +- Log for audit trail + +### Reference Document Safety + +- Content must be from verified website URL +- Content hash verification to detect tampering +- Maximum document size: 50,000 characters +- Regular content updates to prevent staleness + +--- + +## Monitoring & Alerting + +### Key Metrics to Monitor +- Message processing latency (p50, p95, p99) +- Agent confidence scores (distribution) +- Escalation rate (% of messages requiring human) +- Copilot SDK availability and fallback rate +- Facebook API error rate +- Database query performance + +### Alert Thresholds +- 🟡 Warning: Escalation rate > 20% +- 🟠 Critical: Copilot SDK unavailable > 5 minutes +- 🔴 Critical: Facebook API error rate > 10% +- 🔴 Critical: Database connection failures > 3 consecutive + +--- + +## Compliance & Privacy + +### Facebook Messenger Compliance +- Comply with Facebook Messenger Platform policies +- Respect user privacy and data protection +- Provide opt-out mechanisms +- Handle user data deletion requests + +### Data Protection +- Encrypt sensitive data at rest (Supabase handles this) +- Use HTTPS for all external communications +- Rotate API keys and tokens regularly +- Audit log access to sensitive operations diff --git a/PROJECT_STRUCTURE.md b/PROJECT_STRUCTURE.md new file mode 100644 index 0000000..89f3369 --- /dev/null +++ b/PROJECT_STRUCTURE.md @@ -0,0 +1,209 @@ +# Project Structure + +## Overview + +Complete directory structure for the Facebook Messenger AI Bot project with explanations of each file and directory. + +## Directory Tree + +```text +messenger_bot/ +├── src/ +│ ├── __init__.py # Package initialization +│ ├── main.py # FastAPI app initialization +│ ├── config.py # Settings (Pydantic BaseSettings) +│ ├── api/ # API routes and endpoints +│ │ ├── __init__.py +│ │ ├── webhook.py # Facebook webhook endpoints +│ │ ├── setup.py # HTTP setup endpoints (optional) +│ │ └── health.py # /health for Railway +│ ├── models/ # Pydantic models for data validation +│ │ ├── __init__.py +│ │ ├── messenger.py # Incoming/outgoing FB models +│ │ ├── config_models.py # Bot + FB config models +│ │ └── agent_models.py # AgentContext, AgentResponse +│ ├── services/ # Business logic and service layer +│ │ ├── __init__.py +│ │ ├── scraper.py # Website scraping & chunking +│ │ ├── copilot_service.py # Copilot SDK wrapper +│ │ ├── reference_doc.py # Build reference doc via Copilot +│ │ ├── agent_service.py # PydanticAI agent +│ │ └── facebook_service.py # Send messages to FB Graph API +│ ├── db/ # Database client and repository +│ │ ├── __init__.py +│ │ ├── client.py # Supabase client +│ │ └── repository.py # Bot config / message history +│ └── cli/ # CLI commands for setup +│ ├── __init__.py +│ └── setup_cli.py # Typer-based interactive setup +├── migrations/ # Database migrations +│ └── 001_initial.sql # Initial schema +├── pyproject.toml # Project configuration +├── .env.example # Environment variables template +├── railway.toml # Railway deployment config +└── README.md # Project documentation +``` + +## File Descriptions + +### Root Level + +- **pyproject.toml**: Python project configuration, dependencies, and build system +- **.env.example**: Template for environment variables +- **railway.toml**: Railway deployment configuration +- **README.md**: Project documentation +- **PROJECT_STRUCTURE.md**: This file + +### src/ Directory + +Main application package. + +#### src/main.py + +FastAPI application initialization: +- Creates FastAPI app instance +- Registers routers (webhook, health, setup) +- Startup event handlers (Supabase init, Copilot check) +- Uvicorn configuration for Railway + +#### src/config.py + +Application configuration using Pydantic BaseSettings: +- Loads environment variables +- Provides type-safe settings access +- Includes `get_settings()` helper with `lru_cache()` + +### src/api/ Directory + +API routes and endpoints. + +#### src/api/webhook.py + +Facebook webhook endpoints: +- `GET /webhook`: Webhook verification +- `POST /webhook`: Message event handling + +#### src/api/health.py + +Health check endpoint: +- `GET /health`: Returns `{status: "ok"}` for Railway + +#### src/api/setup.py + +Optional HTTP setup endpoints (alternative to CLI). + +### src/models/ Directory + +Pydantic models for data validation. + +#### src/models/messenger.py + +Facebook Messenger models: +- `MessengerEntry`: Webhook entry +- `MessengerMessageIn`: Incoming message +- `MessengerWebhookPayload`: Full webhook payload + +#### src/models/config_models.py + +Configuration models: +- `WebsiteInput`: Website URL input +- `TonePreference`: Communication tone +- `FacebookConfig`: Facebook app configuration +- `BotConfiguration`: Complete bot configuration + +#### src/models/agent_models.py + +Agent models: +- `AgentContext`: Context for agent responses +- `AgentResponse`: Agent response with confidence + +### src/services/ Directory + +Business logic and service layer. + +#### src/services/scraper.py + +Website scraping service: +- Uses `httpx.AsyncClient` for async requests +- Parses HTML with BeautifulSoup +- Chunks text into 500-800 word segments + +#### src/services/copilot_service.py + +GitHub Copilot SDK wrapper: +- Async wrapper over Copilot SDK runtime +- Fallback to OpenAI when unavailable +- Methods: `is_available()`, `synthesize_reference()`, `chat()` + +#### src/services/reference_doc.py + +Reference document builder: +- Uses Copilot to synthesize markdown from chunks +- Stores in `reference_documents` table + +#### src/services/agent_service.py + +PydanticAI agent service: +- Builds system prompt with reference doc and tone +- Returns typed `AgentResponse` +- Handles message context + +#### src/services/facebook_service.py + +Facebook Graph API wrapper: +- Sends messages via `me/messages` endpoint +- Uses page access token + +### src/db/ Directory + +Database client and repository layer. + +#### src/db/client.py + +Supabase client initialization: +- Creates and configures Supabase client +- Handles connection management + +#### src/db/repository.py + +Database repository: +- Bot configuration CRUD operations +- Message history storage +- Reference document queries + +### src/cli/ Directory + +CLI commands for setup and management. + +#### src/cli/setup_cli.py + +Interactive setup CLI using Typer: +- Website URL input and validation +- Scraping and reference doc generation +- Tone selection +- Facebook configuration +- Bot persistence + +### migrations/ Directory + +Database migration files: +- **001_initial.sql**: Initial schema with all tables, indexes, and triggers + +## Import Path Examples + +```python +# From src/main.py +from src.api import webhook, health +from src.config import get_settings +from src.db.client import get_supabase_client + +# From services +from src.services.scraper import scrape_website +from src.services.copilot_service import CopilotService +from src.services.agent_service import MessengerAgentService + +# From models +from src.models.messenger import MessengerWebhookPayload +from src.models.config_models import BotConfiguration +from src.models.agent_models import AgentContext, AgentResponse +``` diff --git a/main.py b/main.py new file mode 100644 index 0000000..bb27ea0 --- /dev/null +++ b/main.py @@ -0,0 +1,6 @@ +def main(): + print("Hello from facebook-messenger-scrape-bot!") + + +if __name__ == "__main__": + main() diff --git a/migrations/001_initial.sql b/migrations/001_initial.sql new file mode 100644 index 0000000..77f55bb --- /dev/null +++ b/migrations/001_initial.sql @@ -0,0 +1,64 @@ +-- Initial database schema for Facebook Messenger AI Bot +-- Supabase PostgreSQL migration + +-- Bot configurations table +-- Stores configuration for each Facebook Page bot instance +create table bot_configurations ( + id uuid primary key default gen_random_uuid(), + page_id text not null unique, + website_url text not null, + reference_doc_id uuid not null, + tone text not null, + facebook_page_access_token text not null, + facebook_verify_token text not null, + created_at timestamptz default now(), + updated_at timestamptz default now(), + is_active boolean default true +); + +-- Reference documents table +-- Stores synthesized reference documents created from scraped website content +create table reference_documents ( + id uuid primary key default gen_random_uuid(), + bot_id uuid not null references bot_configurations(id) on delete cascade, + content text not null, + source_url text not null, + content_hash text not null, + created_at timestamptz default now() +); + +-- Message history table +-- Stores conversation history for analytics and debugging +create table message_history ( + id uuid primary key default gen_random_uuid(), + bot_id uuid not null references bot_configurations(id) on delete cascade, + sender_id text not null, + message_text text not null, + response_text text not null, + confidence float, + requires_escalation boolean default false, + created_at timestamptz default now() +); + +-- Indexes for performance +create index idx_bot_configurations_page_id on bot_configurations(page_id); +create index idx_bot_configurations_is_active on bot_configurations(is_active); +create index idx_reference_documents_bot_id on reference_documents(bot_id); +create index idx_message_history_bot_id on message_history(bot_id); +create index idx_message_history_sender_id on message_history(sender_id); +create index idx_message_history_created_at on message_history(created_at); + +-- Function to update updated_at timestamp +create or replace function update_updated_at_column() +returns trigger as $$ +begin + new.updated_at = now(); + return new; +end; +$$ language plpgsql; + +-- Trigger to automatically update updated_at on bot_configurations +create trigger update_bot_configurations_updated_at + before update on bot_configurations + for each row + execute function update_updated_at_column(); diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..36e6e69 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,30 @@ +[project] +name = "facebook-messenger-scrape-bot" +version = "0.1.0" +description = "Production-ready FastAPI application for AI-powered Facebook Messenger bots using Copilot SDK and PydanticAI" +readme = "README.md" +requires-python = ">=3.12.8" +dependencies = [ + "fastapi>=0.104.0", + "uvicorn[standard]>=0.24.0", + "pydantic>=2.0.0", + "pydantic-ai>=0.0.10", + "pydantic-settings>=2.0.0", + "httpx>=0.25.0", + "beautifulsoup4>=4.12.0", + "supabase>=2.0.0", + "typer>=0.9.0", + "python-dotenv>=1.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.4.0", + "pytest-asyncio>=0.21.0", + "ruff>=0.1.0", + "mypy>=1.6.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" diff --git a/railway.toml b/railway.toml new file mode 100644 index 0000000..002a9cc --- /dev/null +++ b/railway.toml @@ -0,0 +1,9 @@ +[build] +builder = "NIXPACKS" + +[deploy] +startCommand = "uvicorn src.main:app --host 0.0.0.0 --port $PORT" +healthcheckPath = "/health" +healthcheckTimeout = 100 +restartPolicyType = "ON_FAILURE" +restartPolicyMaxRetries = 10 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..877e485 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +"""Facebook Messenger AI Bot - Main package.""" diff --git a/src/api/__init__.py b/src/api/__init__.py new file mode 100644 index 0000000..980330b --- /dev/null +++ b/src/api/__init__.py @@ -0,0 +1 @@ +"""API routes and endpoints.""" diff --git a/src/api/health.py b/src/api/health.py new file mode 100644 index 0000000..5f415d3 --- /dev/null +++ b/src/api/health.py @@ -0,0 +1,11 @@ +"""Health check endpoint for Railway.""" + +from fastapi import APIRouter + +router = APIRouter() + + +@router.get("/health") +async def health_check(): + """Health check endpoint for Railway.""" + return {"status": "ok"} diff --git a/src/api/setup.py b/src/api/setup.py new file mode 100644 index 0000000..092e0c3 --- /dev/null +++ b/src/api/setup.py @@ -0,0 +1 @@ +"""HTTP setup endpoints (optional).""" diff --git a/src/api/webhook.py b/src/api/webhook.py new file mode 100644 index 0000000..2938164 --- /dev/null +++ b/src/api/webhook.py @@ -0,0 +1,50 @@ +"""Facebook webhook endpoints.""" + +from fastapi import APIRouter, Request, Response +from fastapi.responses import PlainTextResponse + +from src.config import get_settings + +router = APIRouter() + + +@router.get("") +async def verify_webhook(request: Request): + """ + Facebook webhook verification endpoint. + + Facebook sends a GET request with: + - hub.mode: "subscribe" + - hub.verify_token: The token you set + - hub.challenge: A random string + + Returns the challenge if verify_token matches. + """ + settings = get_settings() + + mode = request.query_params.get("hub.mode") + token = request.query_params.get("hub.verify_token") + challenge = request.query_params.get("hub.challenge") + + if mode == "subscribe" and token == settings.facebook_verify_token: + return PlainTextResponse(challenge) + + return Response(status_code=403) + + +@router.post("") +async def handle_webhook(request: Request): + """ + Facebook webhook message handler. + + Processes incoming Messenger messages and responds via agent. + """ + # TODO: Implement message handling + # 1. Parse webhook payload + # 2. Extract message text and sender + # 3. Look up bot configuration by page_id + # 4. Build AgentContext + # 5. Call MessengerAgentService.respond + # 6. Send reply via Facebook Graph API + + return {"status": "ok"} diff --git a/src/cli/__init__.py b/src/cli/__init__.py new file mode 100644 index 0000000..127b79e --- /dev/null +++ b/src/cli/__init__.py @@ -0,0 +1 @@ +"""CLI commands for setup and management.""" diff --git a/src/cli/setup_cli.py b/src/cli/setup_cli.py new file mode 100644 index 0000000..4c3dcc2 --- /dev/null +++ b/src/cli/setup_cli.py @@ -0,0 +1,108 @@ +"""Typer-based interactive setup CLI.""" + +import asyncio +import typer +from typing_extensions import Annotated + +from src.services.scraper import scrape_website +from src.services.reference_doc import build_reference_doc +from src.services.copilot_service import CopilotService +from src.db.repository import create_bot_configuration, create_reference_document +from src.db.client import get_supabase_client +from src.config import get_settings + +app = typer.Typer() + + +@app.command() +def setup(): + """ + Interactive setup: + 1) Ask for website + 2) Scrape + build reference doc via Copilot + 3) Ask for tone (with recommendations) + 4) Ask for Facebook Page config + 5) Persist bot config in Supabase + """ + settings = get_settings() + supabase = get_supabase_client() + copilot = CopilotService( + base_url=settings.copilot_cli_host, + enabled=settings.copilot_enabled + ) + + # Step 1: Website URL + website_url = typer.prompt("What website should the bot be based on?") + + typer.echo(f"Scraping {website_url}...") + try: + text_chunks = asyncio.run(scrape_website(website_url)) + typer.echo(f"✓ Scraped {len(text_chunks)} text chunks") + except Exception as e: + typer.echo(f"✗ Error scraping website: {e}", err=True) + raise typer.Exit(1) + + # Step 2: Build reference doc + typer.echo("Generating reference document via Copilot...") + try: + markdown_content, content_hash = asyncio.run(build_reference_doc( + copilot, website_url, text_chunks + )) + typer.echo("✓ Reference document generated") + except Exception as e: + typer.echo(f"✗ Error generating reference doc: {e}", err=True) + raise typer.Exit(1) + + # Step 3: Tone selection + # TODO: Use Copilot to suggest tones from content + recommended_tones = ["Professional", "Friendly", "Casual"] + typer.echo(f"\nRecommended tones: {', '.join(recommended_tones)}") + tone = typer.prompt("Select a tone", default="Professional") + + # Step 4: Facebook configuration + page_id = typer.prompt("Facebook Page ID") + page_access_token = typer.prompt("Facebook Page Access Token") + verify_token = typer.prompt("Verify Token (for webhook)", default=typer.style("random-token-123", fg=typer.colors.YELLOW)) + + # Step 5: Store reference document + typer.echo("\nStoring reference document...") + try: + reference_doc_id = create_reference_document( + content=markdown_content, + source_url=website_url, + content_hash=content_hash + ) + typer.echo("✓ Reference document stored") + except Exception as e: + typer.echo(f"✗ Error storing reference document: {e}", err=True) + raise typer.Exit(1) + + # Step 6: Create bot configuration + typer.echo("\nCreating bot configuration...") + try: + bot_config = create_bot_configuration( + page_id=page_id, + website_url=website_url, + reference_doc_id=reference_doc_id, + tone=tone, + facebook_page_access_token=page_access_token, + facebook_verify_token=verify_token + ) + typer.echo("✓ Bot configuration created") + except Exception as e: + typer.echo(f"✗ Error creating bot configuration: {e}", err=True) + raise typer.Exit(1) + + # Step 7: Print webhook URL + typer.echo("\n" + "="*60) + typer.echo("✓ Setup complete!") + typer.echo("\nNext steps:") + typer.echo(f"1. Configure webhook URL in Facebook App settings:") + typer.echo(f" https://your-railway-url.railway.app/webhook") + typer.echo(f"2. Set verify token: {verify_token}") + typer.echo(f"3. Subscribe to 'messages' events") + typer.echo("="*60) + + +if __name__ == "__main__": + app() diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..1362e18 --- /dev/null +++ b/src/config.py @@ -0,0 +1,69 @@ +"""Application configuration using Pydantic BaseSettings.""" + +from functools import lru_cache +from typing import Literal + +from pydantic import Field +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + """Application settings loaded from environment variables.""" + + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + case_sensitive=False + ) + + # Facebook Configuration + facebook_page_access_token: str = Field( + ..., + description="Facebook Page access token" + ) + facebook_verify_token: str = Field( + ..., + description="Webhook verification token" + ) + facebook_app_secret: str | None = Field( + default=None, + description="Facebook App secret (optional, for signature verification)" + ) + + # Supabase Configuration + supabase_url: str = Field( + ..., + description="Supabase project URL" + ) + supabase_service_key: str = Field( + ..., + description="Supabase service role key" + ) + + # Copilot SDK Configuration + copilot_cli_host: str = Field( + default="http://localhost:5909", + description="GitHub Copilot CLI host URL" + ) + copilot_enabled: bool = Field( + default=True, + description="Enable Copilot SDK (False to use OpenAI fallback)" + ) + + # OpenAI Configuration (Fallback) + openai_api_key: str = Field( + default="", + description="OpenAI API key (used as fallback)" + ) + + # Environment + env: Literal["local", "railway", "prod"] = Field( + default="local", + description="Current environment" + ) + + +@lru_cache() +def get_settings() -> Settings: + """Get cached settings instance.""" + return Settings() diff --git a/src/db/__init__.py b/src/db/__init__.py new file mode 100644 index 0000000..62cceb2 --- /dev/null +++ b/src/db/__init__.py @@ -0,0 +1 @@ +"""Database client and repository layer.""" diff --git a/src/db/client.py b/src/db/client.py new file mode 100644 index 0000000..0dcec51 --- /dev/null +++ b/src/db/client.py @@ -0,0 +1,11 @@ +"""Supabase client initialization.""" + +from supabase import create_client, Client + +from src.config import get_settings + + +def get_supabase_client() -> Client: + """Get Supabase client instance.""" + settings = get_settings() + return create_client(settings.supabase_url, settings.supabase_service_key) diff --git a/src/db/repository.py b/src/db/repository.py new file mode 100644 index 0000000..0f8c7ae --- /dev/null +++ b/src/db/repository.py @@ -0,0 +1,158 @@ +"""Bot configuration and message history repository.""" + +from datetime import datetime +from typing import Optional +import uuid + +from src.db.client import get_supabase_client +from src.models.config_models import BotConfiguration + + +def create_reference_document( + content: str, + source_url: str, + content_hash: str, +) -> str: + """ + Create a reference document (without bot_id initially). + + Args: + content: Markdown content + source_url: Source website URL + content_hash: SHA256 hash of content + + Returns: + Reference document ID + """ + supabase = get_supabase_client() + + data = { + "content": content, + "source_url": source_url, + "content_hash": content_hash + } + + result = supabase.table("reference_documents").insert(data).execute() + + if not result.data: + raise ValueError("Failed to create reference document") + + return result.data[0]["id"] + + +def link_reference_document_to_bot(doc_id: str, bot_id: str) -> None: + """Link reference document to bot configuration.""" + supabase = get_supabase_client() + + supabase.table("reference_documents").update({ + "bot_id": bot_id + }).eq("id", doc_id).execute() + + +def create_bot_configuration( + page_id: str, + website_url: str, + reference_doc_id: str, + tone: str, + facebook_page_access_token: str, + facebook_verify_token: str, +) -> BotConfiguration: + """ + Create a new bot configuration. + + Args: + page_id: Facebook Page ID + website_url: Source website URL + reference_doc_id: Reference document UUID + tone: Communication tone + facebook_page_access_token: Page access token + facebook_verify_token: Webhook verify token + + Returns: + Created BotConfiguration + """ + supabase = get_supabase_client() + + now = datetime.utcnow() + bot_id = str(uuid.uuid4()) + + data = { + "id": bot_id, + "page_id": page_id, + "website_url": website_url, + "reference_doc_id": reference_doc_id, + "tone": tone, + "facebook_page_access_token": facebook_page_access_token, + "facebook_verify_token": facebook_verify_token, + "created_at": now.isoformat(), + "updated_at": now.isoformat(), + "is_active": True + } + + result = supabase.table("bot_configurations").insert(data).execute() + + if not result.data: + raise ValueError("Failed to create bot configuration") + + # Link reference document to bot + link_reference_document_to_bot(reference_doc_id, bot_id) + + return BotConfiguration(**result.data[0]) + + +def get_bot_configuration_by_page_id(page_id: str) -> Optional[BotConfiguration]: + """ + Get bot configuration by Facebook Page ID. + + Returns: + BotConfiguration if found, None otherwise + """ + supabase = get_supabase_client() + + result = supabase.table("bot_configurations").select("*").eq("page_id", page_id).eq("is_active", True).execute() + + if not result.data: + return None + + return BotConfiguration(**result.data[0]) + + +def get_reference_document(doc_id: str) -> Optional[dict]: + """ + Get reference document by ID. + + Returns: + Document dict with 'content' and other fields, or None + """ + supabase = get_supabase_client() + + result = supabase.table("reference_documents").select("*").eq("id", doc_id).execute() + + if not result.data: + return None + + return result.data[0] + + +def save_message_history( + bot_id: str, + sender_id: str, + message_text: str, + response_text: str, + confidence: float, + requires_escalation: bool = False, +) -> None: + """Save message to history.""" + supabase = get_supabase_client() + + data = { + "bot_id": bot_id, + "sender_id": sender_id, + "message_text": message_text, + "response_text": response_text, + "confidence": confidence, + "requires_escalation": requires_escalation, + "created_at": datetime.utcnow().isoformat() + } + + supabase.table("message_history").insert(data).execute() diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..fa34f52 --- /dev/null +++ b/src/main.py @@ -0,0 +1,80 @@ +"""FastAPI application initialization.""" + +import os +from contextlib import asynccontextmanager + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +from src.api import health, webhook +from src.config import get_settings +from src.db.client import get_supabase_client +from src.services.copilot_service import CopilotService + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Application lifespan manager for startup and shutdown.""" + # Startup + settings = get_settings() + + # Initialize Supabase client + supabase = get_supabase_client() + app.state.supabase = supabase + + # Check Copilot availability + copilot = CopilotService( + base_url=settings.copilot_cli_host, + enabled=settings.copilot_enabled + ) + app.state.copilot = copilot + + if settings.copilot_enabled: + is_available = await copilot.is_available() + if not is_available: + print("Warning: Copilot SDK not available, will use OpenAI fallback") + + yield + + # Shutdown + # Cleanup if needed + + +# Create FastAPI app +app = FastAPI( + title="Facebook Messenger AI Bot", + description="AI-powered Facebook Messenger bot using Copilot SDK and PydanticAI", + version="0.1.0", + lifespan=lifespan +) + +# CORS middleware (if needed for webhook testing) +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # Configure appropriately for production + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Register routers +app.include_router(health.router, tags=["health"]) +app.include_router(webhook.router, prefix="/webhook", tags=["webhook"]) + + +@app.get("/") +def root(): + """Root endpoint.""" + return {"message": "Facebook Messenger AI Bot API"} + + +if __name__ == "__main__": + import uvicorn + + port = int(os.getenv("PORT", 8000)) + uvicorn.run( + "src.main:app", + host="0.0.0.0", + port=port, + reload=os.getenv("ENV") == "local" + ) diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..65b3a46 --- /dev/null +++ b/src/models/__init__.py @@ -0,0 +1 @@ +"""Pydantic models for data validation.""" diff --git a/src/models/agent_models.py b/src/models/agent_models.py new file mode 100644 index 0000000..acf36c9 --- /dev/null +++ b/src/models/agent_models.py @@ -0,0 +1,19 @@ +"""Agent context and response models.""" + +from pydantic import BaseModel + + +class AgentContext(BaseModel): + """Context for agent responses.""" + bot_config_id: str + reference_doc: str + tone: str + recent_messages: list[str] # Keep simple for now + + +class AgentResponse(BaseModel): + """Agent response with confidence and escalation flags.""" + message: str + confidence: float + requires_escalation: bool = False + escalation_reason: str | None = None diff --git a/src/models/config_models.py b/src/models/config_models.py new file mode 100644 index 0000000..9ad8143 --- /dev/null +++ b/src/models/config_models.py @@ -0,0 +1,35 @@ +"""Bot and Facebook configuration models.""" + +from datetime import datetime +from pydantic import BaseModel, Field +import uuid + + +class WebsiteInput(BaseModel): + """Website URL input for scraping.""" + url: str + + +class TonePreference(BaseModel): + """Communication tone preference.""" + tone: str # e.g. "professional", "friendly", "casual", "formal", "humorous" + description: str | None = None + + +class FacebookConfig(BaseModel): + """Facebook app configuration.""" + page_id: str + page_access_token: str + verify_token: str + + +class BotConfiguration(BaseModel): + """Complete bot configuration.""" + id: str = Field(default_factory=lambda: str(uuid.uuid4())) + page_id: str + website_url: str + reference_doc_id: str + tone: str + created_at: datetime + updated_at: datetime + is_active: bool = True diff --git a/src/models/messenger.py b/src/models/messenger.py new file mode 100644 index 0000000..97da3a6 --- /dev/null +++ b/src/models/messenger.py @@ -0,0 +1,23 @@ +"""Incoming/outgoing Facebook Messenger models.""" + +from pydantic import BaseModel + + +class MessengerEntry(BaseModel): + """Facebook webhook entry.""" + id: str + time: int + + +class MessengerMessageIn(BaseModel): + """Incoming Facebook Messenger message.""" + sender_id: str + recipient_id: str + text: str | None = None + timestamp: int + + +class MessengerWebhookPayload(BaseModel): + """Facebook webhook payload.""" + object: str + entry: list[dict] # Can be refined later with specific entry models diff --git a/src/services/__init__.py b/src/services/__init__.py new file mode 100644 index 0000000..4cbffd4 --- /dev/null +++ b/src/services/__init__.py @@ -0,0 +1 @@ +"""Business logic and service layer.""" diff --git a/src/services/agent_service.py b/src/services/agent_service.py new file mode 100644 index 0000000..f7027d6 --- /dev/null +++ b/src/services/agent_service.py @@ -0,0 +1,73 @@ +"""PydanticAI agent service.""" + +from src.models.agent_models import AgentContext, AgentResponse +from src.services.copilot_service import CopilotService + + +class MessengerAgentService: + """Service for generating AI agent responses.""" + + def __init__(self, copilot: CopilotService): + """ + Initialize agent service. + + Args: + copilot: CopilotService instance + """ + self.copilot = copilot + + async def respond( + self, + context: AgentContext, + user_message: str, + ) -> AgentResponse: + """ + Generate agent response to user message. + + Args: + context: Agent context with reference doc and tone + user_message: User's message text + + Returns: + AgentResponse with message and confidence + """ + # Build system prompt + system_prompt = f"""You are a {context.tone} assistant for a political/business Facebook page. +Use ONLY the following reference document as your source of truth: + +{context.reference_doc} + +Answer in under 300 characters where possible. +If the user asks about something not covered, say you don't know and suggest a human follow-up. +""" + + # Build messages list + messages = [ + {"role": "user", "content": user_message} + ] + + # Add recent messages for context + for msg in context.recent_messages[-3:]: # Last 3 messages + messages.insert(-1, {"role": "user", "content": msg}) + + # Call Copilot service + response_text = await self.copilot.chat(system_prompt, messages) + + # Parse response and determine confidence + # TODO: Use PydanticAI for structured output if needed + confidence = 0.8 # Placeholder - could be determined by Copilot response + + # Check if escalation is needed + requires_escalation = False + escalation_reason = None + + if "don't know" in response_text.lower() or "human" in response_text.lower(): + requires_escalation = True + escalation_reason = "Question outside knowledge base" + + return AgentResponse( + message=response_text, + confidence=confidence, + requires_escalation=requires_escalation, + escalation_reason=escalation_reason + ) diff --git a/src/services/copilot_service.py b/src/services/copilot_service.py new file mode 100644 index 0000000..374e83c --- /dev/null +++ b/src/services/copilot_service.py @@ -0,0 +1,118 @@ +"""GitHub Copilot SDK wrapper service.""" + +import logging +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + + +class CopilotService: + """Wrapper for GitHub Copilot SDK runtime.""" + + def __init__(self, base_url: str, enabled: bool = True): + """ + Initialize Copilot service. + + Args: + base_url: Base URL for Copilot CLI (e.g., http://localhost:5909) + enabled: Whether Copilot SDK is enabled + """ + self.base_url = base_url.rstrip('/') + self.enabled = enabled + + async def is_available(self) -> bool: + """Check if Copilot SDK is available.""" + if not self.enabled: + return False + + try: + async with httpx.AsyncClient(timeout=2.0) as client: + # Simple health check - adjust endpoint based on Copilot SDK API + response = await client.get(f"{self.base_url}/health") + return response.status_code == 200 + except Exception: + return False + + async def synthesize_reference( + self, + website_url: str, + text_chunks: list[str], + ) -> str: + """ + Use Copilot to synthesize a reference doc markdown string. + + Args: + website_url: Source website URL + text_chunks: List of text chunks from scraping + + Returns: + Synthesized markdown reference document + """ + system_prompt = ( + "You are a content synthesis assistant. Produce a concise but thorough " + "reference document for an AI agent that will answer questions about this website. " + "Focus on policies, services, FAQs, contact, and important positions." + ) + + user_prompt = f""" + Website URL: {website_url} + + Please synthesize the following content into a structured markdown reference document + with headings: Overview, Key Topics, Common Questions, Important Details. + + Content chunks: + {chr(10).join(f"--- Chunk {i+1} ---{chr(10)}{chunk}" for i, chunk in enumerate(text_chunks))} + """ + + return await self.chat(system_prompt, [ + {"role": "user", "content": user_prompt} + ]) + + async def chat( + self, + system_prompt: str, + messages: list[dict[str, str]], + ) -> str: + """ + General chat wrapper used by agent_service. + + Args: + system_prompt: System prompt for the conversation + messages: List of message dicts with 'role' and 'content' + + Returns: + Response text from Copilot + """ + if not self.enabled or not await self.is_available(): + # Fallback to OpenAI or other LLM + logger.warning("Copilot SDK not available, falling back to OpenAI") + return await self._fallback_to_openai(system_prompt, messages) + + try: + # TODO: Implement actual Copilot SDK API call + # This is a placeholder - adjust based on actual Copilot SDK API + async with httpx.AsyncClient(timeout=60.0) as client: + response = await client.post( + f"{self.base_url}/chat", + json={ + "system_prompt": system_prompt, + "messages": messages + } + ) + response.raise_for_status() + return response.json()["content"] + except Exception as e: + logger.error(f"Copilot SDK error: {e}, falling back to OpenAI") + return await self._fallback_to_openai(system_prompt, messages) + + async def _fallback_to_openai( + self, + system_prompt: str, + messages: list[dict[str, str]], + ) -> str: + """Fallback to OpenAI when Copilot is unavailable.""" + # TODO: Implement OpenAI API call + # This requires openai package and API key from settings + raise NotImplementedError("OpenAI fallback not yet implemented") diff --git a/src/services/facebook_service.py b/src/services/facebook_service.py new file mode 100644 index 0000000..79396a5 --- /dev/null +++ b/src/services/facebook_service.py @@ -0,0 +1,30 @@ +"""Send messages to Facebook Graph API service.""" + +import httpx + + +async def send_message( + page_access_token: str, + recipient_id: str, + text: str, +) -> None: + """ + Send message via Facebook Graph API. + + Args: + page_access_token: Facebook Page access token + recipient_id: Facebook user ID to send message to + text: Message text to send + """ + url = "https://graph.facebook.com/v18.0/me/messages" + + params = {"access_token": page_access_token} + + payload = { + "recipient": {"id": recipient_id}, + "message": {"text": text} + } + + async with httpx.AsyncClient(timeout=10.0) as client: + response = await client.post(url, params=params, json=payload) + response.raise_for_status() diff --git a/src/services/reference_doc.py b/src/services/reference_doc.py new file mode 100644 index 0000000..d692aae --- /dev/null +++ b/src/services/reference_doc.py @@ -0,0 +1,30 @@ +"""Build reference document via Copilot service.""" + +import hashlib + +from src.services.copilot_service import CopilotService + + +async def build_reference_doc( + copilot: CopilotService, + website_url: str, + text_chunks: list[str], +) -> tuple[str, str]: + """ + Build reference document from text chunks using Copilot. + + Args: + copilot: CopilotService instance + website_url: Source website URL + text_chunks: List of text chunks from scraping + + Returns: + Tuple of (markdown_content, content_hash) + """ + # Synthesize reference document + markdown_content = await copilot.synthesize_reference(website_url, text_chunks) + + # Generate content hash + content_hash = hashlib.sha256(markdown_content.encode()).hexdigest() + + return markdown_content, content_hash diff --git a/src/services/scraper.py b/src/services/scraper.py new file mode 100644 index 0000000..9730693 --- /dev/null +++ b/src/services/scraper.py @@ -0,0 +1,61 @@ +"""Website scraping and chunking service.""" + +import re +from typing import List + +import httpx +from bs4 import BeautifulSoup + + +async def scrape_website(url: str, max_pages: int = 5) -> List[str]: + """ + Scrape website and return text chunks. + + Args: + url: Root URL to scrape + max_pages: Maximum number of pages to scrape + + Returns: + List of text chunks (500-800 words each) + """ + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: + try: + response = await client.get(url) + response.raise_for_status() + except httpx.HTTPError as e: + raise ValueError(f"Failed to fetch {url}: {e}") + + soup = BeautifulSoup(response.text, "html.parser") + + # Remove script and style elements + for script in soup(["script", "style", "nav", "footer"]): + script.decompose() + + # Extract text + text = soup.get_text() + + # Normalize whitespace + text = re.sub(r'\s+', ' ', text) + text = text.strip() + + # Chunk into 500-800 word segments + words = text.split() + chunks = [] + current_chunk = [] + current_word_count = 0 + target_words = 650 # Target middle of 500-800 range + + for word in words: + current_chunk.append(word) + current_word_count += 1 + + if current_word_count >= target_words: + chunks.append(' '.join(current_chunk)) + current_chunk = [] + current_word_count = 0 + + # Add remaining words + if current_chunk: + chunks.append(' '.join(current_chunk)) + + return chunks