Abstract-Data · jreakin · Jan 30, 2026 · Jan 29, 2026 · Copilot · Jan 29, 2026
diff --git a/migrations/006_scraped_pages_index.sql b/migrations/006_scraped_pages_index.sql
@@ -0,0 +1,74 @@
+-- Scraped pages index: per-page metadata and vector embeddings for semantic search
+-- Requires pgvector extension (Supabase has it built-in)
+
+create extension if not exists vector;
+
+-- Per-page metadata and content from scraped websites
+create table scraped_pages (
+  id uuid primary key default gen_random_uuid(),
+  reference_doc_id uuid not null references reference_documents(id) on delete cascade,
+  url text not null,
+  normalized_url text not null,
+  title text,
+  raw_content text not null,
+  word_count int not null,
+  scraped_at timestamptz not null,
+  created_at timestamptz default now()
+);
+
+-- Chunks with vector embeddings for semantic search
+create table page_chunks (
+  id uuid primary key default gen_random_uuid(),
+  scraped_page_id uuid not null references scraped_pages(id) on delete cascade,
+  chunk_index int not null,
+  content text not null,
+  embedding vector(1536) not null,
+  word_count int not null,
+  created_at timestamptz default now()
+);
+
+-- Indexes
+create index idx_scraped_pages_reference_doc_id on scraped_pages(reference_doc_id);
+create unique index idx_scraped_pages_normalized_url_reference_doc
+  on scraped_pages(normalized_url, reference_doc_id);
+
+-- IVFFlat index for approximate nearest-neighbor search (cosine distance)
+-- lists = 100 is a reasonable default for small-to-medium datasets
+create index idx_page_chunks_embedding_cosine on page_chunks
+  using ivfflat (embedding vector_cosine_ops)
+  with (lists = 100);
+
+create index idx_page_chunks_scraped_page_id on page_chunks(scraped_page_id);
+
+-- RPC for semantic search: returns chunks with source URL, ordered by cosine distance.
+-- query_embedding_text is a string like '[0.1, 0.2, ...]' so Supabase/PostgREST can pass it.
+create or replace function search_page_chunks(
+  query_embedding_text text,
+  ref_doc_id uuid,
+  match_limit int default 5
+)
+returns table (
+  id uuid,
+  scraped_page_id uuid,
+  chunk_index int,
+  content text,
+  word_count int,
+  page_url text,
+  distance float
+)
+language sql stable
+as $$
+  select
+    pc.id,
+    pc.scraped_page_id,
+    pc.chunk_index,
+    pc.content,
+    pc.word_count,
+    sp.url as page_url,
+    (pc.embedding <=> query_embedding_text::vector(1536)) as distance
+  from page_chunks pc
+  join scraped_pages sp on sp.id = pc.scraped_page_id
+  where sp.reference_doc_id = search_page_chunks.ref_doc_id
+  order by pc.embedding <=> query_embedding_text::vector(1536)
+  limit match_limit;
+$$;
diff --git a/src/api/webhook.py b/src/api/webhook.py
@@ -121,6 +121,7 @@ async def process_message(page_id: str, sender_id: str, message_text: str):
 
         context = AgentContext(
             bot_config_id=bot_config.id,
+            reference_doc_id=bot_config.reference_doc_id,
             reference_doc=ref_doc["content"],
             tone=bot_config.tone,
             recent_messages=recent_messages,

diff --git a/src/cli/setup_cli.py b/src/cli/setup_cli.py
@@ -22,13 +22,17 @@
 import questionary
 import typer
 
-from src.services.scraper import scrape_website
+from src.services.scraper import chunk_text, scrape_website
 from src.services.reference_doc import build_reference_document
+from src.services.embedding_service import generate_embeddings
 from src.db.repository import (
     create_bot_configuration,
     create_reference_document,
+    create_page_chunks,
+    create_scraped_page,
     create_test_session,
     get_reference_document_by_source_url,
+    get_scraped_pages_by_reference_doc,
     save_test_message,
 )
 from src.models.agent_models import AgentContext
@@ -282,6 +286,7 @@ def _run_test_repl(
     """
     context = AgentContext(
         bot_config_id="cli-test",
+        reference_doc_id=reference_doc_id,
         reference_doc=ref_doc_content,
         tone=tone,
         recent_messages=[],
@@ -379,12 +384,55 @@ def setup():
         ref_doc_content = existing_doc["content"]
         typer.echo(f"✓ Found existing reference document for {normalized_url}")
         typer.echo("  Skipping scrape and document generation.")
+        # If no page index exists yet, scrape and index pages only (do not modify reference doc)
+        existing_pages = get_scraped_pages_by_reference_doc(reference_doc_id)
+        if not existing_pages:
+            typer.echo("  No page index found. Scraping pages for search index only...")
+            try:
+                scrape_result = _run_async_with_cleanup(scrape_website(normalized_url))
+                typer.echo(f"  ✓ Scraped {len(scrape_result.pages)} pages")
+                typer.echo("  Indexing pages and generating embeddings...")
+                async def _index_pages_and_chunks():
+                    for page in scrape_result.pages:
+                        scraped_page_id = create_scraped_page(
+                            reference_doc_id=reference_doc_id,
+                            url=page.url,
+                            normalized_url=page.normalized_url,
+                            title=page.title,
+                            raw_content=page.content,
+                            word_count=page.word_count,
+                            scraped_at=page.scraped_at,
+                        )
+                        page_chunk_tuples = chunk_text(page.content)
+                        if not page_chunk_tuples:
+                            continue
+                        chunk_texts = [t[0] for t in page_chunk_tuples]
+                        embeddings = await generate_embeddings(chunk_texts)
+                        chunks_with_embeddings = [
+                            (chunk_texts[i], embeddings[i], page_chunk_tuples[i][1])
+                            for i in range(len(chunk_texts))
+                        ]
+                        create_page_chunks(scraped_page_id, chunks_with_embeddings)
+                    return len(scrape_result.pages)
+                page_count = _run_async_with_cleanup(_index_pages_and_chunks())
+                typer.echo(f"  ✓ Indexed {page_count} pages with embeddings")
+            except Exception as e:
+                typer.echo(
+                    typer.style(
+                        f"  ⚠ Page indexing failed (search_pages tool will be empty): {e}",
+                        fg=typer.colors.YELLOW,
+                    ),
+                    err=True,
+                )
+        else:
+            typer.echo(f"  Page index already has {len(existing_pages)} pages.")
-        # If no page index exists yet, scrape and index pages only (do not modify reference doc)
-        existing_pages = get_scraped_pages_by_reference_doc(reference_doc_id)
-        if not existing_pages:
-            typer.echo("  No page index found. Scraping pages for search index only...")
-            try:
-                scrape_result = _run_async_with_cleanup(scrape_website(normalized_url))
-                typer.echo(f"  ✓ Scraped {len(scrape_result.pages)} pages")
-                typer.echo("  Indexing pages and generating embeddings...")
-                async def _index_pages_and_chunks():
-                    for page in scrape_result.pages:
-                        scraped_page_id = create_scraped_page(
-                            reference_doc_id=reference_doc_id,
-                            url=page.url,
-                            normalized_url=page.normalized_url,
-                            title=page.title,
-                            raw_content=page.content,
-                            word_count=page.word_count,
-                            scraped_at=page.scraped_at,
-                        )
-                        page_chunk_tuples = chunk_text(page.content)
-                        if not page_chunk_tuples:
-                            continue
-                        chunk_texts = [t[0] for t in page_chunk_tuples]
-                        embeddings = await generate_embeddings(chunk_texts)
-                        chunks_with_embeddings = [
-                            (chunk_texts[i], embeddings[i], page_chunk_tuples[i][1])
-                            for i in range(len(chunk_texts))
-                        ]
-                        create_page_chunks(scraped_page_id, chunks_with_embeddings)
-                    return len(scrape_result.pages)
-                page_count = _run_async_with_cleanup(_index_pages_and_chunks())
-                typer.echo(f"  ✓ Indexed {page_count} pages with embeddings")
-            except Exception as e:
-                typer.echo(
-                    typer.style(
-                        f"  ⚠ Page indexing failed (search_pages tool will be empty): {e}",
-                        fg=typer.colors.YELLOW,
-                    ),
-                    err=True,
-                )
-        else:
-            typer.echo(f"  Page index already has {len(existing_pages)} pages.")
+        # Always (re-)scrape and index pages for the search index to ensure idempotency.
+        typer.echo("  Scraping pages and building search index (this may take a moment)...")
+        try:
+            scrape_result = _run_async_with_cleanup(scrape_website(normalized_url))
+            typer.echo(f"  ✓ Scraped {len(scrape_result.pages)} pages")
+            typer.echo("  Indexing pages and generating embeddings...")
+
+            async def _index_pages_and_chunks():
+                for page in scrape_result.pages:
+                    scraped_page_id = create_scraped_page(
+                        reference_doc_id=reference_doc_id,
+                        url=page.url,
+                        normalized_url=page.normalized_url,
+                        title=page.title,
+                        raw_content=page.content,
+                        word_count=page.word_count,
+                        scraped_at=page.scraped_at,
+                    )
+                    page_chunk_tuples = chunk_text(page.content)
+                    if not page_chunk_tuples:
+                        continue
+                    chunk_texts = [t[0] for t in page_chunk_tuples]
+                    embeddings = await generate_embeddings(chunk_texts)
+                    chunks_with_embeddings = [
+                        (chunk_texts[i], embeddings[i], page_chunk_tuples[i][1])
+                        for i in range(len(chunk_texts))
+                    ]
+                    create_page_chunks(scraped_page_id, chunks_with_embeddings)
+                return len(scrape_result.pages)
+
+            page_count = _run_async_with_cleanup(_index_pages_and_chunks())
+            typer.echo(f"  ✓ Indexed {page_count} pages with embeddings")
+        except Exception as e:
+            typer.echo(
+                typer.style(
+                    f"  ⚠ Page indexing failed (search_pages tool may be incomplete or empty): {e}",
+                    fg=typer.colors.YELLOW,
+                ),
+                err=True,
+            )
-        # If no page index exists yet, scrape and index pages only (do not modify reference doc)
-        existing_pages = get_scraped_pages_by_reference_doc(reference_doc_id)
-        if not existing_pages:
-            typer.echo("  No page index found. Scraping pages for search index only...")
-            try:
-                scrape_result = _run_async_with_cleanup(scrape_website(normalized_url))
-                typer.echo(f"  ✓ Scraped {len(scrape_result.pages)} pages")
-                typer.echo("  Indexing pages and generating embeddings...")
-                async def _index_pages_and_chunks():
-                    for page in scrape_result.pages:
-                        scraped_page_id = create_scraped_page(
-                            reference_doc_id=reference_doc_id,
-                            url=page.url,
-                            normalized_url=page.normalized_url,
-                            title=page.title,
-                            raw_content=page.content,
-                            word_count=page.word_count,
-                            scraped_at=page.scraped_at,
-                        )
-                        page_chunk_tuples = chunk_text(page.content)
-                        if not page_chunk_tuples:
-                            continue
-                        chunk_texts = [t[0] for t in page_chunk_tuples]
-                        embeddings = await generate_embeddings(chunk_texts)
-                        chunks_with_embeddings = [
-                            (chunk_texts[i], embeddings[i], page_chunk_tuples[i][1])
-                            for i in range(len(chunk_texts))
-                        ]
-                        create_page_chunks(scraped_page_id, chunks_with_embeddings)
-                    return len(scrape_result.pages)
-                page_count = _run_async_with_cleanup(_index_pages_and_chunks())
-                typer.echo(f"  ✓ Indexed {page_count} pages with embeddings")
-            except Exception as e:
-                typer.echo(
-                    typer.style(
-                        f"  ⚠ Page indexing failed (search_pages tool will be empty): {e}",
-                        fg=typer.colors.YELLOW,
-                    ),
-                    err=True,
-                )
-        else:
-            typer.echo(f"  Page index already has {len(existing_pages)} pages.")
+        # Always (re-)scrape and index pages for the search index to ensure idempotency.
+        typer.echo("  Scraping pages and building search index (this may take a moment)...")
+        try:
+            scrape_result = _run_async_with_cleanup(scrape_website(normalized_url))
+            typer.echo(f"  ✓ Scraped {len(scrape_result.pages)} pages")
+            typer.echo("  Indexing pages and generating embeddings...")
+
+            async def _index_pages_and_chunks():
+                for page in scrape_result.pages:
+                    scraped_page_id = create_scraped_page(
+                        reference_doc_id=reference_doc_id,
+                        url=page.url,
+                        normalized_url=page.normalized_url,
+                        title=page.title,
+                        raw_content=page.content,
+                        word_count=page.word_count,
+                        scraped_at=page.scraped_at,
+                    )
+                    page_chunk_tuples = chunk_text(page.content)
+                    if not page_chunk_tuples:
+                        continue
+                    chunk_texts = [t[0] for t in page_chunk_tuples]
+                    embeddings = await generate_embeddings(chunk_texts)
+                    chunks_with_embeddings = [
+                        (chunk_texts[i], embeddings[i], page_chunk_tuples[i][1])
+                        for i in range(len(chunk_texts))
+                    ]
+                    create_page_chunks(scraped_page_id, chunks_with_embeddings)
+                return len(scrape_result.pages)
+
+            page_count = _run_async_with_cleanup(_index_pages_and_chunks())
+            typer.echo(f"  ✓ Indexed {page_count} pages with embeddings")
+        except Exception as e:
+            typer.echo(
+                typer.style(
+                    f"  ⚠ Page indexing failed (search_pages tool may be incomplete or empty): {e}",
+                    fg=typer.colors.YELLOW,
+                ),
+                err=True,
+            )
     else:
         # Step 2a: Scrape
         typer.echo(f"Scraping {normalized_url}...")
         try:
-            text_chunks = _run_async_with_cleanup(scrape_website(normalized_url))
-            typer.echo(f"✓ Scraped {len(text_chunks)} text chunks")
+            scrape_result = _run_async_with_cleanup(scrape_website(normalized_url))
+            text_chunks = scrape_result.chunks
+            typer.echo(f"✓ Scraped {len(text_chunks)} text chunks from {len(scrape_result.pages)} pages")
         except Exception as e:
             typer.echo(f"✗ Error scraping website: {e}", err=True)
             raise typer.Exit(1)
@@ -415,6 +463,44 @@ def setup():
             raise typer.Exit(1)
         ref_doc_content = markdown_content
 
+        # Step 2d: Index scraped pages and chunks with embeddings for semantic search
+        typer.echo("Indexing pages and generating embeddings...")
+        try:
+
+            async def _index_pages_and_chunks():
+                for page in scrape_result.pages:
+                    scraped_page_id = create_scraped_page(
+                        reference_doc_id=reference_doc_id,
+                        url=page.url,
+                        normalized_url=page.normalized_url,
+                        title=page.title,
+                        raw_content=page.content,
+                        word_count=page.word_count,
+                        scraped_at=page.scraped_at,
+                    )
+                    page_chunk_tuples = chunk_text(page.content)
+                    if not page_chunk_tuples:
+                        continue
+                    chunk_texts = [t[0] for t in page_chunk_tuples]
+                    embeddings = await generate_embeddings(chunk_texts)
+                    chunks_with_embeddings = [
+                        (chunk_texts[i], embeddings[i], page_chunk_tuples[i][1])
+                        for i in range(len(chunk_texts))
+                    ]
+                    create_page_chunks(scraped_page_id, chunks_with_embeddings)
+                return len(scrape_result.pages)
+
+            page_count = _run_async_with_cleanup(_index_pages_and_chunks())
+            typer.echo(f"✓ Indexed {page_count} pages with embeddings")
+        except Exception as e:
+            typer.echo(
+                typer.style(
+                    f"⚠ Indexing failed (search_pages tool will be empty): {e}",
+                    fg=typer.colors.YELLOW,
+                ),
+                err=True,
+            )
+
     # Step 3: Action menu (arrow-key); loop so user can Test then Continue or Exit
     while True:
         action = _action_menu()

diff --git a/src/config.py b/src/config.py
@@ -50,6 +50,20 @@ class Settings(BaseSettings):
         description="Fallback Anthropic model if primary fails",
     )
 
+    # Embedding (via PydanticAI Gateway)
+    embedding_model: str = Field(
+        default="gateway/openai:text-embedding-3-small",
+        description="Embedding model via PAIG (e.g. gateway/openai:text-embedding-3-small)",
+    )
+    embedding_dimensions: int = Field(
+        default=1536,
+        description="Embedding vector dimension (matches text-embedding-3-small)",
+    )
-    embedding_dimensions: int = Field(
-        default=1536,
-        description="Embedding vector dimension (matches text-embedding-3-small)",
-    )
-    embedding_dimensions: int = Field(
-        default=1536,
-        description="Embedding vector dimension (matches text-embedding-3-small)",
-    )
+    search_result_limit: int = Field(
+        default=5,
+        description="Max number of chunks to return from page search",
+    )
+
     # OpenAI Configuration (kept for direct fallback if needed)
     openai_api_key: str = Field(
         default="", description="OpenAI API key (legacy fallback)"

diff --git a/src/db/repository.py b/src/db/repository.py
@@ -2,7 +2,7 @@
 
 import time
 from datetime import datetime
-from typing import Optional
+from typing import Any, List, Optional
 import uuid
 
 import logfire
@@ -232,6 +232,109 @@ def get_reference_document_by_source_url(source_url: str) -> Optional[dict]:
     return result.data[0]
 
 
+def _embedding_to_text(embedding: List[float]) -> str:
+    """Format embedding list as pgvector text literal '[a,b,c,...]'."""
+    return "[" + ",".join(str(x) for x in embedding) + "]"
+
+
+def create_scraped_page(
+    reference_doc_id: str,
+    url: str,
+    normalized_url: str,
+    title: str,
+    raw_content: str,
+    word_count: int,
+    scraped_at: datetime,
+) -> str:
+    """
+    Insert a single scraped page row.
+
+    Returns:
+        scraped_page id (uuid string)
+    """
+    supabase = get_supabase_client()
+    data = {
+        "reference_doc_id": reference_doc_id,
+        "url": url,
+        "normalized_url": normalized_url,
+        "title": title or "",
+        "raw_content": raw_content,
+        "word_count": word_count,
+        "scraped_at": scraped_at.isoformat() if hasattr(scraped_at, "isoformat") else scraped_at,
+    }
+    result = supabase.table("scraped_pages").insert(data).execute()
+    if not result.data:
+        raise ValueError("Failed to create scraped_page")
+    return result.data[0]["id"]
+
+
+def create_page_chunks(
+    scraped_page_id: str,
+    chunks_with_embeddings: List[tuple[str, List[float], int]],
+) -> None:
+    """
+    Batch insert page chunks with embeddings.
+
+    chunks_with_embeddings: list of (content, embedding, word_count) per chunk.
+    """
+    if not chunks_with_embeddings:
+        return
+    supabase = get_supabase_client()
+    rows: List[dict[str, Any]] = []
+    for idx, (content, embedding, word_count) in enumerate(chunks_with_embeddings):
+        rows.append({
+            "scraped_page_id": scraped_page_id,
+            "chunk_index": idx,
+            "content": content,
+            "embedding": embedding,  # Supabase accepts list for vector column
+            "word_count": word_count,
+        })
+    supabase.table("page_chunks").insert(rows).execute()
+    logfire.info(
+        "Page chunks created",
+        scraped_page_id=scraped_page_id,
+        chunk_count=len(rows),
+    )
+
+
+def search_page_chunks(
+    query_embedding: List[float],
+    reference_doc_id: str,
+    limit: int = 5,
+) -> List[dict[str, Any]]:
+    """
+    Semantic search over page chunks for a given reference document.
+
+    Returns list of dicts with id, scraped_page_id, chunk_index, content, word_count, page_url, distance.
+    """
+    supabase = get_supabase_client()
+    query_embedding_text = _embedding_to_text(query_embedding)
+    result = supabase.rpc(
+        "search_page_chunks",
+        {
+            "query_embedding_text": query_embedding_text,
+            "ref_doc_id": reference_doc_id,
+            "match_limit": limit,
+        },
+    ).execute()
+    if not result.data:
+        return []
+    return list(result.data)
+
+
+def get_scraped_pages_by_reference_doc(reference_doc_id: str) -> List[dict[str, Any]]:
+    """List all scraped pages for a reference document."""
+    supabase = get_supabase_client()
+    result = (
+        supabase.table("scraped_pages")
+        .select("*")
+        .eq("reference_doc_id", reference_doc_id)
+        .order("created_at")
+        .execute()
+    )
+    return list(result.data) if result.data else []
+
+
 def get_user_profile(sender_id: str, page_id: str) -> dict | None:
     """
     Get user profile by sender_id (unique per user).

diff --git a/src/models/agent_models.py b/src/models/agent_models.py
@@ -7,6 +7,7 @@ class AgentContext(BaseModel):
     """Context for agent responses."""
 
     bot_config_id: str
+    reference_doc_id: str
     reference_doc: str
     tone: str
     recent_messages: list[str] = Field(default_factory=list)

diff --git a/src/models/scraper_models.py b/src/models/scraper_models.py
@@ -0,0 +1,26 @@
+"""Models for scraper results: per-page data and scrape result."""
+
+from dataclasses import dataclass
+from datetime import datetime
+from typing import List
+
+
+@dataclass
+class ScrapedPage:
+    """Metadata and content for a single scraped page."""
+
+    url: str
+    normalized_url: str
+    title: str
+    content: str
+    word_count: int
+    scraped_at: datetime
+
+
+@dataclass
+class ScrapeResult:
+    """Result of a multi-page scrape: pages and combined chunks."""
+
+    pages: List[ScrapedPage]
+    chunks: List[str]
+    content_hash: str