Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions migrations/006_scraped_pages_index.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
-- Scraped pages index: per-page metadata and vector embeddings for semantic search
-- Requires pgvector extension (Supabase has it built-in)

create extension if not exists vector;

-- Per-page metadata and content from scraped websites
create table scraped_pages (
id uuid primary key default gen_random_uuid(),
reference_doc_id uuid not null references reference_documents(id) on delete cascade,
url text not null,
normalized_url text not null,
title text,
raw_content text not null,
word_count int not null,
scraped_at timestamptz not null,
created_at timestamptz default now()
);

-- Chunks with vector embeddings for semantic search
create table page_chunks (
id uuid primary key default gen_random_uuid(),
scraped_page_id uuid not null references scraped_pages(id) on delete cascade,
chunk_index int not null,
content text not null,
embedding vector(1536) not null,
word_count int not null,
created_at timestamptz default now()
);

-- Indexes
create index idx_scraped_pages_reference_doc_id on scraped_pages(reference_doc_id);
create unique index idx_scraped_pages_normalized_url_reference_doc
on scraped_pages(normalized_url, reference_doc_id);

-- IVFFlat index for approximate nearest-neighbor search (cosine distance)
-- lists = 100 is a reasonable default for small-to-medium datasets
create index idx_page_chunks_embedding_cosine on page_chunks
using ivfflat (embedding vector_cosine_ops)
with (lists = 100);

create index idx_page_chunks_scraped_page_id on page_chunks(scraped_page_id);

-- RPC for semantic search: returns chunks with source URL, ordered by cosine distance.
-- query_embedding_text is a string like '[0.1, 0.2, ...]' so Supabase/PostgREST can pass it.
create or replace function search_page_chunks(
query_embedding_text text,
ref_doc_id uuid,
match_limit int default 5
)
returns table (
id uuid,
scraped_page_id uuid,
chunk_index int,
content text,
word_count int,
page_url text,
distance float
)
language sql stable
as $$
select
pc.id,
pc.scraped_page_id,
pc.chunk_index,
pc.content,
pc.word_count,
sp.url as page_url,
(pc.embedding <=> query_embedding_text::vector(1536)) as distance
from page_chunks pc
join scraped_pages sp on sp.id = pc.scraped_page_id
where sp.reference_doc_id = search_page_chunks.ref_doc_id
order by pc.embedding <=> query_embedding_text::vector(1536)
limit match_limit;
$$;
1 change: 1 addition & 0 deletions src/api/webhook.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ async def process_message(page_id: str, sender_id: str, message_text: str):

context = AgentContext(
bot_config_id=bot_config.id,
reference_doc_id=bot_config.reference_doc_id,
reference_doc=ref_doc["content"],
tone=bot_config.tone,
recent_messages=recent_messages,
Expand Down
92 changes: 89 additions & 3 deletions src/cli/setup_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,17 @@
import questionary
import typer

from src.services.scraper import scrape_website
from src.services.scraper import chunk_text, scrape_website
from src.services.reference_doc import build_reference_document
from src.services.embedding_service import generate_embeddings
from src.db.repository import (
create_bot_configuration,
create_reference_document,
create_page_chunks,
create_scraped_page,
create_test_session,
get_reference_document_by_source_url,
get_scraped_pages_by_reference_doc,
save_test_message,
)
from src.models.agent_models import AgentContext
Expand Down Expand Up @@ -282,6 +286,7 @@ def _run_test_repl(
"""
context = AgentContext(
bot_config_id="cli-test",
reference_doc_id=reference_doc_id,
reference_doc=ref_doc_content,
tone=tone,
recent_messages=[],
Expand Down Expand Up @@ -379,12 +384,55 @@ def setup():
ref_doc_content = existing_doc["content"]
typer.echo(f"✓ Found existing reference document for {normalized_url}")
typer.echo(" Skipping scrape and document generation.")
# If no page index exists yet, scrape and index pages only (do not modify reference doc)
existing_pages = get_scraped_pages_by_reference_doc(reference_doc_id)
if not existing_pages:
typer.echo(" No page index found. Scraping pages for search index only...")
try:
scrape_result = _run_async_with_cleanup(scrape_website(normalized_url))
typer.echo(f" ✓ Scraped {len(scrape_result.pages)} pages")
typer.echo(" Indexing pages and generating embeddings...")
async def _index_pages_and_chunks():
for page in scrape_result.pages:
scraped_page_id = create_scraped_page(
reference_doc_id=reference_doc_id,
url=page.url,
normalized_url=page.normalized_url,
title=page.title,
raw_content=page.content,
word_count=page.word_count,
scraped_at=page.scraped_at,
)
page_chunk_tuples = chunk_text(page.content)
if not page_chunk_tuples:
continue
chunk_texts = [t[0] for t in page_chunk_tuples]
embeddings = await generate_embeddings(chunk_texts)
chunks_with_embeddings = [
(chunk_texts[i], embeddings[i], page_chunk_tuples[i][1])
for i in range(len(chunk_texts))
]
create_page_chunks(scraped_page_id, chunks_with_embeddings)
return len(scrape_result.pages)
Comment on lines +395 to +416
Copy link

Copilot AI Jan 29, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The page indexing implementation (_index_pages_and_chunks) is duplicated in both the existing-doc resume path and the new-doc path. This duplication makes it easy for the two flows to drift (e.g., different chunking/embedding behavior, different error handling). Consider extracting a single helper (e.g., index_scrape_result(reference_doc_id, scrape_result)) and calling it from both branches.

Copilot uses AI. Check for mistakes.
page_count = _run_async_with_cleanup(_index_pages_and_chunks())
typer.echo(f" ✓ Indexed {page_count} pages with embeddings")
except Exception as e:
typer.echo(
typer.style(
f" ⚠ Page indexing failed (search_pages tool will be empty): {e}",
fg=typer.colors.YELLOW,
),
err=True,
)
else:
typer.echo(f" Page index already has {len(existing_pages)} pages.")
Comment on lines +387 to +428
Copy link

Copilot AI Jan 29, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the “existing reference doc” resume path, the decision to skip indexing is based only on get_scraped_pages_by_reference_doc(reference_doc_id). If indexing fails after inserting some scraped_pages rows (but before inserting page_chunks), subsequent runs will see existing_pages as non-empty and will skip indexing, leaving search_pages permanently empty for that doc unless the DB is manually cleaned up. Consider checking for existing chunks (not just pages), or making the indexing step idempotent/re-runnable (e.g., upsert pages + replace chunks, or delete partially-created pages on failure).

Suggested change
# If no page index exists yet, scrape and index pages only (do not modify reference doc)
existing_pages = get_scraped_pages_by_reference_doc(reference_doc_id)
if not existing_pages:
typer.echo(" No page index found. Scraping pages for search index only...")
try:
scrape_result = _run_async_with_cleanup(scrape_website(normalized_url))
typer.echo(f" ✓ Scraped {len(scrape_result.pages)} pages")
typer.echo(" Indexing pages and generating embeddings...")
async def _index_pages_and_chunks():
for page in scrape_result.pages:
scraped_page_id = create_scraped_page(
reference_doc_id=reference_doc_id,
url=page.url,
normalized_url=page.normalized_url,
title=page.title,
raw_content=page.content,
word_count=page.word_count,
scraped_at=page.scraped_at,
)
page_chunk_tuples = chunk_text(page.content)
if not page_chunk_tuples:
continue
chunk_texts = [t[0] for t in page_chunk_tuples]
embeddings = await generate_embeddings(chunk_texts)
chunks_with_embeddings = [
(chunk_texts[i], embeddings[i], page_chunk_tuples[i][1])
for i in range(len(chunk_texts))
]
create_page_chunks(scraped_page_id, chunks_with_embeddings)
return len(scrape_result.pages)
page_count = _run_async_with_cleanup(_index_pages_and_chunks())
typer.echo(f" ✓ Indexed {page_count} pages with embeddings")
except Exception as e:
typer.echo(
typer.style(
f" ⚠ Page indexing failed (search_pages tool will be empty): {e}",
fg=typer.colors.YELLOW,
),
err=True,
)
else:
typer.echo(f" Page index already has {len(existing_pages)} pages.")
# Always (re-)scrape and index pages for the search index to ensure idempotency.
typer.echo(" Scraping pages and building search index (this may take a moment)...")
try:
scrape_result = _run_async_with_cleanup(scrape_website(normalized_url))
typer.echo(f" ✓ Scraped {len(scrape_result.pages)} pages")
typer.echo(" Indexing pages and generating embeddings...")
async def _index_pages_and_chunks():
for page in scrape_result.pages:
scraped_page_id = create_scraped_page(
reference_doc_id=reference_doc_id,
url=page.url,
normalized_url=page.normalized_url,
title=page.title,
raw_content=page.content,
word_count=page.word_count,
scraped_at=page.scraped_at,
)
page_chunk_tuples = chunk_text(page.content)
if not page_chunk_tuples:
continue
chunk_texts = [t[0] for t in page_chunk_tuples]
embeddings = await generate_embeddings(chunk_texts)
chunks_with_embeddings = [
(chunk_texts[i], embeddings[i], page_chunk_tuples[i][1])
for i in range(len(chunk_texts))
]
create_page_chunks(scraped_page_id, chunks_with_embeddings)
return len(scrape_result.pages)
page_count = _run_async_with_cleanup(_index_pages_and_chunks())
typer.echo(f" ✓ Indexed {page_count} pages with embeddings")
except Exception as e:
typer.echo(
typer.style(
f" ⚠ Page indexing failed (search_pages tool may be incomplete or empty): {e}",
fg=typer.colors.YELLOW,
),
err=True,
)

Copilot uses AI. Check for mistakes.
else:
# Step 2a: Scrape
typer.echo(f"Scraping {normalized_url}...")
try:
text_chunks = _run_async_with_cleanup(scrape_website(normalized_url))
typer.echo(f"✓ Scraped {len(text_chunks)} text chunks")
scrape_result = _run_async_with_cleanup(scrape_website(normalized_url))
text_chunks = scrape_result.chunks
typer.echo(f"✓ Scraped {len(text_chunks)} text chunks from {len(scrape_result.pages)} pages")
except Exception as e:
typer.echo(f"✗ Error scraping website: {e}", err=True)
raise typer.Exit(1)
Expand Down Expand Up @@ -415,6 +463,44 @@ def setup():
raise typer.Exit(1)
ref_doc_content = markdown_content

# Step 2d: Index scraped pages and chunks with embeddings for semantic search
typer.echo("Indexing pages and generating embeddings...")
try:

async def _index_pages_and_chunks():
for page in scrape_result.pages:
scraped_page_id = create_scraped_page(
reference_doc_id=reference_doc_id,
url=page.url,
normalized_url=page.normalized_url,
title=page.title,
raw_content=page.content,
word_count=page.word_count,
scraped_at=page.scraped_at,
)
page_chunk_tuples = chunk_text(page.content)
if not page_chunk_tuples:
continue
chunk_texts = [t[0] for t in page_chunk_tuples]
embeddings = await generate_embeddings(chunk_texts)
chunks_with_embeddings = [
(chunk_texts[i], embeddings[i], page_chunk_tuples[i][1])
for i in range(len(chunk_texts))
]
create_page_chunks(scraped_page_id, chunks_with_embeddings)
return len(scrape_result.pages)

page_count = _run_async_with_cleanup(_index_pages_and_chunks())
typer.echo(f"✓ Indexed {page_count} pages with embeddings")
except Exception as e:
typer.echo(
typer.style(
f"⚠ Indexing failed (search_pages tool will be empty): {e}",
fg=typer.colors.YELLOW,
),
err=True,
)

# Step 3: Action menu (arrow-key); loop so user can Test then Continue or Exit
while True:
action = _action_menu()
Expand Down
14 changes: 14 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,20 @@ class Settings(BaseSettings):
description="Fallback Anthropic model if primary fails",
)

# Embedding (via PydanticAI Gateway)
embedding_model: str = Field(
default="gateway/openai:text-embedding-3-small",
description="Embedding model via PAIG (e.g. gateway/openai:text-embedding-3-small)",
)
embedding_dimensions: int = Field(
default=1536,
description="Embedding vector dimension (matches text-embedding-3-small)",
)
Comment on lines +58 to +61
Copy link

Copilot AI Jan 29, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

embedding_dimensions is introduced in settings, but the rest of the implementation hard-codes 1536 in the DB schema and tests, and the runtime code doesn’t validate that the embedder actually returns vectors of this size. This creates a sharp edge if embedding_model is changed (or if the gateway returns a different dimension): inserts/search RPC casts can start failing at runtime. Either (a) validate embedding lengths against settings.embedding_dimensions in generate_embeddings/embed_query and keep schema in sync, or (b) remove embedding_dimensions to avoid implying it’s configurable.

Suggested change
embedding_dimensions: int = Field(
default=1536,
description="Embedding vector dimension (matches text-embedding-3-small)",
)

Copilot uses AI. Check for mistakes.
search_result_limit: int = Field(
default=5,
description="Max number of chunks to return from page search",
)

# OpenAI Configuration (kept for direct fallback if needed)
openai_api_key: str = Field(
default="", description="OpenAI API key (legacy fallback)"
Expand Down
105 changes: 104 additions & 1 deletion src/db/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import time
from datetime import datetime
from typing import Optional
from typing import Any, List, Optional
import uuid

import logfire
Expand Down Expand Up @@ -232,6 +232,109 @@ def get_reference_document_by_source_url(source_url: str) -> Optional[dict]:
return result.data[0]


def _embedding_to_text(embedding: List[float]) -> str:
"""Format embedding list as pgvector text literal '[a,b,c,...]'."""
return "[" + ",".join(str(x) for x in embedding) + "]"


def create_scraped_page(
reference_doc_id: str,
url: str,
normalized_url: str,
title: str,
raw_content: str,
word_count: int,
scraped_at: datetime,
) -> str:
Comment on lines +240 to +248
Copy link

Copilot AI Jan 29, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New repository surface area was added (create_scraped_page, create_page_chunks, search_page_chunks, get_scraped_pages_by_reference_doc) but there are no corresponding unit tests, while other functions in this module are covered (see tests/unit/test_repository.py). Please add tests that validate the expected Supabase calls/parameters and basic behaviors (e.g., empty results handling in search_page_chunks/get_scraped_pages_by_reference_doc).

Copilot uses AI. Check for mistakes.
"""
Insert a single scraped page row.

Returns:
scraped_page id (uuid string)
"""
supabase = get_supabase_client()
data = {
"reference_doc_id": reference_doc_id,
"url": url,
"normalized_url": normalized_url,
"title": title or "",
"raw_content": raw_content,
"word_count": word_count,
"scraped_at": scraped_at.isoformat() if hasattr(scraped_at, "isoformat") else scraped_at,
}
result = supabase.table("scraped_pages").insert(data).execute()
if not result.data:
raise ValueError("Failed to create scraped_page")
return result.data[0]["id"]


def create_page_chunks(
scraped_page_id: str,
chunks_with_embeddings: List[tuple[str, List[float], int]],
) -> None:
"""
Batch insert page chunks with embeddings.

chunks_with_embeddings: list of (content, embedding, word_count) per chunk.
"""
if not chunks_with_embeddings:
return
supabase = get_supabase_client()
rows: List[dict[str, Any]] = []
for idx, (content, embedding, word_count) in enumerate(chunks_with_embeddings):
rows.append({
"scraped_page_id": scraped_page_id,
"chunk_index": idx,
"content": content,
"embedding": embedding, # Supabase accepts list for vector column
"word_count": word_count,
})
supabase.table("page_chunks").insert(rows).execute()
logfire.info(
"Page chunks created",
scraped_page_id=scraped_page_id,
chunk_count=len(rows),
)


def search_page_chunks(
query_embedding: List[float],
reference_doc_id: str,
limit: int = 5,
) -> List[dict[str, Any]]:
"""
Semantic search over page chunks for a given reference document.

Returns list of dicts with id, scraped_page_id, chunk_index, content, word_count, page_url, distance.
"""
supabase = get_supabase_client()
query_embedding_text = _embedding_to_text(query_embedding)
result = supabase.rpc(
"search_page_chunks",
{
"query_embedding_text": query_embedding_text,
"ref_doc_id": reference_doc_id,
"match_limit": limit,
},
).execute()
if not result.data:
return []
return list(result.data)


def get_scraped_pages_by_reference_doc(reference_doc_id: str) -> List[dict[str, Any]]:
"""List all scraped pages for a reference document."""
supabase = get_supabase_client()
result = (
supabase.table("scraped_pages")
.select("*")
.eq("reference_doc_id", reference_doc_id)
.order("created_at")
.execute()
)
return list(result.data) if result.data else []


def get_user_profile(sender_id: str, page_id: str) -> dict | None:
"""
Get user profile by sender_id (unique per user).
Expand Down
1 change: 1 addition & 0 deletions src/models/agent_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class AgentContext(BaseModel):
"""Context for agent responses."""

bot_config_id: str
reference_doc_id: str
reference_doc: str
tone: str
recent_messages: list[str] = Field(default_factory=list)
Expand Down
26 changes: 26 additions & 0 deletions src/models/scraper_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Models for scraper results: per-page data and scrape result."""

from dataclasses import dataclass
from datetime import datetime
from typing import List


@dataclass
class ScrapedPage:
"""Metadata and content for a single scraped page."""

url: str
normalized_url: str
title: str
content: str
word_count: int
scraped_at: datetime


@dataclass
class ScrapeResult:
"""Result of a multi-page scrape: pages and combined chunks."""

pages: List[ScrapedPage]
chunks: List[str]
content_hash: str
Loading
Loading