Store and search web content in PlanetScale using Plasmate's Semantic Object Model (SOM).
Plasmate converts HTML to structured JSON with 10-100x token compression. PlanetScale provides serverless MySQL with branching support. Together, they enable efficient web content storage with full-text search and Git-like branch workflows.
pip install planetscale-plasmatePrerequisites:
- Plasmate CLI installed and in PATH
- PlanetScale database with credentials
- Optional: pscale CLI for branch operations
from planetscale_plasmate import PlasmatePlanetScale, migrate
# Run migrations (creates tables)
migrate(
host="aws.connect.psdb.cloud",
database="my_database",
user="username",
password="password"
)
# Initialize client
client = PlasmatePlanetScale(
host="aws.connect.psdb.cloud",
database="my_database",
user="username",
password="password"
)
# Fetch and store a URL
content_id = client.fetch_and_store("https://example.com")
# Search stored content
results = client.search("example domain")
for row in results:
print(f"{row['url']}: {row['extracted_text'][:100]}...")# Single URL with metadata
content_id = client.fetch_and_store(
"https://example.com",
metadata={"source": "crawler", "tags": ["docs"]}
)
# Get stored content
content = client.get_content(content_id)
print(content["som"]) # Structured JSON
print(content["extracted_text"]) # Readable text# Fetch multiple URLs with connection pooling
urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
]
results = client.batch_fetch_and_store(urls, max_workers=5)
for url, result in results.items():
if isinstance(result, Exception):
print(f"Failed: {url} - {result}")
else:
print(f"Stored: {url} -> {result}")# Natural language search
results = client.search("machine learning tutorial", limit=10)
for row in results:
print(f"[{row['relevance']:.2f}] {row['url']}")
print(f" {row['extracted_text'][:200]}...")
# Filter by crawl session
results = client.search("python", session_id="session-uuid")# Create a session for batch crawls
session_id = "my-crawl-session"
client.fetch_and_store("https://example.com/1", session_id=session_id)
client.fetch_and_store("https://example.com/2", session_id=session_id)
# Get session with all content
session = client.get_session(session_id)
print(f"URLs crawled: {len(session['content'])}")| Column | Type | Description |
|---|---|---|
| id | VARCHAR(36) | UUID primary key |
| url | VARCHAR(2048) | Source URL |
| som | JSON | Plasmate SOM output |
| extracted_text | LONGTEXT | Readable text (full-text indexed) |
| metadata | JSON | Custom metadata |
| session_id | VARCHAR(36) | Optional crawl session |
| fetched_at | DATETIME | Fetch timestamp |
| Column | Type | Description |
|---|---|---|
| id | VARCHAR(36) | UUID primary key |
| name | VARCHAR(255) | Session name |
| started_at | DATETIME | Start timestamp |
| completed_at | DATETIME | Completion timestamp |
| status | ENUM | active, completed, failed, cancelled |
| url_count | INT | Total URLs in session |
| success_count | INT | Successfully fetched URLs |
PlanetScale supports Git-like branching for databases. Use branches to stage content changes before deploying to production.
from planetscale_plasmate import create_content_branch, deploy_request
# Create a staging branch
branch, credentials = create_content_branch(
organization="my-org",
database="my-database",
branch_name="content-update-april"
)
# Use branch credentials for content operations
staging_client = PlasmatePlanetScale(
host=credentials["host"],
database=credentials["database"],
user=credentials["username"],
password=credentials["password"]
)
# Add content to staging
staging_client.fetch_and_store("https://new-content.com")from planetscale_plasmate import with_branch
with with_branch(
organization="my-org",
database="my-database",
branch_name="staging",
create_if_missing=True,
delete_on_exit=False
) as credentials:
client = PlasmatePlanetScale(**credentials)
client.fetch_and_store("https://example.com")# Create deploy request to merge to main
dr = deploy_request(
organization="my-org",
database="my-database",
branch="content-update-april",
notes="Add April content updates"
)
print(f"Deploy request #{dr.id} created")from planetscale_plasmate import BranchManager
manager = BranchManager(
organization="my-org",
database="my-database"
)
# List branches
branches = manager.list_branches()
# Create branch
branch = manager.create_branch("feature-branch", parent_branch="main")
# Get credentials
creds = manager.get_branch_credentials("feature-branch")
# Create and deploy
dr = manager.create_deploy_request("feature-branch")
manager.deploy(dr.id)
# Cleanup
manager.delete_branch("feature-branch")export PLANETSCALE_HOST="aws.connect.psdb.cloud"
export PLANETSCALE_DATABASE="my_database"
export PLANETSCALE_USER="username"
export PLANETSCALE_PASSWORD="password"
# For branch operations (pscale CLI)
export PLANETSCALE_ORG="my-org"
export PLANETSCALE_SERVICE_TOKEN="token"
export PLANETSCALE_SERVICE_TOKEN_ID="token-id"from planetscale_plasmate import migrate, check_schema
# Check current schema status
status = check_schema(host, database, user, password)
print(f"Ready: {status['ready']}")
print(f"Tables: {list(status['tables'].keys())}")
# Run migrations
result = migrate(host, database, user, password)
print(f"Created: {result['tables_created']}")
# Reset schema (development only)
result = migrate(host, database, user, password, drop_existing=True)The raw SQL schema is available at sql/schema.sql:
-- Key features:
-- - JSON columns for SOM and metadata
-- - FULLTEXT index on extracted_text
-- - Foreign key from web_content to crawl_sessions
-- - Indexes on url, session_id, and fetched_atMIT