Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ yarn dev:frontend
**Terminal 2 - Backend:**
```bash
cd src
yarn dev:backend
hatch -e dev run dev-backend
```

- Frontend: http://localhost:3000
Expand Down Expand Up @@ -171,5 +171,4 @@ This project is licensed under the Databricks License - see the [LICENSE.txt](LI

---

**Version**: 0.4.0
**Maintained by**: [Databricks](https://databricks.com)
9 changes: 9 additions & 0 deletions src/backend/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,15 @@ LLM_ENABLED=False
# Security: First-phase injection detection prompt (advanced - usually not changed)
# LLM_INJECTION_CHECK_PROMPT="You are a security analyzer. Analyze the following content..."

# --- Graph Explorer Safety Limits ---
# Maximum nodes/edges returned from initial graph load (prevents browser OOM)
# GRAPH_MAX_NODES=5000
# GRAPH_MAX_EDGES=10000
# Default max neighbors per expansion
# GRAPH_NEIGHBOR_LIMIT=50
# SQL statement timeout for graph queries
# GRAPH_QUERY_TIMEOUT=30s

# --- Self-Service Sandbox Policy Settings ---
# These settings control which catalogs and schemas users can create objects in
# via the self-service dialog. This is a global security boundary separate from
Expand Down
143 changes: 71 additions & 72 deletions src/backend/src/app.py
Original file line number Diff line number Diff line change
@@ -1,83 +1,64 @@
# Initialize configuration and logging first
from src.common.config import get_settings, init_config
from src.common.logging import setup_logging, get_logger
from src.common.logging import get_logger, setup_logging

init_config()
settings = get_settings()
setup_logging(level=settings.LOG_LEVEL, log_file=settings.LOG_FILE)
logger = get_logger(__name__)

import mimetypes
import os
import time
from pathlib import Path

# Server startup timestamp for cache invalidation
SERVER_STARTUP_TIME = int(time.time())

from fastapi import Depends, FastAPI, Request
from fastapi import Depends, FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse, HTMLResponse
from fastapi.staticfiles import StaticFiles
from starlette.responses import Response
from fastapi import HTTPException, status

from src.common.middleware import ErrorHandlingMiddleware, LoggingMiddleware
from src.routes import (
access_grants_routes,
audit_routes,
catalog_commander_routes,
data_catalog_routes,
compliance_routes,
change_log_routes,
comments_routes,
compliance_routes,
costs_routes,
data_asset_reviews_routes,
data_catalog_routes,
data_contracts_routes,
data_domains_routes,
data_product_routes,
datasets_routes,
entitlements_routes,
entitlements_sync_routes,
estate_manager_routes,
industry_ontology_routes,
jobs_routes,
llm_search_routes,
mcp_routes,
mcp_tokens_routes,
mdm_routes,
metadata_routes,
notifications_routes,
projects_routes,
search_routes,
security_features_routes,
self_service_routes,
settings_routes,
semantic_models_routes,
semantic_links_routes,
user_routes,
audit_routes,
change_log_routes,
workspace_routes,
semantic_models_routes,
settings_routes,
tags_routes,
teams_routes,
projects_routes,
costs_routes,
user_routes,
graph_explorer_routes,
workflows_routes,
workspace_routes,
)

from src.common.database import init_db, get_session_factory, SQLAlchemySession
from src.controller.data_products_manager import DataProductsManager
from src.controller.data_asset_reviews_manager import DataAssetReviewManager
from src.controller.data_contracts_manager import DataContractsManager
from src.controller.semantic_models_manager import SemanticModelsManager
from src.controller.search_manager import SearchManager
from src.common.workspace_client import get_workspace_client
from src.controller.settings_manager import SettingsManager
from src.controller.users_manager import UsersManager
from src.controller.authorization_manager import AuthorizationManager
from src.utils.startup_tasks import (
initialize_database,
initialize_managers,
startup_event_handler,
shutdown_event_handler
)

from src.utils.startup_tasks import initialize_database, initialize_managers

logger.info(f"Starting application in {settings.ENV} mode.")
logger.info(f"Debug mode: {settings.DEBUG}")
Expand All @@ -94,37 +75,42 @@

# --- Application Lifecycle Events ---


# Application Startup Event
async def startup_event():
import os

# Skip startup tasks if running tests
if os.getenv('SKIP_STARTUP_TASKS') == 'true':
if os.getenv("SKIP_STARTUP_TASKS") == "true":
logger.info("SKIP_STARTUP_TASKS=true detected - skipping startup tasks (test mode)")
return

logger.info("Running application startup event...")
settings = get_settings()

initialize_database(settings=settings)
initialize_managers(app) # Handles DB-backed manager init

# Initialize Git service for indirect delivery mode
try:
logger.info("Initializing Git service...")
from src.common.git import init_git_service

git_service = init_git_service(settings)
app.state.git_service = git_service
logger.info(f"Git service initialized (status: {git_service.get_status().clone_status.value})")
logger.info(
f"Git service initialized (status: {git_service.get_status().clone_status.value})"
)
except Exception as e:
logger.warning(f"Failed initializing Git service: {e}", exc_info=True)
app.state.git_service = None

# Initialize Grant Manager for direct delivery mode
try:
logger.info("Initializing Grant Manager...")
from src.controller.grant_manager import init_grant_manager
from src.common.workspace_client import get_workspace_client
from src.controller.grant_manager import init_grant_manager

ws_client = get_workspace_client(settings=settings)
grant_manager = init_grant_manager(ws_client=ws_client, settings=settings)
app.state.grant_manager = grant_manager
Expand All @@ -137,30 +123,36 @@ async def startup_event():
try:
logger.info("Initializing Delivery Service...")
from src.controller.delivery_service import init_delivery_service

delivery_service = init_delivery_service(
settings=settings,
git_service=getattr(app.state, 'git_service', None),
grant_manager=getattr(app.state, 'grant_manager', None),
notifications_manager=getattr(app.state, 'notifications_manager', None),
git_service=getattr(app.state, "git_service", None),
grant_manager=getattr(app.state, "grant_manager", None),
notifications_manager=getattr(app.state, "notifications_manager", None),
)
app.state.delivery_service = delivery_service
logger.info(f"Delivery Service initialized (active modes: {[m.value for m in delivery_service.get_active_modes()]})")
logger.info(
f"Delivery Service initialized (active modes: {[m.value for m in delivery_service.get_active_modes()]})"
)
except Exception as e:
logger.warning(f"Failed initializing Delivery Service: {e}", exc_info=True)
app.state.delivery_service = None

# Demo data is loaded on-demand via POST /api/settings/demo-data/load
# See: src/backend/src/data/demo_data.sql

# Ensure SearchManager is initialized and index built
try:
from src.common.search_interfaces import SearchableAsset
from src.controller.search_manager import SearchManager

logger.info("Initializing SearchManager after data load (app.py)...")
searchable_managers_instances = []
for attr_name, manager_instance in list(getattr(app.state, '_state', {}).items()):
for attr_name, manager_instance in list(getattr(app.state, "_state", {}).items()):
try:
if isinstance(manager_instance, SearchableAsset) and hasattr(manager_instance, 'get_search_index_items'):
if isinstance(manager_instance, SearchableAsset) and hasattr(
manager_instance, "get_search_index_items"
):
searchable_managers_instances.append(manager_instance)
except Exception:
continue
Expand All @@ -172,11 +164,13 @@ async def startup_event():

logger.info("Application startup complete.")


# Application Shutdown Event
async def shutdown_event():
logger.info("Running application shutdown event...")
logger.info("Application shutdown complete.")


# --- FastAPI App Instantiation (AFTER defining lifecycle functions) ---

# Define paths
Expand All @@ -202,29 +196,23 @@ async def shutdown_event():
{"name": "Datasets", "description": "Manage datasets and dataset instances"},
{"name": "Data Contracts", "description": "Manage data contracts for data products"},
{"name": "Data Products", "description": "Manage data products and subscriptions"},

# Governance - Standards and approval workflows
{"name": "Compliance", "description": "Manage compliance policies and runs"},
{"name": "Approvals", "description": "Manage approval workflows"},
{"name": "Process Workflows", "description": "Manage process workflows"},
{"name": "Data Asset Reviews", "description": "Manage data asset review workflows"},

# Business Glossary - Semantic models and ontologies
{"name": "Semantic Models", "description": "Manage semantic models and ontologies"},
{"name": "Semantic Links", "description": "Manage semantic links between entities"},
{"name": "Industry Ontologies", "description": "Industry Ontology Library for importing standard ontologies"},

# Operations - Monitoring and technical management
{"name": "Estates", "description": "Manage data estates"},
{"name": "Master Data Management", "description": "Master data management features"},
{"name": "Catalog Commander", "description": "Dual-pane catalog explorer"},

# Security - Access control and security features
{"name": "Security Features", "description": "Advanced security features"},
{"name": "Entitlements", "description": "Manage entitlements and personas"},
{"name": "Entitlements Sync", "description": "Sync entitlements from external sources"},
{"name": "Access Grants", "description": "Manage time-limited access grants"},

# System - Utilities, configuration, auxiliary services
{"name": "Metadata", "description": "Manage metadata attachments"},
{"name": "Workspace", "description": "Workspace asset operations"},
Expand All @@ -250,7 +238,7 @@ async def shutdown_event():
dependencies=[Depends(get_settings)],
on_startup=[startup_event],
on_shutdown=[shutdown_event],
openapi_tags=openapi_tags
openapi_tags=openapi_tags,
)

# Configure CORS
Expand Down Expand Up @@ -278,7 +266,7 @@ async def shutdown_event():
app.add_middleware(LoggingMiddleware)

# Mount static files for the React application (skip in test mode)
if not os.environ.get('TESTING'):
if not os.environ.get("TESTING"):
app.mount("/static", StaticFiles(directory=STATIC_ASSETS_PATH, html=True), name="static")

# Data Products - Core data lifecycle
Expand All @@ -291,12 +279,12 @@ async def shutdown_event():
data_contracts_routes.register_routes(app)
data_product_routes.register_routes(app)
from src.routes import approvals_routes

approvals_routes.register_routes(app)

# Governance - Standards and approval workflows
semantic_models_routes.register_routes(app)
semantic_links_routes.register_routes(app)
industry_ontology_routes.register_routes(app) # Industry Ontology Library
data_asset_reviews_routes.register_routes(app)
data_catalog_routes.register_routes(app)

Expand Down Expand Up @@ -326,50 +314,61 @@ async def shutdown_event():
mcp_routes.register_routes(app)
mcp_tokens_routes.register_routes(app)
self_service_routes.register_routes(app)
graph_explorer_routes.register_routes(app)
workflows_routes.register_routes(app)
settings_routes.register_routes(app)


# Define other specific API routes BEFORE the catch-all
@app.get("/api/time")
async def get_current_time():
"""Get the current time (for testing purposes mostly)"""
return {'time': time.time()}
return {"time": time.time()}


@app.get("/api/cache-version")
async def get_cache_version():
"""Get the server cache version for client-side cache invalidation"""
return {'version': SERVER_STARTUP_TIME, 'timestamp': int(time.time())}
return {"version": SERVER_STARTUP_TIME, "timestamp": int(time.time())}


@app.get("/api/version")
async def get_app_version():
"""Get the application version and server start time"""
return {
'version': __version__,
'startTime': SERVER_STARTUP_TIME,
'timestamp': int(time.time())
}
return {"version": __version__, "startTime": SERVER_STARTUP_TIME, "timestamp": int(time.time())}


# Define the SPA catch-all route LAST (skip in test mode)
if not os.environ.get('TESTING'):
if not os.environ.get("TESTING"):

@app.get("/{full_path:path}")
def serve_spa(full_path: str):
# Only catch routes that aren't API routes, static files, or API docs
# This check might be redundant now due to ordering, but safe to keep
if not full_path.startswith("api/") and not full_path.startswith("static/") and full_path not in ["docs", "redoc", "openapi.json"]:
if (
not full_path.startswith("api/")
and not full_path.startswith("static/")
and full_path not in ["docs", "redoc", "openapi.json"]
):
# Ensure the path exists before serving
spa_index = STATIC_ASSETS_PATH / "index.html"
if spa_index.is_file():
return FileResponse(spa_index, media_type="text/html")
return FileResponse(spa_index, media_type="text/html")
else:
# Optional: Return a 404 or a simple HTML message if index.html is missing
logger.error(f"SPA index.html not found at {spa_index}")
return HTMLResponse(content="<html><body>Frontend not built or index.html missing.</body></html>", status_code=404)
# Optional: Return a 404 or a simple HTML message if index.html is missing
logger.error(f"SPA index.html not found at {spa_index}")
return HTMLResponse(
content="<html><body>Frontend not built or index.html missing.</body></html>",
status_code=404,
)
# If it starts with api/ or static/ but wasn't handled by a router/StaticFiles,
# FastAPI will return its default 404 Not Found, which is correct.
# No explicit return needed here for that case.


logger.info("All routes registered.")

if __name__ == '__main__':
if __name__ == "__main__":
import uvicorn

uvicorn.run(app, host="0.0.0.0", port=8000)
6 changes: 6 additions & 0 deletions src/backend/src/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,12 @@ class Settings(BaseSettings):
env='LLM_INJECTION_CHECK_PROMPT'
)

# Graph Explorer safety limits
GRAPH_MAX_NODES: int = Field(5000, env='GRAPH_MAX_NODES') # Max nodes returned from initial graph load
GRAPH_MAX_EDGES: int = Field(10000, env='GRAPH_MAX_EDGES') # Max edges returned from initial graph load
GRAPH_QUERY_TIMEOUT: str = Field("30s", env='GRAPH_QUERY_TIMEOUT') # SQL statement timeout
GRAPH_NEIGHBOR_LIMIT: int = Field(50, env='GRAPH_NEIGHBOR_LIMIT') # Default max neighbors per expansion

# Sandbox allowlist settings
sandbox_default_schema: str = Field('sandbox', validation_alias=AliasChoices('SANDBOX_DEFAULT_SCHEMA', 'sandbox_default_schema'))
sandbox_allowed_catalog_prefixes: List[str] = Field(default_factory=lambda: ['user_'], validation_alias=AliasChoices('SANDBOX_ALLOWED_CATALOG_PREFIXES', 'sandbox_allowed_catalog_prefixes'))
Expand Down
Loading