diff --git a/data-warehousing/README.md b/data-warehousing/README.md index 40d4d85..44ae1ed 100644 --- a/data-warehousing/README.md +++ b/data-warehousing/README.md @@ -1 +1,15 @@ # Data Warehousing + +## Projects + +### [Databricks Metric Views](./dbrx-metric-views/) + +A demo showcasing how to use Unity Catalog Metric Views in Databricks to define semantic models directly on the platform. Built on top of the [Retail Store Star Schema Dataset](https://www.kaggle.com/datasets/shrinivasv/retail-store-star-schema-dataset?select=fact_sales_denormalized.csv), it demonstrates how embedding your semantic layer in Databricks provides unified governance through Unity Catalog alongside optimal query performance — eliminating the need for external semantic modeling tools. + +### [Genie Space CI/CD](./genie-cicd/) + +An automated CI/CD pipeline for promoting Databricks AI/BI Genie spaces across environments. The project uses Databricks Asset Bundles (DABs) to export a Genie space configuration from a Dev workspace, version-control it in Git, and deploy it to a Prod workspace with automatic Unity Catalog catalog/schema reference replacement. It supports both creating new and updating existing Genie spaces, runs on serverless compute by default, and is ready to integrate with CI/CD platforms like GitHub Actions or Azure DevOps. + +### [Genie Room Creation](./genie-room-creation/) + +A Databricks notebook that enables programmatic creation of AI/BI Genie spaces using the Databricks Python SDK and interactive widgets. It provides a guided, widget-driven experience for configuring a new Genie space — including title, description, warehouse selection, table identifiers, and sample instructions — all without writing manual HTTP requests. The notebook also demonstrates advanced patterns such as listing existing spaces, customizing data sources with sample questions, and leveraging the SDK's built-in authentication and retry capabilities. diff --git a/data-warehousing/dbrx-metric-views/0_IngestData.ipynb b/data-warehousing/dbrx-metric-views/0_IngestData.ipynb new file mode 100644 index 0000000..88aa5cd --- /dev/null +++ b/data-warehousing/dbrx-metric-views/0_IngestData.ipynb @@ -0,0 +1,274 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "318cb8e6-9c5e-4fc3-bf7f-c3222c1c47b6", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "!pip install kagglehub\n", + "dbutils.library.restartPython()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9918dd73-dce5-4f93-92d8-fefdcbcb26c8", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "dbutils.widgets.text(\"CATALOG_NAME\", \"\", \"Catalog Name\")\n", + "dbutils.widgets.text(\"SCHEMA_NAME\", \"\", \"Schema Name\")\n", + "CATALOG_NAME = dbutils.widgets.get(\"CATALOG_NAME\")\n", + "SCHEMA_NAME = dbutils.widgets.get(\"SCHEMA_NAME\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Catalog, Schema, and Volume" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(f\"CREATE CATALOG IF NOT EXISTS {CATALOG_NAME}\")\n", + "spark.sql(f\"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.{SCHEMA_NAME}\")\n", + "spark.sql(f\"CREATE VOLUME IF NOT EXISTS {CATALOG_NAME}.{SCHEMA_NAME}.kaggle_files\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configure Kaggle Download Path" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "038ad156-d93b-4c82-81a2-415c23c1bb52", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "VOLUME_PATH = f\"/Volumes/{CATALOG_NAME}/{SCHEMA_NAME}/kaggle_files\"\n", + "os.environ[\"KAGGLEHUB_CACHE\"] = VOLUME_PATH" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download Dataset from Kaggle" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9277a6c9-63b8-4280-9995-f392d718d481", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "import kagglehub\n", + "\n", + "# Download latest version\n", + "path = kagglehub.dataset_download(\"shrinivasv/retail-store-star-schema-dataset\")\n", + "\n", + "print(\"Path to dataset files:\", path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load CSV Files into Delta Tables" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "80f1d37c-096a-4876-a7e4-1e782dc013f7", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "import glob\n", + "\n", + "csv_folder = path # path variable from kagglehub.dataset_download\n", + "csv_files = glob.glob(f\"{csv_folder}/*.csv\")\n", + "\n", + "for csv_file in csv_files:\n", + " table_name = os.path.splitext(os.path.basename(csv_file))[0]\n", + " df = spark.read.csv(csv_file, header=True, inferSchema=True)\n", + " # Clean column names: replace spaces with underscores\n", + " for c in df.columns:\n", + " df = df.withColumnRenamed(c, c.replace(\" \", \"_\"))\n", + " full_table_name = f\"{CATALOG_NAME}.{SCHEMA_NAME}.{table_name}\"\n", + " spark.sql(f\"DROP TABLE IF EXISTS {full_table_name}\")\n", + " df.write.saveAsTable(full_table_name)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": { + "hardware": { + "accelerator": null, + "gpuPoolId": null, + "memory": null + } + }, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 7680324554559970, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "0_IngestData", + "widgets": { + "CATALOG_NAME": { + "currentValue": "pedroz_catalog", + "nuid": "0f8987ce-14af-4a61-aec6-8f2bdf7e0859", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "pedroz_catalog", + "label": "Catalog Name", + "name": "CATALOG_NAME", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "pedroz_catalog", + "label": "Catalog Name", + "name": "CATALOG_NAME", + "options": { + "autoCreated": null, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + }, + "SCHEMA_NAME": { + "currentValue": "metric_views_schema", + "nuid": "4f108163-255b-4738-83a3-72a2e2d7dc19", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "metric_views_schema", + "label": "Schema Name", + "name": "SCHEMA_NAME", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "metric_views_schema", + "label": "Schema Name", + "name": "SCHEMA_NAME", + "options": { + "autoCreated": null, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + } + } + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/data-warehousing/dbrx-metric-views/1_CreateMetricView.ipynb b/data-warehousing/dbrx-metric-views/1_CreateMetricView.ipynb new file mode 100644 index 0000000..5194db5 --- /dev/null +++ b/data-warehousing/dbrx-metric-views/1_CreateMetricView.ipynb @@ -0,0 +1,596 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ae146916-a24a-4973-bd09-ad7cc50c8ea0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "The metric view below condenses the joins depicted in this ERD, and add calculated measures and columns on top of it:\n", + "\n", + "\n", + "Many advanced settings were explored to create this Metric View, such as:\n", + "- Using joins (nested and un-nested): https://docs.databricks.com/aws/en/metric-views/data-modeling/joins\n", + "- Adding semantic metadata: https://docs.databricks.com/aws/en/metric-views/data-modeling/semantic-metadata\n", + "- Using window measures: https://docs.databricks.com/aws/en/metric-views/data-modeling/window-measures\n", + "\n", + "Adding your semantic models directly in Databricks lets you benefit from using the Databricks compute for querying your data, despites also being integrated to the Unity Catalog, allowing for truly unified governance and optimal performance. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "89da16d1-c4f9-4e8d-8232-035e8263d083", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "dbutils.widgets.text(\"CATALOG_NAME\", \"\", \"Catalog Name\")\n", + "dbutils.widgets.text(\"SCHEMA_NAME\", \"\", \"Schema Name\")\n", + "CATALOG_NAME = dbutils.widgets.get(\"CATALOG_NAME\")\n", + "SCHEMA_NAME = dbutils.widgets.get(\"SCHEMA_NAME\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Metric View" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c1749c4c-ed90-440e-be91-63a07dbb0993", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "sql_statement = f\"\"\"\n", + "CREATE OR REPLACE VIEW {CATALOG_NAME}.{SCHEMA_NAME}.sales_relationships WITH METRICS LANGUAGE YAML AS \n", + "$$\n", + "version: 1.1\n", + "\n", + "source: {CATALOG_NAME}.{SCHEMA_NAME}.fact_sales_normalized\n", + "\n", + "joins:\n", + " # Star join between fact table and dim_campaigns\n", + " - name: dim_campaigns\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_campaigns\n", + " \"on\": source.campaign_sk = dim_campaigns.campaign_sk\n", + " # Snowflake joins from dim_campaigns to campaign start/end dates\n", + " joins:\n", + " - name: campaign_start_date\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_dates\n", + " \"on\": dim_campaigns.start_date_sk = campaign_start_date.date_sk\n", + " - name: campaign_end_date\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_dates\n", + " \"on\": dim_campaigns.end_date_sk = campaign_end_date.date_sk\n", + " \n", + " # Star join between fact table and dim_customers\n", + " - name: dim_customers\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_customers\n", + " \"on\": source.customer_sk = dim_customers.customer_sk\n", + " \n", + " # Star join between fact table and dim_dates\n", + " - name: dim_dates\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_dates\n", + " \"on\": source.sales_date = dim_dates.full_date\n", + " \n", + " # Star join between fact table and dim_products\n", + " - name: dim_products\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_products\n", + " \"on\": source.product_sk = dim_products.product_sk\n", + " \n", + " # Star join between fact table and dim_salespersons\n", + " - name: dim_salespersons\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_salespersons\n", + " \"on\": source.salesperson_sk = dim_salespersons.salesperson_sk\n", + "\n", + " # Star join between fact table and dim_stores\n", + " - name: dim_stores\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_stores\n", + " \"on\": source.store_sk = dim_stores.store_sk\n", + " # Snowflake join from dim_stores to store_manager_salesperson\n", + " joins:\n", + " - name: store_manager_salesperson\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_salespersons\n", + " \"on\": dim_stores.store_manager_sk = store_manager_salesperson.salesperson_sk\n", + "\n", + "comment: \"Metric view for analyzing sales relationships across campaigns, customers, dates, products, salespersons, and stores. Includes time-based dimensions and aggregated sales dimensions. Also includes all columns from dimension tables for richer analysis.\"\n", + "\n", + "dimensions:\n", + "\n", + " # Columns from fact table\n", + " - name: sales_date\n", + " expr: date(source.sales_date)\n", + " comment: Sales Date\n", + " display_name: Sales Date\n", + " synonyms: ['date of sale']\n", + " - name: total_amount\n", + " expr: total_amount\n", + " comment: Sales amount\n", + " display_name: Sales amount\n", + " synonyms: ['total amount', 'sale amount']\n", + "\n", + " # Dimension columns from dim_customers\n", + " - name: first_name\n", + " expr: dim_customers.first_name\n", + " comment: Customer first name\n", + " display_name: First Name\n", + " synonyms: ['customer first', 'given name']\n", + " - name: last_name\n", + " expr: dim_customers.last_name\n", + " comment: Customer last name\n", + " display_name: Last Name\n", + " synonyms: ['customer last', 'surname']\n", + " - name: customer_segment\n", + " expr: dim_customers.customer_segment\n", + " comment: Customer segment\n", + " display_name: Customer Segment\n", + " synonyms: ['segment', 'customer type']\n", + " - name: customer_email\n", + " expr: dim_customers.email\n", + " comment: Customer email\n", + " display_name: Customer Email\n", + " synonyms: ['email', 'contact email']\n", + " - name: customer_residential_location\n", + " expr: dim_customers.residential_location\n", + " comment: Customer residential location\n", + " display_name: Customer Residential Location\n", + " synonyms: ['residence', 'home location']\n", + " # Calculated customer columns\n", + " - name: customer_name\n", + " expr: dim_customers.first_name || ' ' || dim_customers.last_name\n", + " comment: Customer name\n", + " display_name: Customer Name\n", + " synonyms: ['name', 'full name']\n", + "\n", + " # Dimension columns from dim_stores\n", + " - name: store_name\n", + " expr: dim_stores.store_name\n", + " comment: Store name\n", + " display_name: Store Name\n", + " synonyms: ['location name', 'retail name']\n", + " - name: store_type\n", + " expr: dim_stores.store_type\n", + " comment: Store type\n", + " display_name: Store Type\n", + " synonyms: ['type', 'retail type']\n", + " - name: store_location\n", + " expr: dim_stores.store_location\n", + " comment: Store location\n", + " display_name: Store Location\n", + " synonyms: ['location', 'address']\n", + "\n", + " # Dimension columns from dim_products\n", + " - name: product_name\n", + " expr: dim_products.product_name\n", + " comment: Product name\n", + " display_name: Product Name\n", + " synonyms: ['item name', 'sku name']\n", + " - name: product_category\n", + " expr: dim_products.category\n", + " comment: Product category\n", + " display_name: Product Category\n", + " synonyms: ['category', 'product type']\n", + " - name: product_brand\n", + " expr: dim_products.brand\n", + " comment: Product brand\n", + " display_name: Brand\n", + " synonyms: ['brand', 'manufacturer']\n", + " - name: product_origin_location\n", + " expr: dim_products.origin_location\n", + " comment: Product origin location\n", + " display_name: Origin Location\n", + " synonyms: ['origin', 'source location']\n", + "\n", + " # Dimension columns from dim_campaigns\n", + " - name: campaign_name\n", + " expr: dim_campaigns.campaign_name\n", + " comment: Campaign name\n", + " display_name: Campaign Name\n", + " synonyms: ['promotion name', 'marketing name']\n", + " - name: campaign_budget\n", + " expr: dim_campaigns.campaign_budget\n", + " comment: Campaign budget\n", + " display_name: Campaign Budget\n", + " synonyms: ['budget', 'promotion budget']\n", + "\n", + " # Dimension columns from dim_dates\n", + " - name: full_date\n", + " expr: dim_dates.full_date\n", + " comment: Full date\n", + " display_name: Full Date\n", + " synonyms: ['date', 'transaction date']\n", + " - name: year\n", + " expr: dim_dates.year\n", + " comment: Year\n", + " display_name: Year\n", + " synonyms: ['calendar year', 'fiscal year']\n", + " - name: month\n", + " expr: dim_dates.month\n", + " comment: Month\n", + " display_name: Month\n", + " synonyms: ['calendar month', 'fiscal month']\n", + " - name: day\n", + " expr: dim_dates.day\n", + " comment: Day\n", + " display_name: Day\n", + " synonyms: ['calendar day', 'date day']\n", + " - name: weekday\n", + " expr: dim_dates.weekday\n", + " comment: Weekday\n", + " display_name: Weekday\n", + " synonyms: ['day of week', 'weekday name']\n", + " - name: quarter\n", + " expr: dim_dates.quarter\n", + " comment: Quarter\n", + " display_name: Quarter\n", + " synonyms: ['fiscal quarter', 'calendar quarter']\n", + " # Calculated datetime columns\n", + " - name: week\n", + " expr: \"date_trunc('week', sales_date)\"\n", + " comment: Week of the sale\n", + " display_name: Week\n", + " format:\n", + " type: date\n", + " date_format: year_week\n", + " leading_zeros: false\n", + " synonyms: ['sales week', 'transaction week']\n", + " - name: dayOfWeek\n", + " expr: dayofweek(sales_date)\n", + " comment: \"Day of the week for the sale (1=Sunday, 7=Saturday)\"\n", + " display_name: Day of Week\n", + " format:\n", + " type: number\n", + " synonyms: ['weekday number', 'day index']\n", + " - name: YearMonth\n", + " expr: \"date_trunc('month', sales_date)\"\n", + " comment: Year and month of the sale\n", + " display_name: Year Month\n", + " format:\n", + " type: date\n", + " date_format: locale_number_month\n", + " leading_zeros: false\n", + " synonyms: ['month', 'year and month']\n", + "\n", + " # Dimension columns from dim_salespersons\n", + " - name: salesperson_name\n", + " expr: dim_salespersons.salesperson_name\n", + " comment: Salesperson name\n", + " display_name: Salesperson Name\n", + " synonyms: ['rep name', 'employee name']\n", + " - name: salesperson_role\n", + " expr: dim_salespersons.salesperson_role\n", + " comment: Salesperson role\n", + " display_name: Salesperson Role\n", + " synonyms: ['role', 'job title']\n", + "\n", + "measures:\n", + " # Calculated measures\n", + " - name: sales_sum\n", + " expr: SUM(source.total_amount)\n", + " comment: Total sales amount\n", + " display_name: Total Sales\n", + " synonyms: ['revenue', 'total sales']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_avg\n", + " expr: AVG(source.total_amount)\n", + " comment: Average sales amount\n", + " display_name: Average Sales\n", + " synonyms: ['average revenue', 'mean sales']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_stddev\n", + " expr: STDDEV(source.total_amount)\n", + " comment: Standard deviation of sales amount\n", + " display_name: Sales Standard Deviation\n", + " synonyms: ['sales variability', 'sales stddev']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_median\n", + " expr: percentile(source.total_amount, 0.5)\n", + " comment: Median sales amount\n", + " display_name: Sales Median\n", + " synonyms: ['median revenue', 'median sales']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_quartile1\n", + " expr: percentile(source.total_amount, 0.25)\n", + " comment: First quartile (Q1) of sales amount\n", + " display_name: Sales Quartile 1\n", + " synonyms: ['Q1 sales', 'first quartile']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_quartile2\n", + " expr: percentile(source.total_amount, 0.5)\n", + " comment: Second quartile (Q2/Median) of sales amount\n", + " display_name: Sales Quartile 2 (Median)\n", + " synonyms: ['Q2 sales', 'median']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_quartile3\n", + " expr: percentile(source.total_amount, 0.75)\n", + " comment: Third quartile (Q3) of sales amount\n", + " display_name: Sales Quartile 3\n", + " synonyms: ['Q3 sales', 'third quartile']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_quartile4\n", + " expr: percentile(source.total_amount, 1.0)\n", + " comment: Fourth quartile (Q4/Max) of sales amount\n", + " display_name: Sales Quartile 4 (Max)\n", + " synonyms: ['Q4 sales', 'max sales']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_largest\n", + " expr: MAX(source.total_amount)\n", + " comment: Largest Sale\n", + " display_name: Largest Sale\n", + " synonyms: ['max sale', 'highest sale']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_smallest\n", + " expr: MIN(source.total_amount)\n", + " comment: Smallest Sale\n", + " display_name: Smallest Sale\n", + " synonyms: ['min sale', 'lowest sale']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_mode\n", + " expr: mode(source.total_amount)\n", + " comment: Mode of sales amount\n", + " display_name: Sales Mode\n", + " synonyms: ['most common sale', 'mode']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + "\n", + " # Window measures\n", + " - name: previous_day_sales\n", + " expr: SUM(total_amount)\n", + " comment: Previous Day Sales\n", + " display_name: Previous Day Sales\n", + " synonyms: ['last day sales', 'yesterday sales']\n", + " window:\n", + " - order: sales_date\n", + " range: trailing 1 day\n", + " semiadditive: last\n", + " - name: current_day_sales\n", + " expr: SUM(total_amount)\n", + " comment: Current Day Sales\n", + " display_name: Current Day Sales\n", + " synonyms: ['today sales']\n", + " window:\n", + " - order: sales_date\n", + " range: current\n", + " semiadditive: last\n", + " - name: day_over_day_growth\n", + " expr: (MEASURE(current_day_sales) - MEASURE(previous_day_sales)) / MEASURE(previous_day_sales) * 100\n", + "\n", + " - name: running_total_sales\n", + " expr: SUM(total_amount)\n", + " comment: Running Total Sales\n", + " display_name: Running Total Sales\n", + " synonyms: ['running sales']\n", + " window:\n", + " - order: sales_date\n", + " range: cumulative\n", + " semiadditive: last\n", + "\n", + " - name: ytd_sales\n", + " expr: SUM(total_amount)\n", + " comment: YTD Sales\n", + " display_name: YTD Sales\n", + " synonyms: ['year-to-date sales']\n", + " window:\n", + " - order: sales_date\n", + " range: cumulative\n", + " semiadditive: last\n", + " - order: year\n", + " range: current\n", + " semiadditive: last\n", + "\n", + " - name: t7d_customers\n", + " expr: COUNT(DISTINCT customer_sk)\n", + " comment: Customers last 7 days\n", + " display_name: Customers last 7 days\n", + " synonyms: ['last 7 days customers', 'number of customers last 7 days']\n", + " window:\n", + " - order: sales_date\n", + " range: trailing 7 day\n", + " semiadditive: last\n", + "\n", + " - name: t30d_customers\n", + " expr: COUNT(DISTINCT customer_sk)\n", + " comment: Customers last 30 days\n", + " display_name: Customers last 30 days\n", + " synonyms: ['last 30 days customers', 'number of customers last 30 days']\n", + " window:\n", + " - order: sales_date\n", + " range: trailing 30 day\n", + " semiadditive: last\n", + "\n", + "$$\n", + "\"\"\"\n", + "\n", + "spark.sql(sql_statement)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 7680324554526009, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "1_CreateMetricView", + "widgets": { + "CATALOG_NAME": { + "currentValue": "pedroz_catalog", + "nuid": "a1da4dd4-31bf-443a-957c-041e6fa5b700", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "pedroz_catalog", + "label": "Catalog Name", + "name": "CATALOG_NAME", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "pedroz_catalog", + "label": "Catalog Name", + "name": "CATALOG_NAME", + "options": { + "autoCreated": null, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + }, + "SCHEMA_NAME": { + "currentValue": "metric_views_schema", + "nuid": "27f37404-389a-4949-bb98-a07b86e3d7f0", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "metric_views_schema", + "label": "Schema Name", + "name": "SCHEMA_NAME", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "metric_views_schema", + "label": "Schema Name", + "name": "SCHEMA_NAME", + "options": { + "autoCreated": null, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + } + } + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/data-warehousing/dbrx-metric-views/2_QueryMetricView.ipynb b/data-warehousing/dbrx-metric-views/2_QueryMetricView.ipynb new file mode 100644 index 0000000..967ffab --- /dev/null +++ b/data-warehousing/dbrx-metric-views/2_QueryMetricView.ipynb @@ -0,0 +1,385 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b2f360c8-b4ec-44cf-81b4-67fa3e4be576", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "!pip install databricks-sdk -U\n", + "dbutils.library.restartPython()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Query Metric View" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c3a3034c-fa9b-4f1c-b84e-3accfd298a12", + "showTitle": false, + "tableResultSettingsMap": { + "0": { + "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1766450269016}", + "filterBlob": null, + "queryPlanFiltersBlob": null, + "tableResultIndex": 0 + } + }, + "title": "" + } + }, + "outputs": [], + "source": [ + "dbutils.widgets.text(\"CATALOG_NAME\", \"\", \"Catalog Name\")\n", + "dbutils.widgets.text(\"SCHEMA_NAME\", \"\", \"Schema Name\")\n", + "CATALOG_NAME = dbutils.widgets.get(\"CATALOG_NAME\")\n", + "SCHEMA_NAME = dbutils.widgets.get(\"SCHEMA_NAME\")\n", + "\n", + "# This is just a simple query example, demonstrating that you can query Metric Views just like any other view!\n", + "df = spark.sql(f\"\"\"\n", + "SELECT\n", + " dayOfWeek,\n", + " MEASURE(`sales_sum`),\n", + " MEASURE(`sales_avg`),\n", + " MEASURE(`sales_stddev`),\n", + " MEASURE(`sales_quartile1`),\n", + " MEASURE(`sales_quartile2`),\n", + " MEASURE(`sales_quartile3`),\n", + " MEASURE(`sales_quartile4`),\n", + " MEASURE(`sales_largest`),\n", + " MEASURE(`sales_smallest`)\n", + "FROM {CATALOG_NAME}.{SCHEMA_NAME}.sales_relationships\n", + "GROUP BY ALL\n", + "ORDER BY dayOfWeek\n", + "\"\"\")\n", + "display(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Genie Space" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "101f839d-1f1a-44fd-8b94-a93e283a3c98", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "You can also consume Metric Views in Genie:\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a90ba13a-b72b-462d-bc5f-c6374485adbc", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Check out the sample serialized Genie space [./dashboard_and_genie/genie_space.json](./dashboard_and_genie/dashboard.json). \n", + "You can use the code snippet below to automatically create it in your workspace, it uses the [Create Genie Space](https://docs.databricks.com/api/workspace/genie/createspace) API. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "78273a45-f363-404c-b7e8-1737e98d718c", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "import json\n", + "from databricks.sdk import WorkspaceClient\n", + "\n", + "# Load the serialized Genie space from the JSON file\n", + "with open('./dashboard_and_genie/genie_space.json', 'r') as f:\n", + " genie_space_json_str = f.read()\n", + "\n", + "# Replace CATALOG_NAME and SCHEMA_NAME placeholders with parameter values\n", + "catalog_name = dbutils.widgets.get(\"CATALOG_NAME\")\n", + "schema_name = dbutils.widgets.get(\"SCHEMA_NAME\")\n", + "genie_space_json_str = genie_space_json_str.replace(\"CATALOG_NAME\", catalog_name).replace(\"SCHEMA_NAME\", schema_name)\n", + "\n", + "# Specify the warehouse ID (replace with your actual warehouse ID)\n", + "dbutils.widgets.text(\"warehouse_id\", \"\", \"Warehouse ID\")\n", + "warehouse_id = dbutils.widgets.get(\"warehouse_id\")\n", + "\n", + "# Prepare the payload as required\n", + "payload = {\n", + " \"serialized_space\": genie_space_json_str,\n", + " \"warehouse_id\": warehouse_id,\n", + " \"title\": \"Sample Sales Genie Space (created via API)\"\n", + "}\n", + "\n", + "# Initialize WorkspaceClient (will use DATABRICKS_HOST and DATABRICKS_TOKEN from env)\n", + "w = WorkspaceClient()\n", + "workspace_url = w.config.host\n", + "pat_token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()\n", + "\n", + "import requests\n", + "\n", + "headers = {\n", + " \"Authorization\": f\"Bearer {pat_token}\",\n", + " \"Content-Type\": \"application/json\"\n", + "}\n", + "\n", + "# POST to Genie Create Space API\n", + "response = requests.post(\n", + " f\"{workspace_url}/api/2.0/genie/spaces\",\n", + " headers=headers,\n", + " data=json.dumps(payload)\n", + ")\n", + "\n", + "response.raise_for_status()\n", + "space_info = response.json()\n", + "print(f\"Created Genie space with ID *{space_info['space_id']}* and name *{space_info['title']}*\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create AIBI Dashboards" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8281f8e3-dd67-4639-be89-98a61f268eb4", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "And create dashboards using it! \n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "07d7f356-cd20-4182-ac6f-f19c34aba167", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Check out the [./dashboard_and_genie/dashboard.lvdash.json](./dashboard_and_genie/dashboard.lvdash.json) dashboard for a simple dashboard that can be used as an example to show how you can consume Metric Views using AIBI Dashboards. \n", + "\n", + "**Note**: You will need to replace the catalog and schema names from the dashboard.lvdash.json file from **CATALOG_NAME** and **SCHEMA_NAME** to your desired catalog and schema names. The script below does that automatically, and creates another .lvdash.json file which you can edit called [./dashboard_and_genie/dashboard_parameterized.lvdash.json](./dashboard_and_genie/dashboard_parameterized.lvdash.json)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "27d714f9-39f9-4df3-8c7f-8d6b8f1ff6e3", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "catalog_name = dbutils.widgets.get(\"CATALOG_NAME\")\n", + "schema_name = dbutils.widgets.get(\"SCHEMA_NAME\")\n", + "\n", + "with open('./dashboard_and_genie/dashboard.lvdash.json', 'r') as f:\n", + " dashboard_json_str = f.read()\n", + "\n", + "dashboard_json_str = dashboard_json_str.replace(\"CATALOG_NAME\", catalog_name).replace(\"SCHEMA_NAME\", schema_name)\n", + "\n", + "with open('./dashboard_and_genie/dashboard_parameterized.lvdash.json', 'w') as f:\n", + " f.write(dashboard_json_str)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": -1, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "2_QueryMetricView", + "widgets": { + "CATALOG_NAME": { + "currentValue": "pedroz_catalog", + "nuid": "06f74b0f-eecd-4def-b995-d03bad17de04", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "pedroz_catalog", + "label": null, + "name": "CATALOG_NAME", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "pedroz_catalog", + "label": null, + "name": "CATALOG_NAME", + "options": { + "autoCreated": null, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + }, + "SCHEMA_NAME": { + "currentValue": "metric_views_schema", + "nuid": "d3fbedb2-1ada-41bb-b5d0-168b130f970a", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "metric_views_schema", + "label": null, + "name": "SCHEMA_NAME", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "metric_views_schema", + "label": null, + "name": "SCHEMA_NAME", + "options": { + "autoCreated": null, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + }, + "warehouse_id": { + "currentValue": "8baced1ff014912d", + "nuid": "7ed415c1-0a75-4920-9494-41bd4c6edfa7", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "8baced1ff014912d", + "label": null, + "name": "warehouse_id", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "8baced1ff014912d", + "label": null, + "name": "warehouse_id", + "options": { + "autoCreated": null, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + } + } + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/data-warehousing/dbrx-metric-views/3_MaterializeMetricView.ipynb b/data-warehousing/dbrx-metric-views/3_MaterializeMetricView.ipynb new file mode 100644 index 0000000..d2ef817 --- /dev/null +++ b/data-warehousing/dbrx-metric-views/3_MaterializeMetricView.ipynb @@ -0,0 +1,789 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "aaffdcfa-ecd7-444b-8e36-e3cdc6490ad2", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Materialization for metric views is an [experimental](https://docs.databricks.com/aws/en/release-notes/release-types) feature. The idea is that you can create a materialized view on top of a metric view definition, and that will improve query performance. \n", + "\n", + "Check out more details about it here: https://docs.databricks.com/aws/en/metric-views/materialization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9c2362d4-df39-4726-a68a-0879a54f74a0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "dbutils.widgets.text(\"CATALOG_NAME\", \"\", \"Catalog Name\")\n", + "dbutils.widgets.text(\"SCHEMA_NAME\", \"\", \"Schema Name\")\n", + "CATALOG_NAME = dbutils.widgets.get(\"CATALOG_NAME\")\n", + "SCHEMA_NAME = dbutils.widgets.get(\"SCHEMA_NAME\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the metric view definition below is basically the same definition that we applied for the creation of the metric view from script ./1_CreateMetricView.ipynb. \n", + "\n", + "The main difference lies on the \"materialization\" key, which is introduced in this notebook, and shows how you can materialize a metric view (either unaggregated or aggregated) in a schedule." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "23ad13dc-894b-4c3f-9b26-4390697ebb59", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "sql_statement = f\"\"\"\n", + "CREATE OR REPLACE VIEW {CATALOG_NAME}.{SCHEMA_NAME}.materialized_sales_relationships WITH METRICS LANGUAGE YAML AS \n", + "$$\n", + "version: 1.1\n", + "\n", + "source: {CATALOG_NAME}.{SCHEMA_NAME}.fact_sales_normalized\n", + "\n", + "joins:\n", + " # Star join between fact table and dim_campaigns\n", + " - name: dim_campaigns\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_campaigns\n", + " \"on\": source.campaign_sk = dim_campaigns.campaign_sk\n", + " # Snowflake joins from dim_campaigns to campaign start/end dates\n", + " joins:\n", + " - name: campaign_start_date\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_dates\n", + " \"on\": dim_campaigns.start_date_sk = campaign_start_date.date_sk\n", + " - name: campaign_end_date\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_dates\n", + " \"on\": dim_campaigns.end_date_sk = campaign_end_date.date_sk\n", + " \n", + " # Star join between fact table and dim_customers\n", + " - name: dim_customers\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_customers\n", + " \"on\": source.customer_sk = dim_customers.customer_sk\n", + " \n", + " # Star join between fact table and dim_dates\n", + " - name: dim_dates\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_dates\n", + " \"on\": source.sales_date = dim_dates.full_date\n", + " \n", + " # Star join between fact table and dim_products\n", + " - name: dim_products\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_products\n", + " \"on\": source.product_sk = dim_products.product_sk\n", + " \n", + " # Star join between fact table and dim_salespersons\n", + " - name: dim_salespersons\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_salespersons\n", + " \"on\": source.salesperson_sk = dim_salespersons.salesperson_sk\n", + "\n", + " # Star join between fact table and dim_stores\n", + " - name: dim_stores\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_stores\n", + " \"on\": source.store_sk = dim_stores.store_sk\n", + " # Snowflake join from dim_stores to store_manager_salesperson\n", + " joins:\n", + " - name: store_manager_salesperson\n", + " source: {CATALOG_NAME}.{SCHEMA_NAME}.dim_salespersons\n", + " \"on\": dim_stores.store_manager_sk = store_manager_salesperson.salesperson_sk\n", + "\n", + "comment: \"Metric view for analyzing sales relationships across campaigns, customers, dates, products, salespersons, and stores. Includes time-based dimensions and aggregated sales dimensions. Also includes all columns from dimension tables for richer analysis.\"\n", + "\n", + "dimensions:\n", + "\n", + " # Columns from fact table\n", + " - name: sales_date\n", + " expr: date(source.sales_date)\n", + " comment: Sales Date\n", + " display_name: Sales Date\n", + " synonyms: ['date of sale']\n", + " - name: total_amount\n", + " expr: total_amount\n", + " comment: Sales amount\n", + " display_name: Sales amount\n", + " synonyms: ['total amount', 'sale amount']\n", + "\n", + " # Dimension columns from dim_customers\n", + " - name: first_name\n", + " expr: dim_customers.first_name\n", + " comment: Customer first name\n", + " display_name: First Name\n", + " synonyms: ['customer first', 'given name']\n", + " - name: last_name\n", + " expr: dim_customers.last_name\n", + " comment: Customer last name\n", + " display_name: Last Name\n", + " synonyms: ['customer last', 'surname']\n", + " - name: customer_segment\n", + " expr: dim_customers.customer_segment\n", + " comment: Customer segment\n", + " display_name: Customer Segment\n", + " synonyms: ['segment', 'customer type']\n", + " - name: customer_email\n", + " expr: dim_customers.email\n", + " comment: Customer email\n", + " display_name: Customer Email\n", + " synonyms: ['email', 'contact email']\n", + " - name: customer_residential_location\n", + " expr: dim_customers.residential_location\n", + " comment: Customer residential location\n", + " display_name: Customer Residential Location\n", + " synonyms: ['residence', 'home location']\n", + " # Calculated customer columns\n", + " - name: customer_name\n", + " expr: dim_customers.first_name || ' ' || dim_customers.last_name\n", + " comment: Customer name\n", + " display_name: Customer Name\n", + " synonyms: ['name', 'full name']\n", + "\n", + " # Dimension columns from dim_stores\n", + " - name: store_name\n", + " expr: dim_stores.store_name\n", + " comment: Store name\n", + " display_name: Store Name\n", + " synonyms: ['location name', 'retail name']\n", + " - name: store_type\n", + " expr: dim_stores.store_type\n", + " comment: Store type\n", + " display_name: Store Type\n", + " synonyms: ['type', 'retail type']\n", + " - name: store_location\n", + " expr: dim_stores.store_location\n", + " comment: Store location\n", + " display_name: Store Location\n", + " synonyms: ['location', 'address']\n", + "\n", + " # Dimension columns from dim_products\n", + " - name: product_name\n", + " expr: dim_products.product_name\n", + " comment: Product name\n", + " display_name: Product Name\n", + " synonyms: ['item name', 'sku name']\n", + " - name: product_category\n", + " expr: dim_products.category\n", + " comment: Product category\n", + " display_name: Product Category\n", + " synonyms: ['category', 'product type']\n", + " - name: product_brand\n", + " expr: dim_products.brand\n", + " comment: Product brand\n", + " display_name: Brand\n", + " synonyms: ['brand', 'manufacturer']\n", + " - name: product_origin_location\n", + " expr: dim_products.origin_location\n", + " comment: Product origin location\n", + " display_name: Origin Location\n", + " synonyms: ['origin', 'source location']\n", + "\n", + " # Dimension columns from dim_campaigns\n", + " - name: campaign_name\n", + " expr: dim_campaigns.campaign_name\n", + " comment: Campaign name\n", + " display_name: Campaign Name\n", + " synonyms: ['promotion name', 'marketing name']\n", + " - name: campaign_budget\n", + " expr: dim_campaigns.campaign_budget\n", + " comment: Campaign budget\n", + " display_name: Campaign Budget\n", + " synonyms: ['budget', 'promotion budget']\n", + "\n", + " # Dimension columns from dim_dates\n", + " - name: full_date\n", + " expr: dim_dates.full_date\n", + " comment: Full date\n", + " display_name: Full Date\n", + " synonyms: ['date', 'transaction date']\n", + " - name: year\n", + " expr: dim_dates.year\n", + " comment: Year\n", + " display_name: Year\n", + " synonyms: ['calendar year', 'fiscal year']\n", + " - name: month\n", + " expr: dim_dates.month\n", + " comment: Month\n", + " display_name: Month\n", + " synonyms: ['calendar month', 'fiscal month']\n", + " - name: day\n", + " expr: dim_dates.day\n", + " comment: Day\n", + " display_name: Day\n", + " synonyms: ['calendar day', 'date day']\n", + " - name: weekday\n", + " expr: dim_dates.weekday\n", + " comment: Weekday\n", + " display_name: Weekday\n", + " synonyms: ['day of week', 'weekday name']\n", + " - name: quarter\n", + " expr: dim_dates.quarter\n", + " comment: Quarter\n", + " display_name: Quarter\n", + " synonyms: ['fiscal quarter', 'calendar quarter']\n", + " # Calculated datetime columns\n", + " - name: week\n", + " expr: \"date_trunc('week', sales_date)\"\n", + " comment: Week of the sale\n", + " display_name: Week\n", + " format:\n", + " type: date\n", + " date_format: year_week\n", + " leading_zeros: false\n", + " synonyms: ['sales week', 'transaction week']\n", + " - name: dayOfWeek\n", + " expr: dayofweek(sales_date)\n", + " comment: \"Day of the week for the sale (1=Sunday, 7=Saturday)\"\n", + " display_name: Day of Week\n", + " format:\n", + " type: number\n", + " synonyms: ['weekday number', 'day index']\n", + " - name: YearMonth\n", + " expr: \"date_trunc('month', sales_date)\"\n", + " comment: Year and month of the sale\n", + " display_name: Year Month\n", + " format:\n", + " type: date\n", + " date_format: locale_number_month\n", + " leading_zeros: false\n", + " synonyms: ['month', 'year and month']\n", + "\n", + " # Dimension columns from dim_salespersons\n", + " - name: salesperson_name\n", + " expr: dim_salespersons.salesperson_name\n", + " comment: Salesperson name\n", + " display_name: Salesperson Name\n", + " synonyms: ['rep name', 'employee name']\n", + " - name: salesperson_role\n", + " expr: dim_salespersons.salesperson_role\n", + " comment: Salesperson role\n", + " display_name: Salesperson Role\n", + " synonyms: ['role', 'job title']\n", + "\n", + "measures:\n", + " # Calculated measures\n", + " - name: sales_sum\n", + " expr: SUM(total_amount)\n", + " comment: Total sales amount\n", + " display_name: Total Sales\n", + " synonyms: ['revenue', 'total sales']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_avg\n", + " expr: AVG(total_amount)\n", + " comment: Average sales amount\n", + " display_name: Average Sales\n", + " synonyms: ['average revenue', 'mean sales']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_stddev\n", + " expr: STDDEV(total_amount)\n", + " comment: Standard deviation of sales amount\n", + " display_name: Sales Standard Deviation\n", + " synonyms: ['sales variability', 'sales stddev']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_median\n", + " expr: percentile(total_amount, 0.5)\n", + " comment: Median sales amount\n", + " display_name: Sales Median\n", + " synonyms: ['median revenue', 'median sales']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_quartile1\n", + " expr: percentile(total_amount, 0.25)\n", + " comment: First quartile (Q1) of sales amount\n", + " display_name: Sales Quartile 1\n", + " synonyms: ['Q1 sales', 'first quartile']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_quartile2\n", + " expr: percentile(total_amount, 0.5)\n", + " comment: Second quartile (Q2/Median) of sales amount\n", + " display_name: Sales Quartile 2 (Median)\n", + " synonyms: ['Q2 sales', 'median']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_quartile3\n", + " expr: percentile(total_amount, 0.75)\n", + " comment: Third quartile (Q3) of sales amount\n", + " display_name: Sales Quartile 3\n", + " synonyms: ['Q3 sales', 'third quartile']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_quartile4\n", + " expr: percentile(total_amount, 1.0)\n", + " comment: Fourth quartile (Q4/Max) of sales amount\n", + " display_name: Sales Quartile 4 (Max)\n", + " synonyms: ['Q4 sales', 'max sales']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_largest\n", + " expr: MAX(total_amount)\n", + " comment: Largest Sale\n", + " display_name: Largest Sale\n", + " synonyms: ['max sale', 'highest sale']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_smallest\n", + " expr: MIN(total_amount)\n", + " comment: Smallest Sale\n", + " display_name: Smallest Sale\n", + " synonyms: ['min sale', 'lowest sale']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + " - name: sales_mode\n", + " expr: mode(total_amount)\n", + " comment: Mode of sales amount\n", + " display_name: Sales Mode\n", + " synonyms: ['most common sale', 'mode']\n", + " format:\n", + " type: currency\n", + " currency_code: USD\n", + " decimal_places:\n", + " type: exact\n", + " places: 2\n", + " abbreviation: compact\n", + "\n", + " # Window measures\n", + " - name: previous_day_sales\n", + " expr: SUM(total_amount)\n", + " comment: Previous Day Sales\n", + " display_name: Previous Day Sales\n", + " synonyms: ['last day sales', 'yesterday sales']\n", + " window:\n", + " - order: sales_date\n", + " range: trailing 1 day\n", + " semiadditive: last\n", + " - name: current_day_sales\n", + " expr: SUM(total_amount)\n", + " comment: Current Day Sales\n", + " display_name: Current Day Sales\n", + " synonyms: ['today sales']\n", + " window:\n", + " - order: sales_date\n", + " range: current\n", + " semiadditive: last\n", + " - name: day_over_day_growth\n", + " expr: (MEASURE(current_day_sales) - MEASURE(previous_day_sales)) / MEASURE(previous_day_sales) * 100\n", + "\n", + " - name: running_total_sales\n", + " expr: SUM(total_amount)\n", + " comment: Running Total Sales\n", + " display_name: Running Total Sales\n", + " synonyms: ['running sales']\n", + " window:\n", + " - order: sales_date\n", + " range: cumulative\n", + " semiadditive: last\n", + "\n", + " - name: ytd_sales\n", + " expr: SUM(total_amount)\n", + " comment: YTD Sales\n", + " display_name: YTD Sales\n", + " synonyms: ['year-to-date sales']\n", + " window:\n", + " - order: sales_date\n", + " range: cumulative\n", + " semiadditive: last\n", + " - order: year\n", + " range: current\n", + " semiadditive: last\n", + "\n", + " - name: t7d_customers\n", + " expr: COUNT(DISTINCT customer_sk)\n", + " comment: Customers last 7 days\n", + " display_name: Customers last 7 days\n", + " synonyms: ['last 7 days customers', 'number of customers last 7 days']\n", + " window:\n", + " - order: sales_date\n", + " range: trailing 7 day\n", + " semiadditive: last\n", + "\n", + " - name: t30d_customers\n", + " expr: COUNT(DISTINCT customer_sk)\n", + " comment: Customers last 30 days\n", + " display_name: Customers last 30 days\n", + " synonyms: ['last 30 days customers', 'number of customers last 30 days']\n", + " window:\n", + " - order: sales_date\n", + " range: trailing 30 day\n", + " semiadditive: last\n", + "\n", + "materialization:\n", + " schedule: every 10 hours\n", + " mode: relaxed\n", + "\n", + " materialized_views:\n", + " - name: baseline\n", + " type: unaggregated\n", + " \n", + " - name: day_of_week_aggregation\n", + " type: aggregated\n", + " dimensions:\n", + " - dayOfWeek\n", + " measures:\n", + " - sales_sum\n", + " - sales_avg\n", + " - sales_stddev\n", + " - sales_quartile1\n", + " - sales_quartile2\n", + " - sales_quartile3\n", + " - sales_quartile4\n", + " - sales_largest\n", + " - sales_smallest\n", + "\n", + "$$\n", + "\"\"\"\n", + "\n", + "spark.sql(sql_statement)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4ae59ecf-2b57-4763-b0eb-0c13349ab7c4", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "display(spark.sql(f\"DESCRIBE EXTENDED {CATALOG_NAME}.{SCHEMA_NAME}.materialized_sales_relationships\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Describe Metric View" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Query Non-Materialized Metric View" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a191f22e-2e5b-4d0a-9fe3-bb0186d52e58", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# QUERY METRIC VIEW DIRECTLY\n", + "df = spark.sql(f\"\"\"\n", + "SELECT\n", + " dayOfWeek,\n", + " MEASURE(`sales_sum`),\n", + " MEASURE(`sales_avg`),\n", + " MEASURE(`sales_stddev`),\n", + " MEASURE(`sales_quartile1`),\n", + " MEASURE(`sales_quartile2`),\n", + " MEASURE(`sales_quartile3`),\n", + " MEASURE(`sales_quartile4`),\n", + " MEASURE(`sales_largest`),\n", + " MEASURE(`sales_smallest`)\n", + "FROM {CATALOG_NAME}.{SCHEMA_NAME}.sales_relationships\n", + "GROUP BY ALL\n", + "ORDER BY dayOfWeek\n", + "\"\"\")\n", + "display(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Query Materialized Metric View" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "dcac9bae-2a84-471a-b1dd-76d6fbb5db00", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# QUERY MATERIALIZED METRIC VIEW\n", + "df = spark.sql(f\"\"\"\n", + "SELECT\n", + " dayOfWeek,\n", + " MEASURE(`sales_sum`),\n", + " MEASURE(`sales_avg`),\n", + " MEASURE(`sales_stddev`),\n", + " MEASURE(`sales_quartile1`),\n", + " MEASURE(`sales_quartile2`),\n", + " MEASURE(`sales_quartile3`),\n", + " MEASURE(`sales_quartile4`),\n", + " MEASURE(`sales_largest`),\n", + " MEASURE(`sales_smallest`)\n", + "FROM {CATALOG_NAME}.{SCHEMA_NAME}.materialized_sales_relationships\n", + "GROUP BY ALL\n", + "ORDER BY dayOfWeek\n", + "\"\"\")\n", + "display(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Queries like the one below should run much faster, as the query engine will understand that it can use the materialized results!\n", + "\n", + "To check if a query is using a materialized view, run `EXPLAIN EXTENDED` on your query to see the query plan. If the query is using materialized views, the leaf node includes `__materialization_mat` and the name of the materialization from the YAML file. \n", + "\n", + "You can also check the materialized metric view in UC UI, and from there you'll be able to see the refresh status and frequency of updates.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4e43fede-1398-43b1-843f-bc33305dd6c9", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# QUERY MATERIALIZED METRIC VIEW\n", + "df = spark.sql(f\"\"\"\n", + "EXPLAIN EXTENDED\n", + "SELECT\n", + " dayOfWeek,\n", + " MEASURE(`sales_sum`),\n", + " MEASURE(`sales_avg`),\n", + " MEASURE(`sales_stddev`),\n", + " MEASURE(`sales_quartile1`),\n", + " MEASURE(`sales_quartile2`),\n", + " MEASURE(`sales_quartile3`),\n", + " MEASURE(`sales_quartile4`),\n", + " MEASURE(`sales_largest`),\n", + " MEASURE(`sales_smallest`)\n", + "FROM {CATALOG_NAME}.{SCHEMA_NAME}.materialized_sales_relationships\n", + "GROUP BY ALL\n", + "ORDER BY dayOfWeek\n", + "\"\"\")\n", + "display(df)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 5175179492167482, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "3_MaterializeMetricView", + "widgets": { + "CATALOG_NAME": { + "currentValue": "pedroz_catalog", + "nuid": "85e74c7a-766e-4e9e-a772-155cf1597384", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "pedroz_catalog", + "label": "Catalog Name", + "name": "CATALOG_NAME", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "pedroz_catalog", + "label": "Catalog Name", + "name": "CATALOG_NAME", + "options": { + "autoCreated": null, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + }, + "SCHEMA_NAME": { + "currentValue": "metric_views_schema", + "nuid": "de4765bb-0385-4c48-aeaf-8b41fdb96113", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "metric_views_schema", + "label": "Schema Name", + "name": "SCHEMA_NAME", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "metric_views_schema", + "label": "Schema Name", + "name": "SCHEMA_NAME", + "options": { + "autoCreated": null, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + } + } + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/data-warehousing/dbrx-metric-views/README.md b/data-warehousing/dbrx-metric-views/README.md new file mode 100644 index 0000000..f31015a --- /dev/null +++ b/data-warehousing/dbrx-metric-views/README.md @@ -0,0 +1,99 @@ +# Databricks Metric Views - Retail Store Dimensional Model + +dimensional model + +This demo shows how to use [Metric Views](https://docs.databricks.com/aws/en/metric-views/) in Databricks to create semantic models directly on the platform. Adding your semantic models in Databricks lets you benefit from Databricks compute for querying your data, while being fully integrated with Unity Catalog for unified governance and optimal performance. + +## Dataset + +This demo uses the [Retail Store Star Schema Dataset](https://www.kaggle.com/datasets/shrinivasv/retail-store-star-schema-dataset) from Kaggle. The dataset contains a star schema with a `fact_sales_normalized` fact table and the following dimension tables: + +- `dim_campaigns` - Marketing campaigns with budget and date ranges +- `dim_customers` - Customer demographics and segmentation +- `dim_dates` - Date dimension for time-based analysis +- `dim_products` - Product catalog with categories and brands +- `dim_salespersons` - Sales team roles and details +- `dim_stores` - Store locations and types + +## Prerequisites + +Before running the notebooks, ensure you have: + +1. **Unity Catalog enabled workspace** with permissions to create catalogs, schemas, and volumes. +2. **Serverless compute** enabled on your workspace (materialized metric views require serverless). +3. **A SQL Warehouse** - needed for the Genie Space and Dashboard creation in notebook 2. To find your warehouse ID: + - Navigate to **SQL Warehouses** in the Databricks sidebar + - Click on your warehouse + - The warehouse ID is in the URL: `https:///sql/warehouses/` +4. **Kaggle access** - the dataset is downloaded via `kagglehub` (no API key required for public datasets). + +## What Gets Created + +| Resource | Name | Description | +|----------|------|-------------| +| Catalog | `` | Unity Catalog catalog for all objects | +| Schema | `.` | Schema containing tables and views | +| Volume | `..kaggle_files` | Volume for staging Kaggle CSV files | +| Tables | 7 Delta tables | Fact and dimension tables from the dataset | +| Metric View | `sales_relationships` | Metric view with joins, dimensions, measures, and window measures | +| Materialized Metric View | `materialized_sales_relationships` | Same metric view with materialization for improved query performance | +| Genie Space | `Sample Sales Genie Space` | AI-powered natural language query interface | +| Dashboard | `dashboard_parameterized.lvdash.json` | AI/BI dashboard consuming the metric view | + +## Notebooks + +Run the notebooks in order: + +### [`0_IngestData.ipynb`](./0_IngestData.ipynb) +Downloads the Kaggle dataset into a UC Volume and loads each CSV file as a Delta table. Creates the catalog, schema, and volume if they don't exist. + +**Parameters:** `CATALOG_NAME`, `SCHEMA_NAME`, `VOLUME_PATH` + +### [`1_CreateMetricView.ipynb`](./1_CreateMetricView.ipynb) +Creates the `sales_relationships` metric view with YAML definition including: +- Star and snowflake joins across all dimension tables +- Dimensions with semantic metadata (display names, synonyms, comments) +- Aggregate measures (sum, avg, stddev, percentiles, min, max, mode) +- Window measures (day-over-day growth, running totals, YTD, trailing 7d/30d customers) + +**Parameters:** `CATALOG_NAME`, `SCHEMA_NAME` + +### [`2_QueryMetricView.ipynb`](./2_QueryMetricView.ipynb) +Demonstrates querying the metric view using `MEASURE()` syntax, and optionally creates a Genie Space and a parameterized AI/BI Dashboard from serialized templates. + +**Parameters:** `CATALOG_NAME`, `SCHEMA_NAME`, `warehouse_id` + +### [`3_MaterializeMetricView.ipynb`](./3_MaterializeMetricView.ipynb) +Creates a materialized version of the metric view with scheduled refreshes. Compares query performance between the non-materialized and materialized versions, and shows how to verify materialization usage via `EXPLAIN EXTENDED`. + +**Parameters:** `CATALOG_NAME`, `SCHEMA_NAME` + +## Directory Structure + +``` +dbrx-metric-views/ +├── 0_IngestData.ipynb # Data ingestion from Kaggle +├── 1_CreateMetricView.ipynb # Metric view creation +├── 2_QueryMetricView.ipynb # Querying, Genie Space, and Dashboard +├── 3_MaterializeMetricView.ipynb # Materialized metric view +├── README.md +├── dashboard_and_genie/ +│ ├── dashboard.lvdash.json # Template dashboard definition +│ ├── dashboard_parameterized.lvdash.json # Parameterized dashboard (generated) +│ └── genie_space.json # Serialized Genie Space definition +└── figures/ + ├── dimensional_model.png # Star schema ERD + ├── materialized_metric_view_ui.png # UC UI showing materialization status + ├── metric_view_in_dashboards.png # Dashboard screenshot + └── metric_view_in_genie.png # Genie Space screenshot +``` + +## Resources + +- [Docs - Unity Catalog metric views](https://docs.databricks.com/aws/en/metric-views/) +- [Docs - Metric view joins](https://docs.databricks.com/aws/en/metric-views/data-modeling/joins) +- [Docs - Semantic metadata](https://docs.databricks.com/aws/en/metric-views/data-modeling/semantic-metadata) +- [Docs - Window measures](https://docs.databricks.com/aws/en/metric-views/data-modeling/window-measures) +- [Docs - Materialization](https://docs.databricks.com/aws/en/metric-views/materialization) +- [Video - Unity Catalog Metric Views Overview](https://www.databricks.com/resources/demos/videos/unity-catalog-metric-views-overview) +- [Video - Understanding Your Business With Unity Catalog Metric Views](https://www.databricks.com/resources/demos/videos/understanding-your-business-with-unity-catalog-metric-view) diff --git a/data-warehousing/dbrx-metric-views/dashboard_and_genie/dashboard.lvdash.json b/data-warehousing/dbrx-metric-views/dashboard_and_genie/dashboard.lvdash.json new file mode 100644 index 0000000..0adbfb9 --- /dev/null +++ b/data-warehousing/dbrx-metric-views/dashboard_and_genie/dashboard.lvdash.json @@ -0,0 +1,989 @@ +{ + "datasets": [ + { + "name": "1b4ef37c", + "displayName": "sales_relationships", + "asset_name": "CATALOG_NAME.SCHEMA_NAME.sales_relationships" + } + ], + "pages": [ + { + "name": "5abd0258", + "displayName": "Main", + "layout": [ + { + "widget": { + "name": "sales-by-store-type", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "1b4ef37c", + "fields": [ + { + "name": "store_name", + "expression": "`store_name`" + }, + { + "name": "sum(total_amount)", + "expression": "SUM(`total_amount`)" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 3, + "widgetType": "bar", + "encodings": { + "x": { + "fieldName": "store_name", + "scale": { + "type": "categorical", + "sort": { + "by": "y-reversed" + } + } + }, + "y": { + "fieldName": "sum(total_amount)", + "scale": { + "type": "quantitative" + } + } + }, + "frame": { + "title": "Sales by Store", + "showTitle": true + } + } + }, + "position": { + "x": 0, + "y": 1, + "width": 6, + "height": 6 + } + }, + { + "widget": { + "name": "product-categories-breakdown", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "1b4ef37c", + "fields": [ + { + "name": "sum(total_amount)", + "expression": "SUM(`total_amount`)" + }, + { + "name": "product_category", + "expression": "`product_category`" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 3, + "widgetType": "pie", + "encodings": { + "angle": { + "fieldName": "sum(total_amount)", + "scale": { + "type": "quantitative" + } + }, + "color": { + "fieldName": "product_category", + "scale": { + "type": "categorical" + } + }, + "label": { + "show": true + } + }, + "frame": { + "title": "Product Categories Breakdown", + "showTitle": true + } + } + }, + "position": { + "x": 0, + "y": 7, + "width": 3, + "height": 8 + } + }, + { + "widget": { + "name": "salesperson-performance", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "1b4ef37c", + "fields": [ + { + "name": "salesperson_name", + "expression": "`salesperson_name`" + }, + { + "name": "total_amount", + "expression": "`total_amount`" + } + ], + "disaggregated": true + } + } + ], + "spec": { + "version": 1, + "widgetType": "table", + "encodings": { + "columns": [ + { + "fieldName": "salesperson_name", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "type": "string", + "displayAs": "string", + "visible": true, + "order": 25, + "title": "salesperson_name", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "fieldName": "total_amount", + "numberFormat": "0.00", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "type": "float", + "displayAs": "number", + "visible": true, + "order": 27, + "title": "total_amount", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + } + ] + }, + "invisibleColumns": [ + { + "dateTimeFormat": "DD/MM/YYYY HH:mm:ss.SSS", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "sales_date", + "type": "datetime", + "displayAs": "datetime", + "order": 0, + "title": "sales_date", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "first_name", + "type": "string", + "displayAs": "string", + "order": 1, + "title": "first_name", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "last_name", + "type": "string", + "displayAs": "string", + "order": 2, + "title": "last_name", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "customer_segment", + "type": "string", + "displayAs": "string", + "order": 3, + "title": "customer_segment", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "customer_email", + "type": "string", + "displayAs": "string", + "order": 4, + "title": "customer_email", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "customer_residential_location", + "type": "string", + "displayAs": "string", + "order": 5, + "title": "customer_residential_location", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "customer_name", + "type": "string", + "displayAs": "string", + "order": 6, + "title": "customer_name", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "store_name", + "type": "string", + "displayAs": "string", + "order": 7, + "title": "store_name", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "store_type", + "type": "string", + "displayAs": "string", + "order": 8, + "title": "store_type", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "store_location", + "type": "string", + "displayAs": "string", + "order": 9, + "title": "store_location", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "product_name", + "type": "string", + "displayAs": "string", + "order": 10, + "title": "product_name", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "product_category", + "type": "string", + "displayAs": "string", + "order": 11, + "title": "product_category", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "product_brand", + "type": "string", + "displayAs": "string", + "order": 12, + "title": "product_brand", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "product_origin_location", + "type": "string", + "displayAs": "string", + "order": 13, + "title": "product_origin_location", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "campaign_name", + "type": "string", + "displayAs": "string", + "order": 14, + "title": "campaign_name", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "numberFormat": "0", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "campaign_budget", + "type": "integer", + "displayAs": "number", + "order": 15, + "title": "campaign_budget", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "dateTimeFormat": "DD/MM/YYYY", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "full_date", + "type": "date", + "displayAs": "datetime", + "order": 16, + "title": "full_date", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "numberFormat": "0", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "year", + "type": "integer", + "displayAs": "number", + "order": 17, + "title": "year", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "numberFormat": "0", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "month", + "type": "integer", + "displayAs": "number", + "order": 18, + "title": "month", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "numberFormat": "0", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "day", + "type": "integer", + "displayAs": "number", + "order": 19, + "title": "day", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "numberFormat": "0", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "weekday", + "type": "integer", + "displayAs": "number", + "order": 20, + "title": "weekday", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "numberFormat": "0", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "quarter", + "type": "integer", + "displayAs": "number", + "order": 21, + "title": "quarter", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "dateTimeFormat": "DD/MM/YYYY HH:mm:ss.SSS", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "week", + "type": "datetime", + "displayAs": "datetime", + "order": 22, + "title": "week", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "numberFormat": "0", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "dayOfWeek", + "type": "integer", + "displayAs": "number", + "order": 23, + "title": "dayOfWeek", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "dateTimeFormat": "DD/MM/YYYY HH:mm:ss.SSS", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "YearMonth", + "type": "datetime", + "displayAs": "datetime", + "order": 24, + "title": "YearMonth", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "salesperson_role", + "type": "string", + "displayAs": "string", + "order": 26, + "title": "salesperson_role", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + } + ], + "allowHTMLByDefault": false, + "itemsPerPage": 10, + "paginationSize": "default", + "condensed": true, + "withRowNumber": false, + "frame": { + "title": "Top 10 Salespersons", + "showTitle": true + } + } + }, + "position": { + "x": 3, + "y": 7, + "width": 3, + "height": 8 + } + }, + { + "widget": { + "name": "product-brand", + "queries": [ + { + "name": "dashboards/01f0e0376f8d1dceb14d5659f8f1e298/datasets/01f0e0377c9a18b7a1625c6032cf7d3a_product_brand", + "query": { + "datasetName": "1b4ef37c", + "fields": [ + { + "name": "product_brand", + "expression": "`product_brand`" + }, + { + "name": "product_brand_associativity", + "expression": "COUNT_IF(`associative_filter_predicate_group`)" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 2, + "widgetType": "filter-multi-select", + "encodings": { + "fields": [ + { + "fieldName": "product_brand", + "queryName": "dashboards/01f0e0376f8d1dceb14d5659f8f1e298/datasets/01f0e0377c9a18b7a1625c6032cf7d3a_product_brand" + } + ] + }, + "frame": { + "showTitle": true, + "title": "Product Brand" + } + } + }, + "position": { + "x": 0, + "y": 0, + "width": 3, + "height": 1 + } + }, + { + "widget": { + "name": "product-category", + "queries": [ + { + "name": "dashboards/01f0e0376f8d1dceb14d5659f8f1e298/datasets/01f0e0377c9a18b7a1625c6032cf7d3a_product_category", + "query": { + "datasetName": "1b4ef37c", + "fields": [ + { + "name": "product_category", + "expression": "`product_category`" + }, + { + "name": "product_category_associativity", + "expression": "COUNT_IF(`associative_filter_predicate_group`)" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 2, + "widgetType": "filter-single-select", + "encodings": { + "fields": [ + { + "fieldName": "product_category", + "queryName": "dashboards/01f0e0376f8d1dceb14d5659f8f1e298/datasets/01f0e0377c9a18b7a1625c6032cf7d3a_product_category" + } + ] + }, + "frame": { + "showTitle": true, + "title": "Product Category" + } + } + }, + "position": { + "x": 3, + "y": 0, + "width": 3, + "height": 1 + } + } + ], + "pageType": "PAGE_TYPE_CANVAS" + } + ], + "uiSettings": { + "theme": { + "widgetHeaderAlignment": "ALIGNMENT_UNSPECIFIED" + }, + "applyModeEnabled": false + } +} diff --git a/data-warehousing/dbrx-metric-views/dashboard_and_genie/dashboard_parameterized.lvdash.json b/data-warehousing/dbrx-metric-views/dashboard_and_genie/dashboard_parameterized.lvdash.json new file mode 100644 index 0000000..a0befe8 --- /dev/null +++ b/data-warehousing/dbrx-metric-views/dashboard_and_genie/dashboard_parameterized.lvdash.json @@ -0,0 +1,989 @@ +{ + "datasets": [ + { + "name": "1b4ef37c", + "displayName": "sales_relationships", + "asset_name": "pedroz_catalog.metric_views_schema.sales_relationships" + } + ], + "pages": [ + { + "name": "5abd0258", + "displayName": "Main", + "layout": [ + { + "widget": { + "name": "sales-by-store-type", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "1b4ef37c", + "fields": [ + { + "name": "store_name", + "expression": "`store_name`" + }, + { + "name": "sum(total_amount)", + "expression": "SUM(`total_amount`)" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 3, + "widgetType": "bar", + "encodings": { + "x": { + "fieldName": "store_name", + "scale": { + "type": "categorical", + "sort": { + "by": "y-reversed" + } + } + }, + "y": { + "fieldName": "sum(total_amount)", + "scale": { + "type": "quantitative" + } + } + }, + "frame": { + "title": "Sales by Store", + "showTitle": true + } + } + }, + "position": { + "x": 0, + "y": 1, + "width": 6, + "height": 6 + } + }, + { + "widget": { + "name": "product-categories-breakdown", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "1b4ef37c", + "fields": [ + { + "name": "sum(total_amount)", + "expression": "SUM(`total_amount`)" + }, + { + "name": "product_category", + "expression": "`product_category`" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 3, + "widgetType": "pie", + "encodings": { + "angle": { + "fieldName": "sum(total_amount)", + "scale": { + "type": "quantitative" + } + }, + "color": { + "fieldName": "product_category", + "scale": { + "type": "categorical" + } + }, + "label": { + "show": true + } + }, + "frame": { + "title": "Product Categories Breakdown", + "showTitle": true + } + } + }, + "position": { + "x": 0, + "y": 7, + "width": 3, + "height": 8 + } + }, + { + "widget": { + "name": "salesperson-performance", + "queries": [ + { + "name": "main_query", + "query": { + "datasetName": "1b4ef37c", + "fields": [ + { + "name": "salesperson_name", + "expression": "`salesperson_name`" + }, + { + "name": "total_amount", + "expression": "`total_amount`" + } + ], + "disaggregated": true + } + } + ], + "spec": { + "version": 1, + "widgetType": "table", + "encodings": { + "columns": [ + { + "fieldName": "salesperson_name", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "type": "string", + "displayAs": "string", + "visible": true, + "order": 25, + "title": "salesperson_name", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "fieldName": "total_amount", + "numberFormat": "0.00", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "type": "float", + "displayAs": "number", + "visible": true, + "order": 27, + "title": "total_amount", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + } + ] + }, + "invisibleColumns": [ + { + "dateTimeFormat": "DD/MM/YYYY HH:mm:ss.SSS", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "sales_date", + "type": "datetime", + "displayAs": "datetime", + "order": 0, + "title": "sales_date", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "first_name", + "type": "string", + "displayAs": "string", + "order": 1, + "title": "first_name", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "last_name", + "type": "string", + "displayAs": "string", + "order": 2, + "title": "last_name", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "customer_segment", + "type": "string", + "displayAs": "string", + "order": 3, + "title": "customer_segment", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "customer_email", + "type": "string", + "displayAs": "string", + "order": 4, + "title": "customer_email", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "customer_residential_location", + "type": "string", + "displayAs": "string", + "order": 5, + "title": "customer_residential_location", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "customer_name", + "type": "string", + "displayAs": "string", + "order": 6, + "title": "customer_name", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "store_name", + "type": "string", + "displayAs": "string", + "order": 7, + "title": "store_name", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "store_type", + "type": "string", + "displayAs": "string", + "order": 8, + "title": "store_type", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "store_location", + "type": "string", + "displayAs": "string", + "order": 9, + "title": "store_location", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "product_name", + "type": "string", + "displayAs": "string", + "order": 10, + "title": "product_name", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "product_category", + "type": "string", + "displayAs": "string", + "order": 11, + "title": "product_category", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "product_brand", + "type": "string", + "displayAs": "string", + "order": 12, + "title": "product_brand", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "product_origin_location", + "type": "string", + "displayAs": "string", + "order": 13, + "title": "product_origin_location", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "campaign_name", + "type": "string", + "displayAs": "string", + "order": 14, + "title": "campaign_name", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "numberFormat": "0", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "campaign_budget", + "type": "integer", + "displayAs": "number", + "order": 15, + "title": "campaign_budget", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "dateTimeFormat": "DD/MM/YYYY", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "full_date", + "type": "date", + "displayAs": "datetime", + "order": 16, + "title": "full_date", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "numberFormat": "0", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "year", + "type": "integer", + "displayAs": "number", + "order": 17, + "title": "year", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "numberFormat": "0", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "month", + "type": "integer", + "displayAs": "number", + "order": 18, + "title": "month", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "numberFormat": "0", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "day", + "type": "integer", + "displayAs": "number", + "order": 19, + "title": "day", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "numberFormat": "0", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "weekday", + "type": "integer", + "displayAs": "number", + "order": 20, + "title": "weekday", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "numberFormat": "0", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "quarter", + "type": "integer", + "displayAs": "number", + "order": 21, + "title": "quarter", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "dateTimeFormat": "DD/MM/YYYY HH:mm:ss.SSS", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "week", + "type": "datetime", + "displayAs": "datetime", + "order": 22, + "title": "week", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "numberFormat": "0", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "dayOfWeek", + "type": "integer", + "displayAs": "number", + "order": 23, + "title": "dayOfWeek", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "dateTimeFormat": "DD/MM/YYYY HH:mm:ss.SSS", + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "YearMonth", + "type": "datetime", + "displayAs": "datetime", + "order": 24, + "title": "YearMonth", + "allowSearch": false, + "alignContent": "right", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + }, + { + "booleanValues": [ + "false", + "true" + ], + "imageUrlTemplate": "{{ @ }}", + "imageTitleTemplate": "{{ @ }}", + "imageWidth": "", + "imageHeight": "", + "linkUrlTemplate": "{{ @ }}", + "linkTextTemplate": "{{ @ }}", + "linkTitleTemplate": "{{ @ }}", + "linkOpenInNewTab": true, + "name": "salesperson_role", + "type": "string", + "displayAs": "string", + "order": 26, + "title": "salesperson_role", + "allowSearch": false, + "alignContent": "left", + "allowHTML": false, + "highlightLinks": false, + "useMonospaceFont": false, + "preserveWhitespace": false + } + ], + "allowHTMLByDefault": false, + "itemsPerPage": 10, + "paginationSize": "default", + "condensed": true, + "withRowNumber": false, + "frame": { + "title": "Top 10 Salespersons", + "showTitle": true + } + } + }, + "position": { + "x": 3, + "y": 7, + "width": 3, + "height": 8 + } + }, + { + "widget": { + "name": "product-brand", + "queries": [ + { + "name": "dashboards/01f0e0376f8d1dceb14d5659f8f1e298/datasets/01f0e0377c9a18b7a1625c6032cf7d3a_product_brand", + "query": { + "datasetName": "1b4ef37c", + "fields": [ + { + "name": "product_brand", + "expression": "`product_brand`" + }, + { + "name": "product_brand_associativity", + "expression": "COUNT_IF(`associative_filter_predicate_group`)" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 2, + "widgetType": "filter-multi-select", + "encodings": { + "fields": [ + { + "fieldName": "product_brand", + "queryName": "dashboards/01f0e0376f8d1dceb14d5659f8f1e298/datasets/01f0e0377c9a18b7a1625c6032cf7d3a_product_brand" + } + ] + }, + "frame": { + "showTitle": true, + "title": "Product Brand" + } + } + }, + "position": { + "x": 0, + "y": 0, + "width": 3, + "height": 1 + } + }, + { + "widget": { + "name": "product-category", + "queries": [ + { + "name": "dashboards/01f0e0376f8d1dceb14d5659f8f1e298/datasets/01f0e0377c9a18b7a1625c6032cf7d3a_product_category", + "query": { + "datasetName": "1b4ef37c", + "fields": [ + { + "name": "product_category", + "expression": "`product_category`" + }, + { + "name": "product_category_associativity", + "expression": "COUNT_IF(`associative_filter_predicate_group`)" + } + ], + "disaggregated": false + } + } + ], + "spec": { + "version": 2, + "widgetType": "filter-single-select", + "encodings": { + "fields": [ + { + "fieldName": "product_category", + "queryName": "dashboards/01f0e0376f8d1dceb14d5659f8f1e298/datasets/01f0e0377c9a18b7a1625c6032cf7d3a_product_category" + } + ] + }, + "frame": { + "showTitle": true, + "title": "Product Category" + } + } + }, + "position": { + "x": 3, + "y": 0, + "width": 3, + "height": 1 + } + } + ], + "pageType": "PAGE_TYPE_CANVAS" + } + ], + "uiSettings": { + "theme": { + "widgetHeaderAlignment": "ALIGNMENT_UNSPECIFIED" + }, + "applyModeEnabled": false + } +} diff --git a/data-warehousing/dbrx-metric-views/dashboard_and_genie/genie_space.json b/data-warehousing/dbrx-metric-views/dashboard_and_genie/genie_space.json new file mode 100644 index 0000000..7145c20 --- /dev/null +++ b/data-warehousing/dbrx-metric-views/dashboard_and_genie/genie_space.json @@ -0,0 +1,379 @@ +{ + "version": 1, + "data_sources": { + "metric_views": [ + { + "identifier": "CATALOG_NAME.SCHEMA_NAME.sales_relationships", + "column_configs": [ + { + "column_name": "YearMonth", + "get_example_values": true + }, + { + "column_name": "campaign_budget", + "get_example_values": true + }, + { + "column_name": "campaign_name", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "current_day_sales" + }, + { + "column_name": "customer_email", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "customer_name", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "customer_residential_location", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "customer_segment", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "day", + "get_example_values": true + }, + { + "column_name": "dayOfWeek", + "get_example_values": true + }, + { + "column_name": "day_over_day_growth" + }, + { + "column_name": "first_name", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "full_date", + "get_example_values": true + }, + { + "column_name": "last_name", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "month", + "get_example_values": true + }, + { + "column_name": "previous_day_sales" + }, + { + "column_name": "product_brand", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "product_category", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "product_name", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "product_origin_location", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "quarter", + "get_example_values": true + }, + { + "column_name": "running_total_sales" + }, + { + "column_name": "sales_avg" + }, + { + "column_name": "sales_date", + "get_example_values": true + }, + { + "column_name": "sales_largest" + }, + { + "column_name": "sales_median" + }, + { + "column_name": "sales_mode" + }, + { + "column_name": "sales_quartile1" + }, + { + "column_name": "sales_quartile2" + }, + { + "column_name": "sales_quartile3" + }, + { + "column_name": "sales_quartile4" + }, + { + "column_name": "sales_smallest" + }, + { + "column_name": "sales_stddev" + }, + { + "column_name": "sales_sum" + }, + { + "column_name": "salesperson_name", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "salesperson_role", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "store_location", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "store_name", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "store_type", + "get_example_values": true, + "build_value_dictionary": true + }, + { + "column_name": "t30d_customers" + }, + { + "column_name": "t7d_customers" + }, + { + "column_name": "total_amount", + "get_example_values": true + }, + { + "column_name": "week", + "get_example_values": true + }, + { + "column_name": "weekday", + "get_example_values": true + }, + { + "column_name": "year", + "get_example_values": true + }, + { + "column_name": "ytd_sales" + } + ] + } + ] + }, + "instructions": { + "text_instructions": [ + { + "id": "01f0e27a754a1a85aa25fdb8f17c51a6", + "content": [ + "* Only display currencies in US Dollars." + ] + } + ], + "example_question_sqls": [ + { + "id": "01f0e034f31e1e5f95d6589791837873", + "question": [ + "What are the sales statistics by day of the week?" + ], + "sql": [ + "-- This is just a simple query example, demonstrating that you can query Metric Views just like any other view!\n", + "SELECT\n", + " dayOfWeek,\n", + " MEASURE(`sales_sum`),\n", + " MEASURE(`sales_avg`),\n", + " MEASURE(`sales_stddev`),\n", + " MEASURE(`sales_quartile1`),\n", + " MEASURE(`sales_quartile2`),\n", + " MEASURE(`sales_quartile3`),\n", + " MEASURE(`sales_quartile4`),\n", + " MEASURE(`sales_largest`),\n", + " MEASURE(`sales_smallest`)\n", + "FROM CATALOG_NAME.SCHEMA_NAME.sales_relationships\n", + "GROUP BY ALL\n", + "ORDER BY dayOfWeek" + ] + }, + { + "id": "01f0e034f32015eda08110633923590b", + "question": [ + "What are the sales statistics by year and month?" + ], + "sql": [ + "SELECT\n", + " YearMonth,\n", + " MEASURE(`sales_sum`),\n", + " MEASURE(`sales_avg`) AS `sales_avg`,\n", + " MEASURE(`sales_stddev`) AS `sales_stddev`,\n", + " MEASURE(`sales_median`) AS `sales_median`,\n", + " MEASURE(`sales_quartile1`) AS `sales_quartile1`,\n", + " MEASURE(`sales_quartile2`) AS `sales_quartile2`,\n", + " MEASURE(`sales_quartile3`) AS `sales_quartile3`,\n", + " MEASURE(`sales_quartile4`) AS `sales_quartile4`,\n", + " MEASURE(`sales_largest`) AS `sales_largest`,\n", + " MEASURE(`sales_smallest`) AS `sales_smallest`\n", + "FROM CATALOG_NAME.SCHEMA_NAME.sales_relationships\n", + "GROUP BY ALL" + ] + }, + { + "id": "01f0e034f3211da6b78032d63bbb8411", + "question": [ + "What are the total sales and percentage of relevance of sales by year and month?" + ], + "sql": [ + "SELECT\n", + " YearMonth,\n", + " MEASURE(`total_sales`) AS `total_sales`, \n", + " MEASURE(`pct_of_relevance_of_sales`) AS `pct_of_relevance_of_sales`\n", + "FROM CATALOG_NAME.SCHEMA_NAME.sales_relationships GROUP BY ALL" + ] + }, + { + "id": "01f0e034f32313bc89ccc7adcb6b3c66", + "question": [ + "What are the sales statistics by day of the week?" + ], + "sql": [ + "SELECT\n", + " dayOfWeek,\n", + " MEASURE(`sales_sum`),\n", + " MEASURE(`sales_avg`) AS `sales_avg`,\n", + " MEASURE(`sales_stddev`) AS `sales_stddev`,\n", + " MEASURE(`sales_median`) AS `sales_median`,\n", + " MEASURE(`sales_quartile1`) AS `sales_quartile1`,\n", + " MEASURE(`sales_quartile2`) AS `sales_quartile2`,\n", + " MEASURE(`sales_quartile3`) AS `sales_quartile3`,\n", + " MEASURE(`sales_quartile4`) AS `sales_quartile4`,\n", + " MEASURE(`sales_largest`) AS `sales_largest`,\n", + " MEASURE(`sales_smallest`) AS `sales_smallest`\n", + "FROM CATALOG_NAME.SCHEMA_NAME.sales_relationships\n", + "GROUP BY ALL" + ] + }, + { + "id": "01f0e034f3241beea1b46f27a8cb57da", + "question": [ + "What are the total sales and percentage of relevance of sales overall?" + ], + "sql": [ + "SELECT MEASURE(`total_sales`) AS `total_sales`, MEASURE(`pct_of_relevance_of_sales`) AS `pct_of_relevance_of_sales`\n", + "FROM CATALOG_NAME.SCHEMA_NAME.sales_relationships" + ] + }, + { + "id": "01f0e034f3261044846c08976a7ce277", + "question": [ + "What are the sales sum, average, stddev, and median by year and month?" + ], + "sql": [ + "SELECT\n", + " YearMonth,\n", + " MEASURE(`sales_sum`) AS `total_sales`, \n", + " MEASURE(`sales_avg`) AS `avg_sales`, \n", + " MEASURE(`sales_stddev`) AS `stddev_sales`, \n", + " MEASURE(`sales_median`) AS `pct_of_relevance_of_sales`\n", + "FROM CATALOG_NAME.SCHEMA_NAME.sales_relationships GROUP BY ALL" + ] + }, + { + "id": "01f0e034f327148d9cedb3741dc1a8bb", + "question": [ + "What are the sales statistics by year and month?" + ], + "sql": [ + "SELECT\n", + " YearMonth,\n", + " MEASURE(`sales_sum`) AS `sales_sum`,\n", + " MEASURE(`sales_avg`) AS `sales_avg`,\n", + " MEASURE(`sales_stddev`) AS `sales_stddev`,\n", + " MEASURE(`sales_median`) AS `sales_median`,\n", + " MEASURE(`sales_quartile1`) AS `sales_quartile1`,\n", + " MEASURE(`sales_quartile2`) AS `sales_quartile2`,\n", + " MEASURE(`sales_quartile3`) AS `sales_quartile3`,\n", + " MEASURE(`sales_quartile4`) AS `sales_quartile4`,\n", + " MEASURE(`sales_largest`) AS `sales_largest`,\n", + " MEASURE(`sales_smallest`) AS `sales_smallest`\n", + "FROM CATALOG_NAME.SCHEMA_NAME.sales_relationships\n", + "GROUP BY ALL" + ] + }, + { + "id": "01f0e034f32819f0996e63738565d3cb", + "question": [ + "What are the sales statistics by day of the week?" + ], + "sql": [ + "SELECT\n", + " dayOfWeek,\n", + " MEASURE(`sales_sum`),\n", + " MEASURE(`sales_avg`) AS `sales_avg`,\n", + " MEASURE(`sales_stddev`) AS `sales_stddev`,\n", + " MEASURE(`sales_median`) AS `sales_median`,\n", + " MEASURE(`sales_quartile1`) AS `sales_quartile1`,\n", + " MEASURE(`sales_quartile2`) AS `sales_quartile2`,\n", + " MEASURE(`sales_quartile3`) AS `sales_quartile3`,\n", + " MEASURE(`sales_quartile4`) AS `sales_quartile4`,\n", + " MEASURE(`sales_largest`) AS `sales_largest`,\n", + " MEASURE(`sales_smallest`) AS `sales_smallest`\n", + "FROM CATALOG_NAME.SCHEMA_NAME.sales_relationships\n", + "GROUP BY ALL\n", + "ORDER BY dayOfWeek" + ] + }, + { + "id": "01f0e034f3291e7d975ad11ee97a866b", + "question": [ + "What are the sales statistics by day of the week?" + ], + "sql": [ + "SELECT\n", + " dayOfWeek,\n", + " MEASURE(`sales_sum`),\n", + " MEASURE(`sales_avg`),\n", + " MEASURE(`sales_stddev`),\n", + " MEASURE(`sales_median`),\n", + " MEASURE(`sales_quartile1`),\n", + " MEASURE(`sales_quartile2`),\n", + " MEASURE(`sales_quartile3`),\n", + " MEASURE(`sales_quartile4`),\n", + " MEASURE(`sales_largest`),\n", + " MEASURE(`sales_smallest`)\n", + "FROM CATALOG_NAME.SCHEMA_NAME.sales_relationships\n", + "GROUP BY ALL\n", + "ORDER BY dayOfWeek" + ] + } + ] + } +} diff --git a/data-warehousing/dbrx-metric-views/figures/dimensional_model.png b/data-warehousing/dbrx-metric-views/figures/dimensional_model.png new file mode 100644 index 0000000..606b696 Binary files /dev/null and b/data-warehousing/dbrx-metric-views/figures/dimensional_model.png differ diff --git a/data-warehousing/dbrx-metric-views/figures/materialized_metric_view_ui.png b/data-warehousing/dbrx-metric-views/figures/materialized_metric_view_ui.png new file mode 100644 index 0000000..c9f85fe Binary files /dev/null and b/data-warehousing/dbrx-metric-views/figures/materialized_metric_view_ui.png differ diff --git a/data-warehousing/dbrx-metric-views/figures/metric_view_in_dashboards.png b/data-warehousing/dbrx-metric-views/figures/metric_view_in_dashboards.png new file mode 100644 index 0000000..e10a7e1 Binary files /dev/null and b/data-warehousing/dbrx-metric-views/figures/metric_view_in_dashboards.png differ diff --git a/data-warehousing/dbrx-metric-views/figures/metric_view_in_genie.png b/data-warehousing/dbrx-metric-views/figures/metric_view_in_genie.png new file mode 100644 index 0000000..27f1edc Binary files /dev/null and b/data-warehousing/dbrx-metric-views/figures/metric_view_in_genie.png differ diff --git a/data-warehousing/genie-cicd.zip b/data-warehousing/genie-cicd.zip new file mode 100644 index 0000000..b76d484 Binary files /dev/null and b/data-warehousing/genie-cicd.zip differ diff --git a/data-warehousing/genie-cicd/CONTRIBUTING.md b/data-warehousing/genie-cicd/CONTRIBUTING.md new file mode 100644 index 0000000..610e38f --- /dev/null +++ b/data-warehousing/genie-cicd/CONTRIBUTING.md @@ -0,0 +1,319 @@ +# Contributing Guide + +Thank you for your interest in contributing to the Genie Space CI/CD project! This document provides guidelines and best practices for contributing. + +## Table of Contents + +- [Getting Started](#getting-started) +- [Development Setup](#development-setup) +- [Project Structure](#project-structure) +- [Making Changes](#making-changes) +- [Testing](#testing) +- [Code Style](#code-style) +- [Pull Request Process](#pull-request-process) + +--- + +## Getting Started + +### Prerequisites + +Before contributing, ensure you have: + +1. **Databricks CLI** installed and configured +2. **Python 3.8+** installed locally +3. Access to a Databricks workspace for testing +4. Basic understanding of: + - Databricks Asset Bundles (DABs) + - Unity Catalog + - AI/BI Genie spaces + +### Forking the Repository + +1. Fork this repository to your GitHub account +2. Clone your fork locally: + ```bash + git clone https://github.com/YOUR_USERNAME/genie-cicd.git + cd genie-cicd + ``` +3. Add the upstream remote: + ```bash + git remote add upstream https://github.com/ORIGINAL_OWNER/genie-cicd.git + ``` + +--- + +## Development Setup + +### Local Development Environment + +```bash +# Create a virtual environment +python -m venv .venv +source .venv/bin/activate # On Windows: .venv\Scripts\activate + +# Install development dependencies +pip install databricks-sdk requests + +# Configure Databricks CLI +databricks configure --token +``` + +### Testing Configuration + +Create a test configuration by copying and modifying `databricks.yml`: + +```bash +# Validate your configuration +databricks bundle validate --target dev +``` + +--- + +## Project Structure + +``` +genie-cicd/ +├── databricks.yml # Main bundle configuration +├── README.md # Project overview +├── SETUP.md # Setup instructions +├── CONTRIBUTING.md # This file +├── .gitignore # Git ignore patterns +├── src/ +│ ├── export_genie_definition.py # Export notebook +│ ├── deploy_genie_space.py # Deploy notebook +│ └── DOCUMENTATION.md # Source code docs +└── genie_definition/ + ├── genie_space.json # Dev export (version controlled) + └── genie_space_prod.json # Prod version (auto-generated) +``` + +### Key Files + +| File | Description | When to Modify | +|------|-------------|----------------| +| `databricks.yml` | Bundle configuration | Adding jobs, variables, or targets | +| `src/*.py` | Databricks notebooks | Changing export/deploy logic | +| `src/DOCUMENTATION.md` | Code documentation | After modifying source files | +| `README.md` | Project overview | Adding features or changing usage | +| `SETUP.md` | Setup instructions | Changing setup process | + +--- + +## Making Changes + +### Branch Naming Convention + +Use descriptive branch names: + +``` +feature/add-multi-space-support +bugfix/fix-schema-replacement +docs/update-setup-guide +refactor/improve-error-handling +``` + +### Commit Messages + +Follow conventional commit format: + +``` +type(scope): short description + +Longer description if needed. +``` + +**Types:** +- `feat`: New feature +- `fix`: Bug fix +- `docs`: Documentation changes +- `refactor`: Code refactoring +- `test`: Adding tests +- `chore`: Maintenance tasks + +**Examples:** +``` +feat(deploy): add support for metric views replacement +fix(export): handle spaces with special characters in title +docs(readme): add troubleshooting section +``` + +--- + +## Testing + +### Local Testing Checklist + +Before submitting changes, test the following: + +#### 1. Configuration Validation + +```bash +databricks bundle validate --target dev +databricks bundle validate --target prod +``` + +#### 2. Export Functionality + +Test the export notebook: +- [ ] Exports successfully with valid space_id +- [ ] Returns proper error for invalid space_id +- [ ] Creates valid JSON output + +#### 3. Deploy Functionality + +Test the deploy notebook: +- [ ] Creates new space when space_id is empty +- [ ] Updates existing space when space_id is provided +- [ ] Catalog/schema replacement works correctly +- [ ] Handles backtick-quoted identifiers +- [ ] Handles plain identifiers + +#### 4. End-to-End Test + +```bash +# Deploy the bundle +databricks bundle deploy --target dev + +# Run the full pipeline +databricks bundle run promote_genie_to_prod --target dev +``` + +### Test Cases for Catalog Replacement + +Verify these replacement scenarios work: + +| Input | Expected Output | +|-------|-----------------| +| `main_th.schema_dev.table` | `target_cat.target_schema.table` | +| `` `main_th`.`schema_dev`.`table` `` | `` `target_cat`.`target_schema`.`table` `` | +| `SELECT * FROM main_th.schema_dev.t` | `SELECT * FROM target_cat.target_schema.t` | + +--- + +## Code Style + +### Python Guidelines + +1. **Imports**: Group imports in order: standard library, third-party, local +2. **Docstrings**: Use docstrings for all functions +3. **Type hints**: Include type hints where practical +4. **Comments**: Use clear section headers with `# ========` + +### Example Function Style + +```python +def replace_catalog_schema( + text: str, + source_catalog: str, + target_catalog: str, + source_schema: Optional[str] = None, + target_schema: Optional[str] = None +) -> str: + """ + Replace catalog and schema names in a text string. + + Handles both formats: + - Without backticks: catalog.schema.table + - With backticks: `catalog`.`schema`.`table` + + Args: + text: The text to search and replace in + source_catalog: The catalog name to find + target_catalog: The catalog name to replace with + source_schema: Optional schema name to find + target_schema: Optional schema name to replace with + + Returns: + The text with replacements applied + """ + # Implementation... +``` + +### YAML Guidelines (databricks.yml) + +1. Use 2-space indentation +2. Include comments for all TODO items +3. Group related variables together +4. Document all parameters + +--- + +## Pull Request Process + +### Before Submitting + +1. **Update documentation** if you changed functionality +2. **Test your changes** using the checklist above +3. **Update DOCUMENTATION.md** if you modified source code +4. **Rebase on main** to ensure clean history: + ```bash + git fetch upstream + git rebase upstream/main + ``` + +### PR Template + +When creating a PR, include: + +```markdown +## Description +Brief description of changes + +## Type of Change +- [ ] Bug fix +- [ ] New feature +- [ ] Documentation update +- [ ] Refactoring + +## Testing Done +- [ ] Validated bundle configuration +- [ ] Tested export functionality +- [ ] Tested deploy functionality +- [ ] Tested catalog/schema replacement + +## Documentation +- [ ] Updated README.md (if needed) +- [ ] Updated SETUP.md (if needed) +- [ ] Updated src/DOCUMENTATION.md (if needed) +``` + +### Review Process + +1. Submit PR to `main` branch +2. Ensure all checks pass +3. Request review from maintainers +4. Address any feedback +5. Once approved, maintainer will merge + +--- + +## Areas for Contribution + +### Good First Issues + +- Improve error messages +- Add more examples to documentation +- Add support for additional JSON paths in catalog replacement +- Create unit tests for replacement functions + +### Feature Ideas + +- Support for multiple catalog/schema mappings +- Dry-run mode for deployment +- Diff preview before deployment +- Support for Genie space permissions management +- GitHub Actions workflow template +- Azure DevOps pipeline template + +--- + +## Questions? + +If you have questions about contributing: + +1. Check existing documentation +2. Review closed issues and PRs +3. Open a new issue with the `question` label + +Thank you for contributing! diff --git a/data-warehousing/genie-cicd/README.md b/data-warehousing/genie-cicd/README.md new file mode 100644 index 0000000..c4a0905 --- /dev/null +++ b/data-warehousing/genie-cicd/README.md @@ -0,0 +1,313 @@ +# Genie Space CI/CD + +Automated CI/CD pipeline for Databricks AI/BI Genie spaces. Export from Dev, version control in Git, and deploy to Prod with automatic catalog/schema replacement. + +## Table of Contents + +- [Overview](#overview) +- [Architecture](#architecture) +- [File Structure](#file-structure) +- [Quick Start](#quick-start) +- [Jobs Available](#jobs-available) +- [Catalog/Schema Replacement](#catalogschema-replacement) +- [Parameters Reference](#parameters-reference) +- [Prerequisites](#prerequisites) +- [Troubleshooting](#troubleshooting) +- [API Reference](#api-reference) +- [Additional Documentation](#additional-documentation) + +--- + +## Overview + +This project provides a complete CI/CD solution for managing Databricks AI/BI Genie spaces across environments. It enables teams to: + +- **Export** Genie space configurations from Development workspaces +- **Version control** configurations in Git for audit trails and collaboration +- **Deploy** to Production with automatic Unity Catalog reference replacement +- **Maintain** consistency between Dev and Prod environments + +### Key Features + +- **Serverless compute by default** - Jobs run on serverless for faster startup and no cluster management +- Automatic catalog/schema replacement during deployment +- Support for both backtick-quoted and plain Unity Catalog identifiers +- Create new or update existing Genie spaces +- Configurable via Databricks Asset Bundles (DABs) +- CI/CD ready for integration with GitHub Actions, Azure DevOps, etc. + +--- + +## Architecture + +``` +┌────────────────────────┐ ┌────────────────────────┐ ┌────────────────────────┐ +│ Dev Workspace │ │ Git / CI/CD │ │ Prod Workspace │ +│ │ │ │ │ │ +│ Genie Space ──Task1 ┼──►───│ genie_space_dev.json. │──►───┼ Task2─► Genie Space │ +│ (source catalog) │ │ │ │ (target catalog) │ +└────────────────────────┘ └────────────────────────┘ └────────────────────────┘ +``` + +### Pipeline Flow + +**Job: `promote_genie_to_prod`** + +| Task | Description | Input | Output | +|------|-------------|-------|--------| +| **Task 1** (Export) | Export Genie space from Dev | Dev Space ID | `genie_definition/genie_space.json` | +| **Task 2** (Deploy) | Deploy to Prod with replacements | JSON file | New/Updated Prod Genie Space | + +--- + +## File Structure + +``` +genie-cicd/ +├── databricks.yml # DAB configuration (customize this!) +├── SETUP.md # Step-by-step setup guide +├── README.md # This file +├── .gitignore # Git ignore patterns +├── src/ +│ ├── export_genie_definition.py # Task 1: Export from Dev +│ ├── deploy_genie_space.py # Task 2: Deploy to Prod +│ └── DOCUMENTATION.md # Detailed source code documentation +└── genie_definition/ + ├── genie_space.json # Exported from Dev (version controlled) + └── genie_space_prod.json # Generated for Prod (auto-created) +``` + +### Key Files + +| File | Purpose | +|------|---------| +| `databricks.yml` | Main configuration file - define variables, jobs, and targets | +| `src/export_genie_definition.py` | Databricks notebook to export Genie space definitions | +| `src/deploy_genie_space.py` | Databricks notebook to deploy with catalog/schema replacement | +| `genie_definition/*.json` | Exported space definitions (keep in version control) | + +--- + +## Quick Start + +> **Detailed instructions**: See [SETUP.md](./SETUP.md) for comprehensive step-by-step guide. + +### 1. Install Prerequisites + +```bash +# Install Databricks CLI +pip install databricks-cli + +# Configure authentication +databricks configure --token +``` + +### 2. Configure `databricks.yml` + +Find and update all `# <-- TODO` comments: + +```bash +grep -n "TODO" databricks.yml +``` + +**Required variables:** + +| Variable | Description | Example | +|----------|-------------|---------| +| `dev_space_id` | Your Dev Genie Space ID (source to export) | `01f0fd2cfa1c16c185ec2ee3b4ea29d7` | +| `prod_space_id` | Your Prod Space ID (empty for first run) | `""` or Space ID | +| `prod_warehouse_id` | Prod SQL Warehouse ID | `81b975e2ee32b916` | +| `source_catalog` | Dev catalog name | `main_th` | +| `source_schema` | Dev schema name | `schema_dev` | +| `target_catalog` | Prod catalog name | `main_prod` | +| `target_schema` | Prod schema name | `schema_prod` | +| Workspace URLs | Dev and Prod workspace URLs | `https://xxx.cloud.databricks.com` | + +### 3. Deploy + +```bash +# Validate configuration +databricks bundle validate --target prod + +# Deploy bundle (first time) +databricks bundle deploy --target prod + +# Run the pipeline +databricks bundle run promote_genie_to_prod --target prod + +# ⚠️ IMPORTANT: Save the prod space_id from output, add to databricks.yml + +# Subsequent runs (updates existing Prod space) +databricks bundle run promote_genie_to_prod --target prod +``` + +--- + +## Jobs Available + +| Job | Description | Use Case | +|-----|-------------|----------| +| `promote_genie_to_prod` | Full pipeline: Export from Dev → Deploy to Prod | Regular deployments | +| `deploy_genie_only` | Deploy only: Uses existing `genie_space.json` (skip export) | Quick deploys without re-export | + +### Run Commands + +```bash +# Full pipeline (export + deploy) +databricks bundle run promote_genie_to_prod --target prod + +# Deploy only (if you already have the JSON) # it is availbale on the the databricks.yml file +databricks bundle run deploy_genie_only --target prod +``` + +--- + +## Catalog/Schema Replacement + +The pipeline automatically replaces Unity Catalog references when deploying to Prod, ensuring your Dev configurations work seamlessly in Production. + +### How It Works + +The deployment script scans the Genie space JSON and replaces all occurrences of your source catalog/schema with the target values. + +### Supported Formats + +| Format | Example | Replaced With | +|--------|---------|---------------| +| Plain | `main_th.schema_dev.table_example` | `main_prod.schema_prod.table_example` | +| Backtick-quoted | `` `main_th`.`schema_dev`.`table_example` `` | `` `main_prod`.`schema_prod`.`table_example` `` | + +### What Gets Replaced + +The following JSON paths are scanned and updated: + +| JSON Path | Description | +|-----------|-------------| +| `data_sources.tables[].identifier` | Table references | +| `data_sources.metric_views[].identifier` | Metric view references | +| `instructions.example_question_sqls[].sql[]` | Example SQL queries | +| `benchmarks.questions[].answer[].content[]` | Benchmark answer SQL | + +--- + +## Parameters Reference + +### Export Task (`export_genie_definition.py`) + +| Parameter | Required | Default | Description | +|-----------|----------|---------|-------------| +| `space_id` | **Yes** | - | Dev Genie Space ID to export | +| `output_file` | Yes | `../genie_definition/genie_space.json` | Path to save the exported JSON | + +### Deploy Task (`deploy_genie_space.py`) + +| Parameter | Required | Default | Description | +|-----------|----------|---------|-------------| +| `space_id` | No | `""` (empty) | Prod Space ID. Empty = create new, filled = update existing | +| `input_file` | **Yes** | `./genie_definition/genie_space.json` | Path to the Dev JSON file | +| `output_file` | No | Auto-generated `_prod` suffix | Path for Prod JSON backup | +| `warehouse_id` | **Yes*** | - | SQL Warehouse ID (*required for create) | +| `title` | **Yes*** | - | Space title (*required for create) | +| `source_catalog` | No | - | Dev catalog to replace (enables replacement) | +| `target_catalog` | No | - | Prod catalog | +| `source_schema` | No | - | Dev schema to replace | +| `target_schema` | No | - | Prod schema | + +--- + +## Compute Options + +By default, jobs run on **serverless compute** which provides: +- Faster startup times (no cluster provisioning) +- No cluster management overhead +- Cost-effective for short-running tasks +- Automatic scaling + +### Alternative Compute Options + +If serverless is not available or you need specific configurations, you can use: + +| Option | Use Case | Configuration | +|--------|----------|---------------| +| **Serverless** (default) | Recommended for most use cases | No cluster config needed | +| **Existing Cluster** | Reuse an all-purpose cluster | `existing_cluster_id: ""` | +| **New Job Cluster** | Dedicated compute per job | `new_cluster: { ... }` | + +To switch from serverless, uncomment the appropriate section in `databricks.yml`. + +--- + +## Prerequisites + +### Software Requirements + +- **Databricks CLI** v0.200+ installed and configured +- **Python 3.8+** (for local development/testing) + +### Permissions + +| Operation | Required Permission | Scope | +|-----------|---------------------|-------| +| Export | `CAN EDIT` | Dev Genie space | +| Create Space | `CAN MANAGE` | Prod workspace | +| Update Space | `CAN EDIT` | Prod Genie space | + +### Infrastructure + +- **Serverless compute** enabled in the workspace (for default configuration) +- **SQL Warehouse** running in Prod workspace +- **Unity Catalog** configured in both workspaces +- **Network connectivity** between workspaces (if different) + +--- + +## Troubleshooting + +| Error | Cause | Solution | +|-------|-------|----------| +| `space_id parameter is required` | Export task missing space ID | Set `dev_space_id` in `databricks.yml` | +| `warehouse_id is required` | Creating new space without warehouse | Set `prod_warehouse_id` in `databricks.yml` | +| `Permission denied` | Insufficient access | Check permissions (see Prerequisites) | +| `Space not found` | Invalid space ID | Verify space ID exists and is accessible | + +### Debugging Commands + +```bash +# Validate configuration +databricks bundle validate --target prod + +# Check job status +databricks bundle run promote_genie_to_prod --target prod +# Watch the job in Databricks UI for detailed logs + +# List available jobs +databricks bundle summary --target prod +``` + +--- + +## API Reference + +This project uses the Databricks Genie Space REST APIs: + +| Operation | API | Documentation | +|-----------|-----|---------------| +| Export | GET `/api/2.0/genie/spaces/{space_id}` | [Get Space API](https://docs.databricks.com/api/workspace/genie/getspace) | +| Create | POST `/api/2.0/genie/spaces` | [Create Space API](https://docs.databricks.com/api/workspace/genie/createspace) | +| Update | PATCH `/api/2.0/genie/spaces/{space_id}` | [Update Space API](https://docs.databricks.com/api/workspace/genie/updatespace) | + +--- + +## Additional Documentation + +| Document | Description | +|----------|-------------| +| [SETUP.md](./SETUP.md) | Detailed step-by-step setup guide | +| [src/DOCUMENTATION.md](./src/DOCUMENTATION.md) | Source code documentation and API details | +| [CONTRIBUTING.md](./CONTRIBUTING.md) | Guidelines for contributing to this project | + +--- + +## License + +This project is provided as-is for educational and internal use purposes. diff --git a/data-warehousing/genie-cicd/SETUP.md b/data-warehousing/genie-cicd/SETUP.md new file mode 100644 index 0000000..31ef02b --- /dev/null +++ b/data-warehousing/genie-cicd/SETUP.md @@ -0,0 +1,491 @@ +# Setup Guide + +Complete step-by-step instructions to configure and run your Genie Space CI/CD pipeline. + +## Table of Contents + +- [Prerequisites](#prerequisites) +- [Step 1: Configure databricks.yml](#step-1-configure-databricksyml) +- [Step 2: First Deployment](#step-2-first-deployment) +- [Step 3: Subsequent Deployments](#step-3-subsequent-deployments) +- [Available Jobs](#available-jobs) +- [Configuration Examples](#configuration-examples) +- [Troubleshooting](#troubleshooting) +- [Quick Reference](#quick-reference) + +--- + +## Prerequisites + +### 1. Install Databricks CLI + +Choose your preferred installation method: + +```bash +# Option 1: Install using pip (recommended) +pip install databricks-cli + +# Option 2: Install using Homebrew (macOS) +brew install databricks-cli + +# Option 3: Install using curl (Linux/macOS) +curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh +``` + +### 2. Verify Installation + +```bash +databricks --version +# Expected output: Databricks CLI v0.200.0 or higher +``` + +### 3. Configure Authentication + +You can authenticate using one of these methods: + +**Option A: Personal Access Token (PAT)** +```bash +# Interactive configuration +databricks configure --token + +# When prompted: +# Databricks Host: https://your-workspace.cloud.databricks.com +# Personal Access Token: dapi...your-token... +``` + +**Option B: Using Environment Variables** +```bash +export DATABRICKS_HOST="https://your-workspace.cloud.databricks.com" +export DATABRICKS_TOKEN="dapi...your-token..." +``` + +**Option C: Using Databricks CLI Profiles** +```bash +# Configure a named profile +databricks configure --token --profile prod + +# Use the profile +databricks bundle deploy --target prod --profile prod +``` + +### 4. Verify Connection + +```bash +databricks workspace list / +# Should list your workspace root directories +``` + +--- + +## Step 1: Configure databricks.yml + +Open `databricks.yml` and update all values marked with `# <-- TODO`. + +### Quick Find: All TODOs + +```bash +grep -n "TODO" databricks.yml +``` + +### Required Configuration Items + +Below is a checklist of all items you need to configure: + +--- + +#### 1.1 Dev Space ID (Source) + +**What it is**: The ID of your Genie space in the Development workspace that you want to export. + +```yaml +variables: + dev_space_id: + default: "your-dev-genie-space-id" # <-- TODO: Add your Dev Space ID +``` + +**How to find it**: +1. Open your Databricks Dev workspace +2. Navigate to your Genie space +3. Copy the ID from the URL: `https://your-workspace.cloud.databricks.com/genie/spaces/{THIS_IS_YOUR_SPACE_ID}` + +--- + +#### 1.2 Prod Warehouse ID + +**What it is**: The SQL Warehouse ID in your Production workspace where queries will run. + +```yaml +variables: + prod_warehouse_id: + default: "your-warehouse-id" # <-- TODO: Add your Prod Warehouse ID +``` + +**How to find it**: +1. Open your Databricks Prod workspace +2. Go to **SQL** → **SQL Warehouses** +3. Click on your warehouse +4. Copy the ID from the URL: `https://your-workspace.cloud.databricks.com/sql/warehouses/{THIS_IS_YOUR_WAREHOUSE_ID}` + +--- + +#### 1.3 Catalog/Schema Mapping + +**What it is**: The mapping from your Dev Unity Catalog references to Prod. + +```yaml +variables: + # Source (Dev) - what to find and replace + source_catalog: + default: "main_th" # <-- TODO: Your Dev catalog name + source_schema: + default: "schema_dev" # <-- TODO: Your Dev schema name + + # Target (Prod) - replacement values + target_catalog: + default: "prod_catalog" # <-- TODO: Your Prod catalog name + target_schema: + default: "schema_prod" # <-- TODO: Your Prod schema name +``` + +**Example**: +| Environment | Catalog | Schema | Full Reference | +|-------------|---------|--------|----------------| +| Dev | `main_th` | `schema_dev` | `main_th.schema_dev.customers` | +| Prod | `main_prod` | `schema_prod` | `main_prod.schema_prod.customers` | + +--- + +#### 1.4 Workspace URLs + +**What it is**: The URLs for your Dev and Prod Databricks workspaces. + +```yaml +targets: + dev: + workspace: + host: https://your-dev-workspace.cloud.databricks.com # <-- TODO: Dev URL + prod: + workspace: + host: https://your-prod-workspace.cloud.databricks.com # <-- TODO: Prod URL +``` + +**Note**: If Dev and Prod are on the same workspace, use the same URL for both. + +--- + +#### 1.5 Compute Configuration + +**What it is**: The compute resources used to run the export and deploy tasks. + +**Default: Serverless** (recommended) + +By default, jobs run on **serverless compute**. No configuration needed - just leave the cluster settings commented out. + +Benefits of serverless: +- Instant startup (no cluster provisioning wait) +- No cluster management overhead +- Automatic scaling and resource optimization +- Cost-effective for short-running tasks + +**Alternative Options**: + +If serverless is not available in your workspace, choose one of the following: + +**Option A: Existing Cluster** (shared resources, requires cluster to be running) +```yaml +existing_cluster_id: "your-cluster-id" # Uncomment and set +``` + +**How to find cluster ID**: +- Go to **Compute** → Select your cluster → Copy ID from URL + +**Option B: New Job Cluster** (isolated, auto-terminates after job) +```yaml +new_cluster: + spark_version: "14.3.x-scala2.12" + num_workers: 0 + node_type_id: "i3.xlarge" # Adjust for your cloud provider + spark_conf: + "spark.databricks.cluster.profile": "singleNode" + "spark.master": "local[*]" + custom_tags: + "ResourceClass": "SingleNode" +``` + +**Node types by cloud**: +| Cloud | Recommended Node Type | +|-------|----------------------| +| AWS | `i3.xlarge`, `m5.xlarge` | +| Azure | `Standard_DS3_v2`, `Standard_D4s_v3` | +| GCP | `n1-standard-4` | + +--- + +## Step 2: First Deployment + +### 2.1 Validate Configuration + +Before deploying, validate your configuration: + +```bash +databricks bundle validate --target prod +``` + +**Expected output**: No errors, shows bundle summary. + +### 2.2 Deploy Bundle + +Deploy the bundle to your workspace: + +```bash +databricks bundle deploy --target prod +``` + +This uploads the notebooks and creates the job definitions. + +### 2.3 Run the Pipeline + +Execute the full pipeline: + +```bash +databricks bundle run promote_genie_to_prod --target prod +``` + +**What happens**: +1. **Task 1 (Export)**: Exports your Dev Genie space → `genie_definition/genie_space.json` +2. **Task 2 (Deploy)**: Creates a new Prod Genie space with catalog/schema replaced + +### 2.4 Save the Prod Space ID + +After the job completes successfully, check the output for: + +``` +⚠️ IMPORTANT: Save this Space ID for future updates: + space_id = "01f0e034e6cb118695218a38adc4176d" +``` + +**Critical**: Add this ID to your `databricks.yml`: + +```yaml +variables: + prod_space_id: + default: "01f0e034e6cb118695218a38adc4176d" # <-- Paste your new Prod Space ID here +``` + +--- + +## Step 3: Subsequent Deployments + +After you've saved the `prod_space_id`, subsequent deployments are simple: + +```bash +# Deploy any bundle changes first (if you modified databricks.yml) +databricks bundle deploy --target prod + +# Run the pipeline to sync changes from Dev to Prod +databricks bundle run promote_genie_to_prod --target prod +``` + +**What happens**: +1. **Export**: Fresh definition exported from Dev +2. **Update**: Existing Prod space is updated (not recreated) + +--- + +## Available Jobs + +| Job | Description | When to Use | +|-----|-------------|-------------| +| `promote_genie_to_prod` | Full pipeline: Export from Dev + Deploy to Prod | Regular sync/deployment | +| `deploy_genie_only` | Deploy only: Use existing `genie_space.json` | Quick deploy without re-export | + +### Run Commands + +```bash +# Full pipeline (recommended for most cases) +databricks bundle run promote_genie_to_prod --target prod + +# Deploy only (skip export step) +databricks bundle run deploy_genie_only --target prod +``` + +--- + +## Configuration Examples + +### Example 1: Same Workspace (Different Catalogs) + +When Dev and Prod are in the same Databricks workspace but use different Unity Catalog schemas: + +```yaml +variables: + source_catalog: + default: "main" + source_schema: + default: "dev" + target_catalog: + default: "main" + target_schema: + default: "prod" + +targets: + dev: + workspace: + host: https://mycompany.cloud.databricks.com + prod: + workspace: + host: https://mycompany.cloud.databricks.com # Same workspace +``` + +### Example 2: Different Workspaces + +When Dev and Prod are separate Databricks workspaces: + +```yaml +variables: + source_catalog: + default: "catalog_dev" + source_schema: + default: "analytics" + target_catalog: + default: "catalog_prod" + target_schema: + default: "analytics" + +targets: + dev: + workspace: + host: https://mycompany-dev.cloud.databricks.com + prod: + workspace: + host: https://mycompany-prod.cloud.databricks.com # Different workspace +``` + +### Example 3: Using Serverless (Default) + +Serverless is the default - simply leave cluster configuration commented out: + +```yaml +tasks: + - task_key: export_from_dev + notebook_task: + notebook_path: ./src/export_genie_definition.ipynb + base_parameters: + space_id: ${var.dev_space_id} + output_file: "../genie_definition/genie_space.json" + # No cluster config = serverless compute +``` + +### Example 4: Using Job Clusters + +If serverless is not available, use job clusters: + +```yaml +tasks: + - task_key: export_from_dev + notebook_task: + notebook_path: ./src/export_genie_definition.ipynb + base_parameters: + space_id: ${var.dev_space_id} + output_file: "../genie_definition/genie_space.json" + + new_cluster: + spark_version: "14.3.x-scala2.12" + num_workers: 0 + node_type_id: "i3.xlarge" + spark_conf: + "spark.databricks.cluster.profile": "singleNode" + "spark.master": "local[*]" + custom_tags: + "ResourceClass": "SingleNode" +``` + +--- + +## Troubleshooting + +### Common Errors + +| Error Message | Cause | Solution | +|---------------|-------|----------| +| `space_id parameter is required` | Export task missing space ID | Set `dev_space_id` in databricks.yml | +| `warehouse_id is required` | Creating new space without warehouse | Set `prod_warehouse_id` in databricks.yml | +| `title is required` | Creating new space without title | Set `genie_space_title` in databricks.yml | +| `Permission denied` | Insufficient workspace access | See [Permissions](#permissions) section below | +| `Space not found` | Invalid or inaccessible space ID | Verify the space ID exists and you have access | +| `Cluster not found` | Invalid existing cluster ID | Verify cluster exists or use job cluster instead | +| `Serverless compute not available` | Serverless not enabled | Enable serverless in workspace settings or configure `existing_cluster_id` or `new_cluster` | + +### Permissions + +Ensure you have the correct permissions: + +| Operation | Required Permission | Where | +|-----------|---------------------|-------| +| Export | `CAN EDIT` | Dev Genie Space | +| Create Space | `CAN MANAGE` | Prod Workspace | +| Update Space | `CAN EDIT` | Prod Genie Space | +| Run Jobs | `CAN MANAGE RUN` | Job in Prod Workspace | + +### Debugging Commands + +```bash +# Validate bundle configuration +databricks bundle validate --target prod + +# Show bundle summary +databricks bundle summary --target prod + +# View job runs +databricks jobs list --output JSON + +# Check workspace connectivity +databricks workspace list / +``` + +### Logs and Monitoring + +1. Run the job and note the run ID +2. In Databricks UI: **Workflows** → **Job Runs** → Select your run +3. Click on each task to view detailed logs and output + +--- + +## Quick Reference + +### Where to Find Things + +| Item | Location | +|------|----------| +| Dev Space ID | Dev Genie Space URL: `/genie/spaces/{SPACE_ID}` | +| Prod Warehouse ID | SQL Warehouses page URL: `/sql/warehouses/{WAREHOUSE_ID}` | +| Prod Space ID | Job output after first successful run | +| Cluster ID (optional) | Compute page URL: `/compute/clusters/{CLUSTER_ID}` | +| Workspace URL | Browser URL when logged in | + +**Note**: Cluster ID is only needed if you're not using serverless (the default). + +### Command Cheat Sheet + +```bash +# Setup +databricks configure --token # Configure authentication +databricks workspace list / # Verify connection + +# Bundle operations +databricks bundle validate --target prod # Validate config +databricks bundle deploy --target prod # Deploy to workspace +databricks bundle summary --target prod # Show bundle info + +# Run jobs +databricks bundle run promote_genie_to_prod --target prod # Full pipeline +databricks bundle run deploy_genie_only --target prod # Deploy only +``` + +### Configuration File Locations + +| File | Purpose | +|------|---------| +| `databricks.yml` | Main bundle configuration | +| `~/.databrickscfg` | CLI authentication profiles | +| `.env` | Environment variables (optional) | diff --git a/data-warehousing/genie-cicd/databricks.yml.template b/data-warehousing/genie-cicd/databricks.yml.template new file mode 100644 index 0000000..e47edc0 --- /dev/null +++ b/data-warehousing/genie-cicd/databricks.yml.template @@ -0,0 +1,247 @@ +# Databricks Asset Bundle Configuration +# ===================================== +# This bundle exports Genie spaces from Dev and deploys them to Prod with catalog/schema replacement. +# +# Workflow: +# Task 1: Export Genie space definition from Dev +# Task 2: Deploy (create/update) Genie space to Prod +# +# Usage: +# databricks bundle deploy --target prod +# databricks bundle run promote_genie_to_prod --target prod + +bundle: + name: genie-space-cicd + +# ============================================================ +# VARIABLES - Customize these for your environment +# ============================================================ +variables: + # ------------------------------ + # TODO: Set your Dev Space ID (source to export) + # ------------------------------ + dev_space_id: + description: "" + default: "" # <-- TODO: Add your Dev Genie Space ID + + # ------------------------------ + # TODO: Set your Prod Space ID (target to update) + # ------------------------------ + # Leave empty ("") for first deployment - the job will create a new space + # After first run, copy the space_id from the output and paste here + prod_space_id: + description: "" + default: "" # <-- TODO: After first deployment, add your Prod space_id here + + # ------------------------------ + # TODO: Set your Prod Warehouse ID + # ------------------------------ + prod_warehouse_id: + description: "" + default: "" # <-- TODO: Add your Prod SQL Warehouse ID + + # ------------------------------ + # TODO: Set your Genie Space Title + # ------------------------------ + genie_space_title: + description: "" + default: "" # <-- TODO: Change to your desired title + + # ------------------------------ + # TODO: Set your Dev catalog/schema names (source) + # ------------------------------ + source_catalog: + description: "" + default: "" # <-- TODO: Change to your Dev catalog name + + source_schema: + description: "" + default: "" # <-- TODO: Change to your Dev schema name + + # ------------------------------ + # TODO: Set your Prod catalog/schema names (target) + # ------------------------------ + target_catalog: + description: "" + default: "" # <-- TODO: Change to your Prod catalog name + + target_schema: + description: "" + default: "" # <-- TODO: Change to your Prod schema name + +# ============================================================ +# RESOURCES - Job definitions +# ============================================================ +resources: + jobs: + # ---------------------------------------------------------- + # Main CI/CD Job: Export from Dev → Deploy to Prod + # ---------------------------------------------------------- + promote_genie_to_prod: + name: "Promote Genie Space: Dev → Prod" + description: "Exports Genie space from Dev and deploys to Prod with catalog/schema replacement" + + # ------------------------------ + # TODO: Set job schedule (optional) + # ------------------------------ + # Uncomment below to run on a schedule + # schedule: + # quartz_cron_expression: "0 0 9 * * ?" # Daily at 9 AM + # timezone_id: "America/Los_Angeles" + + tasks: + # ---------------------------------------- + # Task 1: Export Genie Space from Dev + # ---------------------------------------- + - task_key: export_from_dev + description: "Export Genie space definition from Dev workspace" + + notebook_task: + notebook_path: ./src/export_genie_definition.ipynb + base_parameters: + space_id: ${var.dev_space_id} + output_file: "../genie_definition/genie_space_dev.json" + + # ------------------------------ + # Compute Configuration + # ------------------------------ + # Default: Serverless (no cluster config needed) + # The task runs on serverless compute when no cluster is specified. + # This is the recommended option for most use cases. + + # ------------------------------ + # Alternative: Use an existing cluster + # ------------------------------ + # Uncomment below to run on an existing all-purpose cluster + # existing_cluster_id: "" # <-- TODO: Set your cluster ID + + # ------------------------------ + # Alternative: Create a new job cluster + # ------------------------------ + # Uncomment below to create a dedicated job cluster for this task + # new_cluster: + # spark_version: "14.3.x-scala2.12" + # num_workers: 0 + # node_type_id: "i3.xlarge" # <-- TODO: Adjust for your cloud (AWS: i3.xlarge, Azure: Standard_DS3_v2, GCP: n1-standard-4) + # spark_conf: + # "spark.databricks.cluster.profile": "singleNode" + # "spark.master": "local[*]" + # custom_tags: + # "ResourceClass": "SingleNode" + + # ---------------------------------------- + # Task 2: Deploy Genie Space to Prod + # ---------------------------------------- + - task_key: deploy_to_prod + description: "Deploy Genie space to Prod (create or update)" + depends_on: + - task_key: export_from_dev + + notebook_task: + notebook_path: ./src/deploy_genie_space.ipynb + base_parameters: + space_id: ${var.prod_space_id} + input_file: "../genie_definition/genie_space_dev.json" + output_file: "../genie_definition/genie_space_prod.json" + warehouse_id: ${var.prod_warehouse_id} + title: ${var.genie_space_title} + source_catalog: ${var.source_catalog} + target_catalog: ${var.target_catalog} + source_schema: ${var.source_schema} + target_schema: ${var.target_schema} + + # ------------------------------ + # Compute Configuration + # ------------------------------ + # Default: Serverless (no cluster config needed) + # The task runs on serverless compute when no cluster is specified. + # This is the recommended option for most use cases. + + # ------------------------------ + # Alternative: Use an existing cluster + # ------------------------------ + # Uncomment below to run on an existing all-purpose cluster + # existing_cluster_id: "" # <-- TODO: Set your cluster ID + + # ------------------------------ + # Alternative: Create a new job cluster + # ------------------------------ + # Uncomment below to create a dedicated job cluster for this task + # new_cluster: + # spark_version: "14.3.x-scala2.12" + # num_workers: 0 + # node_type_id: "i3.xlarge" # <-- TODO: Adjust for your cloud (AWS: i3.xlarge, Azure: Standard_DS3_v2, GCP: n1-standard-4) + # spark_conf: + # "spark.databricks.cluster.profile": "singleNode" + # "spark.master": "local[*]" + # custom_tags: + # "ResourceClass": "SingleNode" + + # # ---------------------------------------------------------- + # # Standalone Job: Deploy Only (without export) + # # ---------------------------------------------------------- + # deploy_genie_only: + # name: "Deploy Genie Space to Prod (Deploy Only)" + # description: "Deploys existing genie_space.json to Prod without re-exporting from Dev" + + # tasks: + # - task_key: deploy_to_prod + # description: "Deploy Genie space to Prod (create or update)" + + # notebook_task: + # notebook_path: ./src/deploy_genie_space + # base_parameters: + # space_id: ${var.prod_space_id} + # input_file: "./genie_definition/genie_space.json" + # output_file: "./genie_definition/genie_space_prod.json" + # warehouse_id: ${var.prod_warehouse_id} + # title: ${var.genie_space_title} + # source_catalog: ${var.source_catalog} + # target_catalog: ${var.target_catalog} + # source_schema: ${var.source_schema} + # target_schema: ${var.target_schema} + # + # # Default: Serverless (no cluster config needed) + # # Uncomment one of the alternatives below if needed: + # + # # Alternative: Use an existing cluster + # # existing_cluster_id: "" + # + # # Alternative: Create a new job cluster + # # new_cluster: + # # spark_version: "14.3.x-scala2.12" + # # num_workers: 0 + # # node_type_id: "i3.xlarge" + # # spark_conf: + # # "spark.databricks.cluster.profile": "singleNode" + # # "spark.master": "local[*]" + # # custom_tags: + # # "ResourceClass": "SingleNode" + +# ============================================================ +# TARGETS - Environment configurations +# ============================================================ +targets: + # Development target (for testing the bundle) + dev: + mode: development + default: true + workspace: + # ------------------------------ + # TODO: Set your Dev workspace URL + # ------------------------------ + host: https://.cloud.databricks.com # <-- TODO: Change this + + # Production target (for actual deployment) + prod: + mode: production + workspace: + # ------------------------------ + # TODO: Set your Prod workspace URL + # ------------------------------ + host: https://.cloud.databricks.com # <-- TODO: Change this + + # Override variables for Prod (optional) + # variables: + # prod_space_id: "" + # prod_warehouse_id: "" diff --git a/data-warehousing/genie-cicd/genie_definition/genie_space_dev.json b/data-warehousing/genie-cicd/genie_definition/genie_space_dev.json new file mode 100644 index 0000000..2c1e953 --- /dev/null +++ b/data-warehousing/genie-cicd/genie_definition/genie_space_dev.json @@ -0,0 +1,129 @@ +{ + "version": 2, + "config": { + "sample_questions": [ + { + "id": "01f0fd2d918317eb9aa652fd17e86579", + "question": [ + "What is the total number of customer?" + ] + } + ] + }, + "data_sources": { + "tables": [ + { + "identifier": "main_th.schema_dev.table_example", + "column_configs": [ + { + "column_name": "c_acctbal", + "enable_format_assistance": true + }, + { + "column_name": "c_address", + "enable_format_assistance": true, + "enable_entity_matching": true + }, + { + "column_name": "c_comment", + "enable_format_assistance": true, + "enable_entity_matching": true + }, + { + "column_name": "c_custkey", + "enable_format_assistance": true + }, + { + "column_name": "c_mktsegment", + "enable_format_assistance": true, + "enable_entity_matching": true + }, + { + "column_name": "c_name", + "enable_format_assistance": true, + "enable_entity_matching": true + }, + { + "column_name": "c_nationkey", + "enable_format_assistance": true + }, + { + "column_name": "c_phone", + "enable_format_assistance": true, + "enable_entity_matching": true + } + ] + } + ] + }, + "instructions": { + "text_instructions": [ + { + "id": "01f0fd2d0816128598c487ad7cde4be4", + "content": [ + "You are a responsible to answer question about the customer using the Table table_example as source.\n", + "\n", + "I updated the instruction on Dev and need to see them on Prod" + ] + } + ], + "example_question_sqls": [ + { + "id": "01f0fd3cb0541e37b9a8cb3ee3eec240", + "question": [ + "What is the total number of customer?" + ], + "sql": [ + "SELECT\n", + " COUNT(*) AS total_customers\n", + "FROM\n", + " `main_th`.`schema_dev`.`table_example`\n", + "WHERE\n", + " `c_custkey` IS NOT NULL;" + ] + }, + { + "id": "01f0fd3cc01b1fa3839f30ccde15a8cb", + "question": [ + "What is the total number of customers in each market segment?" + ], + "sql": [ + "SELECT\n", + " `c_mktsegment`,\n", + " COUNT(*) AS total_customers\n", + "FROM\n", + " `main_th`.`schema_dev`.`table_example`\n", + "WHERE\n", + " `c_mktsegment` IS NOT NULL\n", + "GROUP BY\n", + " `c_mktsegment`\n", + "ORDER BY\n", + " total_customers DESC;" + ] + } + ] + }, + "benchmarks": { + "questions": [ + { + "id": "01f0fd3cd6a61691a33b3d01d69d63a6", + "question": [ + "What is the total number of customer?" + ], + "answer": [ + { + "format": "SQL", + "content": [ + "SELECT\n", + " COUNT(*) AS total_customers\n", + "FROM\n", + " `main_th`.`schema_dev`.`table_example`\n", + "WHERE\n", + " `c_custkey` IS NOT NULL;" + ] + } + ] + } + ] + } +} \ No newline at end of file diff --git a/data-warehousing/genie-cicd/genie_definition/genie_space_prod.json b/data-warehousing/genie-cicd/genie_definition/genie_space_prod.json new file mode 100644 index 0000000..d3d9756 --- /dev/null +++ b/data-warehousing/genie-cicd/genie_definition/genie_space_prod.json @@ -0,0 +1,129 @@ +{ + "version": 2, + "config": { + "sample_questions": [ + { + "id": "01f0fd2d918317eb9aa652fd17e86579", + "question": [ + "What is the total number of customer?" + ] + } + ] + }, + "data_sources": { + "tables": [ + { + "identifier": "main_th.schema_prod.table_example", + "column_configs": [ + { + "column_name": "c_acctbal", + "enable_format_assistance": true + }, + { + "column_name": "c_address", + "enable_format_assistance": true, + "enable_entity_matching": true + }, + { + "column_name": "c_comment", + "enable_format_assistance": true, + "enable_entity_matching": true + }, + { + "column_name": "c_custkey", + "enable_format_assistance": true + }, + { + "column_name": "c_mktsegment", + "enable_format_assistance": true, + "enable_entity_matching": true + }, + { + "column_name": "c_name", + "enable_format_assistance": true, + "enable_entity_matching": true + }, + { + "column_name": "c_nationkey", + "enable_format_assistance": true + }, + { + "column_name": "c_phone", + "enable_format_assistance": true, + "enable_entity_matching": true + } + ] + } + ] + }, + "instructions": { + "text_instructions": [ + { + "id": "01f0fd2d0816128598c487ad7cde4be4", + "content": [ + "You are a responsible to answer question about the customer using the Table table_example as source.\n", + "\n", + "I updated the instruction on Dev and need to see them on Prod" + ] + } + ], + "example_question_sqls": [ + { + "id": "01f0fd3cb0541e37b9a8cb3ee3eec240", + "question": [ + "What is the total number of customer?" + ], + "sql": [ + "SELECT\n", + " COUNT(*) AS total_customers\n", + "FROM\n", + " `main_th`.`schema_prod`.`table_example`\n", + "WHERE\n", + " `c_custkey` IS NOT NULL;" + ] + }, + { + "id": "01f0fd3cc01b1fa3839f30ccde15a8cb", + "question": [ + "What is the total number of customers in each market segment?" + ], + "sql": [ + "SELECT\n", + " `c_mktsegment`,\n", + " COUNT(*) AS total_customers\n", + "FROM\n", + " `main_th`.`schema_prod`.`table_example`\n", + "WHERE\n", + " `c_mktsegment` IS NOT NULL\n", + "GROUP BY\n", + " `c_mktsegment`\n", + "ORDER BY\n", + " total_customers DESC;" + ] + } + ] + }, + "benchmarks": { + "questions": [ + { + "id": "01f0fd3cd6a61691a33b3d01d69d63a6", + "question": [ + "What is the total number of customer?" + ], + "answer": [ + { + "format": "SQL", + "content": [ + "SELECT\n", + " COUNT(*) AS total_customers\n", + "FROM\n", + " `main_th`.`schema_prod`.`table_example`\n", + "WHERE\n", + " `c_custkey` IS NOT NULL;" + ] + } + ] + } + ] + } +} \ No newline at end of file diff --git a/data-warehousing/genie-cicd/src/DOCUMENTATION.md b/data-warehousing/genie-cicd/src/DOCUMENTATION.md new file mode 100644 index 0000000..614efce --- /dev/null +++ b/data-warehousing/genie-cicd/src/DOCUMENTATION.md @@ -0,0 +1,540 @@ +# Source Code Documentation + +This document provides detailed documentation for the Python source files (Databricks notebooks) in the `src/` directory. + +## Table of Contents + +- [Overview](#overview) +- [export_genie_definition.ipynb](#export_genie_definitionpy) +- [deploy_genie_space.ipynb](#deploy_genie_spacepy) +- [Genie Space JSON Structure](#genie-space-json-structure) +- [API Reference](#api-reference) + +--- + +## Overview + +This project contains two Databricks notebooks that work together to provide CI/CD capabilities for Genie spaces: + +| Notebook | Purpose | Databricks API | +|----------|---------|----------------| +| `export_genie_definition.ipynb` | Export a Genie space configuration from Dev | GET `/api/2.0/genie/spaces/{space_id}` | +| `deploy_genie_space.ipynb` | Deploy (create or update) a Genie space to Prod | POST/PATCH `/api/2.0/genie/spaces` | + +### Notebook Format + +These files use the **Databricks notebook source format** (indicated by `# Databricks notebook source` at the top). They can be: +- Run directly in Databricks notebooks +- Executed as part of Databricks Jobs +- Managed via Databricks Asset Bundles (DABs) + +### Compute Configuration + +By default, jobs run on **serverless compute**. This is the recommended option as it provides: +- Faster startup times +- No cluster management +- Cost-effective for short-running tasks + +#### Alternative Compute Options + +If serverless is not available or you need specific configurations, you can configure the following in `databricks.yml`: + +**Option 1: Serverless (Default)** +```yaml +# No cluster configuration needed - serverless is used automatically +notebook_task: + notebook_path: ./src/deploy_genie_space.ipynb + # ... parameters ... +``` + +**Option 2: Existing Cluster** +```yaml +notebook_task: + notebook_path: ./src/deploy_genie_space.ipynb + # ... parameters ... +existing_cluster_id: "" +``` + +**Option 3: New Job Cluster** +```yaml +notebook_task: + notebook_path: ./src/deploy_genie_space.ipynb + # ... parameters ... +new_cluster: + spark_version: "14.3.x-scala2.12" + num_workers: 0 + node_type_id: "i3.xlarge" # AWS: i3.xlarge, Azure: Standard_DS3_v2, GCP: n1-standard-4 + spark_conf: + "spark.databricks.cluster.profile": "singleNode" + "spark.master": "local[*]" + custom_tags: + "ResourceClass": "SingleNode" +``` + +### Dependencies + +Both notebooks require the Databricks SDK: + +```python +!pip install databricks-sdk -U +dbutils.library.restartPython() +``` + +--- + +## export_genie_definition.py + +### Purpose + +Exports a Genie space definition from a Databricks workspace and saves it as a JSON file. This is typically used to export from a Development workspace for version control and subsequent deployment to Production. + +### Parameters (Widgets) + +| Parameter | Required | Default | Description | +|-----------|----------|---------|-------------| +| `space_id` | **Yes** | `""` | The Genie Space ID to export | +| `output_file` | Yes | `../genie_definition/genie_space_dev.json` | Path where the exported JSON will be saved | + +### Workflow + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ 1. Validate Parameters │ +│ └─ Ensure space_id is provided │ +├─────────────────────────────────────────────────────────────────────┤ +│ 2. Initialize Authentication │ +│ └─ Get workspace URL and PAT token from notebook context │ +├─────────────────────────────────────────────────────────────────────┤ +│ 3. Call Get Space API │ +│ └─ GET /api/2.0/genie/spaces/{space_id}?include_serialized_space │ +├─────────────────────────────────────────────────────────────────────┤ +│ 4. Extract Serialized Space │ +│ └─ Parse JSON from serialized_space field │ +├─────────────────────────────────────────────────────────────────────┤ +│ 5. Save to File │ +│ └─ Write formatted JSON to output_file │ +├─────────────────────────────────────────────────────────────────────┤ +│ 6. Return Output │ +│ └─ Return status, space_id, title, output_file via dbutils.exit │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +### Code Sections + +#### 1. Widget Parameters + +```python +dbutils.widgets.text("space_id", "", "Space ID to export (required)") +dbutils.widgets.text("output_file", "../genie_definition/genie_space_dev.json", "Output JSON file path") +``` + +#### 2. Parameter Validation + +```python +SPACE_ID = dbutils.widgets.get("space_id") +if not SPACE_ID: + raise ValueError("space_id parameter is required.") +``` + +#### 3. Authentication + +```python +w = WorkspaceClient() +workspace_url = w.config.host +pat_token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get() +``` + +The notebook uses the current notebook's authentication context to obtain: +- **workspace_url**: The Databricks workspace URL +- **pat_token**: Personal Access Token for API calls + +#### 4. API Call + +```python +response = requests.get( + f"{workspace_url}/api/2.0/genie/spaces/{SPACE_ID}", + headers=headers, + params={"include_serialized_space": "true"} +) +``` + +The `include_serialized_space=true` parameter is critical - it returns the full space configuration. + +### Output Format + +The notebook returns a JSON object via `dbutils.notebook.exit()`: + +```json +{ + "status": "exported", + "space_id": "01f0fd2cfa1c16c185ec2ee3b4ea29d7", # your Genie space ID + "title": "My Genie Space", + "output_file": "../genie_definition/genie_space_dev.json" +} +``` + +### Error Handling + +| Error | Cause | Solution | +|-------|-------|----------| +| `ValueError: space_id parameter is required` | Missing space_id | Provide the space_id parameter | +| `requests.HTTPError` | API error (404, 403, etc.) | Check space_id and permissions | + +--- + +## deploy_genie_space.py + +### Purpose + +Deploys a Genie space to a Databricks workspace. Can either: +- **Create** a new Genie space (if `space_id` is empty) +- **Update** an existing Genie space (if `space_id` is provided) + +Optionally performs **catalog/schema replacement** to transform Dev references to Prod references. + +### Parameters (Widgets) + +| Parameter | Required | Default | Description | +|-----------|----------|---------|-------------| +| `space_id` | No | `""` | Target Space ID. Empty = create new, filled = update | +| `input_file` | **Yes** | `../genie_definition/genie_space_dev.json` | Path to the source JSON file | +| `output_file` | No | Auto-generated | Path for the transformed JSON (Prod version) | +| `warehouse_id` | **Yes*** | `""` | SQL Warehouse ID (*required for create) | +| `title` | **Yes*** | `""` | Space title (*required for create) | +| `source_catalog` | No | `""` | Source catalog name to replace | +| `target_catalog` | No | `""` | Target catalog name | +| `source_schema` | No | `""` | Source schema name to replace | +| `target_schema` | No | `""` | Target schema name | + +### Workflow + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ 1. Parse Parameters │ +│ └─ Determine CREATE vs UPDATE mode based on space_id │ +├─────────────────────────────────────────────────────────────────────┤ +│ 2. Validate Required Parameters │ +│ └─ For CREATE: warehouse_id and title required │ +├─────────────────────────────────────────────────────────────────────┤ +│ 3. Load Input JSON │ +│ └─ Read the exported Genie space definition │ +├─────────────────────────────────────────────────────────────────────┤ +│ 4. Apply Catalog/Schema Replacement (if configured) │ +│ └─ Transform all Unity Catalog references │ +├─────────────────────────────────────────────────────────────────────┤ +│ 5. Save Transformed JSON (optional) │ +│ └─ Write Prod version for audit trail │ +├─────────────────────────────────────────────────────────────────────┤ +│ 6. Deploy to Workspace │ +│ └─ POST (create) or PATCH (update) via Genie API │ +├─────────────────────────────────────────────────────────────────────┤ +│ 7. Return Output │ +│ └─ Return status, space_id, title via dbutils.exit │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +### Key Functions + +#### `replace_catalog_schema(text, source_catalog, target_catalog, source_schema, target_schema)` + +Replaces catalog and schema names in a text string. Handles both formats: + +```python +# Input: "SELECT * FROM main_th.schema_dev.customers" +# Output: "SELECT * FROM main_prod.schema_prod.customers" + +# Input: "SELECT * FROM `main_th`.`schema_dev`.`customers`" +# Output: "SELECT * FROM `main_prod`.`schema_prod`.`customers`" +``` + +**Parameters:** +| Parameter | Type | Description | +|-----------|------|-------------| +| `text` | str | The text to search and replace in | +| `source_catalog` | str | The catalog name to find | +| `target_catalog` | str | The catalog name to replace with | +| `source_schema` | str (optional) | The schema name to find | +| `target_schema` | str (optional) | The schema name to replace with | + +**Returns:** `str` - The text with replacements applied + +**Regex Patterns Used:** +```python +# Backtick-quoted format +rf'`{re.escape(source_catalog)}`\.`{re.escape(source_schema)}`\.' + +# Plain format +rf'\b{re.escape(source_catalog)}\.{re.escape(source_schema)}\.' +``` + +#### `update_genie_space_catalog(data, source_catalog, target_catalog, source_schema, target_schema)` + +Updates all catalog/schema references in a Genie space JSON structure. + +**Parameters:** +| Parameter | Type | Description | +|-----------|------|-------------| +| `data` | dict | The Genie space JSON as a Python dict | +| `source_catalog` | str | The catalog name to find | +| `target_catalog` | str | The catalog name to replace with | +| `source_schema` | str (optional) | The schema name to find | +| `target_schema` | str (optional) | The schema name to replace with | + +**Returns:** `Tuple[dict, int]` - Updated data and count of replacements made + +**JSON Paths Updated:** +| Path | Description | +|------|-------------| +| `data_sources.tables[].identifier` | Table identifiers | +| `data_sources.metric_views[].identifier` | Metric view identifiers | +| `instructions.example_question_sqls[].sql[]` | Example SQL queries | +| `benchmarks.questions[].answer[].content[]` | Benchmark answer content | + +### Operation Modes + +#### CREATE Mode (space_id is empty) + +```python +payload = { + "serialized_space": genie_space_json_str, + "warehouse_id": WAREHOUSE_ID, + "title": TITLE +} + +response = requests.post( + f"{workspace_url}/api/2.0/genie/spaces", + headers=headers, + data=json.dumps(payload) +) +``` + +**Requirements:** +- `warehouse_id` is **required** +- `title` is **required** + +**Output includes the new space_id** - save this for future updates! + +#### UPDATE Mode (space_id is provided) + +```python +payload = { + "serialized_space": genie_space_json_str +} +# Optional: title, warehouse_id overrides + +response = requests.patch( + f"{workspace_url}/api/2.0/genie/spaces/{SPACE_ID}", + headers=headers, + data=json.dumps(payload) +) +``` + +### Output Format + +The notebook returns a JSON object via `dbutils.notebook.exit()`: + +**For CREATE:** +```json +{ + "status": "created", + "space_id": "01f0e034e6cb118695218a38adc4176d", # your Genie space ID + "title": "My Genie Space Prod" +} +``` + +**For UPDATE:** +```json +{ + "status": "updated", + "space_id": "01f0e034e6cb118695218a38adc4176d", # your Genie space ID + "title": "My Genie Space Prod" +} +``` + +### Error Handling + +| Error | Cause | Solution | +|-------|-------|----------| +| `ValueError: warehouse_id is required` | Creating without warehouse_id | Provide warehouse_id parameter | +| `ValueError: title is required` | Creating without title | Provide title parameter | +| `requests.HTTPError: 404` | Space not found (update mode) | Verify space_id is correct | +| `requests.HTTPError: 403` | Permission denied | Check workspace permissions | + +--- + +## Genie Space JSON Structure + +The exported Genie space JSON follows this structure: + +```json +{ + "version": 2, + "config": { + "sample_questions": [ + { + "id": "unique-id", + "question": ["Question text"] + } + ] + }, + "data_sources": { + "tables": [ + { + "identifier": "catalog.schema.table_name", + "column_configs": [ + { + "column_name": "column_name", + "enable_format_assistance": true, + "enable_entity_matching": true + } + ] + } + ], + "metric_views": [ + { + "identifier": "catalog.schema.metric_view_name" + } + ] + }, + "instructions": { + "text_instructions": [ + { + "id": "unique-id", + "content": ["Instruction text..."] + } + ], + "example_question_sqls": [ + { + "id": "unique-id", + "question": ["Question text"], + "sql": ["SELECT...", "FROM...", "WHERE..."] + } + ] + }, + "benchmarks": { + "questions": [ + { + "id": "unique-id", + "question": ["Question text"], + "answer": [ + { + "format": "SQL", + "content": ["SELECT...", "FROM...", "WHERE..."] + } + ] + } + ] + } +} +``` + +### Key Fields + +| Field | Description | Catalog Replacement | +|-------|-------------|---------------------| +| `data_sources.tables[].identifier` | Unity Catalog table reference | **Yes** | +| `data_sources.metric_views[].identifier` | Metric view reference | **Yes** | +| `instructions.example_question_sqls[].sql` | Example SQL queries | **Yes** | +| `benchmarks.questions[].answer[].content` | Benchmark answers | **Yes** | +| `config.sample_questions` | UI sample questions | No | +| `instructions.text_instructions` | Natural language instructions | No | + +--- + +## API Reference + +### Get Space (Export) + +``` +GET /api/2.0/genie/spaces/{space_id} +``` + +**Query Parameters:** +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `include_serialized_space` | boolean | Yes | Set to `true` to get full configuration | + +**Response:** +```json +{ + "space_id": "string", + "title": "string", + "description": "string", + "warehouse_id": "string", + "serialized_space": "string (JSON)" +} +``` + +**Documentation:** [Get Space API](https://docs.databricks.com/api/workspace/genie/getspace) + +### Create Space + +``` +POST /api/2.0/genie/spaces +``` + +**Request Body:** +```json +{ + "serialized_space": "string (JSON)", + "warehouse_id": "string (required)", + "title": "string (required)" +} +``` + +**Response:** +```json +{ + "space_id": "string", + "title": "string" +} +``` + +**Documentation:** [Create Space API](https://docs.databricks.com/api/workspace/genie/createspace) + +### Update Space + +``` +PATCH /api/2.0/genie/spaces/{space_id} +``` + +**Request Body:** +```json +{ + "serialized_space": "string (JSON)", + "warehouse_id": "string (optional)", + "title": "string (optional)" +} +``` + +**Response:** +```json +{ + "space_id": "string", + "title": "string" +} +``` + +**Documentation:** [Update Space API](https://docs.databricks.com/api/workspace/genie/updatespace) + +--- + +## Authentication + +Both notebooks use the Databricks notebook context for authentication: + +```python +from databricks.sdk import WorkspaceClient + +w = WorkspaceClient() +workspace_url = w.config.host + +# Get PAT token from notebook context +pat_token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get() +``` + +This approach: +- Uses the same authentication as the notebook +- Works in Databricks Jobs +- No need to manage separate credentials +- Automatically uses the correct workspace URL diff --git a/data-warehousing/genie-cicd/src/deploy_genie_space.ipynb b/data-warehousing/genie-cicd/src/deploy_genie_space.ipynb new file mode 100644 index 0000000..f6dc480 --- /dev/null +++ b/data-warehousing/genie-cicd/src/deploy_genie_space.ipynb @@ -0,0 +1,348 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "b11200e3-25b6-4733-944a-ce96c848af34", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "!pip install databricks-sdk -U\n", + "dbutils.library.restartPython()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c97ced34-2536-48d6-94d8-aeaf02b300b8", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "import json\n", + "import re\n", + "import requests\n", + "import os\n", + "from databricks.sdk import WorkspaceClient\n", + "from typing import Optional, Tuple\n", + "\n", + "# ========================================\n", + "# Widget Parameters (for Job/CI-CD integration)\n", + "# ========================================\n", + "dbutils.widgets.text(\"space_id\", \"\", \"Space ID (empty = create new)\")\n", + "dbutils.widgets.text(\"input_file\", \"./genie_definition/genie_space.json\", \"Input JSON File (Dev)\")\n", + "dbutils.widgets.text(\"output_file\", \"\", \"Output JSON File (Prod) - optional\")\n", + "dbutils.widgets.text(\"warehouse_id\", \"\", \"Warehouse ID (required)\")\n", + "dbutils.widgets.text(\"title\", \"\", \"Space Title\")\n", + "dbutils.widgets.text(\"source_catalog\", \"\", \"Source Catalog\")\n", + "dbutils.widgets.text(\"target_catalog\", \"\", \"Target Catalog\")\n", + "dbutils.widgets.text(\"source_schema\", \"\", \"Source Schema\")\n", + "dbutils.widgets.text(\"target_schema\", \"\", \"Target Schema\")\n", + "\n", + "# ========================================\n", + "# Get parameter values\n", + "# ========================================\n", + "SPACE_ID = dbutils.widgets.get(\"space_id\") or None\n", + "INPUT_FILE = dbutils.widgets.get(\"input_file\")\n", + "OUTPUT_FILE = dbutils.widgets.get(\"output_file\") or None\n", + "WAREHOUSE_ID = dbutils.widgets.get(\"warehouse_id\") or None\n", + "TITLE = dbutils.widgets.get(\"title\") or None\n", + "SOURCE_CATALOG = dbutils.widgets.get(\"source_catalog\") or None\n", + "TARGET_CATALOG = dbutils.widgets.get(\"target_catalog\") or None\n", + "SOURCE_SCHEMA = dbutils.widgets.get(\"source_schema\") or None\n", + "TARGET_SCHEMA = dbutils.widgets.get(\"target_schema\") or None\n", + "\n", + "# Generate default output file name if not provided\n", + "if not OUTPUT_FILE and SOURCE_CATALOG and TARGET_CATALOG:\n", + " base, ext = os.path.splitext(INPUT_FILE)\n", + " OUTPUT_FILE = f\"{base}_prod{ext}\"\n", + "\n", + "# Determine operation mode\n", + "OPERATION = \"UPDATE\" if SPACE_ID else \"CREATE\"\n", + "\n", + "# Validate required parameters\n", + "if OPERATION == \"CREATE\":\n", + " if not WAREHOUSE_ID:\n", + " raise ValueError(\"warehouse_id is required when creating a new Genie space.\")\n", + " if not TITLE:\n", + " raise ValueError(\"title is required when creating a new Genie space.\")\n", + "\n", + "print(f\"\uD83D\uDE80 Operation Mode: {OPERATION}\")\n", + "if SPACE_ID:\n", + " print(f\" Space ID: {SPACE_ID}\")\n", + "\n", + "# ========================================\n", + "# Catalog/Schema Replacement Functions\n", + "# ========================================\n", + "def replace_catalog_schema(text: str, \n", + " source_catalog: str, \n", + " target_catalog: str,\n", + " source_schema: Optional[str] = None,\n", + " target_schema: Optional[str] = None) -> str:\n", + " \"\"\"\n", + " Replace catalog and schema names in a text string.\n", + " Handles both formats:\n", + " - Without backticks: catalog.schema.table\n", + " - With backticks: `catalog`.`schema`.`table`\n", + " \"\"\"\n", + " result = text\n", + " \n", + " if source_schema and target_schema:\n", + " # Replace backtick-quoted format: `catalog`.`schema`.\n", + " pattern_backticks = rf'`{re.escape(source_catalog)}`\\.`{re.escape(source_schema)}`\\.'\n", + " replacement_backticks = f'`{target_catalog}`.`{target_schema}`.'\n", + " result = re.sub(pattern_backticks, replacement_backticks, result)\n", + " \n", + " # Replace non-quoted format: catalog.schema.\n", + " pattern_plain = rf'\\b{re.escape(source_catalog)}\\.{re.escape(source_schema)}\\.'\n", + " replacement_plain = f'{target_catalog}.{target_schema}.'\n", + " result = re.sub(pattern_plain, replacement_plain, result)\n", + " else:\n", + " # Replace backtick-quoted format: `catalog`.\n", + " pattern_backticks = rf'`{re.escape(source_catalog)}`\\.'\n", + " replacement_backticks = f'`{target_catalog}`.'\n", + " result = re.sub(pattern_backticks, replacement_backticks, result)\n", + " \n", + " # Replace non-quoted format: catalog.\n", + " pattern_plain = rf'\\b{re.escape(source_catalog)}\\.'\n", + " replacement_plain = f'{target_catalog}.'\n", + " result = re.sub(pattern_plain, replacement_plain, result)\n", + " \n", + " return result\n", + "\n", + "\n", + "def update_genie_space_catalog(data: dict,\n", + " source_catalog: str,\n", + " target_catalog: str,\n", + " source_schema: Optional[str] = None,\n", + " target_schema: Optional[str] = None) -> Tuple[dict, int]:\n", + " \"\"\"Update all catalog/schema references in a Genie space JSON structure.\"\"\"\n", + " updated_data = json.loads(json.dumps(data)) # Deep copy\n", + " updates_count = 0\n", + " \n", + " # Update data_sources.tables[].identifier\n", + " if 'data_sources' in updated_data and 'tables' in updated_data['data_sources']:\n", + " for table in updated_data['data_sources']['tables']:\n", + " if 'identifier' in table:\n", + " old_value = table['identifier']\n", + " new_value = replace_catalog_schema(\n", + " old_value, source_catalog, target_catalog, source_schema, target_schema\n", + " )\n", + " if old_value != new_value:\n", + " table['identifier'] = new_value\n", + " updates_count += 1\n", + " \n", + " # Update data_sources.metric_views[].identifier\n", + " if 'data_sources' in updated_data and 'metric_views' in updated_data['data_sources']:\n", + " for metric_view in updated_data['data_sources']['metric_views']:\n", + " if 'identifier' in metric_view:\n", + " old_value = metric_view['identifier']\n", + " new_value = replace_catalog_schema(\n", + " old_value, source_catalog, target_catalog, source_schema, target_schema\n", + " )\n", + " if old_value != new_value:\n", + " metric_view['identifier'] = new_value\n", + " updates_count += 1\n", + " \n", + " # Update instructions.example_question_sqls[].sql[]\n", + " if 'instructions' in updated_data and 'example_question_sqls' in updated_data['instructions']:\n", + " for example in updated_data['instructions']['example_question_sqls']:\n", + " if 'sql' in example:\n", + " for j, sql_line in enumerate(example['sql']):\n", + " old_value = sql_line\n", + " new_value = replace_catalog_schema(\n", + " old_value, source_catalog, target_catalog, source_schema, target_schema\n", + " )\n", + " if old_value != new_value:\n", + " example['sql'][j] = new_value\n", + " updates_count += 1\n", + " \n", + " # Update benchmarks.questions[].answer[].content[]\n", + " if 'benchmarks' in updated_data and 'questions' in updated_data['benchmarks']:\n", + " for question in updated_data['benchmarks']['questions']:\n", + " if 'answer' in question:\n", + " for answer in question['answer']:\n", + " if 'content' in answer:\n", + " for k, content_line in enumerate(answer['content']):\n", + " old_value = content_line\n", + " new_value = replace_catalog_schema(\n", + " old_value, source_catalog, target_catalog, source_schema, target_schema\n", + " )\n", + " if old_value != new_value:\n", + " answer['content'][k] = new_value\n", + " updates_count += 1\n", + " \n", + " return updated_data, updates_count\n", + "\n", + "\n", + "# ========================================\n", + "# Initialize client and get auth token\n", + "# ========================================\n", + "w = WorkspaceClient()\n", + "workspace_url = w.config.host\n", + "pat_token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()\n", + "\n", + "headers = {\n", + " \"Authorization\": f\"Bearer {pat_token}\",\n", + " \"Content-Type\": \"application/json\"\n", + "}\n", + "\n", + "# ========================================\n", + "# Load the original space definition (Dev)\n", + "# ========================================\n", + "with open(INPUT_FILE, 'r') as f:\n", + " genie_space_data = json.load(f)\n", + "\n", + "print(f\"\\n\uD83D\uDCC4 Loaded space definition from: {INPUT_FILE}\")\n", + "\n", + "# ========================================\n", + "# Apply catalog/schema replacement if configured\n", + "# ========================================\n", + "if SOURCE_CATALOG and TARGET_CATALOG:\n", + " source_display = SOURCE_CATALOG + (f\".{SOURCE_SCHEMA}\" if SOURCE_SCHEMA else \"\")\n", + " target_display = TARGET_CATALOG + (f\".{TARGET_SCHEMA}\" if TARGET_SCHEMA else \"\")\n", + " \n", + " print(f\"\\n\uD83D\uDD04 Replacing catalog/schema names:\")\n", + " print(f\" Source: {source_display}\")\n", + " print(f\" Target: {target_display}\")\n", + " \n", + " genie_space_data, updates_count = update_genie_space_catalog(\n", + " genie_space_data,\n", + " source_catalog=SOURCE_CATALOG,\n", + " target_catalog=TARGET_CATALOG,\n", + " source_schema=SOURCE_SCHEMA,\n", + " target_schema=TARGET_SCHEMA\n", + " )\n", + " print(f\" ✓ Made {updates_count} replacements\")\n", + " \n", + " # Save the modified JSON to the output file (Prod version)\n", + " if OUTPUT_FILE:\n", + " with open(OUTPUT_FILE, 'w') as f:\n", + " json.dump(genie_space_data, f, indent=2)\n", + " print(f\"\\n\uD83D\uDCBE Saved Prod version to: {OUTPUT_FILE}\")\n", + " print(f\" (Original Dev version preserved at: {INPUT_FILE})\")\n", + "\n", + "# Convert to JSON string for API\n", + "genie_space_json_str = json.dumps(genie_space_data)\n", + "\n", + "# ========================================\n", + "# Deploy: Create or Update\n", + "# ========================================\n", + "if OPERATION == \"CREATE\":\n", + " # CREATE new Genie space\n", + " payload = {\n", + " \"serialized_space\": genie_space_json_str,\n", + " \"warehouse_id\": WAREHOUSE_ID,\n", + " \"title\": TITLE\n", + " }\n", + " \n", + " print(f\"\\n\uD83D\uDCE4 Creating new Genie space...\")\n", + " response = requests.post(\n", + " f\"{workspace_url}/api/2.0/genie/spaces\",\n", + " headers=headers,\n", + " data=json.dumps(payload)\n", + " )\n", + " \n", + " response.raise_for_status()\n", + " space_info = response.json()\n", + " \n", + " print(f\"\\n✅ Successfully CREATED Genie space:\")\n", + " print(f\" - Space ID: {space_info['space_id']}\")\n", + " print(f\" - Title: {space_info['title']}\")\n", + " \n", + " # Important: Output the new space_id for future runs\n", + " print(f\"\\n⚠️ IMPORTANT: Save this Space ID for future updates:\")\n", + " print(f\" space_id = \\\"{space_info['space_id']}\\\"\")\n", + " \n", + " # Set the space_id as notebook output for downstream tasks\n", + " dbutils.notebook.exit(json.dumps({\n", + " \"status\": \"created\",\n", + " \"space_id\": space_info['space_id'],\n", + " \"title\": space_info['title']\n", + " }))\n", + "\n", + "else:\n", + " # UPDATE existing Genie space\n", + " payload = {\n", + " \"serialized_space\": genie_space_json_str\n", + " }\n", + " \n", + " # Add optional overrides\n", + " if TITLE:\n", + " payload[\"title\"] = TITLE\n", + " if WAREHOUSE_ID:\n", + " payload[\"warehouse_id\"] = WAREHOUSE_ID\n", + " \n", + " print(f\"\\n\uD83D\uDCE4 Updating existing Genie space...\")\n", + " response = requests.patch(\n", + " f\"{workspace_url}/api/2.0/genie/spaces/{SPACE_ID}\",\n", + " headers=headers,\n", + " data=json.dumps(payload)\n", + " )\n", + " \n", + " response.raise_for_status()\n", + " space_info = response.json()\n", + " \n", + " print(f\"\\n✅ Successfully UPDATED Genie space:\")\n", + " print(f\" - Space ID: {space_info.get('space_id', SPACE_ID)}\")\n", + " print(f\" - Title: {space_info.get('title', 'N/A')}\")\n", + " \n", + " # Set output for downstream tasks\n", + " dbutils.notebook.exit(json.dumps({\n", + " \"status\": \"updated\",\n", + " \"space_id\": space_info.get('space_id', SPACE_ID),\n", + " \"title\": space_info.get('title', 'N/A')\n", + " }))\n", + "\n", + "# ========================================\n", + "# Summary\n", + "# ========================================\n", + "print(\"\\n\uD83C\uDF89 Deployment complete!\")\n", + "\n", + "if SOURCE_CATALOG and TARGET_CATALOG and OUTPUT_FILE:\n", + " print(\"\\n\" + \"=\"*50)\n", + " print(\"\uD83D\uDCC1 Files:\")\n", + " print(f\" Dev version: {INPUT_FILE}\")\n", + " print(f\" Prod version: {OUTPUT_FILE}\")\n", + " print(\"=\"*50)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "dbe_0c235d96-4bc7-4fb5-b118-17fd1dad0124", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "deploy_genie_space", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/data-warehousing/genie-cicd/src/export_genie_definition.ipynb b/data-warehousing/genie-cicd/src/export_genie_definition.ipynb new file mode 100644 index 0000000..5349d3a --- /dev/null +++ b/data-warehousing/genie-cicd/src/export_genie_definition.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "357d7b80-3f75-41c6-a4b0-37ad40ac3963", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "!pip install databricks-sdk -U\n", + "dbutils.library.restartPython()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "08e19cbb-4e9d-459e-8f25-b43a088725e9", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "import json\n", + "import requests\n", + "from databricks.sdk import WorkspaceClient\n", + "\n", + "# ========================================\n", + "# Widget Parameters (for Job/CI-CD integration)\n", + "# ========================================\n", + "dbutils.widgets.text(\"space_id\", \"\", \"Space ID to export (required)\")\n", + "dbutils.widgets.text(\"output_file\", \"../genie_definition/genie_space.json\", \"Output JSON file path\")\n", + "\n", + "# ========================================\n", + "# Get parameter values\n", + "# ========================================\n", + "SPACE_ID = dbutils.widgets.get(\"space_id\")\n", + "OUTPUT_FILE = dbutils.widgets.get(\"output_file\")\n", + "\n", + "# Validate required parameters\n", + "if not SPACE_ID:\n", + " raise ValueError(\"space_id parameter is required. Please provide the Genie Space ID to export.\")\n", + "\n", + "print(f\"🚀 Exporting Genie Space: {SPACE_ID}\")\n", + "\n", + "# ========================================\n", + "# Initialize client and get auth token\n", + "# ========================================\n", + "w = WorkspaceClient()\n", + "workspace_url = w.config.host\n", + "pat_token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()\n", + "\n", + "headers = {\n", + " \"Authorization\": f\"Bearer {pat_token}\",\n", + " \"Content-Type\": \"application/json\"\n", + "}\n", + "\n", + "# ========================================\n", + "# Call Get Space API\n", + "# ========================================\n", + "response = requests.get(\n", + " f\"{workspace_url}/api/2.0/genie/spaces/{SPACE_ID}\",\n", + " headers=headers,\n", + " params={\"include_serialized_space\": \"true\"}\n", + ")\n", + "\n", + "response.raise_for_status()\n", + "space_definition = response.json()\n", + "\n", + "print(f\"\\n✅ Successfully retrieved Genie space:\")\n", + "print(f\" - Title: {space_definition.get('title', 'N/A')}\")\n", + "print(f\" - Space ID: {space_definition.get('space_id', SPACE_ID)}\")\n", + "\n", + "# ========================================\n", + "# Extract and save the serialized space\n", + "# ========================================\n", + "# Parse the serialized space JSON string\n", + "serialized_space = json.loads(space_definition[\"serialized_space\"])\n", + " \n", + "# Save to file with pretty formatting\n", + "with open(OUTPUT_FILE, 'w') as f:\n", + " json.dump(serialized_space, f, indent=2)\n", + " \n", + "print(f\"\\n💾 Exported to: {OUTPUT_FILE}\")\n", + "\n", + "# ========================================\n", + "# Output for downstream tasks\n", + "# ========================================\n", + "print(\"\\n🎉 Export complete!\")\n", + "\n", + "# Return output for job chaining\n", + "dbutils.notebook.exit(json.dumps({\n", + " \"status\": \"exported\",\n", + " \"space_id\": SPACE_ID,\n", + " \"title\": space_definition.get('title', 'N/A'),\n", + " \"output_file\": OUTPUT_FILE\n", + "}))" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "dbe_0c235d96-4bc7-4fb5-b118-17fd1dad0124", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "export_genie_definition", + "widgets": { + "output_file": { + "currentValue": "../genie_definition/genie_space.json", + "nuid": "8e223e36-5e14-4d16-ba66-f86ed76f6ac1", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "./genie_definition/genie_space.json", + "label": "Output JSON file path", + "name": "output_file", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "./genie_definition/genie_space.json", + "label": "Output JSON file path", + "name": "output_file", + "options": { + "autoCreated": null, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + }, + "space_id": { + "currentValue": "01f0fd2cfa1c16c185ec2ee3b4ea29d7", + "nuid": "4227c54f-b5be-4db0-8eaf-4537f8f9761d", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "", + "label": "Space ID to export (required)", + "name": "space_id", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "", + "label": "Space ID to export (required)", + "name": "space_id", + "options": { + "autoCreated": null, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + } + } + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/data-warehousing/genie-room-creation/Create Genie Space with Widgets.ipynb b/data-warehousing/genie-room-creation/Create Genie Space with Widgets.ipynb new file mode 100644 index 0000000..07b995e --- /dev/null +++ b/data-warehousing/genie-room-creation/Create Genie Space with Widgets.ipynb @@ -0,0 +1,731 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "85d0f4e6-b0f5-45e2-b573-1a8eefa381bf", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "# Create Genie Space Programmatically with Databricks SDK & Widgets\n", + "\n", + "This notebook allows you to create a Genie space programmatically using the **Databricks SDK** and widgets for easy configuration. Perfect for automating Genie space creation across different environments!\n", + "\n", + "- Databricks SDK Documentation: [https://docs.databricks.com/dev-tools/sdk-python.html](https://docs.databricks.com/dev-tools/sdk-python.html)\n", + "- Genie API Reference: [https://docs.databricks.com/api/workspace/genie/createspace](https://docs.databricks.com/api/workspace/genie/createspace)\n", + "- More about Genie: [https://docs.databricks.com/aws/en/genie/conversation-api](https://docs.databricks.com/aws/en/genie/conversation-api)\n", + "\n", + "### Benefits of using the SDK:\n", + "- **Cleaner code** - No need to manage HTTP requests manually\n", + "- **Automatic authentication** - Uses your notebook's authentication context\n", + "- **Type safety** - Better IDE support and error checking\n", + "- **Built-in retries** - Handles transient errors automatically\n", + "- **Easy parameterization** - Use widgets for different environments\n", + "\n", + "### Prerequisites:\n", + "Make sure you have the latest Databricks SDK installed (run the cell below if needed)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7e08205f-5459-4500-b94e-81d284d88e14", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Install or upgrade the Databricks SDK to the latest version\n", + "# Uncomment and run this cell if you need to install/upgrade the SDK\n", + "\n", + "%pip install --upgrade databricks-sdk\n", + "dbutils.library.restartPython() # Restart Python to use the new version\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "641e4f75-92f2-46b0-bd7a-7e1fb902aaec", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Create Databricks Widgets for configuration\n", + "dbutils.widgets.text(\"space_title\", \"My New Genie Space\", \"1. Space Title\")\n", + "dbutils.widgets.text(\"space_description\", \"\", \"2. Space Description (Optional)\")\n", + "dbutils.widgets.text(\"warehouse_id\", \"\", \"3. Warehouse ID\")\n", + "dbutils.widgets.text(\"table_identifiers\", \"\", \"4. Tables (Optional - comma separated)\")\n", + "dbutils.widgets.dropdown(\"include_sample_instructions\", \"Yes\", [\"Yes\", \"No\"], \"5. Include Sample Instructions\")\n", + "\n", + "print(\"✓ Widgets created successfully!\")\n", + "print(\"\\nNote: The SDK automatically uses your notebook's authentication context.\")\n", + "print(\"No need to provide workspace URL or credentials separately.\")\n", + "print(\"\\n💡 Tip: For tables, use format: catalog.schema.table\")\n", + "print(\" Example: main.sales.orders, main.sales.customers\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f1f9c0a2-f702-42f8-ad04-dc16588361c4", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Authentication with Databricks SDK\n", + "\n", + "**Good News!** When using the Databricks SDK in a notebook, authentication is **automatic**. The SDK uses your notebook's execution context, so you don't need to:\n", + "- ❌ Manage Personal Access Tokens (PATs)\n", + "- ❌ Store secrets in secret scopes\n", + "- ❌ Handle authentication headers manually\n", + "\n", + "The SDK automatically inherits your workspace credentials! 🎉\n", + "\n", + "#### (Optional) For Advanced Use Cases\n", + "\n", + "If you need to authenticate to a **different workspace** or use **custom credentials**, you can still do that:\n", + "\n", + "```python\n", + "from databricks.sdk import WorkspaceClient\n", + "\n", + "# Option 1: Use a specific PAT\n", + "w = WorkspaceClient(\n", + " host=\"https://your-workspace.cloud.databricks.com\",\n", + " token=dbutils.secrets.get(scope=\"my-scope\", key=\"my-key\")\n", + ")\n", + "\n", + "# Option 2: Use environment variables\n", + "# Set DATABRICKS_HOST and DATABRICKS_TOKEN in your environment\n", + "w = WorkspaceClient()\n", + "```\n", + "\n", + "But for most use cases, just use `WorkspaceClient()` without any parameters!\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8803479d-6a26-4678-ab35-1975cad8ea7d", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### How to Find Your Warehouse ID\n", + "\n", + "Your **Warehouse ID** is the unique identifier for your SQL Warehouse (compute resource).\n", + "\n", + "#### Method 1: Using the Databricks UI\n", + "\n", + "1. Go to **SQL Warehouses** in your Databricks workspace sidebar\n", + "2. Click on the warehouse you want to use\n", + "3. Look at the URL - the warehouse ID is at the end:\n", + " - URL format: `https:///sql/warehouses/`\n", + " - Or check the **Connection Details** tab\n", + "\n", + "#### Method 2: List Warehouses Programmatically\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "680830ba-e0fd-4e88-89f9-e13e73f81618", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# List all SQL Warehouses and their IDs\n", + "from databricks.sdk import WorkspaceClient\n", + "\n", + "w = WorkspaceClient()\n", + "warehouses = w.warehouses.list()\n", + "\n", + "print(\"Available SQL Warehouses:\")\n", + "print(\"-\" * 80)\n", + "for wh in warehouses:\n", + " print(f\"Name: {wh.name}\")\n", + " print(f\"ID: {wh.id}\")\n", + " print(f\"State: {wh.state}\")\n", + " print(\"-\" * 80)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "22c9d8cd-08b6-4ff4-8229-02e0f7e17d0c", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from databricks.sdk import WorkspaceClient\n", + "import json\n", + "\n", + "# Initialize the Databricks SDK client\n", + "# The SDK automatically uses the notebook's authentication context\n", + "w = WorkspaceClient()\n", + "\n", + "# Get widget values\n", + "space_title = dbutils.widgets.get(\"space_title\")\n", + "space_description = dbutils.widgets.get(\"space_description\")\n", + "warehouse_id = dbutils.widgets.get(\"warehouse_id\")\n", + "table_identifiers = dbutils.widgets.get(\"table_identifiers\")\n", + "include_instructions = dbutils.widgets.get(\"include_sample_instructions\")\n", + "\n", + "# Validate required fields\n", + "if not space_title:\n", + " raise ValueError(\"Space Title is required\")\n", + "if not warehouse_id:\n", + " raise ValueError(\"Warehouse ID is required\")\n", + "\n", + "print(\"✓ Configuration validated successfully\")\n", + "print(f\"✓ Space Title: {space_title}\")\n", + "print(f\"✓ Warehouse ID: {warehouse_id}\")\n", + "if table_identifiers:\n", + " tables_list = [t.strip() for t in table_identifiers.split(\",\") if t.strip()]\n", + " print(f\"✓ Tables: {len(tables_list)} table(s) specified\")\n", + " for table in tables_list:\n", + " print(f\" - {table}\")\n", + "else:\n", + " print(f\"ℹ️ No tables specified\")\n", + "print(f\"✓ SDK Client initialized\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "300d356a-2a95-4aba-a0fd-6c08b127b7ca", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Build the Genie Space Configuration\n", + "\n", + "Now we'll construct the serialized space configuration. The structure must match the API format:\n", + "\n", + "```json\n", + "{\n", + " \"version\": 1,\n", + " \"config\": {\n", + " \"instructions\": [...],\n", + " \"sample_questions\": [...]\n", + " },\n", + " \"data_sources\": {\n", + " \"tables\": [...]\n", + " }\n", + "}\n", + "```\n", + "\n", + "Key points:\n", + "- `version` is at the **root level** (not inside a nested object)\n", + "- `version` must be an **integer 1**, not a string\n", + "- The structure has three main keys: `version`, `config`, and `data_sources`\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2d6ec309-df81-45b2-b13b-cbfb7ec3a075", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Build the serialized space configuration\n", + "# Based on the actual API structure (version at root level, not inside genie_config)\n", + "serialized_space_config = {\n", + " \"version\": 1,\n", + " \"config\": {},\n", + " \"data_sources\": {}\n", + "}\n", + "\n", + "# Add sample instructions if requested\n", + "if include_instructions == \"Yes\":\n", + " serialized_space_config[\"config\"][\"instructions\"] = [\n", + " \"Always provide clear and concise answers\",\n", + " \"When showing data, include relevant context\",\n", + " \"Explain any assumptions made in your analysis\"\n", + " ]\n", + " print(\"✓ Sample instructions included\")\n", + "else:\n", + " print(\"ℹ️ No sample instructions - using minimal configuration\")\n", + "\n", + "# Convert to JSON string (Genie API expects serialized_space as a JSON string)\n", + "serialized_space = json.dumps(serialized_space_config)\n", + "\n", + "print(\"\\n📋 Serialized Space Configuration:\")\n", + "print(json.dumps(serialized_space_config, indent=2))\n", + "print(\"\\n📝 Serialized as string:\")\n", + "print(serialized_space)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c3f07f47-af9c-42e4-8256-418c0ecbe1a2", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Create the Genie Space\n", + "\n", + "Now let's use the SDK to create the new Genie space with our configuration.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9e5a15c3-a452-479d-8d1b-43a121d9e5e1", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "print(\"🚀 Creating Genie Space using Databricks SDK...\\n\")\n", + "\n", + "try:\n", + " # Create the Genie Space using the SDK\n", + " # The SDK handles authentication, retries, and error handling automatically\n", + " # Note: The parameter is 'title' not 'display_name'\n", + " created_space = w.genie.create_space(\n", + " title=space_title,\n", + " description=space_description if space_description else None,\n", + " warehouse_id=warehouse_id,\n", + " serialized_space=serialized_space\n", + " )\n", + " \n", + " # Success! Display the results\n", + " print(\"✅ Genie Space created successfully!\\n\")\n", + " print(\"=\" * 60)\n", + " print(f\"Space ID: {created_space.space_id}\")\n", + " print(f\"Title: {created_space.title}\")\n", + " print(f\"Warehouse ID: {created_space.warehouse_id}\")\n", + " if created_space.description:\n", + " print(f\"Description: {created_space.description}\")\n", + " if hasattr(created_space, 'created_timestamp'):\n", + " print(f\"Created At: {created_space.created_timestamp}\")\n", + " if hasattr(created_space, 'updated_timestamp'):\n", + " print(f\"Updated At: {created_space.updated_timestamp}\")\n", + " print(\"=\" * 60)\n", + " \n", + " # Store the space_id for future use in workflow task values\n", + " if created_space.space_id:\n", + " try:\n", + " dbutils.jobs.taskValues.set(key=\"genie_space_id\", value=created_space.space_id)\n", + " print(f\"\\n✓ Space ID saved to task values: {created_space.space_id}\")\n", + " except:\n", + " # Task values might not be available if not running in a job\n", + " print(f\"\\nℹ️ Space ID: {created_space.space_id} (task values not available in interactive mode)\")\n", + " \n", + " # Display the full space object for reference\n", + " print(f\"\\n📄 Full Space Object:\")\n", + " print(f\"{created_space}\")\n", + " \n", + "except Exception as e:\n", + " print(f\"❌ Error creating Genie Space:\")\n", + " print(f\"Error: {str(e)}\")\n", + " raise\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3605a1f8-3c58-4bf7-9f0a-2ba769a05f1c", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Advanced: Customize Data Sources and Sample Questions (Optional)\n", + "\n", + "You can create a more advanced configuration with specific tables and sample questions:\n", + "\n", + "```python\n", + "# Example with data sources and sample questions\n", + "advanced_config = {\n", + " \"version\": 1,\n", + " \"config\": {\n", + " \"instructions\": [\"Focus on sales metrics\", \"Compare period over period\"],\n", + " \"sample_questions\": [\n", + " {\n", + " \"id\": \"q1\",\n", + " \"question\": [\"Show total revenue by region\"]\n", + " },\n", + " {\n", + " \"id\": \"q2\",\n", + " \"question\": [\"What are the top 10 products by sales?\"]\n", + " }\n", + " ]\n", + " },\n", + " \"data_sources\": {\n", + " \"tables\": [\n", + " {\"identifier\": \"catalog.schema.sales_table\"},\n", + " {\"identifier\": \"catalog.schema.products_table\"}\n", + " ]\n", + " }\n", + "}\n", + "\n", + "# Use this config instead of the simple one above\n", + "# serialized_space = json.dumps(advanced_config)\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e3ff7b41-582d-468a-a228-dae0285808a1", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Additional SDK Operations\n", + "\n", + "The Databricks SDK also supports other Genie operations like listing, getting, updating, and deleting spaces.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3e2712b5-b197-4022-a026-41580095d46b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# List all Genie spaces\n", + "all_spaces_response = w.genie.list_spaces()\n", + "all_spaces = all_spaces_response.spaces # Access the list attribute\n", + "\n", + "print(\"📋 All Genie Spaces in this workspace:\")\n", + "print(\"-\" * 80)\n", + "for space in all_spaces:\n", + " print(f\"Name: {space.title}\")\n", + " print(f\"Space ID: {space.space_id}\")\n", + " print(f\"Warehouse ID: {space.warehouse_id}\")\n", + " print(\"-\" * 80)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cdc394cb-f07b-4404-902a-26f8fe8a7ad4", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Clean Up Widgets (Optional)\n", + "\n", + "Run this cell to remove all widgets when you're done.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c07930ae-768d-425d-8aba-19d646c46155", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Uncomment to remove all widgets\n", + "# dbutils.widgets.removeAll()\n", + "# print(\"All widgets removed\")\n" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": null, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "Create Genie Space with Widgets", + "widgets": { + "include_sample_instructions": { + "currentValue": "Yes", + "nuid": "1c3a9d0c-ef24-4807-9f8d-acffb1ad246d", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "Yes", + "label": "5. Include Sample Instructions", + "name": "include_sample_instructions", + "options": { + "choices": [ + "Yes", + "No" + ], + "fixedDomain": true, + "multiselect": false, + "widgetDisplayType": "Dropdown" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "Yes", + "label": "5. Include Sample Instructions", + "name": "include_sample_instructions", + "options": { + "autoCreated": null, + "choices": [ + "Yes", + "No" + ], + "widgetType": "dropdown" + }, + "widgetType": "dropdown" + } + }, + "space_description": { + "currentValue": "", + "nuid": "deb58081-6d4f-4a08-93b1-a44372aee743", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "", + "label": "2. Space Description (Optional)", + "name": "space_description", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "", + "label": "2. Space Description (Optional)", + "name": "space_description", + "options": { + "autoCreated": null, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + }, + "space_title": { + "currentValue": "My New Genie Space", + "nuid": "4030dbbe-93e9-444a-8e01-0ce1ad5db41a", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "My New Genie Space", + "label": "1. Space Title", + "name": "space_title", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "My New Genie Space", + "label": "1. Space Title", + "name": "space_title", + "options": { + "autoCreated": null, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + }, + "table_identifiers": { + "currentValue": "", + "nuid": "ca3932a1-b03c-4fde-9c0c-5f6ce859cc0a", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "", + "label": "4. Tables (Optional - comma separated)", + "name": "table_identifiers", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "", + "label": "4. Tables (Optional - comma separated)", + "name": "table_identifiers", + "options": { + "autoCreated": false, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + }, + "warehouse_id": { + "currentValue": "", + "nuid": "7ab5aa3a-d4e3-448d-9a68-59fd05661477", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "", + "label": "3. Warehouse ID", + "name": "warehouse_id", + "options": { + "validationRegex": null, + "widgetDisplayType": "Text" + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "defaultValue": "", + "label": "3. Warehouse ID", + "name": "warehouse_id", + "options": { + "autoCreated": null, + "validationRegex": null, + "widgetType": "text" + }, + "widgetType": "text" + } + } + } + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}