diff --git a/demo/README.md b/demo/README.md index f39f35d..409dad4 100644 --- a/demo/README.md +++ b/demo/README.md @@ -48,7 +48,7 @@ This Demo launches Bronze and Silver pipelines with following activities: 7. ```commandline python demo/launch_dais_demo.py --uc_catalog_name=<> --profile=<> ``` - - uc_catalog_name : Unity catalog name + - uc_catalog_name : UC catalog name. Names that are valid non-delimited identifiers (ASCII letters, digits, underscores, not starting with a digit) are used as-is. Names containing other characters are automatically wrapped in backticks as delimited identifiers. - you can provide `--profile=databricks_profile name` in case you already have databricks cli otherwise command prompt will ask host and token. ![dais_demo.png](../docs/static/images/dais_demo.png) @@ -86,7 +86,7 @@ This demo will launch auto generated tables(100s) inside single bronze and silve 7. ```commandline python demo/launch_techsummit_demo.py --uc_catalog_name=<> --profile=<> ``` - - uc_catalog_name : Unity catalog name + - uc_catalog_name : UC catalog name. Names that are valid non-delimited identifiers (ASCII letters, digits, underscores, not starting with a digit) are used as-is. Names containing other characters are automatically wrapped in backticks as delimited identifiers. - you can provide `--profile=databricks_profile name` in case you already have databricks cli otherwise command prompt will ask host and token ![tech_summit_demo.png](../docs/static/images/tech_summit_demo.png) @@ -128,7 +128,7 @@ This demo will perform following tasks: 7. ```commandline python demo/launch_af_cloudfiles_demo.py --uc_catalog_name=<> --source=cloudfiles --profile=<> ``` - - uc_catalog_name : Unity Catalog name + - uc_catalog_name : UC catalog name. Names that are valid non-delimited identifiers (ASCII letters, digits, underscores, not starting with a digit) are used as-is. Names containing other characters are automatically wrapped in backticks as delimited identifiers. - you can provide `--profile=databricks_profile name` in case you already have databricks cli otherwise command prompt will ask host and token ![af_am_demo.png](../docs/static/images/af_am_demo.png) @@ -178,7 +178,7 @@ This demo will perform following tasks: - Create databricks secrets to store producer and consumer keys using the scope created in step 2 - Following are the mandatory arguments for running EventHubs demo - - uc_catalog_name : unity catalog name e.g. ravi_dlt_meta_uc + - uc_catalog_name : UC catalog name. Names that are valid non-delimited identifiers (ASCII letters, digits, underscores, not starting with a digit) are used as-is. Names containing other characters are automatically wrapped in backticks as delimited identifiers. e.g. ravi_dlt_meta_uc - eventhub_namespace: Eventhub namespace e.g. dltmeta - eventhub_name : Primary Eventhubname e.g. dltmeta_demo - eventhub_name_append_flow: Secondary eventhub name for appendflow feed e.g. dltmeta_demo_af @@ -232,6 +232,7 @@ This demo will perform following tasks: python demo/launch_silver_fanout_demo.py --source=cloudfiles --uc_catalog_name=<> --profile=<> ``` + - uc_catalog_name : UC catalog name. Names that are valid non-delimited identifiers (ASCII letters, digits, underscores, not starting with a digit) are used as-is. Names containing other characters are automatically wrapped in backticks as delimited identifiers. - you can provide `--profile=databricks_profile name` in case you already have databricks cli otherwise command prompt will ask host and token. a. Databricks Workspace URL: @@ -296,6 +297,9 @@ This demo will perform following tasks: ```commandline python demo/launch_acfs_demo.py --uc_catalog_name=<> --profile=<> ``` + - uc_catalog_name : UC catalog name. Names that are valid non-delimited identifiers (ASCII letters, digits, underscores, not starting with a digit) are used as-is. Names containing other characters are automatically wrapped in backticks as delimited identifiers. + - you can provide `--profile=databricks_profile name` in case you already have databricks cli otherwise command prompt will ask host and token. + ![acfs.png](../docs/static/images/acfs.png) # Lakeflow Declarative Pipelines Sink Demo @@ -350,6 +354,9 @@ This demo will perform following tasks: ```commandline python demo/launch_dlt_sink_demo.py --uc_catalog_name=<> --source=kafka --kafka_source_topic=<>>> --kafka_sink_topic=<> --kafka_source_servers_secrets_scope_name=<> --kafka_source_servers_secrets_scope_key=<> --kafka_sink_servers_secret_scope_name=<> --kafka_sink_servers_secret_scope_key=<> --profile=<> ``` + - uc_catalog_name : UC catalog name. Names that are valid non-delimited identifiers (ASCII letters, digits, underscores, not starting with a digit) are used as-is. Names containing other characters are automatically wrapped in backticks as delimited identifiers. + - you can provide `--profile=databricks_profile name` in case you already have databricks cli otherwise command prompt will ask host and token. + ![dlt_demo_sink.png](../docs/static/images/dlt_demo_sink.png) ![dlt_delta_sink.png](../docs/static/images/dlt_delta_sink.png) ![dlt_kafka_sink.png](../docs/static/images/dlt_kafka_sink.png) @@ -406,6 +413,8 @@ This demo will perform following steps: ```commandline python demo/generate_dabs_resources.py --source=cloudfiles --uc_catalog_name= --profile= ``` + > Note: If uc_catalog_name contains characters not valid for a non-delimited identifier, it is automatically wrapped in backticks as a delimited identifier. + > Note: If you don't specify `--profile`, you'll be prompted for your Databricks workspace URL and access token. 7. Deploy and run the DAB bundle: diff --git a/demo/generate_dabs_resources.py b/demo/generate_dabs_resources.py index f18e4f8..b891346 100644 --- a/demo/generate_dabs_resources.py +++ b/demo/generate_dabs_resources.py @@ -48,10 +48,11 @@ def init_runner_conf(self) -> DLTMetaRunnerConf: The initialized runner configuration. """ run_id = uuid.uuid4().hex + uc_catalog_name = self.validate_uc_catalog_name(self.args.get("uc_catalog_name")) runner_conf = DLTMetaRunnerConf( run_id=run_id, username=self.wsi._my_username, - uc_catalog_name=self.args["uc_catalog_name"], + uc_catalog_name=uc_catalog_name, int_tests_dir="demo/dabs", dlt_meta_schema=f"dlt_meta_dataflowspecs_demo_{run_id}", bronze_schema=f"dlt_meta_bronze_demo_{run_id}", diff --git a/demo/launch_acfs_demo.py b/demo/launch_acfs_demo.py index bd28d5e..c1e90d2 100644 --- a/demo/launch_acfs_demo.py +++ b/demo/launch_acfs_demo.py @@ -54,7 +54,7 @@ def init_runner_conf(self) -> DLTMetaRunnerConf: onboarding_file_path="demo/conf/onboarding.json", env="demo" ) - runner_conf.uc_catalog_name = self.args['uc_catalog_name'] + runner_conf.uc_catalog_name = self.validate_uc_catalog_name(self.args.get('uc_catalog_name')) return runner_conf def launch_workflow(self, runner_conf: DLTMetaRunnerConf): diff --git a/demo/launch_af_cloudfiles_demo.py b/demo/launch_af_cloudfiles_demo.py index 98fdcbf..d501619 100644 --- a/demo/launch_af_cloudfiles_demo.py +++ b/demo/launch_af_cloudfiles_demo.py @@ -45,10 +45,11 @@ def init_runner_conf(self) -> DLTMetaRunnerConf: The initialized runner configuration. """ run_id = uuid.uuid4().hex + uc_catalog_name = self.validate_uc_catalog_name(self.args.get("uc_catalog_name")) runner_conf = DLTMetaRunnerConf( run_id=run_id, username=self.wsi._my_username, - uc_catalog_name=self.args["uc_catalog_name"], + uc_catalog_name=uc_catalog_name, int_tests_dir="demo", dlt_meta_schema=f"dlt_meta_dataflowspecs_demo_{run_id}", bronze_schema=f"dlt_meta_bronze_demo_{run_id}", diff --git a/demo/launch_af_eventhub_demo.py b/demo/launch_af_eventhub_demo.py index 9b16f24..eada96e 100644 --- a/demo/launch_af_eventhub_demo.py +++ b/demo/launch_af_eventhub_demo.py @@ -70,7 +70,7 @@ def init_runner_conf(self) -> DLTMetaRunnerConf: eventhub_namespace=self.args["eventhub_namespace"], eventhub_port=self.args["eventhub_port"] ) - runner_conf.uc_catalog_name = self.args['uc_catalog_name'] + runner_conf.uc_catalog_name = self.validate_uc_catalog_name(self.args.get('uc_catalog_name')) runner_conf.runners_full_local_path = 'demo/notebooks/afam_eventhub_runners' return runner_conf diff --git a/demo/launch_dais_demo.py b/demo/launch_dais_demo.py index ad5c984..ce01fd3 100644 --- a/demo/launch_dais_demo.py +++ b/demo/launch_dais_demo.py @@ -50,8 +50,9 @@ def init_runner_conf(self) -> DLTMetaRunnerConf: # runners_full_local_path='./demo/dbc/dais_dlt_meta_runners.dbc', onboarding_file_path='demo/conf/onboarding.json' ) - if self.args['uc_catalog_name']: - runner_conf.uc_catalog_name = self.args['uc_catalog_name'] + uc_catalog_name = self.args.get('uc_catalog_name') + if uc_catalog_name: + runner_conf.uc_catalog_name = self.validate_uc_catalog_name(uc_catalog_name) runner_conf.uc_volume_name = f"{runner_conf.uc_catalog_name}_dais_demo_{run_id}" return runner_conf diff --git a/demo/launch_dlt_sink_demo.py b/demo/launch_dlt_sink_demo.py index 2fb5308..371adca 100644 --- a/demo/launch_dlt_sink_demo.py +++ b/demo/launch_dlt_sink_demo.py @@ -45,10 +45,11 @@ def init_runner_conf(self) -> DLTMetaRunnerConf: The initialized runner configuration. """ run_id = uuid.uuid4().hex + uc_catalog_name = self.validate_uc_catalog_name(self.args.get("uc_catalog_name")) runner_conf = DLTMetaRunnerConf( run_id=run_id, username=self.wsi._my_username, - uc_catalog_name=self.args["uc_catalog_name"], + uc_catalog_name=uc_catalog_name, int_tests_dir="demo", dlt_meta_schema=f"dlt_meta_dataflowspecs_demo_{run_id}", bronze_schema=f"dlt_meta_bronze_demo_{run_id}", diff --git a/demo/launch_silver_fanout_demo.py b/demo/launch_silver_fanout_demo.py index d075f27..9c6bc19 100644 --- a/demo/launch_silver_fanout_demo.py +++ b/demo/launch_silver_fanout_demo.py @@ -82,7 +82,7 @@ def init_runner_conf(self) -> DLTMetaRunnerConf: onboarding_fanout_file_path="demo/conf/onboarding_fanout_cars.json", env="demo" ) - runner_conf.uc_catalog_name = self.args['uc_catalog_name'] + runner_conf.uc_catalog_name = self.validate_uc_catalog_name(self.args.get('uc_catalog_name')) runner_conf.uc_volume_name = f"{runner_conf.uc_catalog_name}_dlt_meta_fout_demo_{run_id}" return runner_conf diff --git a/demo/launch_techsummit_demo.py b/demo/launch_techsummit_demo.py index 94d76e2..53ed41b 100644 --- a/demo/launch_techsummit_demo.py +++ b/demo/launch_techsummit_demo.py @@ -100,9 +100,10 @@ def init_runner_conf(self) -> TechsummitRunnerConf: and self.args.__dict__['table_data_rows_count'] else "10"), ) - if self.args['uc_catalog_name']: - runner_conf.uc_catalog_name = self.args['uc_catalog_name'] - runner_conf.uc_volume_name = f"{self.args['uc_catalog_name']}_volume_{run_id}" + uc_catalog_name = self.args.get('uc_catalog_name') + if uc_catalog_name: + runner_conf.uc_catalog_name = self.validate_uc_catalog_name(uc_catalog_name) + runner_conf.uc_volume_name = f"{runner_conf.uc_catalog_name}_volume_{run_id}" return runner_conf def create_bronze_silver_dlt(self, runner_conf: DLTMetaRunnerConf): diff --git a/integration_tests/run_integration_tests.py b/integration_tests/run_integration_tests.py index 86dc94f..531a68a 100644 --- a/integration_tests/run_integration_tests.py +++ b/integration_tests/run_integration_tests.py @@ -4,6 +4,7 @@ import argparse import json import os +import re import sys import traceback import uuid @@ -168,6 +169,37 @@ def __init__(self, args: dict[str:str], ws, base_dir): self.wsi = WorkspaceInstaller(ws) self.base_dir = base_dir + @staticmethod + def validate_uc_catalog_name(name): + """Validate and normalize a Unity Catalog name. + + Non-delimited identifiers can only contain ASCII letters, digits, and + underscores and must not start with a digit. Delimited identifiers + (wrapped in backticks) can use any unicode character. + + If the name contains characters not valid for a non-delimited identifier, + it is automatically wrapped in backticks to form a delimited identifier. + + Args: + name: The catalog name to validate. + + Returns: + The validated and possibly backtick-wrapped catalog name. + + Raises: + ValueError: If the name is None or empty. + """ + if name is None: + raise ValueError("'uc_catalog_name' is required but was not provided.") + if not name.strip(): + raise ValueError("'uc_catalog_name' must not be empty.") + stripped = name.strip('`') + if not stripped: + raise ValueError("'uc_catalog_name' must not be empty.") + if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', stripped): + return f'`{stripped}`' + return stripped + def init_runner_conf(self) -> DLTMetaRunnerConf: """Initialize the runner configuration for running integration tests.""" run_id = uuid.uuid4().hex diff --git a/src/cli.py b/src/cli.py index 656729d..fcd74ba 100644 --- a/src/cli.py +++ b/src/cli.py @@ -3,6 +3,7 @@ import logging import json import os +import re import sys import uuid import webbrowser @@ -18,6 +19,37 @@ logger = logging.getLogger('databricks.labs.dltmeta') +def validate_uc_catalog_name(name): + """Validate and normalize a Unity Catalog name. + + Non-delimited identifiers can only contain ASCII letters, digits, and + underscores and must not start with a digit. Delimited identifiers + (wrapped in backticks) can use any unicode character. + + If the name contains characters not valid for a non-delimited identifier, + it is automatically wrapped in backticks to form a delimited identifier. + + Args: + name: The catalog name to validate. + + Returns: + The validated and possibly backtick-wrapped catalog name. + + Raises: + ValueError: If the name is None or empty. + """ + if name is None: + raise ValueError("'uc_catalog_name' is required but was not provided.") + if not name.strip(): + raise ValueError("'uc_catalog_name' must not be empty.") + stripped = name.strip('`') + if not stripped: + raise ValueError("'uc_catalog_name' must not be empty.") + if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', stripped): + return f'`{stripped}`' + return stripped + + DLT_META_RUNNER_NOTEBOOK = """ # Databricks notebook source # MAGIC %pip install dlt-meta=={version} @@ -67,6 +99,8 @@ def __post_init__(self): raise ValueError("onboard_layer is required") if self.onboard_layer.lower() not in ["bronze", "silver", "bronze_silver"]: raise ValueError("onboard_layer must be one of bronze, silver, bronze_silver") + if self.uc_enabled and self.uc_catalog_name: + self.uc_catalog_name = validate_uc_catalog_name(self.uc_catalog_name) # if self.uc_enabled == "": # raise ValueError("uc_enabled is required, please set to True or False") if not self.uc_enabled and not self.dbfs_path: @@ -125,6 +159,8 @@ class DeployCommand: def __post_init__(self): if self.uc_enabled and not self.uc_catalog_name: raise ValueError("uc_catalog_name is required") + if self.uc_enabled and self.uc_catalog_name: + self.uc_catalog_name = validate_uc_catalog_name(self.uc_catalog_name) if not self.serverless and not self.num_workers: raise ValueError("num_workers is required") if not self.layer: diff --git a/tests/test_cli.py b/tests/test_cli.py index b34ed98..8ed85d3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -4,7 +4,10 @@ import json from databricks.sdk.service.catalog import VolumeType from src.__about__ import __version__ -from src.cli import DLT_META_RUNNER_NOTEBOOK, DeployCommand, DLTMeta, OnboardCommand, main +from src.cli import ( + DLT_META_RUNNER_NOTEBOOK, DeployCommand, DLTMeta, OnboardCommand, + main, validate_uc_catalog_name +) class CliTests(unittest.TestCase): @@ -1303,6 +1306,101 @@ def test_onboard_command_version_validation(self): ) self.assertIn("version is required", str(context.exception)) + def test_onboard_command_uc_catalog_name_auto_delimit(self): + """Test OnboardCommand auto-wraps non-standard uc_catalog_name with backticks.""" + delimit_cases = { + "my-catalog": "`my-catalog`", + "1starts_with_digit": "`1starts_with_digit`", + "has space": "`has space`", + "dot.name": "`dot.name`", + "sp@cial": "`sp@cial`", + } + for name, expected in delimit_cases.items(): + cmd = OnboardCommand( + onboarding_file_path="tests/resources/onboarding.json", + onboarding_files_dir_path="tests/resources/", + onboard_layer="bronze", + env="dev", + import_author="John Doe", + version="1.0", + dlt_meta_schema="dlt_meta", + uc_enabled=True, + uc_catalog_name=name, + serverless=True, + bronze_dataflowspec_table="bronze_dataflowspec", + overwrite=True, + ) + self.assertEqual(cmd.uc_catalog_name, expected) + + # Valid non-delimited names should stay unchanged + valid_names = ["my_catalog", "Catalog1", "_private", "ABC_123"] + for name in valid_names: + cmd = OnboardCommand( + onboarding_file_path="tests/resources/onboarding.json", + onboarding_files_dir_path="tests/resources/", + onboard_layer="bronze", + env="dev", + import_author="John Doe", + version="1.0", + dlt_meta_schema="dlt_meta", + uc_enabled=True, + uc_catalog_name=name, + serverless=True, + bronze_dataflowspec_table="bronze_dataflowspec", + overwrite=True, + ) + self.assertEqual(cmd.uc_catalog_name, name) + + def test_deploy_command_uc_catalog_name_auto_delimit(self): + """Test DeployCommand auto-wraps non-standard uc_catalog_name with backticks.""" + delimit_cases = { + "my-catalog": "`my-catalog`", + "1starts_with_digit": "`1starts_with_digit`", + "has space": "`has space`", + "dot.name": "`dot.name`", + "sp@cial": "`sp@cial`", + } + for name, expected in delimit_cases.items(): + cmd = DeployCommand( + layer="bronze", + onboard_bronze_group="A1", + dlt_meta_bronze_schema="dlt_meta", + dataflowspec_bronze_table="dataflowspec_table", + pipeline_name="test_pipeline", + dlt_target_schema="target_schema", + uc_enabled=True, + uc_catalog_name=name, + serverless=True, + ) + self.assertEqual(cmd.uc_catalog_name, expected) + + # Valid non-delimited names should stay unchanged + valid_names = ["my_catalog", "Catalog1", "_private", "ABC_123"] + for name in valid_names: + cmd = DeployCommand( + layer="bronze", + onboard_bronze_group="A1", + dlt_meta_bronze_schema="dlt_meta", + dataflowspec_bronze_table="dataflowspec_table", + pipeline_name="test_pipeline", + dlt_target_schema="target_schema", + uc_enabled=True, + uc_catalog_name=name, + serverless=True, + ) + self.assertEqual(cmd.uc_catalog_name, name) + + def test_validate_uc_catalog_name_empty_and_none(self): + """Test validate_uc_catalog_name raises on None and empty strings.""" + with self.assertRaises(ValueError): + validate_uc_catalog_name(None) + with self.assertRaises(ValueError): + validate_uc_catalog_name("") + with self.assertRaises(ValueError): + validate_uc_catalog_name(" ") + with self.assertRaises(ValueError): + validate_uc_catalog_name("``") + def test_deploy_command_validation_cases(self): """Test DeployCommand validation cases for missing coverage.""" # Test bronze layer without dataflowspec_bronze_table when uc_enabled=True (line 136)