diff --git a/Diagnostic/LogAnalytics/Azure_Synapse_Spark_Application.workbook b/Diagnostic/LogAnalytics/Azure_Synapse_Spark_Application.workbook index 8ab6efd..9da139e 100644 --- a/Diagnostic/LogAnalytics/Azure_Synapse_Spark_Application.workbook +++ b/Diagnostic/LogAnalytics/Azure_Synapse_Spark_Application.workbook @@ -109,7 +109,8 @@ "type": 2, "description": "Spark Application Livy Id and Name", "isRequired": true, - "query": "SparkMetrics_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and isnotempty(livyId_s)\r\n| summarize by livyId_s, applicationName_s\r\n| order by livyId_s desc\r\n| extend applicationName = substring(applicationName_s, 0, strlen(applicationName_s) - 1)\r\n| project value = livyId_s, label = strcat(livyId_s, \" | \", applicationName), selected = false", + "query": "SparkMetrics_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and isnotempty(livyId_s)\r\n| extend applicationName = column_ifexists(\"applicationName_s\", applicationId_s)\r\n| summarize by livyId_s, applicationName\r\n| order by livyId_s desc\r\n| project value = livyId_s, label = strcat(livyId_s, \" | \", applicationName), selected = false", + "value": null, "typeSettings": { "additionalResourceOptions": [], "showDefault": false @@ -182,7 +183,7 @@ "id": "9246e88b-9682-498c-b440-f53a15b6e481", "cellValue": "selectedTab", "linkTarget": "parameter", - "linkLabel": "Spark Streaming", + "linkLabel": "Streaming", "subTarget": "Streaming", "preText": "Metrics", "postText": "1", @@ -255,7 +256,7 @@ "additionalResourceOptions": [ "value::all" ], - "selectAllValue": "ALL", + "selectAllValue": "All", "showDefault": false }, "timeContext": { @@ -277,7 +278,7 @@ "comparison": "isEqualTo", "value": "Logs" }, - "name": "CustomMetricsSelector - Copy" + "name": "Logs Filter Dropdown" }, { "type": 3, @@ -566,7 +567,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "let Data = SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\";\r\n\r\nunion\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(progress_sources_s)\r\n| mv-expand progress_sources\r\n| project TimeGenerated, InputRows=tolong(progress_sources.numInputRows)\r\n| summarize Value=sum(InputRows)\r\n| extend Name=\"Total Input Rows\", Order=0\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Progress Events\", Order=1\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryStartedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Starts\", Order=2\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Terminateds\", Order=3\r\n)\r\n", + "query": "let Data = SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\";\r\n\r\nunion\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(column_ifexists(\"progress_sources_s\", \"{}\"))\r\n| mv-expand progress_sources\r\n| project TimeGenerated, InputRows=tolong(progress_sources.numInputRows)\r\n| summarize Value=sum(InputRows)\r\n| extend Name=\"Total Input Rows\", Order=0\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Progress Events\", Order=1\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryStartedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Starts\", Order=2\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Terminateds\", Order=3\r\n)\r\n", "size": 4, "showAnnotations": true, "showAnalytics": true, @@ -1637,7 +1638,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(progress_sources_s)\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), ProcessedRowsPerSecond=todouble(progress_sources.processedRowsPerSecond)\r\n| summarize Value=sum(ProcessedRowsPerSecond) by TimeGenerated, Description\r\n| order by TimeGenerated asc\r\n\r\n// InputRowsPerSecond=todouble(progress_sources.inputRowsPerSecond), ", + "query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(column_ifexists(\"progress_sources_s\", \"{}\"))\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), ProcessedRowsPerSecond=todouble(progress_sources.processedRowsPerSecond)\r\n| summarize Value=sum(ProcessedRowsPerSecond) by TimeGenerated, Description\r\n| order by TimeGenerated asc\r\n\r\n// InputRowsPerSecond=todouble(progress_sources.inputRowsPerSecond), ", "size": 1, "aggregation": 5, "showAnalytics": true, @@ -1664,7 +1665,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(progress_sources_s)\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), InputRows=tolong(progress_sources.numInputRows)\r\n| summarize sum(InputRows)/count() by bin(TimeGenerated, timespan({Interval})), Description\r\n| order by TimeGenerated asc", + "query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(column_ifexists(\"progress_sources_s\", \"{}\"))\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), InputRows=tolong(progress_sources.numInputRows)\r\n| summarize sum(InputRows)/count() by bin(TimeGenerated, timespan({Interval})), Description\r\n| order by TimeGenerated asc", "size": 1, "aggregation": 5, "showAnalytics": true, diff --git a/Diagnostic/SparkDiagnosticEmitter/README.md b/Diagnostic/SparkDiagnosticEmitter/README.md new file mode 100644 index 0000000..81d09c1 --- /dev/null +++ b/Diagnostic/SparkDiagnosticEmitter/README.md @@ -0,0 +1,8 @@ +# Synapse Spark Diagnostic Emitter Configuration Samples + +## Introduction + +The Azure Synapse Spark diagnostic emitter extension is a library which enables Spark application to emit the logs, event logs and metrics to one or more destinations, +including Azure Log Analytics, Azure Storage and Azure EventHub. + +The sample templates in this repo are designed to help users quickly enable this feature. \ No newline at end of file diff --git a/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-event-hub-conf.txt b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-event-hub-conf.txt new file mode 100644 index 0000000..c21e979 --- /dev/null +++ b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-event-hub-conf.txt @@ -0,0 +1,5 @@ +spark.synapse.diagnostic.emitters MyDestination1 +spark.synapse.diagnostic.emitter.MyDestination1.type AzureEventHub +spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics +spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault +spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault.secretName \ No newline at end of file diff --git a/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-log-analytics-conf.txt b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-log-analytics-conf.txt new file mode 100644 index 0000000..d94082c --- /dev/null +++ b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-log-analytics-conf.txt @@ -0,0 +1,6 @@ +spark.synapse.diagnostic.emitters MyDestination1 +spark.synapse.diagnostic.emitter.MyDestination1.type AzureLogAnalytics +spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics +spark.synapse.diagnostic.emitter.MyDestination1.workspaceId +spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault +spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault.secretName \ No newline at end of file diff --git a/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-storage-conf.txt b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-storage-conf.txt new file mode 100644 index 0000000..1f0e815 --- /dev/null +++ b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-storage-conf.txt @@ -0,0 +1,6 @@ +spark.synapse.diagnostic.emitters MyDestination1 +spark.synapse.diagnostic.emitter.MyDestination1.type AzureStorage +spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics +spark.synapse.diagnostic.emitter.MyDestination1.uri https://.blob.core.windows.net// +spark.synapse.diagnostic.emitter.MyDestination1.auth AccessKey +spark.synapse.diagnostic.emitter.MyDestination1.secret \ No newline at end of file diff --git a/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-multi-destination-conf.txt b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-multi-destination-conf.txt new file mode 100644 index 0000000..065ccda --- /dev/null +++ b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-multi-destination-conf.txt @@ -0,0 +1,15 @@ +spark.synapse.diagnostic.emitters MyDestination1,MyDestination2 + +spark.synapse.diagnostic.emitter.MyDestination1.type AzureStorage +spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics +spark.synapse.diagnostic.emitter.MyDestination1.uri https://.blob.core.windows.net// +spark.synapse.diagnostic.emitter.MyDestination1.auth AccessKey +spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault +spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault.secretName + +spark.synapse.diagnostic.emitters MyDestination2 +spark.synapse.diagnostic.emitter.MyDestination2.type AzureLogAnalytics +spark.synapse.diagnostic.emitter.MyDestination2.categories Log,EventLog,Metrics +spark.synapse.diagnostic.emitter.MyDestination2.workspaceId +spark.synapse.diagnostic.emitter.MyDestination2.secret.keyVault +spark.synapse.diagnostic.emitter.MyDestination2.secret.keyVault.secretName \ No newline at end of file diff --git a/Notebooks/PySpark/05 Using Azure Open Datasets in Synapse.ipynb b/Notebooks/PySpark/05 Using Azure Open Datasets in Synapse.ipynb index 4c356da..b3fda51 100644 --- a/Notebooks/PySpark/05 Using Azure Open Datasets in Synapse.ipynb +++ b/Notebooks/PySpark/05 Using Azure Open Datasets in Synapse.ipynb @@ -270,7 +270,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Enrich with weather data¶\n", + "## Enrich with weather data\n", "\n", "Now we append NOAA surface weather data to the taxi and holiday data. Use a similar approach to fetch the [NOAA weather history data](https://azure.microsoft.com/en-us/services/open-datasets/catalog/noaa-integrated-surface-data/) from Azure Open Datasets. " ], diff --git a/Notebooks/PySpark/07 Data Exploration and ML Modeling - NYC taxi predict using Spark MLlib.ipynb b/Notebooks/PySpark/07 Data Exploration and ML Modeling - NYC taxi predict using Spark MLlib.ipynb index 24bca0c..80b99f9 100644 --- a/Notebooks/PySpark/07 Data Exploration and ML Modeling - NYC taxi predict using Spark MLlib.ipynb +++ b/Notebooks/PySpark/07 Data Exploration and ML Modeling - NYC taxi predict using Spark MLlib.ipynb @@ -48,7 +48,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Ingest Data¶ \n", + "## Ingest Data\n", "\n", "Get a sample data of nyc yellow taxi to make it faster/easier to evaluate different approaches to prep for the modelling phase later in the notebook." ], diff --git a/Notebooks/PySpark/Synapse Link for Cosmos DB samples/Retail/spark-notebooks/pyspark/2SalesForecastingWithAML.ipynb b/Notebooks/PySpark/Synapse Link for Cosmos DB samples/Retail/spark-notebooks/pyspark/2SalesForecastingWithAML.ipynb index f79e97c..7030b59 100644 --- a/Notebooks/PySpark/Synapse Link for Cosmos DB samples/Retail/spark-notebooks/pyspark/2SalesForecastingWithAML.ipynb +++ b/Notebooks/PySpark/Synapse Link for Cosmos DB samples/Retail/spark-notebooks/pyspark/2SalesForecastingWithAML.ipynb @@ -444,16 +444,15 @@ "\n", "df_all = align_outputs(y_predictions, X_trans, X_test, y_test, target_column_name)\n", "\n", - "from azureml.automl.core._vendor.automl.client.core.common import metrics\n", + "from azureml.automl.runtime.shared.score import scoring\n", "from matplotlib import pyplot as plt\n", "from automl.client.core.common import constants\n", "\n", - "# use automl metrics module\n", - "scores = metrics.compute_metrics_regression(\n", - " df_all['predicted'],\n", - " df_all[target_column_name],\n", - " list(constants.Metric.SCALAR_REGRESSION_SET),\n", - " None, None, None)\n", + "# use automl scoring module\n", + "scores = scoring.score_regression(\n", + " y_test=df_all[target_column_name],\n", + " y_pred=df_all['predicted'],\n", + " metrics=list(constants.Metric.SCALAR_REGRESSION_SET))\n", "\n", "print(\"[Test data scores]\\n\")\n", "for key, value in scores.items(): \n", diff --git a/Notebooks/PySpark/exampleForParameter.ipynb b/Notebooks/PySpark/exampleForParameter.ipynb new file mode 100644 index 0000000..4b565b1 --- /dev/null +++ b/Notebooks/PySpark/exampleForParameter.ipynb @@ -0,0 +1,107 @@ +{ + "cells": [ + { + "cell_type": "code", + "source": [ + "account_name = '' # fill in your primary account name\r\n", + "container_name = '' # fill in your container name\r\n", + "relative_path = '' # fill in your relative folder path" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "sparkpool01", + "session_id": 0, + "statement_id": 3, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-09-17T06:08:57.5750046Z", + "session_start_time": null, + "execution_start_time": "2021-09-17T06:08:57.6725894Z", + "execution_finish_time": "2021-09-17T06:08:57.6728166Z" + }, + "text/plain": "StatementMeta(sparkpool01, 0, 3, Finished, Available)" + }, + "metadata": {} + } + ], + "execution_count": 3, + "metadata": { + "tags": [ + "parameters" + ] + } + }, + { + "cell_type": "code", + "source": [ + "from pyspark.sql import SparkSession\r\n", + "from pyspark.sql.types import *\r\n", + "\r\n", + "adls_path = 'abfss://%s@%s.dfs.core.windows.net/%s' % (container_name, account_name, relative_path)\r\n", + "print('Primary storage account path: ' + adls_path)" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "sparkpool01", + "session_id": 0, + "statement_id": 4, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-09-17T06:09:00.1736428Z", + "session_start_time": null, + "execution_start_time": "2021-09-17T06:09:00.273459Z", + "execution_finish_time": "2021-09-17T06:09:00.4351164Z" + }, + "text/plain": "StatementMeta(sparkpool01, 0, 4, Finished, Available)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Primary storage account path: abfss://test0121@lilijing0227neur.dfs.core.windows.net/test01/" + ] + } + ], + "execution_count": 4, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + } + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "kernelspec": { + "name": "synapse_pyspark", + "language": "Python", + "display_name": "Synapse PySpark" + }, + "kernel_info": { + "name": "synapse_pyspark" + }, + "save_output": true, + "synapse_widget": { + "version": "0.1", + "state": {} + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/Notebooks/PySpark/parameters/exampleForParameter.json b/Notebooks/PySpark/parameters/exampleForParameter.json new file mode 100644 index 0000000..308ca6d --- /dev/null +++ b/Notebooks/PySpark/parameters/exampleForParameter.json @@ -0,0 +1,5 @@ +{ + "Your primary storage account name": "nbsampletestworkspace01", + "Your container name": "test0922", + "Your relative path": "testCSV/" +} \ No newline at end of file diff --git a/Notebooks/Scala/Migrate_snowflake_schema.ipynb b/Notebooks/Scala/Migrate_snowflake_schema.ipynb new file mode 100644 index 0000000..b2e6c7f --- /dev/null +++ b/Notebooks/Scala/Migrate_snowflake_schema.ipynb @@ -0,0 +1,493 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "## Migrate schema from Snowflake to Synapse SQL Dedicated Pool\r\n", + "\r\n", + "---\r\n", + "\r\n", + "### This notebooks demos how to connect to Snowflake, read data from a schema named INFORMATION_SCHEMA, gather the list of the tables for the given schema and move those tables to a Synapse SQL dedicated pool.\r\n", + "\r\n", + "
    \r\n", + "
  • Define connection source
  • \r\n", + "
  • Specify connection options for the Snowflake instance
  • \r\n", + "
  • Read Snowflake Information_schema.tables to compile list of tables for our schema\r\n", + "
      \r\n", + "
    • Read each Snowflake table into into Spark DataFrame
    • \r\n", + "
    • Write DataFrame to table to Synapse SQL Dedicated pool\r\n", + "
    \r\n", + "
  • \r\n", + "
\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n" + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "// To use Snowflake as a data source in Spark, use the .format option to provide the Snowflake connector class name that defines the data source.\r\n", + "// Please note that you need to add spark-snowflake_2.12-2.9.0-spark_3.1.jar and snowflake-jdbc-3.13.6.jar to workspace packages as well as to cluster/session packages\r\n", + "// You can download those jar files from https://mvnrepository.com/artifact/net.snowflake/spark-snowflake?repo=snowflakedb and https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc respectevly\r\n", + "// You can find instructions how to add customized jars to cluster/session packages at https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-manage-scala-packages\r\n", + "\r\n", + "import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME\r\n", + "val SNOWFLAKE_SOURCE_NAME = \"net.snowflake.spark.snowflake\"" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "DemoCluster", + "session_id": 17, + "statement_id": 1, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-08-09T14:14:57.9353514Z", + "session_start_time": "2021-08-09T14:14:57.98765Z", + "execution_start_time": "2021-08-09T14:17:14.6699123Z", + "execution_finish_time": "2021-08-09T14:17:15.2227281Z" + }, + "text/plain": "StatementMeta(DemoCluster, 17, 1, Finished, Available)" + }, + "metadata": {} + } + ], + "execution_count": 1, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// setting default paramter \r\n", + "\r\n", + "val sfschema=\"existing schema\"" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "tags": [ + "parameters" + ] + } + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "// To not expose snowflake credentials, it is best practice to store user, password and account in Azure Key Vault service in your suscription\r\n", + "// Please reference https://docs.microsoft.com/en-us/azure/data-factory/store-credentials-in-key-vault how to set up secrets in Azure Key Vault service\r\n", + "// Please note that you need to link your Azure Key Vault service (AKV) to your Synapse workspace \r\n", + "// mssparkutils package let you retrive your secrets from AKV\r\n", + "\r\n", + "val user = mssparkutils.credentials.getSecret(\"Azure Key Vault name \", \"secret name for user\",\"linked service name\")\r\n", + "val password = mssparkutils.credentials.getSecret(\"Azure Key Vault name \", \"secret name for password\",\"linked service name\")\r\n", + "val account = mssparkutils.credentials.getSecret(\"Azure Key Vault name \", \"secret name for account\",\"linked service name\")\r\n", + "val account_URL = \"https://\" + account + \".azure.snowflakecomputing.com\"" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "TestS3Cluster", + "session_id": 18, + "statement_id": 2, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-08-04T17:58:15.5479493Z", + "session_start_time": null, + "execution_start_time": "2021-08-04T17:58:15.7178403Z", + "execution_finish_time": "2021-08-04T17:58:23.2651865Z" + }, + "text/plain": "StatementMeta(TestS3Cluster, 18, 2, Finished, Available)" + }, + "metadata": {} + } + ], + "execution_count": 79, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// set up options to connect to Snowflake schema public on TESTDB database\r\n", + " \r\n", + "val sfoptions = Map( \r\n", + " \"sfUrl\" -> account_URL,\r\n", + " \"sfUser\"->user,\r\n", + " \"sfPassword\"-> password,\r\n", + " \"sfDatabase\"-> \"TESTDB\",\r\n", + " \"sfSchema\"-> \"PUBLIC\",\r\n", + " \"sfWarehouse\"-> \"COMPUTE_WH\"\r\n", + ")" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "TestS3Cluster", + "session_id": 18, + "statement_id": 3, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-08-04T17:58:26.7006748Z", + "session_start_time": null, + "execution_start_time": "2021-08-04T17:58:26.8059265Z", + "execution_finish_time": "2021-08-04T17:58:28.1653268Z" + }, + "text/plain": "StatementMeta(TestS3Cluster, 18, 3, Finished, Available)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "sfoptions: scala.collection.immutable.Map[String,String] = Map(sfUrl -> https://yg58220.east-us-2.azure.snowflakecomputing.com, sfSchema -> PUBLIC, sfPassword -> , sfUser -> , sfWarehouse -> , sfDatabase -> TESTDB)\n" + ] + } + ], + "execution_count": 80, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// Setup options to connect to schema INFORMATION_SCHEMA. That schema in Snowflake contains your database metadata\r\n", + " \r\n", + "val sfoptions1 = Map( \r\n", + " \"sfUrl\" -> account_URL,\r\n", + " \"sfUser\"->user,\r\n", + " \"sfPassword\"-> password,\r\n", + " \"sfDatabase\"-> \"TESTDB\",\r\n", + " \"sfSchema\"-> \"INFORMATION_SCHEMA\",\r\n", + " \"sfWarehouse\"-> \"COMPUTE_WH\"\r\n", + ")\r\n" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "TestS3Cluster", + "session_id": 18, + "statement_id": 4, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-08-04T17:58:34.7448636Z", + "session_start_time": null, + "execution_start_time": "2021-08-04T17:58:34.8537732Z", + "execution_finish_time": "2021-08-04T17:58:36.0081044Z" + }, + "text/plain": "StatementMeta(TestS3Cluster, 18, 4, Finished, Available)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "sfoptions1: scala.collection.immutable.Map[String,String] = Map(sfUrl -> https://yg58220.east-us-2.azure.snowflakecomputing.com, sfSchema -> INFORMATION_SCHEMA, sfPassword -> , sfUser -> , sfWarehouse -> , sfDatabase -> TESTDB)\n" + ] + } + ], + "execution_count": 81, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true, + "tags": [] + } + }, + { + "cell_type": "code", + "source": [ + "// read table INFORMATION_SCHEMA.TABLES into a DataFrame. We need it to compile list of the tables within our schema\r\n", + "\r\n", + "val df_tl=spark.read.format( SNOWFLAKE_SOURCE_NAME ).options(sfoptions1).option(\"dbtable\",\"TABLES\").load()\r\n", + "//display(df_tl)" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "TestS3Cluster", + "session_id": 18, + "statement_id": 5, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-08-04T17:58:42.8787864Z", + "session_start_time": null, + "execution_start_time": "2021-08-04T17:58:42.9973772Z", + "execution_finish_time": "2021-08-04T17:58:48.579489Z" + }, + "text/plain": "StatementMeta(TestS3Cluster, 18, 5, Finished, Available)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "StructuredStream-spark package version: 3.0.0-2.1.1\n", + "df_tl: org.apache.spark.sql.DataFrame = [TABLE_CATALOG: string, TABLE_SCHEMA: string ... 20 more fields]\n" + ] + } + ], + "execution_count": 82, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// For easy iteration, convert selected info from DataFrame to collection\r\n", + "val df_tab_list = df_tl.select(\"table_schema\", \"table_name\").filter(\"table_schema='PUBLIC'\").collect()\r\n", + "//println(df_tab_list)" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "TestS3Cluster", + "session_id": 18, + "statement_id": 9, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-08-04T18:02:25.0054968Z", + "session_start_time": null, + "execution_start_time": "2021-08-04T18:02:25.209112Z", + "execution_finish_time": "2021-08-04T18:02:34.1314757Z" + }, + "text/plain": "StatementMeta(TestS3Cluster, 18, 9, Finished, Available)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "df_tab_list: Array[org.apache.spark.sql.Row] = Array([PUBLIC,CUSTOMERT], [PUBLIC,CUSTOMER_TEST], [PUBLIC,NATIONT], [PUBLIC,REGIONT])\n", + "[Lorg.apache.spark.sql.Row;@6d6b91a4\n" + ] + } + ], + "execution_count": 86, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Note: \r\n", + "We are using df.write.synapsesql method to populate table in SQL dedicated pool. If your target schema anything but \"dbo\", it need to be exist before.\r\n", + "At the same time this target schema should not have the table with name specified in this method. Here is stored procedure you can run to make sure that this requirement is met:\r\n", + "\r\n", + "```sql\r\n", + "CREATE PROCEDURE set_sfschema @schemaname sysname\r\n", + "AS BEGIN\r\n", + " DECLARE @cr_stmt NVARCHAR(200) = N'CREATE SCHEMA ' + @schemaname; \r\n", + " -- to imulate cursor processing\r\n", + " CREATE TABLE #temp_tbl\r\n", + " WITH\r\n", + " ( DISTRIBUTION = ROUND_ROBIN\r\n", + " )\r\n", + " AS \r\n", + " SELECT ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) AS Sequence,\r\n", + " table_schema , table_name ,\r\n", + " 'DROP TABLE ' + quotename(table_schema) + '.' + quotename(table_name) as sql_code\r\n", + " from information_schema.tables WHERE table_schema = @schemaname ; \r\n", + " \r\n", + " DECLARE @nbr_statements INT = (SELECT COUNT(*) FROM #temp_tbl)\r\n", + " , @i INT = 1;\r\n", + "\r\n", + " IF (0 = (SELECT COUNT(*) FROM sys.schemas WHERE name = @schemaname))\r\n", + " BEGIN\r\n", + " EXEC sp_executesql @tsql = @cr_stmt;\r\n", + " END\r\n", + " ELSE \r\n", + " WHILE @i <= @nbr_statements\r\n", + " BEGIN\r\n", + " DECLARE @sql_code NVARCHAR(60) = (SELECT sql_code FROM #temp_tbl WHERE Sequence =@i);\r\n", + " EXEC sp_executesql @sql_code;\r\n", + " SET @i +=1;\r\n", + " END\r\n", + " DROP TABLE #temp_tbl; \r\n", + "END\r\n", + "GO\r\n", + "```\r\n", + "\r\n", + "" + ] + }, + { + "cell_type": "code", + "source": [ + "// For each table in the schema read data from Snowflake table into a DataFrame and write it to Synapse SQL Dedicated Pool.\r\n", + "\r\n", + "df_tab_list.foreach(row=>\r\n", + " {\r\n", + " val tname = row.getString(1) \r\n", + " //println(tname)\r\n", + " val df_temp=spark.read.format( SNOWFLAKE_SOURCE_NAME ).options(sfoptions).option(\"dbtable\",tname).load()\r\n", + " val target_table = \"SQLdedpool1.\" + sfschema + \".\" + tname\r\n", + " println(target_table)\r\n", + " df_temp.write.synapsesql(target_table, Constants.INTERNAL)\r\n", + " })" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "TestS3Cluster", + "session_id": 18, + "statement_id": 10, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-08-04T18:02:57.9134366Z", + "session_start_time": null, + "execution_start_time": "2021-08-04T18:02:58.0345256Z", + "execution_finish_time": "2021-08-04T18:04:43.7619937Z" + }, + "text/plain": "StatementMeta(TestS3Cluster, 18, 10, Finished, Available)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CUSTOMERT\n", + "CUSTOMER_TEST\n", + "NATIONT\n", + "REGIONT\n" + ] + } + ], + "execution_count": 87, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + } + ], + "metadata": { + "kernelspec": { + "name": "synapse_spark", + "language": "Scala", + "display_name": "Synapse Spark" + }, + "language_info": { + "name": "scala" + }, + "kernel_info": { + "name": "synapse_spark" + }, + "save_output": true, + "synapse_widget": { + "version": "0.1", + "state": {} + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Notebooks/metadata.json b/Notebooks/metadata.json new file mode 100644 index 0000000..4c6871c --- /dev/null +++ b/Notebooks/metadata.json @@ -0,0 +1,13 @@ +{ + "data": [ + { + "title": "exampleForParameter", + "path": "PySpark\\exampleForParameter.ipynb", + "description": "This notebook provides examples to show how to handle parameters for automation test.", + "tags": ["Example"], + "types": ["PySpark"], + "categories": ["Example"], + "languages": ["PySpark"] + } + ] +} diff --git a/Pipelines/ImportADFtoSynapse/README.md b/Pipelines/ImportADFtoSynapse/README.md index c795dbe..68c1bc0 100644 --- a/Pipelines/ImportADFtoSynapse/README.md +++ b/Pipelines/ImportADFtoSynapse/README.md @@ -61,14 +61,12 @@ You can get the Resource IDs for Azure Data Factory and Azure Synapse Analytics. .\importADFtoSynapseTool.ps1 -ConfigFile appsettings.json ``` -[!NOTE] -* Existing resources in destination workspace with the same name will be overwritten. -* Same networking setting between Azure Data Factory and Synapse is required. +>[!NOTE] +> * Existing resources in destination workspace with the same name will be overwritten. +> * Same networking setting between Azure Data Factory and Synapse is required. e.g. Managed Virtual Network of Azure Data Factory is enabled on both Azure Data Factory and Synapse. -* The migration tool does not support migration of ADF SSIS pipeline -* Refer to the [Troubleshooting Guide](./Troubleshooting.md) if you run into any issues when using the migration PowerShell script - -* * * +> * The migration tool does not support migration of ADF SSIS pipeline +> * Refer to the [Troubleshooting Guide](./Troubleshooting.md) if you run into any issues when using the migration PowerShell script ## How do I exclude specific objects from my Data Factory source factory? This migration tool will migrate all objects from the published version of your factory that is live in Azure. You will have to remove any objects that you do not wish to migrate to your Synapse workspace. If you do not wish to modify your source ADF, then you should make a copy of the existing factory, remove the objects you do not wish to migrate, and use that new factory as your source. diff --git a/Pipelines/ImportADFtoSynapse/importADFtoSynapseTool.ps1 b/Pipelines/ImportADFtoSynapse/importADFtoSynapseTool.ps1 index 28bb84b..6c519b3 100644 --- a/Pipelines/ImportADFtoSynapse/importADFtoSynapseTool.ps1 +++ b/Pipelines/ImportADFtoSynapse/importADFtoSynapseTool.ps1 @@ -354,6 +354,16 @@ function ProcessResource { Write-Host "" Write-Host "Processing $resourceType" -ForegroundColor White $resourcesToBeCopied.AddRange($srcResponse.Value); + + while ($srcResponse.PSobject.Properties.Name.Contains("nextLink")) { + Write-Host "Processing next page $srcResponse.nextLink" + $nextLink = $srcResponse.nextLink + $srcResponse = Invoke-RestMethod -UseBasicParsing -Uri $nextLink -Method Get -ContentType "application/json" -Headers @{ Authorization = "Bearer $token"} + if ($srcResponse.Value.Length -gt 0) { + $resourcesToBeCopied.AddRange($srcResponse.Value); + } + } + WriteSuccessResponse(" Migrating $($resourcesToBeCopied.Count) $resourceType") } elseif($resourcesToBeCopied.Count -le 0) { diff --git a/README.md b/README.md index 90eb451..34de4e3 100644 --- a/README.md +++ b/README.md @@ -31,5 +31,9 @@ Shows .NET for Spark and shared metadata experience between Spark created tables * [Description and notebooks/code files](Notebooks/Spark.NET%20C%23/Tweets) * [Sample Data](Data/Tweets)) +### ADF to Synapse Migration Tool + +The [ADF to Synapse Migration Tool](Pipelines/ImportADFtoSynapse) (currently PowerShell scripts) enables you to migrate Azure Data Factory pipelines, datasets, linked service, integration runtime and triggers to a Synapse Analytics Workspace. + ## Contributing This project welcomes contributions and suggestions. See the [Contributor's guide](https://github.com/Azure-Samples/Synapse/tree/master/CONTRIBUTE.md) diff --git a/SQL/Samples/LdwSample/SampleDB.sql b/SQL/Samples/LdwSample/SampleDB.sql index 329d603..c533051 100644 --- a/SQL/Samples/LdwSample/SampleDB.sql +++ b/SQL/Samples/LdwSample/SampleDB.sql @@ -29,6 +29,10 @@ IF (EXISTS(SELECT * FROM sys.external_file_formats WHERE name = 'NativeParquet') DROP EXTERNAL FILE FORMAT NativeParquet END GO +IF (EXISTS(SELECT * FROM sys.external_file_formats WHERE name = 'DeltaLakeFormat')) BEGIN + DROP EXTERNAL FILE FORMAT DeltaLakeFormat +END +GO DROP SCHEMA IF EXISTS parquet; GO DROP SCHEMA IF EXISTS csv; @@ -40,6 +44,10 @@ IF (EXISTS(SELECT * FROM sys.external_data_sources WHERE name = 'SqlOnDemandDemo DROP EXTERNAL DATA SOURCE SqlOnDemandDemo END +IF (EXISTS(SELECT * FROM sys.external_data_sources WHERE name = 'DeltaLakeStorage')) BEGIN + DROP EXTERNAL DATA SOURCE DeltaLakeStorage +END + IF (EXISTS(SELECT * FROM sys.external_data_sources WHERE name = 'AzureOpenData')) BEGIN DROP EXTERNAL DATA SOURCE AzureOpenData END @@ -112,6 +120,14 @@ CREATE EXTERNAL DATA SOURCE SqlOnDemandDemo WITH ( CREDENTIAL = sqlondemand ); GO + +-- Data source referencing Delta Lake folders +CREATE EXTERNAL DATA SOURCE DeltaLakeStorage WITH ( + LOCATION = 'https://sqlondemandstorage.blob.core.windows.net/delta-lake', + CREDENTIAL = sqlondemand +); +GO + -- Create publicly available external data sources CREATE EXTERNAL DATA SOURCE AzureOpenData WITH ( LOCATION = 'https://azureopendatastorage.blob.core.windows.net/') @@ -148,6 +164,11 @@ WITH ( FORMAT_TYPE = PARQUET ); GO +CREATE EXTERNAL FILE FORMAT DeltaLakeFormat +WITH ( + FORMAT_TYPE = DELTA +); +GO CREATE EXTERNAL TABLE csv.population ( diff --git a/SQL/databases/tpcds/schema-views.sql b/SQL/databases/tpcds/schema-views.sql new file mode 100644 index 0000000..c914f2c --- /dev/null +++ b/SQL/databases/tpcds/schema-views.sql @@ -0,0 +1,689 @@ +DROP VIEW IF EXISTS [call_center]; +GO + +DROP VIEW IF EXISTS [catalog_page]; +GO + +DROP VIEW IF EXISTS [catalog_returns]; +GO + +DROP VIEW IF EXISTS [catalog_sales]; +GO + +DROP VIEW IF EXISTS [customer]; +GO + +DROP VIEW IF EXISTS [customer_address]; +GO + +DROP VIEW IF EXISTS [customer_demographics]; +GO + +DROP VIEW IF EXISTS [date_dim]; +GO + +DROP VIEW IF EXISTS [household_demographics]; +GO + +DROP VIEW IF EXISTS [income_band]; +GO + +DROP VIEW IF EXISTS [inventory]; +GO + +DROP VIEW IF EXISTS [item]; +GO + +DROP VIEW IF EXISTS [promotion]; +GO + +DROP VIEW IF EXISTS [reason]; +GO + +DROP VIEW IF EXISTS [ship_mode]; +GO + +DROP VIEW IF EXISTS [store]; +GO + +DROP VIEW IF EXISTS [store_returns]; +GO + +DROP VIEW IF EXISTS [store_sales]; +GO + +DROP VIEW IF EXISTS [time_dim]; +GO + +DROP VIEW IF EXISTS [warehouse]; +GO + +DROP VIEW IF EXISTS [web_page]; +GO + +DROP VIEW IF EXISTS [web_site]; +GO + +DROP VIEW IF EXISTS [web_returns]; +GO + +DROP VIEW IF EXISTS [web_sales]; +GO + + +CREATE VIEW [call_center] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/call_center/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + CC_CALL_CENTER_SK integer, + CC_CALL_CENTER_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + CC_REC_START_DATE date, + CC_REC_END_DATE date, + CC_CLOSED_DATE_SK integer, + CC_OPEN_DATE_SK integer, + CC_NAME varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + CC_CLASS varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + CC_EMPLOYEES integer, + CC_SQ_FT integer, + CC_HOURS char(20) COLLATE Latin1_General_100_BIN2_UTF8, + CC_MANAGER varchar(40) COLLATE Latin1_General_100_BIN2_UTF8, + CC_MKT_ID integer, + CC_MKT_CLASS char(50) COLLATE Latin1_General_100_BIN2_UTF8, + CC_MKT_DESC varchar(100) COLLATE Latin1_General_100_BIN2_UTF8, + CC_MARKET_MANAGER varchar(40) COLLATE Latin1_General_100_BIN2_UTF8, + CC_DIVISION integer, + CC_DIVISION_NAME varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + CC_COMPANY integer, + CC_COMPANY_NAME char(50) COLLATE Latin1_General_100_BIN2_UTF8, + CC_STREET_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + CC_STREET_NAME varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + CC_STREET_TYPE char(15) COLLATE Latin1_General_100_BIN2_UTF8, + CC_SUITE_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + CC_CITY varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + CC_COUNTY varchar(30) COLLATE Latin1_General_100_BIN2_UTF8, + CC_STATE char(2) COLLATE Latin1_General_100_BIN2_UTF8, + CC_ZIP char(10) COLLATE Latin1_General_100_BIN2_UTF8, + CC_COUNTRY varchar(20) COLLATE Latin1_General_100_BIN2_UTF8, + CC_GMT_OFFSET decimal(5,2), + CC_TAX_PERCENTAGE decimal(5,2) +) AS call_center; +GO + +CREATE VIEW [catalog_page] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/catalog_page/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + CP_CATALOG_PAGE_SK integer, + CP_CATALOG_PAGE_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + CP_START_DATE_SK integer, + CP_END_DATE_SK integer, + CP_DEPARTMENT varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + CP_CATALOG_NUMBER integer, + CP_CATALOG_PAGE_NUMBER integer, + CP_DESCRIPTION varchar(100) COLLATE Latin1_General_100_BIN2_UTF8, + CP_TYPE varchar(100) COLLATE Latin1_General_100_BIN2_UTF8 +) AS catalog_page; +GO + +CREATE VIEW [catalog_returns] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/catalog_returns/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + CR_RETURNED_DATE_SK integer, + CR_RETURNED_TIME_SK integer, + CR_ITEM_SK integer, + CR_REFUNDED_CUSTOMER_SK integer, + CR_REFUNDED_CDEMO_SK integer, + CR_REFUNDED_HDEMO_SK integer, + CR_REFUNDED_ADDR_SK integer, + CR_RETURNING_CUSTOMER_SK integer, + CR_RETURNING_CDEMO_SK integer, + CR_RETURNING_HDEMO_SK integer, + CR_RETURNING_ADDR_SK integer, + CR_CALL_CENTER_SK integer, + CR_CATALOG_PAGE_SK integer, + CR_SHIP_MODE_SK integer, + CR_WAREHOUSE_SK integer, + CR_REASON_SK integer, + CR_ORDER_NUMBER bigint, + CR_RETURN_QUANTITY integer, + CR_RETURN_AMOUNT decimal(7,2), + CR_RETURN_TAX decimal(7,2), + CR_RETURN_AMT_INC_TAX decimal(7,2), + CR_FEE decimal(7,2), + CR_RETURN_SHIP_COST decimal(7,2), + CR_REFUNDED_CASH decimal(7,2), + CR_REVERSED_CHARGE decimal(7,2), + CR_STORE_CREDIT decimal(7,2), + CR_NET_LOSS decimal(7,2) +) AS catalog_returns; +GO + +CREATE VIEW [catalog_sales] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/catalog_sales/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + CS_SOLD_DATE_SK integer, + CS_SOLD_TIME_SK integer, + CS_SHIP_DATE_SK integer, + CS_BILL_CUSTOMER_SK integer, + CS_BILL_CDEMO_SK integer, + CS_BILL_HDEMO_SK integer, + CS_BILL_ADDR_SK integer, + CS_SHIP_CUSTOMER_SK integer, + CS_SHIP_CDEMO_SK integer, + CS_SHIP_HDEMO_SK integer, + CS_SHIP_ADDR_SK integer, + CS_CALL_CENTER_SK integer, + CS_CATALOG_PAGE_SK integer, + CS_SHIP_MODE_SK integer, + CS_WAREHOUSE_SK integer, + CS_ITEM_SK integer, + CS_PROMO_SK integer, + CS_ORDER_NUMBER bigint, + CS_QUANTITY integer, + CS_WHOLESALE_COST decimal(7,2), + CS_LIST_PRICE decimal(7,2), + CS_SALES_PRICE decimal(7,2), + CS_EXT_DISCOUNT_AMT decimal(7,2), + CS_EXT_SALES_PRICE decimal(7,2), + CS_EXT_WHOLESALE_COST decimal(7,2), + CS_EXT_LIST_PRICE decimal(7,2), + CS_EXT_TAX decimal(7,2), + CS_COUPON_AMT decimal(7,2), + CS_EXT_SHIP_COST decimal(7,2), + CS_NET_PAID decimal(7,2), + CS_NET_PAID_INC_TAX decimal(7,2), + CS_NET_PAID_INC_SHIP decimal(7,2), + CS_NET_PAID_INC_SHIP_TAX decimal(7,2), + CS_NET_PROFIT decimal(7,2) +) AS catalog_sales; +GO + +CREATE VIEW [customer] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/customer/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + C_CUSTOMER_SK integer, + C_CUSTOMER_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + C_CURRENT_CDEMO_SK integer, + C_CURRENT_HDEMO_SK integer, + C_CURRENT_ADDR_SK integer, + C_FIRST_SHIPTO_DATE_SK integer, + C_FIRST_SALES_DATE_SK integer, + C_SALUTATION char(10) COLLATE Latin1_General_100_BIN2_UTF8, + C_FIRST_NAME char(20) COLLATE Latin1_General_100_BIN2_UTF8, + C_LAST_NAME char(30) COLLATE Latin1_General_100_BIN2_UTF8, + C_PREFERRED_CUST_FLAG char(1) COLLATE Latin1_General_100_BIN2_UTF8, + C_BIRTH_DAY integer, + C_BIRTH_MONTH integer, + C_BIRTH_YEAR integer, + C_BIRTH_COUNTRY varchar(20) COLLATE Latin1_General_100_BIN2_UTF8, + C_LOGIN char(13) COLLATE Latin1_General_100_BIN2_UTF8, + C_EMAIL_ADDRESS char(50) COLLATE Latin1_General_100_BIN2_UTF8, + C_LAST_REVIEW_DATE_SK integer +) AS customer; +GO + +CREATE VIEW [customer_address] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/customer_address/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + CA_ADDRESS_SK integer, + CA_ADDRESS_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + CA_STREET_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + CA_STREET_NAME varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + CA_STREET_TYPE char(15) COLLATE Latin1_General_100_BIN2_UTF8, + CA_SUITE_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + CA_CITY varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + CA_COUNTY varchar(30) COLLATE Latin1_General_100_BIN2_UTF8, + CA_STATE char(2) COLLATE Latin1_General_100_BIN2_UTF8, + CA_ZIP char(10) COLLATE Latin1_General_100_BIN2_UTF8, + CA_COUNTRY varchar(20) COLLATE Latin1_General_100_BIN2_UTF8, + CA_GMT_OFFSET decimal(5,2), + CA_LOCATION_TYPE char(20) COLLATE Latin1_General_100_BIN2_UTF8 +) AS customer_address; +GO + +CREATE VIEW [customer_demographics] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/customer_demographics/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + CD_DEMO_SK integer, + CD_GENDER char(1) COLLATE Latin1_General_100_BIN2_UTF8, + CD_MARITAL_STATUS char(1) COLLATE Latin1_General_100_BIN2_UTF8, + CD_EDUCATION_STATUS char(20) COLLATE Latin1_General_100_BIN2_UTF8, + CD_PURCHASE_ESTIMATE integer, + CD_CREDIT_RATING char(10) COLLATE Latin1_General_100_BIN2_UTF8, + CD_DEP_COUNT integer, + CD_DEP_EMPLOYED_COUNT integer, + CD_DEP_COLLEGE_COUNT integer +) AS customer_demographics; +GO + +CREATE VIEW [date_dim] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/date_dim/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + D_DATE_SK integer, + D_DATE_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + D_DATE date, + D_MONTH_SEQ integer, + D_WEEK_SEQ integer, + D_QUARTER_SEQ integer, + D_YEAR integer, + D_DOW integer, + D_MOY integer, + D_DOM integer, + D_QOY integer, + D_FY_YEAR integer, + D_FY_QUARTER_SEQ integer, + D_FY_WEEK_SEQ integer, + D_DAY_NAME char(9) COLLATE Latin1_General_100_BIN2_UTF8, + D_QUARTER_NAME char(6) COLLATE Latin1_General_100_BIN2_UTF8, + D_HOLIDAY char(1) COLLATE Latin1_General_100_BIN2_UTF8, + D_WEEKEND char(1) COLLATE Latin1_General_100_BIN2_UTF8, + D_FOLLOWING_HOLIDAY char(1) COLLATE Latin1_General_100_BIN2_UTF8, + D_FIRST_DOM integer, + D_LAST_DOM integer, + D_SAME_DAY_LY integer, + D_SAME_DAY_LQ integer, + D_CURRENT_DAY char(1) COLLATE Latin1_General_100_BIN2_UTF8, + D_CURRENT_WEEK char(1) COLLATE Latin1_General_100_BIN2_UTF8, + D_CURRENT_MONTH char(1) COLLATE Latin1_General_100_BIN2_UTF8, + D_CURRENT_QUARTER char(1) COLLATE Latin1_General_100_BIN2_UTF8, + D_CURRENT_YEAR char(1) COLLATE Latin1_General_100_BIN2_UTF8 +) AS date_dim; +GO + +CREATE VIEW [household_demographics] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/household_demographics/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + HD_DEMO_SK integer, + HD_INCOME_BAND_SK integer, + HD_BUY_POTENTIAL char(15) COLLATE Latin1_General_100_BIN2_UTF8, + HD_DEP_COUNT integer, + HD_VEHICLE_COUNT integer +) AS household_demographics; +GO + +CREATE VIEW [income_band] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/income_band/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + IB_INCOME_BAND_SK integer, + IB_LOWER_BOUND integer, + IB_UPPER_BOUND integer +) AS income_band; +GO + +CREATE VIEW [inventory] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/inventory/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + INV_DATE_SK integer, + INV_ITEM_SK integer, + INV_WAREHOUSE_SK integer, + INV_QUANTITY_ON_HAND integer +) AS inventory; +GO + +CREATE VIEW [item] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/item/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + I_ITEM_SK integer, + I_ITEM_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + I_REC_START_DATE date, + I_REC_END_DATE date, + I_ITEM_DESC varchar(200) COLLATE Latin1_General_100_BIN2_UTF8, + I_CURRENT_PRICE decimal(7,2), + I_WHOLESALE_COST decimal(7,2), + I_BRAND_ID integer, + I_BRAND char(50) COLLATE Latin1_General_100_BIN2_UTF8, + I_CLASS_ID integer, + I_CLASS char(50) COLLATE Latin1_General_100_BIN2_UTF8, + I_CTGRY_ID integer, + I_CTGRY char(50) COLLATE Latin1_General_100_BIN2_UTF8, + I_MANUFACT_ID integer, + I_MANUFACT char(50) COLLATE Latin1_General_100_BIN2_UTF8, + I_SIZE char(20) COLLATE Latin1_General_100_BIN2_UTF8, + I_FORMULATION char(20) COLLATE Latin1_General_100_BIN2_UTF8, + I_COLOR char(20) COLLATE Latin1_General_100_BIN2_UTF8, + I_UNITS char(10) COLLATE Latin1_General_100_BIN2_UTF8, + I_CONTAINER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + I_MANAGER_ID integer, + I_PRODUCT_NAME char(50) COLLATE Latin1_General_100_BIN2_UTF8 +) AS item; +GO + +CREATE VIEW [promotion] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/promotion/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + P_PROMO_SK integer, + P_PROMO_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + P_START_DATE_SK integer, + P_END_DATE_SK integer, + P_ITEM_SK integer, + P_COST decimal(15,2), + P_RESPONSE_TARGET integer, + P_PROMO_NAME char(50) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_DMAIL char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_EMAIL char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_CATALOG char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_TV char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_RADIO char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_PRESS char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_EVENT char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_DEMO char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_DETAILS varchar(100) COLLATE Latin1_General_100_BIN2_UTF8, + P_PURPOSE char(15) COLLATE Latin1_General_100_BIN2_UTF8, + P_DISCOUNT_ACTIVE char(1) COLLATE Latin1_General_100_BIN2_UTF8 +) AS promotion; +GO + +CREATE VIEW [reason] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/reason/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + R_REASON_SK integer, + R_REASON_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + R_REASON_DESC char(100) COLLATE Latin1_General_100_BIN2_UTF8 +) AS reason; +GO + +CREATE VIEW [ship_mode] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/ship_mode/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + SM_SHIP_MODE_SK integer, + SM_SHIP_MODE_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + SM_TYPE char(30) COLLATE Latin1_General_100_BIN2_UTF8, + SM_CODE char(10) COLLATE Latin1_General_100_BIN2_UTF8, + SM_CARRIER char(20) COLLATE Latin1_General_100_BIN2_UTF8, + SM_CONTRACT char(20) COLLATE Latin1_General_100_BIN2_UTF8 +) AS ship_mode; +GO + +CREATE VIEW [store] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/store/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + S_STORE_SK integer, + S_STORE_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + S_REC_START_DATE date, + S_REC_END_DATE date, + S_CLOSED_DATE_SK integer, + S_STORE_NAME varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + S_NUMBER_EMPLOYEES integer, + S_FLOOR_SPACE integer, + S_HOURS char(20) COLLATE Latin1_General_100_BIN2_UTF8, + S_MANAGER varchar(40) COLLATE Latin1_General_100_BIN2_UTF8, + S_MARKET_ID integer, + S_GEOGRAPHY_CLASS varchar(100) COLLATE Latin1_General_100_BIN2_UTF8, + S_MARKET_DESC varchar(100) COLLATE Latin1_General_100_BIN2_UTF8, + S_MARKET_MANAGER varchar(40) COLLATE Latin1_General_100_BIN2_UTF8, + S_DIVISION_ID integer, + S_DIVISION_NAME varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + S_COMPANY_ID integer, + S_COMPANY_NAME varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + S_STREET_NUMBER varchar(10) COLLATE Latin1_General_100_BIN2_UTF8, + S_STREET_NAME varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + S_STREET_TYPE char(15) COLLATE Latin1_General_100_BIN2_UTF8, + S_SUITE_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + S_CITY varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + S_COUNTY varchar(30) COLLATE Latin1_General_100_BIN2_UTF8, + S_STATE char(2) COLLATE Latin1_General_100_BIN2_UTF8, + S_ZIP char(10) COLLATE Latin1_General_100_BIN2_UTF8, + S_COUNTRY varchar(20) COLLATE Latin1_General_100_BIN2_UTF8, + S_GMT_OFFSET decimal(5,2), + S_TAX_PRECENTAGE decimal(5,2) +) AS store; +GO + +CREATE VIEW [store_returns] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/store_returns/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + SR_RETURNED_DATE_SK integer, + SR_RETURN_TIME_SK integer, + SR_ITEM_SK integer, + SR_CUSTOMER_SK integer, + SR_CDEMO_SK integer, + SR_HDEMO_SK integer, + SR_ADDR_SK integer, + SR_STORE_SK integer, + SR_REASON_SK integer, + SR_TICKET_NUMBER integer, + SR_RETURN_QUANTITY integer, + SR_RETURN_AMT decimal(7,2), + SR_RETURN_TAX decimal(7,2), + SR_RETURN_AMT_INC_TAX decimal(7,2), + SR_FEE decimal(7,2), + SR_RETURN_SHIP_COST decimal(7,2), + SR_REFUNDED_CASH decimal(7,2), + SR_REVERSED_CHARGE decimal(7,2), + SR_STORE_CREDIT decimal(7,2), + SR_NET_LOSS decimal(7,2) +) AS store_returns; +GO + +CREATE VIEW [store_sales] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/store_sales/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + SS_SOLD_DATE_SK integer, + SS_SOLD_TIME_SK integer, + SS_ITEM_SK integer, + SS_CUSTOMER_SK integer, + SS_CDEMO_SK integer, + SS_HDEMO_SK integer, + SS_ADDR_SK integer, + SS_STORE_SK integer, + SS_PROMO_SK integer, + SS_TICKET_NUMBER integer, + SS_QUANTITY integer, + SS_WHOLESALE_COST decimal(7, 2), + SS_LIST_PRICE decimal(7, 2), + SS_SALES_PRICE decimal(7, 2), + SS_EXT_DISCOUNT_AMT decimal(7, 2), + SS_EXT_SALES_PRICE decimal(7, 2), + SS_EXT_WHOLESALE_COST decimal(7, 2), + SS_EXT_LIST_PRICE decimal(7, 2), + SS_EXT_TAX decimal(7, 2), + SS_COUPON_AMT decimal(7, 2), + SS_NET_PAID decimal(7, 2), + SS_NET_PAID_INC_TAX decimal(7, 2), + SS_NET_PROFIT decimal(7, 2) +) AS store_sales; +GO + +CREATE VIEW [time_dim] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/time_dim/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + T_TIME_SK integer, + T_TIME_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + T_TIME integer, + T_HOUR integer, + T_MINUTE integer, + T_SECOND integer, + T_AM_PM char(2) COLLATE Latin1_General_100_BIN2_UTF8, + T_SHIFT char(20) COLLATE Latin1_General_100_BIN2_UTF8, + T_SUB_SHIFT char(20) COLLATE Latin1_General_100_BIN2_UTF8, + T_MEAL_TIME char(20) COLLATE Latin1_General_100_BIN2_UTF8 +) AS time_dim; +GO + +CREATE VIEW [warehouse] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/warehouse/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + W_WAREHOUSE_SK integer, + W_WAREHOUSE_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + W_WAREHOUSE_NAME varchar(20) COLLATE Latin1_General_100_BIN2_UTF8, + W_WAREHOUSE_SQ_FT integer, + W_STREET_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + W_STREET_NAME varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + W_STREET_TYPE char(15) COLLATE Latin1_General_100_BIN2_UTF8, + W_SUITE_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + W_CITY varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + W_COUNTY varchar(30) COLLATE Latin1_General_100_BIN2_UTF8, + W_STATE char(2) COLLATE Latin1_General_100_BIN2_UTF8, + W_ZIP char(10) COLLATE Latin1_General_100_BIN2_UTF8, + W_COUNTRY varchar(20) COLLATE Latin1_General_100_BIN2_UTF8, + W_GMT_OFFSET decimal(5,2) +) AS warehouse; +GO + +CREATE VIEW [web_page] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/web_page/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + WP_WEB_PAGE_SK integer, + WP_WEB_PAGE_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + WP_REC_START_DATE date, + WP_REC_END_DATE date, + WP_CREATION_DATE_SK integer, + WP_ACCESS_DATE_SK integer, + WP_AUTOGEN_FLAG char(1) COLLATE Latin1_General_100_BIN2_UTF8, + WP_CUSTOMER_SK integer, + WP_URL varchar(100) COLLATE Latin1_General_100_BIN2_UTF8, + WP_TYPE char(50) COLLATE Latin1_General_100_BIN2_UTF8, + WP_CHAR_COUNT integer, + WP_LINK_COUNT integer, + WP_IMAGE_COUNT integer, + WP_MAX_AD_COUNT integer +) AS web_page; +GO + +CREATE VIEW [web_returns] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/web_returns/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + WR_RETURNED_DATE_SK integer, + WR_RETURNED_TIME_SK integer, + WR_ITEM_SK integer, + WR_REFUNDED_CUSTOMER_SK integer, + WR_REFUNDED_CDEMO_SK integer, + WR_REFUNDED_HDEMO_SK integer, + WR_REFUNDED_ADDR_SK integer, + WR_RETURNING_CUSTOMER_SK integer, + WR_RETURNING_CDEMO_SK integer, + WR_RETURNING_HDEMO_SK integer, + WR_RETURNING_ADDR_SK integer, + WR_WEB_PAGE_SK integer, + WR_REASON_SK integer, + WR_ORDER_NUMBER integer, + WR_RETURN_QUANTITY integer, + WR_RETURN_AMT decimal(7,2), + WR_RETURN_TAX decimal(7,2), + WR_RETURN_AMT_INC_TAX decimal(7,2), + WR_FEE decimal(7,2), + WR_RETURN_SHIP_COST decimal(7,2), + WR_REFUNDED_CASH decimal(7,2), + WR_REVERSED_CHARGE decimal(7,2), + WR_ACCOUNT_CREDIT decimal(7,2), + WR_NET_LOSS decimal(7,2) +) AS web_returns; +GO + +CREATE VIEW [web_sales] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/web_sales/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + WS_SOLD_DATE_SK integer, + WS_SOLD_TIME_SK integer, + WS_SHIP_DATE_SK integer, + WS_ITEM_SK integer, + WS_BILL_CUSTOMER_SK integer, + WS_BILL_CDEMO_SK integer, + WS_BILL_HDEMO_SK integer, + WS_BILL_ADDR_SK integer, + WS_SHIP_CUSTOMER_SK integer, + WS_SHIP_CDEMO_SK integer, + WS_SHIP_HDEMO_SK integer, + WS_SHIP_ADDR_SK integer, + WS_WEB_PAGE_SK integer, + WS_WEB_SITE_SK integer, + WS_SHIP_MODE_SK integer, + WS_WAREHOUSE_SK integer, + WS_PROMO_SK integer, + WS_ORDER_NUMBER integer, + WS_QUANTITY integer, + WS_WHOLESALE_COST decimal(7,2), + WS_LIST_PRICE decimal(7,2), + WS_SALES_PRICE decimal(7,2), + WS_EXT_DISCOUNT_AMT decimal(7,2), + WS_EXT_SALES_PRICE decimal(7,2), + WS_EXT_WHOLESALE_COST decimal(7,2), + WS_EXT_LIST_PRICE decimal(7,2), + WS_EXT_TAX decimal(7,2), + WS_COUPON_AMT decimal(7,2), + WS_EXT_SHIP_COST decimal(7,2), + WS_NET_PAID decimal(7,2), + WS_NET_PAID_INC_TAX decimal(7,2), + WS_NET_PAID_INC_SHIP decimal(7,2), + WS_NET_PAID_INC_SHIP_TAX decimal(7,2), + WS_NET_PROFIT decimal(7,2) +) AS web_sales; +GO + +CREATE VIEW [web_site] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/web_site/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + WEB_SITE_SK integer, + WEB_SITE_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_REC_START_DATE date, + WEB_REC_END_DATE date, + WEB_NAME varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_OPEN_DATE_SK integer, + WEB_CLOSE_DATE_SK integer, + WEB_CLASS varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_MANAGER varchar(40) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_MKT_ID integer, + WEB_MKT_CLASS varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_MKT_DESC varchar(100) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_MARKET_MANAGER varchar(40) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_COMPANY_ID integer, + WEB_COMPANY_NAME char(50) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_STREET_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_STREET_NAME varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_STREET_TYPE char(15) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_SUITE_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_CITY varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_COUNTY varchar(30) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_STATE char(2) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_ZIP char(10) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_COUNTRY varchar(20) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_GMT_OFFSET decimal(5,2), + WEB_TAX_PERCENTAGE decimal(5,2) +) AS web_site; +GO diff --git a/docs/synapse-sql/generate-cosmosdb-query-ff-schema.html b/docs/synapse-sql/generate-cosmosdb-query-ff-schema.html new file mode 100644 index 0000000..2a43862 --- /dev/null +++ b/docs/synapse-sql/generate-cosmosdb-query-ff-schema.html @@ -0,0 +1,225 @@ + + + + + + OPENROWSET by example - Synapse link for CosmosDB + + + + +
+

Generating OPENROWSET statement for Cosmos DB documents

+
+ +
+ +
+
+
+ +
+ + + +
+
+ +
+

Copy the text below in some SQL editor once you generate the script

+
+ +
+ + + + + +