Azure-Samples · nikhil-sood · Jul 5, 2021 · Jul 6, 2021 · Jul 7, 2021 · Jul 7, 2021
diff --git a/Diagnostic/LogAnalytics/Azure_Synapse_Spark_Application.workbook b/Diagnostic/LogAnalytics/Azure_Synapse_Spark_Application.workbook
@@ -109,7 +109,8 @@
             "type": 2,
             "description": "Spark Application Livy Id and Name",
             "isRequired": true,
-            "query": "SparkMetrics_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and isnotempty(livyId_s)\r\n| summarize by livyId_s, applicationName_s\r\n| order by livyId_s desc\r\n| extend applicationName = substring(applicationName_s, 0, strlen(applicationName_s) - 1)\r\n| project value = livyId_s, label = strcat(livyId_s, \" | \", applicationName), selected = false",
+            "query": "SparkMetrics_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and isnotempty(livyId_s)\r\n| extend applicationName = column_ifexists(\"applicationName_s\", applicationId_s)\r\n| summarize by livyId_s, applicationName\r\n| order by livyId_s desc\r\n| project value = livyId_s, label = strcat(livyId_s, \" | \", applicationName), selected = false",
+            "value": null,
             "typeSettings": {
               "additionalResourceOptions": [],
               "showDefault": false
@@ -182,7 +183,7 @@
             "id": "9246e88b-9682-498c-b440-f53a15b6e481",
             "cellValue": "selectedTab",
             "linkTarget": "parameter",
-            "linkLabel": "Spark Streaming",
+            "linkLabel": "Streaming",
             "subTarget": "Streaming",
             "preText": "Metrics",
             "postText": "1",
@@ -255,7 +256,7 @@
               "additionalResourceOptions": [
                 "value::all"
               ],
-              "selectAllValue": "ALL",
+              "selectAllValue": "All",
               "showDefault": false
             },
             "timeContext": {
@@ -277,7 +278,7 @@
         "comparison": "isEqualTo",
         "value": "Logs"
       },
-      "name": "CustomMetricsSelector - Copy"
+      "name": "Logs Filter Dropdown"
     },
     {
       "type": 3,
@@ -566,7 +567,7 @@
       "type": 3,
       "content": {
         "version": "KqlItem/1.0",
-        "query": "let Data = SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\";\r\n\r\nunion\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(progress_sources_s)\r\n| mv-expand progress_sources\r\n| project TimeGenerated, InputRows=tolong(progress_sources.numInputRows)\r\n| summarize Value=sum(InputRows)\r\n| extend Name=\"Total Input Rows\", Order=0\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Progress Events\", Order=1\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryStartedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Starts\", Order=2\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Terminateds\", Order=3\r\n)\r\n",
+        "query": "let Data = SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\";\r\n\r\nunion\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(column_ifexists(\"progress_sources_s\", \"{}\"))\r\n| mv-expand progress_sources\r\n| project TimeGenerated, InputRows=tolong(progress_sources.numInputRows)\r\n| summarize Value=sum(InputRows)\r\n| extend Name=\"Total Input Rows\", Order=0\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Progress Events\", Order=1\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryStartedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Starts\", Order=2\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Terminateds\", Order=3\r\n)\r\n",
         "size": 4,
         "showAnnotations": true,
         "showAnalytics": true,
@@ -1637,7 +1638,7 @@
       "type": 3,
       "content": {
         "version": "KqlItem/1.0",
-        "query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(progress_sources_s)\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), ProcessedRowsPerSecond=todouble(progress_sources.processedRowsPerSecond)\r\n| summarize Value=sum(ProcessedRowsPerSecond) by TimeGenerated, Description\r\n| order by TimeGenerated asc\r\n\r\n// InputRowsPerSecond=todouble(progress_sources.inputRowsPerSecond), ",
+        "query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(column_ifexists(\"progress_sources_s\", \"{}\"))\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), ProcessedRowsPerSecond=todouble(progress_sources.processedRowsPerSecond)\r\n| summarize Value=sum(ProcessedRowsPerSecond) by TimeGenerated, Description\r\n| order by TimeGenerated asc\r\n\r\n// InputRowsPerSecond=todouble(progress_sources.inputRowsPerSecond), ",
         "size": 1,
         "aggregation": 5,
         "showAnalytics": true,
@@ -1664,7 +1665,7 @@
       "type": 3,
       "content": {
         "version": "KqlItem/1.0",
-        "query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(progress_sources_s)\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), InputRows=tolong(progress_sources.numInputRows)\r\n| summarize sum(InputRows)/count() by bin(TimeGenerated, timespan({Interval})), Description\r\n| order by TimeGenerated asc",
+        "query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(column_ifexists(\"progress_sources_s\", \"{}\"))\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), InputRows=tolong(progress_sources.numInputRows)\r\n| summarize sum(InputRows)/count() by bin(TimeGenerated, timespan({Interval})), Description\r\n| order by TimeGenerated asc",
         "size": 1,
         "aggregation": 5,
         "showAnalytics": true,

diff --git a/Diagnostic/SparkDiagnosticEmitter/README.md b/Diagnostic/SparkDiagnosticEmitter/README.md
@@ -0,0 +1,8 @@
+# Synapse Spark Diagnostic Emitter Configuration Samples
+
+## Introduction
+
+The Azure Synapse Spark diagnostic emitter extension is a library which enables Spark application to emit the logs, event logs and metrics to one or more destinations, 
+including Azure Log Analytics, Azure Storage and Azure EventHub.
+
+The sample templates in this repo are designed to help users quickly enable this feature.
diff --git a/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-event-hub-conf.txt b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-event-hub-conf.txt
@@ -0,0 +1,5 @@
+spark.synapse.diagnostic.emitters MyDestination1
+spark.synapse.diagnostic.emitter.MyDestination1.type AzureEventHub
+spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics
+spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault <key-vault-name>
+spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault.secretName <key-vault-secret-name-for-connection-string>
diff --git a/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-log-analytics-conf.txt b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-log-analytics-conf.txt
@@ -0,0 +1,6 @@
+spark.synapse.diagnostic.emitters MyDestination1
+spark.synapse.diagnostic.emitter.MyDestination1.type AzureLogAnalytics
+spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics
+spark.synapse.diagnostic.emitter.MyDestination1.workspaceId <azure-log-analytics-workspace-id>
+spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault <key-vault-name>
+spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault.secretName <key-vault-secret-name-for-log-analytics-key>
diff --git a/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-storage-conf.txt b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-storage-conf.txt
@@ -0,0 +1,6 @@
+spark.synapse.diagnostic.emitters MyDestination1
+spark.synapse.diagnostic.emitter.MyDestination1.type AzureStorage
+spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics
+spark.synapse.diagnostic.emitter.MyDestination1.uri https://<my-blob-storage>.blob.core.windows.net/<container-name>/<folder-name>
+spark.synapse.diagnostic.emitter.MyDestination1.auth AccessKey
+spark.synapse.diagnostic.emitter.MyDestination1.secret <storage-access-key>
diff --git a/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-multi-destination-conf.txt b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-multi-destination-conf.txt
@@ -0,0 +1,15 @@
+spark.synapse.diagnostic.emitters MyDestination1,MyDestination2
+
+spark.synapse.diagnostic.emitter.MyDestination1.type AzureStorage
+spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics
+spark.synapse.diagnostic.emitter.MyDestination1.uri https://<my-blob-storage>.blob.core.windows.net/<container-name>/<folder-name>
+spark.synapse.diagnostic.emitter.MyDestination1.auth AccessKey
+spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault <key-vault-name>
+spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault.secretName <key-vault-secret-name>
+
+spark.synapse.diagnostic.emitters MyDestination2
+spark.synapse.diagnostic.emitter.MyDestination2.type AzureLogAnalytics
+spark.synapse.diagnostic.emitter.MyDestination2.categories Log,EventLog,Metrics
+spark.synapse.diagnostic.emitter.MyDestination2.workspaceId <azure-log-analytics-workspace-id>
+spark.synapse.diagnostic.emitter.MyDestination2.secret.keyVault <key-vault-name>
+spark.synapse.diagnostic.emitter.MyDestination2.secret.keyVault.secretName <key-vault-secret-name-for-log-analytics-key>
diff --git a/Notebooks/PySpark/05 Using Azure Open Datasets in Synapse.ipynb b/Notebooks/PySpark/05 Using Azure Open Datasets in Synapse.ipynb
@@ -270,7 +270,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Enrich with weather data¶\n",
+        "## Enrich with weather data\n",
         "\n",
         "Now we append NOAA surface weather data to the taxi and holiday data. Use a similar approach to fetch the [NOAA weather history data](https://azure.microsoft.com/en-us/services/open-datasets/catalog/noaa-integrated-surface-data/) from Azure Open Datasets. "
       ],

diff --git a/...ks/PySpark/07 Data Exploration and ML Modeling - NYC taxi predict using Spark MLlib.ipynb b/...ks/PySpark/07 Data Exploration and ML Modeling - NYC taxi predict using Spark MLlib.ipynb
@@ -48,7 +48,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Ingest Data¶ \n",
+        "## Ingest Data\n",
         "\n",
         "Get a sample data of nyc yellow taxi to make it faster/easier to evaluate different approaches to prep for the modelling phase later in the notebook."
       ],

diff --git a/... Link for Cosmos DB samples/Retail/spark-notebooks/pyspark/2SalesForecastingWithAML.ipynb b/... Link for Cosmos DB samples/Retail/spark-notebooks/pyspark/2SalesForecastingWithAML.ipynb
@@ -444,16 +444,15 @@
         "\n",
         "df_all = align_outputs(y_predictions, X_trans, X_test, y_test, target_column_name)\n",
         "\n",
-        "from azureml.automl.core._vendor.automl.client.core.common import metrics\n",
+        "from azureml.automl.runtime.shared.score import scoring\n",
         "from matplotlib import pyplot as plt\n",
         "from automl.client.core.common import constants\n",
         "\n",
-        "# use automl metrics module\n",
-        "scores = metrics.compute_metrics_regression(\n",
-        "    df_all['predicted'],\n",
-        "    df_all[target_column_name],\n",
-        "    list(constants.Metric.SCALAR_REGRESSION_SET),\n",
-        "    None, None, None)\n",
+        "# use automl scoring module\n",
+        "scores = scoring.score_regression(\n",
+        "    y_test=df_all[target_column_name],\n",
+        "    y_pred=df_all['predicted'],\n",
+        "    metrics=list(constants.Metric.SCALAR_REGRESSION_SET))\n",
         "\n",
         "print(\"[Test data scores]\\n\")\n",
         "for key, value in scores.items():    \n",

diff --git a/Notebooks/PySpark/exampleForParameter.ipynb b/Notebooks/PySpark/exampleForParameter.ipynb
@@ -0,0 +1,107 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "account_name = '<Your primary storage account name>' # fill in your primary account name\r\n",
+        "container_name = '<Your container name>' # fill in your container name\r\n",
+        "relative_path = '<Your relative path>' # fill in your relative folder path"
+      ],
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/vnd.livy.statement-meta+json": {
+              "spark_pool": "sparkpool01",
+              "session_id": 0,
+              "statement_id": 3,
+              "state": "finished",
+              "livy_statement_state": "available",
+              "queued_time": "2021-09-17T06:08:57.5750046Z",
+              "session_start_time": null,
+              "execution_start_time": "2021-09-17T06:08:57.6725894Z",
+              "execution_finish_time": "2021-09-17T06:08:57.6728166Z"
+            },
+            "text/plain": "StatementMeta(sparkpool01, 0, 3, Finished, Available)"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 3,
+      "metadata": {
+        "tags": [
+          "parameters"
+        ]
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from pyspark.sql import SparkSession\r\n",
+        "from pyspark.sql.types import *\r\n",
+        "\r\n",
+        "adls_path = 'abfss://%s@%s.dfs.core.windows.net/%s' % (container_name, account_name, relative_path)\r\n",
+        "print('Primary storage account path: ' + adls_path)"
+      ],
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/vnd.livy.statement-meta+json": {
+              "spark_pool": "sparkpool01",
+              "session_id": 0,
+              "statement_id": 4,
+              "state": "finished",
+              "livy_statement_state": "available",
+              "queued_time": "2021-09-17T06:09:00.1736428Z",
+              "session_start_time": null,
+              "execution_start_time": "2021-09-17T06:09:00.273459Z",
+              "execution_finish_time": "2021-09-17T06:09:00.4351164Z"
+            },
+            "text/plain": "StatementMeta(sparkpool01, 0, 4, Finished, Available)"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Primary storage account path: abfss://test0121@lilijing0227neur.dfs.core.windows.net/test01/"
+          ]
+        }
+      ],
+      "execution_count": 4,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    }
+  ],
+  "metadata": {
+    "language_info": {
+      "name": "python"
+    },
+    "kernelspec": {
+      "name": "synapse_pyspark",
+      "language": "Python",
+      "display_name": "Synapse PySpark"
+    },
+    "kernel_info": {
+      "name": "synapse_pyspark"
+    },
+    "save_output": true,
+    "synapse_widget": {
+      "version": "0.1",
+      "state": {}
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}
diff --git a/Notebooks/PySpark/parameters/exampleForParameter.json b/Notebooks/PySpark/parameters/exampleForParameter.json
@@ -0,0 +1,5 @@
+{
+    "Your primary storage account name": "nbsampletestworkspace01",
+    "Your container name": "test0922",
+    "Your relative path": "testCSV/"
+}