Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e3e7f60
Add entry in parent content directory, and minor format fix
chugugrace Jul 5, 2021
3f46c18
Merge pull request #108 from Azure-Samples/pathway
Jul 6, 2021
474bc98
Merge pull request #104 from chugugrace/importadftosynapsechg2
Jul 7, 2021
a247816
Merge pull request #109 from Azure-Samples/pathway
Jul 7, 2021
e8697fe
Update Spark application Log Analytics diagnostic workbook (#110)
kaiyuezhou Jul 14, 2021
168408f
Adding Delta Lake settings
jovanpop-msft Jul 15, 2021
f01e4b4
Added TPCDS views
jovanpop-msft Jul 21, 2021
ca9db82
fix bug of pagination
chugugrace Jul 27, 2021
bc70026
Merge pull request #111 from chugugrace/adfbug1
saveenr-msft Jul 29, 2021
f005a6a
Added my notebook
mlevin19 Aug 10, 2021
2dbf2a8
Add Spark diagnostic emitter spark pool configuration samples (#112)
kaiyuezhou Aug 11, 2021
f965305
Cleaned up phrasing some, but all looks good!
Shunderpooch Aug 18, 2021
0102b21
Removed an output
Shunderpooch Aug 18, 2021
7ca8e63
Removed output for credential location
Shunderpooch Aug 18, 2021
183321d
Removed last credential
Shunderpooch Aug 18, 2021
5465044
Final cleanup on credentials
Shunderpooch Aug 18, 2021
4c4f3dc
Merge pull request #1 from Shunderpooch/movefromsnowflake
mlevin19 Aug 18, 2021
d7987eb
Updated with comments regarding stored proc
mlevin19 Aug 18, 2021
9866d5c
fixed some typos
mlevin19 Aug 18, 2021
dfd392b
Create generate-cosmosdb-query-ff-schema.html
jovanpop-msft Aug 20, 2021
1adfdc8
Merge pull request #1 from Azure-Samples/main
Aug 27, 2021
f7b7d50
score fix
Aug 27, 2021
a1c211f
Merge pull request #114 from Rodrigossz/master
ruixinxu Aug 30, 2021
0e1bdff
Update Migrate_snowflake_schema.ipynb
mlevin19 Sep 1, 2021
bf5a423
Update Migrate_snowflake_schema.ipynb
mlevin19 Sep 1, 2021
cb1e62e
Update Migrate_snowflake_schema.ipynb
mlevin19 Sep 1, 2021
d076bcf
Update Migrate_snowflake_schema.ipynb
mlevin19 Sep 1, 2021
c335d6f
Merge pull request #113 from mlevin19/movefromsnowflake
ruixinxu Sep 2, 2021
8965326
Add metadata file for testing
Lijing29 Sep 13, 2021
ca231e2
Merge pull request #116 from Azure-Samples/lilijing/add-metedata-file
Jing29 Sep 13, 2021
68058f6
Test full pipeline
Lijing29 Sep 13, 2021
9c361e9
Remove invalid characters in the notebook
Lijing29 Sep 13, 2021
75334aa
Merge pull request #118 from Azure-Samples/lilijing/add-metadata-file-02
Jing29 Sep 13, 2021
681caa3
Example about a notebook with parameter
Lijing29 Sep 22, 2021
194d473
Add example in metadata.json
Lijing29 Sep 22, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions Diagnostic/LogAnalytics/Azure_Synapse_Spark_Application.workbook
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@
"type": 2,
"description": "Spark Application Livy Id and Name",
"isRequired": true,
"query": "SparkMetrics_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and isnotempty(livyId_s)\r\n| summarize by livyId_s, applicationName_s\r\n| order by livyId_s desc\r\n| extend applicationName = substring(applicationName_s, 0, strlen(applicationName_s) - 1)\r\n| project value = livyId_s, label = strcat(livyId_s, \" | \", applicationName), selected = false",
"query": "SparkMetrics_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and isnotempty(livyId_s)\r\n| extend applicationName = column_ifexists(\"applicationName_s\", applicationId_s)\r\n| summarize by livyId_s, applicationName\r\n| order by livyId_s desc\r\n| project value = livyId_s, label = strcat(livyId_s, \" | \", applicationName), selected = false",
"value": null,
"typeSettings": {
"additionalResourceOptions": [],
"showDefault": false
Expand Down Expand Up @@ -182,7 +183,7 @@
"id": "9246e88b-9682-498c-b440-f53a15b6e481",
"cellValue": "selectedTab",
"linkTarget": "parameter",
"linkLabel": "Spark Streaming",
"linkLabel": "Streaming",
"subTarget": "Streaming",
"preText": "Metrics",
"postText": "1",
Expand Down Expand Up @@ -255,7 +256,7 @@
"additionalResourceOptions": [
"value::all"
],
"selectAllValue": "ALL",
"selectAllValue": "All",
"showDefault": false
},
"timeContext": {
Expand All @@ -277,7 +278,7 @@
"comparison": "isEqualTo",
"value": "Logs"
},
"name": "CustomMetricsSelector - Copy"
"name": "Logs Filter Dropdown"
},
{
"type": 3,
Expand Down Expand Up @@ -566,7 +567,7 @@
"type": 3,
"content": {
"version": "KqlItem/1.0",
"query": "let Data = SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\";\r\n\r\nunion\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(progress_sources_s)\r\n| mv-expand progress_sources\r\n| project TimeGenerated, InputRows=tolong(progress_sources.numInputRows)\r\n| summarize Value=sum(InputRows)\r\n| extend Name=\"Total Input Rows\", Order=0\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Progress Events\", Order=1\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryStartedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Starts\", Order=2\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Terminateds\", Order=3\r\n)\r\n",
"query": "let Data = SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\";\r\n\r\nunion\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(column_ifexists(\"progress_sources_s\", \"{}\"))\r\n| mv-expand progress_sources\r\n| project TimeGenerated, InputRows=tolong(progress_sources.numInputRows)\r\n| summarize Value=sum(InputRows)\r\n| extend Name=\"Total Input Rows\", Order=0\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Progress Events\", Order=1\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryStartedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Starts\", Order=2\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Terminateds\", Order=3\r\n)\r\n",
"size": 4,
"showAnnotations": true,
"showAnalytics": true,
Expand Down Expand Up @@ -1637,7 +1638,7 @@
"type": 3,
"content": {
"version": "KqlItem/1.0",
"query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(progress_sources_s)\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), ProcessedRowsPerSecond=todouble(progress_sources.processedRowsPerSecond)\r\n| summarize Value=sum(ProcessedRowsPerSecond) by TimeGenerated, Description\r\n| order by TimeGenerated asc\r\n\r\n// InputRowsPerSecond=todouble(progress_sources.inputRowsPerSecond), ",
"query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(column_ifexists(\"progress_sources_s\", \"{}\"))\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), ProcessedRowsPerSecond=todouble(progress_sources.processedRowsPerSecond)\r\n| summarize Value=sum(ProcessedRowsPerSecond) by TimeGenerated, Description\r\n| order by TimeGenerated asc\r\n\r\n// InputRowsPerSecond=todouble(progress_sources.inputRowsPerSecond), ",
"size": 1,
"aggregation": 5,
"showAnalytics": true,
Expand All @@ -1664,7 +1665,7 @@
"type": 3,
"content": {
"version": "KqlItem/1.0",
"query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(progress_sources_s)\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), InputRows=tolong(progress_sources.numInputRows)\r\n| summarize sum(InputRows)/count() by bin(TimeGenerated, timespan({Interval})), Description\r\n| order by TimeGenerated asc",
"query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(column_ifexists(\"progress_sources_s\", \"{}\"))\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), InputRows=tolong(progress_sources.numInputRows)\r\n| summarize sum(InputRows)/count() by bin(TimeGenerated, timespan({Interval})), Description\r\n| order by TimeGenerated asc",
"size": 1,
"aggregation": 5,
"showAnalytics": true,
Expand Down
8 changes: 8 additions & 0 deletions Diagnostic/SparkDiagnosticEmitter/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Synapse Spark Diagnostic Emitter Configuration Samples

## Introduction

The Azure Synapse Spark diagnostic emitter extension is a library which enables Spark application to emit the logs, event logs and metrics to one or more destinations,
including Azure Log Analytics, Azure Storage and Azure EventHub.

The sample templates in this repo are designed to help users quickly enable this feature.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
spark.synapse.diagnostic.emitters MyDestination1
spark.synapse.diagnostic.emitter.MyDestination1.type AzureEventHub
spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics
spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault <key-vault-name>
spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault.secretName <key-vault-secret-name-for-connection-string>
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
spark.synapse.diagnostic.emitters MyDestination1
spark.synapse.diagnostic.emitter.MyDestination1.type AzureLogAnalytics
spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics
spark.synapse.diagnostic.emitter.MyDestination1.workspaceId <azure-log-analytics-workspace-id>
spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault <key-vault-name>
spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault.secretName <key-vault-secret-name-for-log-analytics-key>
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
spark.synapse.diagnostic.emitters MyDestination1
spark.synapse.diagnostic.emitter.MyDestination1.type AzureStorage
spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics
spark.synapse.diagnostic.emitter.MyDestination1.uri https://<my-blob-storage>.blob.core.windows.net/<container-name>/<folder-name>
spark.synapse.diagnostic.emitter.MyDestination1.auth AccessKey
spark.synapse.diagnostic.emitter.MyDestination1.secret <storage-access-key>
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
spark.synapse.diagnostic.emitters MyDestination1,MyDestination2

spark.synapse.diagnostic.emitter.MyDestination1.type AzureStorage
spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics
spark.synapse.diagnostic.emitter.MyDestination1.uri https://<my-blob-storage>.blob.core.windows.net/<container-name>/<folder-name>
spark.synapse.diagnostic.emitter.MyDestination1.auth AccessKey
spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault <key-vault-name>
spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault.secretName <key-vault-secret-name>

spark.synapse.diagnostic.emitters MyDestination2
spark.synapse.diagnostic.emitter.MyDestination2.type AzureLogAnalytics
spark.synapse.diagnostic.emitter.MyDestination2.categories Log,EventLog,Metrics
spark.synapse.diagnostic.emitter.MyDestination2.workspaceId <azure-log-analytics-workspace-id>
spark.synapse.diagnostic.emitter.MyDestination2.secret.keyVault <key-vault-name>
spark.synapse.diagnostic.emitter.MyDestination2.secret.keyVault.secretName <key-vault-secret-name-for-log-analytics-key>
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Enrich with weather data\n",
"## Enrich with weather data\n",
"\n",
"Now we append NOAA surface weather data to the taxi and holiday data. Use a similar approach to fetch the [NOAA weather history data](https://azure.microsoft.com/en-us/services/open-datasets/catalog/noaa-integrated-surface-data/) from Azure Open Datasets. "
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Ingest Data\n",
"## Ingest Data\n",
"\n",
"Get a sample data of nyc yellow taxi to make it faster/easier to evaluate different approaches to prep for the modelling phase later in the notebook."
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -444,16 +444,15 @@
"\n",
"df_all = align_outputs(y_predictions, X_trans, X_test, y_test, target_column_name)\n",
"\n",
"from azureml.automl.core._vendor.automl.client.core.common import metrics\n",
"from azureml.automl.runtime.shared.score import scoring\n",
"from matplotlib import pyplot as plt\n",
"from automl.client.core.common import constants\n",
"\n",
"# use automl metrics module\n",
"scores = metrics.compute_metrics_regression(\n",
" df_all['predicted'],\n",
" df_all[target_column_name],\n",
" list(constants.Metric.SCALAR_REGRESSION_SET),\n",
" None, None, None)\n",
"# use automl scoring module\n",
"scores = scoring.score_regression(\n",
" y_test=df_all[target_column_name],\n",
" y_pred=df_all['predicted'],\n",
" metrics=list(constants.Metric.SCALAR_REGRESSION_SET))\n",
"\n",
"print(\"[Test data scores]\\n\")\n",
"for key, value in scores.items(): \n",
Expand Down
107 changes: 107 additions & 0 deletions Notebooks/PySpark/exampleForParameter.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
{
"cells": [
{
"cell_type": "code",
"source": [
"account_name = '<Your primary storage account name>' # fill in your primary account name\r\n",
"container_name = '<Your container name>' # fill in your container name\r\n",
"relative_path = '<Your relative path>' # fill in your relative folder path"
],
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.livy.statement-meta+json": {
"spark_pool": "sparkpool01",
"session_id": 0,
"statement_id": 3,
"state": "finished",
"livy_statement_state": "available",
"queued_time": "2021-09-17T06:08:57.5750046Z",
"session_start_time": null,
"execution_start_time": "2021-09-17T06:08:57.6725894Z",
"execution_finish_time": "2021-09-17T06:08:57.6728166Z"
},
"text/plain": "StatementMeta(sparkpool01, 0, 3, Finished, Available)"
},
"metadata": {}
}
],
"execution_count": 3,
"metadata": {
"tags": [
"parameters"
]
}
},
{
"cell_type": "code",
"source": [
"from pyspark.sql import SparkSession\r\n",
"from pyspark.sql.types import *\r\n",
"\r\n",
"adls_path = 'abfss://%s@%s.dfs.core.windows.net/%s' % (container_name, account_name, relative_path)\r\n",
"print('Primary storage account path: ' + adls_path)"
],
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.livy.statement-meta+json": {
"spark_pool": "sparkpool01",
"session_id": 0,
"statement_id": 4,
"state": "finished",
"livy_statement_state": "available",
"queued_time": "2021-09-17T06:09:00.1736428Z",
"session_start_time": null,
"execution_start_time": "2021-09-17T06:09:00.273459Z",
"execution_finish_time": "2021-09-17T06:09:00.4351164Z"
},
"text/plain": "StatementMeta(sparkpool01, 0, 4, Finished, Available)"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Primary storage account path: abfss://test0121@lilijing0227neur.dfs.core.windows.net/test01/"
]
}
],
"execution_count": 4,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
}
}
}
],
"metadata": {
"language_info": {
"name": "python"
},
"kernelspec": {
"name": "synapse_pyspark",
"language": "Python",
"display_name": "Synapse PySpark"
},
"kernel_info": {
"name": "synapse_pyspark"
},
"save_output": true,
"synapse_widget": {
"version": "0.1",
"state": {}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
5 changes: 5 additions & 0 deletions Notebooks/PySpark/parameters/exampleForParameter.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"Your primary storage account name": "nbsampletestworkspace01",
"Your container name": "test0922",
"Your relative path": "testCSV/"
}
Loading