Cherry pick fixes to release branch rel-1.3.0 (#3936)

stevenlix · Ryan Lai · Prabhat · web-flow · commit eb5da13bb574 · 2020-05-14T23:45:18.000-07:00
* Fix DirectML nuget creation in Nuget pipeline (#3929) * Added onnxruntime aarch64 wheel to pypi publishing pipeline (#3903) * Added onnxruntime aarch64 wheel to pypi publishing pipeline * Support nightly build flag * Add support for nightly build * Fix error handling in LearningModelSession.cpp (#3920) * Update DML Nuget version and DML EP Doc (#3945) Update DML Nuget version and DML EP Doc * Fix ordering of APIs. (#3951) Co-authored-by: Ryan Lai <rylai@microsoft.com> Co-authored-by: Prabhat <prabhat.roy@microsoft.com> Co-authored-by: Jeff Bloomfield <38966965+jeffbloo@users.noreply.github.com> Co-authored-by: Pranav Sharma <prs@microsoft.com>
diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake
@@ -20,7 +20,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
   set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
   set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
   get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
-  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/DirectML.0.0.4)
+  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/DirectML.2.1.0)
 
   # Restore nuget packages, which will pull down the DirectML redist package
   add_custom_command(
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.cs
@@ -13,6 +13,8 @@ public struct OrtApiBase
         public IntPtr GetVersionString;
     };
 
+    // NOTE: The order of the APIs in this struct should match exactly that in
+    // OrtApi ort_api_1_to_3 (onnxruntime_c_api.cc)
     [StructLayout(LayoutKind.Sequential)]
     public struct OrtApi
     {
@@ -38,8 +40,8 @@ public struct OrtApi
         public IntPtr EnableCpuMemArena;
         public IntPtr DisableCpuMemArena;
         public IntPtr SetSessionLogId;
-        public IntPtr SetSessionLogSeverityLevel;
         public IntPtr SetSessionLogVerbosityLevel;
+        public IntPtr SetSessionLogSeverityLevel;
         public IntPtr SetSessionGraphOptimizationLevel;
         public IntPtr SetIntraOpNumThreads;
         public IntPtr SetInterOpNumThreads;
@@ -59,8 +61,8 @@ public struct OrtApi
         public IntPtr SessionGetOutputName;
         public IntPtr SessionGetOverridableInitializerName;
         public IntPtr CreateRunOptions;
-        public IntPtr RunOptionsSetRunLogSeverityLevel;
         public IntPtr RunOptionsSetRunLogVerbosityLevel;
+        public IntPtr RunOptionsSetRunLogSeverityLevel;
         public IntPtr RunOptionsSetRunTag;
         public IntPtr RunOptionsGetRunLogVerbosityLevel;
         public IntPtr RunOptionsGetRunLogSeverityLevel;
diff --git a/docs/execution_providers/DirectML-ExecutionProvider.md b/docs/execution_providers/DirectML-ExecutionProvider.md
@@ -1,16 +1,16 @@
-# DirectML Execution Provider (Preview)
+# DirectML Execution Provider
 
 DirectML is a high-performance, hardware-accelerated DirectX 12 library for machine learning on Windows.  DirectML provides GPU acceleration for common machine learning tasks across a broad range of supported hardware and drivers.
 
 When used standalone, the DirectML API is a low-level DirectX 12 library and is suitable for high-performance, low-latency applications such as frameworks, games, and other real-time applications. The seamless interoperability of DirectML with Direct3D 12 as well as its low overhead and conformance across hardware makes DirectML ideal for accelerating machine learning when both high performance is desired, and the reliability and predictabiltiy of results across hardware is critical.
 
 The *DirectML Execution Provider* is an optional component of ONNX Runtime that uses DirectML to accelerate inference of ONNX models. The DirectML execution provider is capable of greatly improving evaluation time of models using commodity GPU hardware, without sacrificing broad hardware support or requiring vendor-specific extensions to be installed.
 
-The DirectML Execution Provider is currently in preview.
+The DirectML Execution Provider currently uses DirectML version 2.1.0.
 
 ## Table of contents
 
-- [DirectML Execution Provider (Preview)](#directml-execution-provider-preview)
+- [DirectML Execution Provider](#directml-execution-provider)
   - [Table of contents](#table-of-contents)
   - [Minimum requirements](#minimum-requirements)
   - [Building from source](#building-from-source)
@@ -48,7 +48,7 @@ To build onnxruntime with the DML EP included, supply the `--use_dml` parameter
 
 The DirectML execution provider supports building for both x64 (default) and x86 architectures.
 
-Note that building onnxruntime with the DirectML execution provider enabled causes the the DirectML redistributable package to be automatically downloaded as part of the build. This package contains a pre-release version of DirectML, and its use is governed by a license whose text may be found as part of the NuGet package.
+Note that building onnxruntime with the DirectML execution provider enabled causes the the DirectML redistributable package to be automatically downloaded as part of the build.  Its use is governed by a license whose text may be found as part of the NuGet package.
 
 
 
@@ -83,7 +83,7 @@ Creates a DirectML Execution Provider using the given DirectML device, and which
 
 ### ONNX opset support
 
-The DirectML execution provider currently supports ONNX opset 9 ([ONNX v1.4](https://github.com/onnx/onnx/releases/tag/v1.4.0)). Evaluating models which require a higher opset version is not supported, and may produce unexpected results.
+The DirectML execution provider currently supports ONNX opset 11 ([ONNX v1.6](https://github.com/onnx/onnx/releases/tag/v1.6.0)). Evaluating models which require a higher opset version is not supported, and may produce unexpected results.
 
 ### Multi-threading and supported session options
 
@@ -114,8 +114,9 @@ The DirectML execution provider works most efficiently when tensor shapes are kn
 
 Normally when the shapes of model inputs are known during session creation, the shapes for the rest of the model are inferred by OnnxRuntime when a session is created.  However if a model input contains a free dimension (such as for batch size), steps must be taken to retain the above performance benefits.
 
-In this case, there are two options:
-- Edit the model to replace an input's free dimension (specified through ONNX using "dim_param") with a fixed size.
+In this case, there are three options:
+- Edit the model to replace an input's free dimension (specified through ONNX using "dim_param") with a fixed size (specified through ONNX using "dim_value").
+- Specify values of named dimensions within model inputs when creating the session using the OnnxRuntime *AddFreeDimensionOverrideByName* ABI.
 - Edit the model to ensure that an input's free dimension has a [denotation](https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md) (such as "DATA_BATCH," or a custom denotation).  Then when creating the session, specify the dimension size for each denotation.  This can be done using the OnnxRuntime *AddFreeDimensionOverride* ABI.
 
 
diff --git a/packages.config b/packages.config
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
-  <package id="DirectML" version="0.0.4" targetFramework="native" />
+  <package id="DirectML" version="2.1.0" targetFramework="native" />
   <package id="GoogleTestAdapter" version="0.17.1" targetFramework="net46" />
 </packages>
diff --git a/tools/ci_build/github/azure-pipelines/azure-pipelines-py-packaging.yml b/tools/ci_build/github/azure-pipelines/azure-pipelines-py-packaging.yml
@@ -343,3 +343,65 @@ jobs:
         ArtifactName: onnxruntime
 
     - template: templates/component-governance-component-detection-steps.yml
+
+- job: Linux_ARM_py_Wheels
+  timeoutInMinutes: 60
+  pool: 'Linux-CPU'
+  strategy:
+    matrix:
+      Py37:
+        python.include: '3.7m'
+        cp.tag: 'cp37-cp37m'
+      Py36:
+        python.include: '3.6m'
+        cp.tag: 'cp36-cp36m'
+      Py35:
+        python.include: '3.5m'
+        cp.tag: 'cp35-cp35m'
+  steps:
+    - task: CmdLine@2
+      inputs:
+        script: |
+          set -e -x
+          sudo rm -rf *
+          cd $(Build.SourcesDirectory)
+          git submodule update --init --recursive
+          cd -
+          sudo apt-get install -y qemu-user-static
+          sudo chmod a+x /usr/bin/azcopy
+
+          cat << EOF > tool-chain.cmake
+          SET(CMAKE_SYSTEM_NAME Linux)
+          SET(CMAKE_SYSTEM_VERSION 1)
+          SET(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+          SET(CMAKE_C_FLAGS "-march=armv8-a -mtune=generic -Wno-unused-parameter -Wno-type-limits")
+          SET(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+          SET(CMAKE_CXX_FLAGS "-march=armv8-a -mtune=generic -Wno-unused-parameter -Wno-type-limits")
+          SET(CMAKE_FIND_ROOT_PATH /mnt/toolchains/manylinux2014_aarch64)
+          SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+          SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+          SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+          SET(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+          EOF
+          export PATH=/mnt/toolchains/gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu/bin:$PATH
+          azcopy cp https://onnxruntimetestdata.blob.core.windows.net/models/toolchains.tar.xz $(Build.BinariesDirectory)/toolchains.tar.xz
+          sudo rm -rf /mnt/toolchains
+          mkdir /mnt/toolchains
+          tar -Jxf $(Build.BinariesDirectory)/toolchains.tar.xz -C /mnt/toolchains
+          aria2c -q https://github.com/protocolbuffers/protobuf/releases/download/v3.11.1/protoc-3.11.1-linux-x86_64.zip
+          unzip protoc-3.11.1-linux-x86_64.zip
+          aria2c -q https://github.com/Kitware/CMake/releases/download/v3.17.1/cmake-3.17.1-Linux-x86_64.tar.gz
+          tar --strip=1 -zxf cmake-3.17.1-Linux-x86_64.tar.gz
+          sudo cp /mnt/toolchains/manylinux2014_aarch64/usr/include/stdlib.h /mnt/toolchains/gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu/aarch64-linux-gnu/libc/usr/include/
+          bin/cmake -Donnxruntime_GCC_STATIC_CPP_RUNTIME=ON -DCMAKE_BUILD_TYPE=Release -Dprotobuf_WITH_ZLIB=OFF -DCMAKE_TOOLCHAIN_FILE=tool-chain.cmake -Donnxruntime_ENABLE_PYTHON=ON -DPYTHON_LIBRARY=dl -DPYTHON_EXECUTABLE=/mnt/toolchains/manylinux2014_aarch64/opt/python/'$(cp.tag)'/bin/python3  -Donnxruntime_BUILD_SHARED_LIB=OFF  -Donnxruntime_RUN_ONNX_TESTS=OFF -Donnxruntime_DEV_MODE=ON -DONNX_CUSTOM_PROTOC_EXECUTABLE=$(Build.BinariesDirectory)/bin/protoc "-DPYTHON_INCLUDE_DIR=/mnt/toolchains/manylinux2014_aarch64/usr/include;/mnt/toolchains/manylinux2014_aarch64/opt/python/$(cp.tag)/include/python$(python.include)" -DNUMPY_INCLUDE_DIR=/mnt/toolchains $(Build.SourcesDirectory)/cmake
+          make -j$(getconf _NPROCESSORS_ONLN)
+          case $NIGHTLY_BUILD in
+            1) docker run -v /usr/bin/qemu-aarch64-static:/usr/bin/qemu-aarch64-static -v $(Build.BinariesDirectory):/tmp/a -v $(Build.SourcesDirectory):/tmp/b -w /tmp/a --rm quay.io/pypa/manylinux2014_aarch64 /opt/python/'$(cp.tag)'/bin/python3 /tmp/b/setup.py bdist_wheel --nightly_build;;
+            *) docker run -v /usr/bin/qemu-aarch64-static:/usr/bin/qemu-aarch64-static -v $(Build.BinariesDirectory):/tmp/a -v $(Build.SourcesDirectory):/tmp/b -w /tmp/a --rm quay.io/pypa/manylinux2014_aarch64 /opt/python/'$(cp.tag)'/bin/python3 /tmp/b/setup.py bdist_wheel;;
+          esac
+        workingDirectory: $(Build.BinariesDirectory)
+    - task: PublishBuildArtifacts@1
+      displayName: 'Publish Artifact: ONNXRuntime python wheel'
+      inputs:
+        PathtoPublish: '$(Build.BinariesDirectory)/dist'
+        ArtifactName: onnxruntime
diff --git a/tools/ci_build/github/azure-pipelines/linux-arm-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-arm-ci-pipeline.yml
@@ -29,9 +29,9 @@ jobs:
            SET(CMAKE_SYSTEM_NAME Linux)
            SET(CMAKE_SYSTEM_VERSION 1)
            SET(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
-           set(CMAKE_C_FLAGS "-march=armv8-a -mtune=generic -Wno-unused-parameter -Wno-type-limits")
+           SET(CMAKE_C_FLAGS "-march=armv8-a -mtune=generic -Wno-unused-parameter -Wno-type-limits")
            SET(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
-           set(CMAKE_CXX_FLAGS "-march=armv8-a -mtune=generic -Wno-unused-parameter -Wno-type-limits")
+           SET(CMAKE_CXX_FLAGS "-march=armv8-a -mtune=generic -Wno-unused-parameter -Wno-type-limits")
            SET(CMAKE_FIND_ROOT_PATH /mnt/toolchains/manylinux2014_aarch64)
            SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
            SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -148,9 +148,12 @@ def generate_files(list, args):
     files_list.append('<file src=' + '"' + os.path.join(args.native_build_path, 'onnxruntime.pdb') + '" target="runtimes\\win-' + args.target_architecture + '\\native" />')
 
     if includes_directml:
-        files_list.append('<file src=' + '"' + os.path.join(args.native_build_path, 'DirectML.dll') + '" target="runtimes\\win-' + args.target_architecture + '\\native" />')
-        files_list.append('<file src=' + '"' + os.path.join(args.native_build_path, 'DirectML.pdb') + '" target="runtimes\\win-' + args.target_architecture + '\\native" />')
-        files_list.append('<file src=' + '"' + os.path.join(args.packages_path, 'DirectML.0.0.2\\LICENSE.txt') + '" target="DirectML_LICENSE.txt" />')
+        files_list.append('<file src=' + '"' + os.path.join(args.native_build_path, 'DirectML.dll') +
+                          '" target="runtimes\\win-' + args.target_architecture + '\\native" />')
+        files_list.append('<file src=' + '"' + os.path.join(args.native_build_path, 'DirectML.pdb') +
+                          '" target="runtimes\\win-' + args.target_architecture + '\\native" />')
+        files_list.append('<file src=' + '"' + os.path.join(args.packages_path, 'DirectML.2.1.0\\LICENSE.txt') +
+                          '" target="DirectML_LICENSE.txt" />')
 
     if includes_winml:
         # Process microsoft.ai.machinelearning import lib, dll, and pdb
@@ -251,4 +254,4 @@ def main():
             f.write('\n')
 
 if __name__ == "__main__":
-    sys.exit(main())    
+    sys.exit(main())    
diff --git a/winml/lib/Api/LearningModelSession.cpp b/winml/lib/Api/LearningModelSession.cpp
@@ -103,16 +103,16 @@ void LearningModelSession::Initialize() {
   engine_factory_.copy_from(model_impl->GetEngineFactory());
 
   com_ptr<_winml::IEngineBuilder> engine_builder;
-  engine_factory_->CreateEngineBuilder(engine_builder.put());
+  WINML_THROW_IF_FAILED(engine_factory_->CreateEngineBuilder(engine_builder.put()));
 
   if (device_impl->IsCpuDevice() == false) {
-    engine_builder->SetD3D12Resources(device_impl->GetD3DDevice(), device_impl->GetDeviceQueue());
-    engine_builder->SetMetacommandsEnabled(device_impl->MetacommandsEnabled());
+    WINML_THROW_IF_FAILED(engine_builder->SetD3D12Resources(device_impl->GetD3DDevice(), device_impl->GetDeviceQueue()));
+    WINML_THROW_IF_FAILED(engine_builder->SetMetacommandsEnabled(device_impl->MetacommandsEnabled()));
   }
 
   // Make onnxruntime apply the batch size override, if any
   if (session_options_ && session_options_.BatchSizeOverride() != 0) {
-    engine_builder->SetBatchSizeOverride(session_options_.BatchSizeOverride());
+    WINML_THROW_IF_FAILED(engine_builder->SetBatchSizeOverride(session_options_.BatchSizeOverride()));
   }
 
   com_ptr<_winml::IEngine> engine;
@@ -123,7 +123,7 @@ void LearningModelSession::Initialize() {
   WINML_THROW_IF_FAILED(engine->RegisterCustomRegistry(operator_registry_.get()));
 
   // Register transformers - this should probably not be exposed on IEngine, but an internal call as this configuration step is ort specific.
-  engine->RegisterGraphTransformers();
+  WINML_THROW_IF_FAILED(engine->RegisterGraphTransformers());
 
   // Load the model into the session
   WINML_THROW_IF_FAILED(engine->LoadModel(model.get()));
@@ -229,17 +229,17 @@ uint64_t LearningModelSession::Run(winrt::com_ptr<winmlp::LearningModelBinding>
       std::back_inserter(outputs_raw),
       [&](auto& input) { return input.get(); });
 
-  engine_->Run(input_names_raw.data(),
+  WINML_THROW_IF_FAILED(engine_->Run(input_names_raw.data(),
                inputs_raw.data(),
                input_names_raw.size(),
                output_names_raw.data(),
                outputs_raw.data(),
-               output_names_raw.size());
+               output_names_raw.size()));
 
   if (!device->IsCpuDevice()) {
     // Flush the D3D12 work from the DML execution provider and queue a fence before we release the lock.
     // This allows us to wait without holding onto the lock in GetResults.
-    engine_->FlushContext();
+    WINML_THROW_IF_FAILED(engine_->FlushContext());
     return device->GetD3DDeviceCache()->QueueFenceToD3D12();
   }
 
@@ -268,10 +268,10 @@ LearningModelSession::GetResults(
   if (is_gpu_evaluation) {
     // For DML we aren't using the Sync function because we want to make fencing the
     // completed frame thread safe while not holding the lock while waiting for the gpu.
-    engine_->ReleaseCompletedReferences();
+    WINML_THROW_IF_FAILED(engine_->ReleaseCompletedReferences());
   } else {
     // For CPU call the standard Sync function
-    engine_->Sync();
+    WINML_THROW_IF_FAILED(engine_->Sync());
   }
 
   // This isn't the best we are holding the lock while we wait for detensorize on the GPU.