Update pad on nvpgu (#1759)

Chris Sullivan · rkimballn1 · commit 40ff77bd8d17 · 2018-10-08T11:06:24.000-07:00
* Add pad with fill operator using the outward-in index pattern.

* Remove static pad and rename build_pad_dynamic -&gt; build_pad. Update maxpool 1d padding.

* Formatting.

* Split build_pad_dynamic into build_pad and build_pad_fill.

* Add test coverage for fixed bug in op::Pad for gpu.
diff --git a/src/ngraph/runtime/gpu/cuda_emitter.cpp b/src/ngraph/runtime/gpu/cuda_emitter.cpp
diff --git a/src/ngraph/runtime/gpu/cuda_emitter.hpp b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -50,19 +50,17 @@ namespace ngraph
                 size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op);
 
             public:
-                size_t build_pad(const std::array<std::string, 2>& dtypes,
+                size_t build_pad(const std::vector<std::string>& dtypes,
                                  NVShape input_shape,
                                  NVShape output_shape,
-                                 NVShape pad_below,
-                                 NVShape pad_above,
-                                 NVShape pad_interior,
-                                 const std::string& pad_value = "");
+                                 NVShape padding_below,
+                                 NVShape padding_interior);
 
-                size_t build_pad_dynamic(const std::array<std::string, 2>& dtypes,
-                                         NVShape input_shape,
-                                         NVShape output_shape,
-                                         NVShape padding_below,
-                                         NVShape padding_interior);
+                size_t build_pad_fill(const std::vector<std::string>& dtypes,
+                                      NVShape input_shape,
+                                      NVShape output_shape,
+                                      NVShape padding_below,
+                                      NVShape padding_interior);
 
                 size_t build_1d_max_pool(const std::array<std::string, 2>& dtypes,
                                          NVShape input_shape,
diff --git a/src/ngraph/runtime/gpu/cudnn_emitter.cpp b/src/ngraph/runtime/gpu/cudnn_emitter.cpp
@@ -418,7 +418,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
     Shape padding_interior(data_dilation_strides);
 
     size_t idx_workspace = std::numeric_limits<size_t>::max();
-    size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
+    size_t pad_index = std::numeric_limits<size_t>::max();
     bool can_find_algo = true;
     if (pad_required || is_deconvolution)
     {
@@ -431,8 +431,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
         idx_workspace = allocator.reserve_workspace(temp_size, true);
 
         auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
-        pad_dynamic_index =
-            cuda_emitter->build_pad_dynamic({{args[0].get_element_type().c_type_string(),
+        pad_index = cuda_emitter->build_pad({{args[0].get_element_type().c_type_string(),
                                               out[0].get_element_type().c_type_string()}},
                                             input_shape,
                                             input_shape_padded,
@@ -458,11 +457,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
     std::unique_ptr<gpu::primitive> kernel_launch(
         new gpu::primitive{[=](void** inputs, void** outputs) mutable {
             if (idx_workspace != std::numeric_limits<size_t>::max() &&
-                pad_dynamic_index != std::numeric_limits<size_t>::max())
+                pad_index != std::numeric_limits<size_t>::max())
             {
                 void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
                 gpu::invoke_primitive(m_ctx,
-                                      pad_dynamic_index,
+                                      pad_index,
                                       std::vector<void*>{inputs[0]}.data(),
                                       std::vector<void*>{pad_buffer}.data());
                 gpu::invoke_primitive(
@@ -542,7 +541,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
     Shape padding_interior(data_dilation_strides);
 
     size_t idx_workspace = std::numeric_limits<size_t>::max();
-    size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
+    size_t pad_index = std::numeric_limits<size_t>::max();
     size_t slice_index = std::numeric_limits<size_t>::max();
     bool can_find_algo = true;
     if (pad_required || is_deconvolution)
@@ -556,11 +555,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
         idx_workspace = allocator.reserve_workspace(temp_size, true);
 
         auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
-        pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
-                                                            output_shape,
-                                                            output_shape_padded,
-                                                            padding_below,
-                                                            padding_interior);
+        pad_index = cuda_emitter->build_pad({{input_type, output_type}},
+                                            output_shape,
+                                            output_shape_padded,
+                                            padding_below,
+                                            padding_interior);
 
         slice_index = cuda_emitter->build_slice({{input_type, output_type}},
                                                 output_shape_padded,
@@ -587,12 +586,12 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
     std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
                                                                          void** outputs) mutable {
         if (idx_workspace != std::numeric_limits<size_t>::max() &&
-            pad_dynamic_index != std::numeric_limits<size_t>::max() &&
+            pad_index != std::numeric_limits<size_t>::max() &&
             slice_index != std::numeric_limits<size_t>::max())
         {
             void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
             gpu::invoke_primitive(m_ctx,
-                                  pad_dynamic_index,
+                                  pad_index,
                                   std::vector<void*>{inputs[0]}.data(),
                                   std::vector<void*>{pad_buffer}.data());
             gpu::invoke_primitive(m_ctx, conv_index, inputs, std::vector<void*>{pad_buffer}.data());
@@ -662,7 +661,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
     Shape padding_interior(data_dilation_strides);
 
     size_t idx_workspace = std::numeric_limits<size_t>::max();
-    size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
+    size_t pad_index = std::numeric_limits<size_t>::max();
     bool can_find_algo = true;
     if (pad_required || is_deconvolution)
     {
@@ -675,11 +674,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
         idx_workspace = allocator.reserve_workspace(temp_size, true);
 
         auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
-        pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
-                                                            input_shape_0,
-                                                            input_shape_padded,
-                                                            padding_below,
-                                                            padding_interior);
+        pad_index = cuda_emitter->build_pad({{input_type, output_type}},
+                                            input_shape_0,
+                                            input_shape_padded,
+                                            padding_below,
+                                            padding_interior);
 
         // asymetric padding has been applied, zero out padding vectors to
         // ensure cudnn does not assume padding
@@ -700,11 +699,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
     std::unique_ptr<gpu::primitive> kernel_launch(
         new gpu::primitive{[=](void** inputs, void** outputs) mutable {
             if (idx_workspace != std::numeric_limits<size_t>::max() &&
-                pad_dynamic_index != std::numeric_limits<size_t>::max())
+                pad_index != std::numeric_limits<size_t>::max())
             {
                 void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
                 gpu::invoke_primitive(m_ctx,
-                                      pad_dynamic_index,
+                                      pad_index,
                                       std::vector<void*>{inputs[0]}.data(),
                                       std::vector<void*>{pad_buffer}.data());
                 gpu::invoke_primitive(
@@ -768,11 +767,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node)
                                                    padded_size * args[0].get_element_type().size());
 
         auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
-        pad_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
-                                                    input_shape,
-                                                    input_shape_padded,
-                                                    padding_below,
-                                                    padding_interior);
+        pad_index = cuda_emitter->build_pad({{input_type, output_type}},
+                                            input_shape,
+                                            input_shape_padded,
+                                            padding_below,
+                                            padding_interior);
 
         // asymetric padding has been applied, zero out padding vectors to
         // ensure cuDNN does not assume padding during pooling
diff --git a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -638,12 +638,10 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
     writer.block_end();
 }
 
-void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
-    codegen::CodeWriter& writer,
-    const std::string& name,
-    GPUKernelArgs& args,
-    const std::array<std::string, 2>& data_types,
-    size_t rank)
+void runtime::gpu::CudaKernelBuilder::get_pad_op(codegen::CodeWriter& writer,
+                                                 const std::string& name,
+                                                 GPUKernelArgs& args,
+                                                 size_t rank)
 {
     writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
     writer.block_begin();
@@ -673,6 +671,44 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
     writer.block_end();
 }
 
+void runtime::gpu::CudaKernelBuilder::get_pad_fill_op(codegen::CodeWriter& writer,
+                                                      const std::string& name,
+                                                      GPUKernelArgs& args,
+                                                      size_t rank)
+{
+    writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
+    writer.block_begin();
+    {
+        writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
+        writer << "if (tid < n)\n";
+        writer.block_begin();
+        {
+            writer << "bool in_bounds = true;\n";
+            writer << "uint32_t output_pixel = tid;\n";
+            writer << "uint32_t input_pixel = 0;\n";
+            writer << "int32_t input, input_dil;\n";
+            for (size_t i = 0; i < rank; i++)
+            {
+                if (i != 0)
+                {
+                    writer << "output_pixel %= output_strides" << i - 1 << ";\n";
+                }
+                writer << "input_dil = output_pixel / output_strides" << i << " - padding_below"
+                       << i << ";\n";
+
+                writer << "input = input_dil / (padding_interior" << i << " + 1);\n";
+                writer << "input_dil %= (padding_interior" << i << " + 1);\n";
+                writer << "in_bounds = in_bounds && (input >= 0) && (input < input_shape" << i
+                       << ") && (input_dil == 0);\n";
+                writer << "input_pixel += input * input_strides" << i << ";\n";
+            }
+            writer << "out[tid] = (in_bounds) ? in[input_pixel] : *pad;\n";
+        }
+        writer.block_end();
+    }
+    writer.block_end();
+}
+
 void runtime::gpu::CudaKernelBuilder::get_reverse_sequence_op(
     codegen::CodeWriter& writer,
     const std::string& name,
diff --git a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -130,11 +130,15 @@ namespace ngraph
                                               const std::string& math_kernel,
                                               const std::vector<std::string>& data_types);
 
-                static void get_pad_dynamic_op(codegen::CodeWriter& writer,
-                                               const std::string& name,
-                                               GPUKernelArgs& args,
-                                               const std::array<std::string, 2>& data_types,
-                                               size_t rank);
+                static void get_pad_op(codegen::CodeWriter& writer,
+                                       const std::string& name,
+                                       GPUKernelArgs& args,
+                                       size_t rank);
+
+                static void get_pad_fill_op(codegen::CodeWriter& writer,
+                                            const std::string& name,
+                                            GPUKernelArgs& args,
+                                            size_t rank);
 
                 static void get_ew_collective_op(codegen::CodeWriter& writer,
                                                  const std::string& name,
diff --git a/src/ngraph/runtime/gpu/gpu_emitter.cpp b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -827,12 +827,12 @@ void runtime::gpu::GPU_Emitter::emit_Pad(EMIT_ARGS)
 
         auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
 
-        auto pad_index = cuda_emitter->build_pad({{args[0].get_type(), out[0].get_type()}},
-                                                 input_shape,
-                                                 output_shape,
-                                                 padding_below,
-                                                 padding_above,
-                                                 padding_interior);
+        auto pad_index = cuda_emitter->build_pad_fill(
+            {{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
+            input_shape,
+            output_shape,
+            padding_below,
+            padding_interior);
         writer << "void* input[] = {" << node_names(args) << "};\n";
         writer << "void* output[] = {" << node_names(out) << "};\n";
         writer << "gpu::invoke_primitive(ctx, " << pad_index << ", input, output);\n";
diff --git a/test/backend_test.in.cpp b/test/backend_test.in.cpp
@@ -7497,6 +7497,86 @@ NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_4d_2x0x3x2)
     EXPECT_EQ(expected, read_vector<float>(result));
 }
 
+// This test covers the case with multiple image and with asymetric pad
+// bug has been found on nvGPU side now covered by this test
+NGRAPH_TEST(${BACKEND_NAME}, pad_2channel_2image_asym)
+{
+    Shape shape_a{2, 2, 4, 4};
+    auto window_movement_strides = Strides{2, 2};
+    Shape padding_below{0, 0, 0, 0};
+    Shape padding_above{0, 0, 2, 2};
+    Shape padding_interior{0, 0, 0, 0};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{2, 2, 6, 6};
+    auto f = make_shared<Function>(
+        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
+        op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a,
+              test::NDArray<float, 4>({{{{0, 1, 0, 2}, // img 0 chan 0
+                                         {0, 3, 2, 0},
+                                         {2, 0, 0, 0},
+                                         {0, 2, 1, 0}},
+
+                                        {{0, 0, 0, 2}, // img 0 chan 1
+                                         {0, 2, 3, 0},
+                                         {2, 0, 1, 0},
+                                         {2, 0, 0, 0}}},
+
+                                       {{{0, 2, 1, 1}, // img 1 chan 0
+                                         {0, 0, 2, 0},
+                                         {0, 0, 1, 2},
+                                         {0, 0, 0, 0}},
+
+                                        {{2, 1, 0, 0}, // img 1 chan 1
+                                         {0, 2, 0, 0},
+                                         {1, 1, 2, 0},
+                                         {1, 0, 0, 0}}}})
+                  .get_vector());
+
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{42});
+
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((test::NDArray<float, 4>({{{{0, 1, 0, 2, 42, 42}, // img 0 chan 0
+                                          {0, 3, 2, 0, 42, 42},
+                                          {2, 0, 0, 0, 42, 42},
+                                          {0, 2, 1, 0, 42, 42},
+                                          {42, 42, 42, 42, 42, 42},
+                                          {42, 42, 42, 42, 42, 42}},
+
+                                         {{0, 0, 0, 2, 42, 42}, // img 1 chan 0
+                                          {0, 2, 3, 0, 42, 42},
+                                          {2, 0, 1, 0, 42, 42},
+                                          {2, 0, 0, 0, 42, 42},
+                                          {42, 42, 42, 42, 42, 42},
+                                          {42, 42, 42, 42, 42, 42}}},
+
+                                        {{{0, 2, 1, 1, 42, 42}, // img 1 chan 0
+                                          {0, 0, 2, 0, 42, 42},
+                                          {0, 0, 1, 2, 42, 42},
+                                          {0, 0, 0, 0, 42, 42},
+                                          {42, 42, 42, 42, 42, 42},
+                                          {42, 42, 42, 42, 42, 42}},
+
+                                         {{2, 1, 0, 0, 42, 42}, // img 1 chan 1
+                                          {0, 2, 0, 0, 42, 42},
+                                          {1, 1, 2, 0, 42, 42},
+                                          {1, 0, 0, 0, 42, 42},
+                                          {42, 42, 42, 42, 42, 42},
+                                          {42, 42, 42, 42, 42, 42}}}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
 // Trivial case with no reduced axes.
 NGRAPH_TEST(${BACKEND_NAME}, product_trivial)
 {