Skip to content
This repository was archived by the owner on Jan 3, 2023. It is now read-only.

Commit 40ff77b

Browse files
Chris Sullivanrkimballn1
authored andcommitted
Update pad on nvpgu (#1759)
* Add pad with fill operator using the outward-in index pattern. * Remove static pad and rename build_pad_dynamic -> build_pad. Update maxpool 1d padding. * Formatting. * Split build_pad_dynamic into build_pad and build_pad_fill. * Add test coverage for fixed bug in op::Pad for gpu.
1 parent 519b18a commit 40ff77b

File tree

7 files changed

+261
-219
lines changed

7 files changed

+261
-219
lines changed

src/ngraph/runtime/gpu/cuda_emitter.cpp

Lines changed: 91 additions & 166 deletions
Large diffs are not rendered by default.

src/ngraph/runtime/gpu/cuda_emitter.hpp

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,19 +50,17 @@ namespace ngraph
5050
size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op);
5151

5252
public:
53-
size_t build_pad(const std::array<std::string, 2>& dtypes,
53+
size_t build_pad(const std::vector<std::string>& dtypes,
5454
NVShape input_shape,
5555
NVShape output_shape,
56-
NVShape pad_below,
57-
NVShape pad_above,
58-
NVShape pad_interior,
59-
const std::string& pad_value = "");
56+
NVShape padding_below,
57+
NVShape padding_interior);
6058

61-
size_t build_pad_dynamic(const std::array<std::string, 2>& dtypes,
62-
NVShape input_shape,
63-
NVShape output_shape,
64-
NVShape padding_below,
65-
NVShape padding_interior);
59+
size_t build_pad_fill(const std::vector<std::string>& dtypes,
60+
NVShape input_shape,
61+
NVShape output_shape,
62+
NVShape padding_below,
63+
NVShape padding_interior);
6664

6765
size_t build_1d_max_pool(const std::array<std::string, 2>& dtypes,
6866
NVShape input_shape,

src/ngraph/runtime/gpu/cudnn_emitter.cpp

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
418418
Shape padding_interior(data_dilation_strides);
419419

420420
size_t idx_workspace = std::numeric_limits<size_t>::max();
421-
size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
421+
size_t pad_index = std::numeric_limits<size_t>::max();
422422
bool can_find_algo = true;
423423
if (pad_required || is_deconvolution)
424424
{
@@ -431,8 +431,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
431431
idx_workspace = allocator.reserve_workspace(temp_size, true);
432432

433433
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
434-
pad_dynamic_index =
435-
cuda_emitter->build_pad_dynamic({{args[0].get_element_type().c_type_string(),
434+
pad_index = cuda_emitter->build_pad({{args[0].get_element_type().c_type_string(),
436435
out[0].get_element_type().c_type_string()}},
437436
input_shape,
438437
input_shape_padded,
@@ -458,11 +457,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::Convolution* node)
458457
std::unique_ptr<gpu::primitive> kernel_launch(
459458
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
460459
if (idx_workspace != std::numeric_limits<size_t>::max() &&
461-
pad_dynamic_index != std::numeric_limits<size_t>::max())
460+
pad_index != std::numeric_limits<size_t>::max())
462461
{
463462
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
464463
gpu::invoke_primitive(m_ctx,
465-
pad_dynamic_index,
464+
pad_index,
466465
std::vector<void*>{inputs[0]}.data(),
467466
std::vector<void*>{pad_buffer}.data());
468467
gpu::invoke_primitive(
@@ -542,7 +541,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
542541
Shape padding_interior(data_dilation_strides);
543542

544543
size_t idx_workspace = std::numeric_limits<size_t>::max();
545-
size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
544+
size_t pad_index = std::numeric_limits<size_t>::max();
546545
size_t slice_index = std::numeric_limits<size_t>::max();
547546
bool can_find_algo = true;
548547
if (pad_required || is_deconvolution)
@@ -556,11 +555,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
556555
idx_workspace = allocator.reserve_workspace(temp_size, true);
557556

558557
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
559-
pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
560-
output_shape,
561-
output_shape_padded,
562-
padding_below,
563-
padding_interior);
558+
pad_index = cuda_emitter->build_pad({{input_type, output_type}},
559+
output_shape,
560+
output_shape_padded,
561+
padding_below,
562+
padding_interior);
564563

565564
slice_index = cuda_emitter->build_slice({{input_type, output_type}},
566565
output_shape_padded,
@@ -587,12 +586,12 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
587586
std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
588587
void** outputs) mutable {
589588
if (idx_workspace != std::numeric_limits<size_t>::max() &&
590-
pad_dynamic_index != std::numeric_limits<size_t>::max() &&
589+
pad_index != std::numeric_limits<size_t>::max() &&
591590
slice_index != std::numeric_limits<size_t>::max())
592591
{
593592
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
594593
gpu::invoke_primitive(m_ctx,
595-
pad_dynamic_index,
594+
pad_index,
596595
std::vector<void*>{inputs[0]}.data(),
597596
std::vector<void*>{pad_buffer}.data());
598597
gpu::invoke_primitive(m_ctx, conv_index, inputs, std::vector<void*>{pad_buffer}.data());
@@ -662,7 +661,7 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
662661
Shape padding_interior(data_dilation_strides);
663662

664663
size_t idx_workspace = std::numeric_limits<size_t>::max();
665-
size_t pad_dynamic_index = std::numeric_limits<size_t>::max();
664+
size_t pad_index = std::numeric_limits<size_t>::max();
666665
bool can_find_algo = true;
667666
if (pad_required || is_deconvolution)
668667
{
@@ -675,11 +674,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
675674
idx_workspace = allocator.reserve_workspace(temp_size, true);
676675

677676
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
678-
pad_dynamic_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
679-
input_shape_0,
680-
input_shape_padded,
681-
padding_below,
682-
padding_interior);
677+
pad_index = cuda_emitter->build_pad({{input_type, output_type}},
678+
input_shape_0,
679+
input_shape_padded,
680+
padding_below,
681+
padding_interior);
683682

684683
// asymetric padding has been applied, zero out padding vectors to
685684
// ensure cudnn does not assume padding
@@ -700,11 +699,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::ConvolutionBackprop
700699
std::unique_ptr<gpu::primitive> kernel_launch(
701700
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
702701
if (idx_workspace != std::numeric_limits<size_t>::max() &&
703-
pad_dynamic_index != std::numeric_limits<size_t>::max())
702+
pad_index != std::numeric_limits<size_t>::max())
704703
{
705704
void* pad_buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
706705
gpu::invoke_primitive(m_ctx,
707-
pad_dynamic_index,
706+
pad_index,
708707
std::vector<void*>{inputs[0]}.data(),
709708
std::vector<void*>{pad_buffer}.data());
710709
gpu::invoke_primitive(
@@ -768,11 +767,11 @@ size_t runtime::gpu::CUDNNEmitter::build_primitive(const op::MaxPool* node)
768767
padded_size * args[0].get_element_type().size());
769768

770769
auto& cuda_emitter = m_primitive_emitter->get_cuda_emitter();
771-
pad_index = cuda_emitter->build_pad_dynamic({{input_type, output_type}},
772-
input_shape,
773-
input_shape_padded,
774-
padding_below,
775-
padding_interior);
770+
pad_index = cuda_emitter->build_pad({{input_type, output_type}},
771+
input_shape,
772+
input_shape_padded,
773+
padding_below,
774+
padding_interior);
776775

777776
// asymetric padding has been applied, zero out padding vectors to
778777
// ensure cuDNN does not assume padding during pooling

src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -638,12 +638,10 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
638638
writer.block_end();
639639
}
640640

641-
void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
642-
codegen::CodeWriter& writer,
643-
const std::string& name,
644-
GPUKernelArgs& args,
645-
const std::array<std::string, 2>& data_types,
646-
size_t rank)
641+
void runtime::gpu::CudaKernelBuilder::get_pad_op(codegen::CodeWriter& writer,
642+
const std::string& name,
643+
GPUKernelArgs& args,
644+
size_t rank)
647645
{
648646
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
649647
writer.block_begin();
@@ -673,6 +671,44 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
673671
writer.block_end();
674672
}
675673

674+
void runtime::gpu::CudaKernelBuilder::get_pad_fill_op(codegen::CodeWriter& writer,
675+
const std::string& name,
676+
GPUKernelArgs& args,
677+
size_t rank)
678+
{
679+
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
680+
writer.block_begin();
681+
{
682+
writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
683+
writer << "if (tid < n)\n";
684+
writer.block_begin();
685+
{
686+
writer << "bool in_bounds = true;\n";
687+
writer << "uint32_t output_pixel = tid;\n";
688+
writer << "uint32_t input_pixel = 0;\n";
689+
writer << "int32_t input, input_dil;\n";
690+
for (size_t i = 0; i < rank; i++)
691+
{
692+
if (i != 0)
693+
{
694+
writer << "output_pixel %= output_strides" << i - 1 << ";\n";
695+
}
696+
writer << "input_dil = output_pixel / output_strides" << i << " - padding_below"
697+
<< i << ";\n";
698+
699+
writer << "input = input_dil / (padding_interior" << i << " + 1);\n";
700+
writer << "input_dil %= (padding_interior" << i << " + 1);\n";
701+
writer << "in_bounds = in_bounds && (input >= 0) && (input < input_shape" << i
702+
<< ") && (input_dil == 0);\n";
703+
writer << "input_pixel += input * input_strides" << i << ";\n";
704+
}
705+
writer << "out[tid] = (in_bounds) ? in[input_pixel] : *pad;\n";
706+
}
707+
writer.block_end();
708+
}
709+
writer.block_end();
710+
}
711+
676712
void runtime::gpu::CudaKernelBuilder::get_reverse_sequence_op(
677713
codegen::CodeWriter& writer,
678714
const std::string& name,

src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,15 @@ namespace ngraph
130130
const std::string& math_kernel,
131131
const std::vector<std::string>& data_types);
132132

133-
static void get_pad_dynamic_op(codegen::CodeWriter& writer,
134-
const std::string& name,
135-
GPUKernelArgs& args,
136-
const std::array<std::string, 2>& data_types,
137-
size_t rank);
133+
static void get_pad_op(codegen::CodeWriter& writer,
134+
const std::string& name,
135+
GPUKernelArgs& args,
136+
size_t rank);
137+
138+
static void get_pad_fill_op(codegen::CodeWriter& writer,
139+
const std::string& name,
140+
GPUKernelArgs& args,
141+
size_t rank);
138142

139143
static void get_ew_collective_op(codegen::CodeWriter& writer,
140144
const std::string& name,

src/ngraph/runtime/gpu/gpu_emitter.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -827,12 +827,12 @@ void runtime::gpu::GPU_Emitter::emit_Pad(EMIT_ARGS)
827827

828828
auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
829829

830-
auto pad_index = cuda_emitter->build_pad({{args[0].get_type(), out[0].get_type()}},
831-
input_shape,
832-
output_shape,
833-
padding_below,
834-
padding_above,
835-
padding_interior);
830+
auto pad_index = cuda_emitter->build_pad_fill(
831+
{{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
832+
input_shape,
833+
output_shape,
834+
padding_below,
835+
padding_interior);
836836
writer << "void* input[] = {" << node_names(args) << "};\n";
837837
writer << "void* output[] = {" << node_names(out) << "};\n";
838838
writer << "gpu::invoke_primitive(ctx, " << pad_index << ", input, output);\n";

test/backend_test.in.cpp

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7497,6 +7497,86 @@ NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_4d_2x0x3x2)
74977497
EXPECT_EQ(expected, read_vector<float>(result));
74987498
}
74997499

7500+
// This test covers the case with multiple image and with asymetric pad
7501+
// bug has been found on nvGPU side now covered by this test
7502+
NGRAPH_TEST(${BACKEND_NAME}, pad_2channel_2image_asym)
7503+
{
7504+
Shape shape_a{2, 2, 4, 4};
7505+
auto window_movement_strides = Strides{2, 2};
7506+
Shape padding_below{0, 0, 0, 0};
7507+
Shape padding_above{0, 0, 2, 2};
7508+
Shape padding_interior{0, 0, 0, 0};
7509+
auto A = make_shared<op::Parameter>(element::f32, shape_a);
7510+
Shape shape_b{};
7511+
auto B = make_shared<op::Parameter>(element::f32, shape_b);
7512+
Shape shape_r{2, 2, 6, 6};
7513+
auto f = make_shared<Function>(
7514+
make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
7515+
op::ParameterVector{A, B});
7516+
7517+
auto backend = runtime::Backend::create("${BACKEND_NAME}");
7518+
7519+
// Create some tensors for input/output
7520+
auto a = backend->create_tensor(element::f32, shape_a);
7521+
copy_data(a,
7522+
test::NDArray<float, 4>({{{{0, 1, 0, 2}, // img 0 chan 0
7523+
{0, 3, 2, 0},
7524+
{2, 0, 0, 0},
7525+
{0, 2, 1, 0}},
7526+
7527+
{{0, 0, 0, 2}, // img 0 chan 1
7528+
{0, 2, 3, 0},
7529+
{2, 0, 1, 0},
7530+
{2, 0, 0, 0}}},
7531+
7532+
{{{0, 2, 1, 1}, // img 1 chan 0
7533+
{0, 0, 2, 0},
7534+
{0, 0, 1, 2},
7535+
{0, 0, 0, 0}},
7536+
7537+
{{2, 1, 0, 0}, // img 1 chan 1
7538+
{0, 2, 0, 0},
7539+
{1, 1, 2, 0},
7540+
{1, 0, 0, 0}}}})
7541+
.get_vector());
7542+
7543+
auto b = backend->create_tensor(element::f32, shape_b);
7544+
copy_data(b, vector<float>{42});
7545+
7546+
auto result = backend->create_tensor(element::f32, shape_r);
7547+
7548+
backend->call_with_validate(f, {result}, {a, b});
7549+
EXPECT_EQ((test::NDArray<float, 4>({{{{0, 1, 0, 2, 42, 42}, // img 0 chan 0
7550+
{0, 3, 2, 0, 42, 42},
7551+
{2, 0, 0, 0, 42, 42},
7552+
{0, 2, 1, 0, 42, 42},
7553+
{42, 42, 42, 42, 42, 42},
7554+
{42, 42, 42, 42, 42, 42}},
7555+
7556+
{{0, 0, 0, 2, 42, 42}, // img 1 chan 0
7557+
{0, 2, 3, 0, 42, 42},
7558+
{2, 0, 1, 0, 42, 42},
7559+
{2, 0, 0, 0, 42, 42},
7560+
{42, 42, 42, 42, 42, 42},
7561+
{42, 42, 42, 42, 42, 42}}},
7562+
7563+
{{{0, 2, 1, 1, 42, 42}, // img 1 chan 0
7564+
{0, 0, 2, 0, 42, 42},
7565+
{0, 0, 1, 2, 42, 42},
7566+
{0, 0, 0, 0, 42, 42},
7567+
{42, 42, 42, 42, 42, 42},
7568+
{42, 42, 42, 42, 42, 42}},
7569+
7570+
{{2, 1, 0, 0, 42, 42}, // img 1 chan 1
7571+
{0, 2, 0, 0, 42, 42},
7572+
{1, 1, 2, 0, 42, 42},
7573+
{1, 0, 0, 0, 42, 42},
7574+
{42, 42, 42, 42, 42, 42},
7575+
{42, 42, 42, 42, 42, 42}}}})
7576+
.get_vector()),
7577+
read_vector<float>(result));
7578+
}
7579+
75007580
// Trivial case with no reduced axes.
75017581
NGRAPH_TEST(${BACKEND_NAME}, product_trivial)
75027582
{

0 commit comments

Comments
 (0)