Skip to content
This repository was archived by the owner on Jan 3, 2023. It is now read-only.

Commit 5f40d95

Browse files
fengleitiandiyessi
authored andcommitted
nvgpu reduce to scalar optimization (#1491)
* add cuda reduce * clang format * fix bugs * fix bug * add 1d reduce * clang format * fix bugs * unroll loop * remove debug info * revert tests * unroll 1D reduce op * add comments * using cudnn for nd to scalar reduction * remove cuda 1d reduction since cudnn version is faster * remove 1D kernel * fix bugs * 1d multi block size * remove debug * change kernel name * add reduce to scalar optimization, add test * fix bugs and tune parameters * clang format * update comments * update comments * update comments * clang format * update comments * remove wrong comments, apply clang format * resolve Bob's comment * clang format * pass shared mem size from cuLaunchKernel, set unroll loop size through host code * remove unused code.clang format * change reduce to thread with shfl for each warp first * add seed * unroll size
1 parent 8fdefa5 commit 5f40d95

File tree

6 files changed

+488
-51
lines changed

6 files changed

+488
-51
lines changed

src/ngraph/runtime/gpu/cuda_emitter.cpp

Lines changed: 252 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1323,16 +1323,19 @@ size_t runtime::gpu::CUDAEmitter::build_softmax_divide(const std::vector<std::st
13231323
return primitive_index;
13241324
}
13251325

1326-
size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& dtypes,
1327-
NVShape input_shape,
1328-
NVShape reduce_axis,
1329-
const char* op,
1330-
const char* kernel)
1326+
size_t runtime::gpu::CUDAEmitter::build_reduce_to_nd(const std::vector<std::string>& dtypes,
1327+
NVShape input_shape,
1328+
NVShape reduce_axis,
1329+
const char* op,
1330+
const char* kernel)
13311331
{
1332+
size_t rank = input_shape.size();
1333+
size_t reduce_rank = reduce_axis.size();
1334+
size_t out_rank = rank - reduce_rank;
13321335
// assumes NC{d1,...,dn} format
1333-
std::string kernel_name = "reduce_" + join(dtypes, "_") + "_ri_" +
1334-
std::to_string(input_shape.size()) + "_rr_" +
1335-
std::to_string(reduce_axis.size());
1336+
std::string kernel_name = "reduce_nd_" + join(dtypes, "_");
1337+
kernel_name +=
1338+
"_ri_" + std::to_string(input_shape.size()) + "_rr_" + std::to_string(reduce_axis.size());
13361339
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
13371340

13381341
std::stringstream ss;
@@ -1345,9 +1348,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
13451348
return primitive_index;
13461349
}
13471350

1348-
size_t rank = input_shape.size();
1349-
size_t reduce_rank = reduce_axis.size();
1350-
size_t out_rank = rank - reduce_rank;
13511351
NVShape reduce_flag(rank, 0);
13521352
for (auto a : reduce_axis)
13531353
{
@@ -1372,7 +1372,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
13721372
}
13731373
}
13741374
NVShape output_strides = row_major_strides(output_shape);
1375-
13761375
uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
13771376
// TODO: currently we set it to 64, will add tuning method later
13781377
uint32_t block_size_x = 64;
@@ -1398,7 +1397,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
13981397
CudaKernelBuilder::get_device_helper(
13991398
writer, op, kernel, {{dtypes[0], dtypes[0], dtypes[1]}});
14001399
}
1401-
runtime::gpu::CudaKernelBuilder::get_reduce_op(
1400+
runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
14021401
writer, kernel_name, args, dtypes, op, out_rank, reduce_rank);
14031402
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
14041403
}
@@ -1422,12 +1421,252 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
14221421
0));
14231422
debug_sync();
14241423
}});
1424+
primitive_index = this->m_primitive_emitter->insert(std::move(reduce));
1425+
m_primitive_emitter->cache(hash, primitive_index);
1426+
return primitive_index;
1427+
}
1428+
1429+
size_t runtime::gpu::CUDAEmitter::build_reduce_to_scalar(const std::vector<std::string>& dtypes,
1430+
const size_t data_bytes,
1431+
NVShape input_shape,
1432+
const char* op,
1433+
const char* kernel)
1434+
{
1435+
// assumes NC{d1,...,dn} format
1436+
std::string kernel_name = "reduce_scalar_" + join(dtypes, "_");
1437+
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
1438+
1439+
std::stringstream ss;
1440+
ss << kernel_name << "_s_" << join(input_shape, "_");
1441+
auto hash = ss.str();
1442+
// check if the requested kernel is already an inserted primitive
1443+
size_t primitive_index = m_primitive_emitter->lookup(hash);
1444+
if (primitive_index != std::numeric_limits<size_t>::max())
1445+
{
1446+
return primitive_index;
1447+
}
1448+
1449+
uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
1450+
uint32_t n = nthreads;
1451+
uint32_t block_size_x = 1;
1452+
while (n > 1)
1453+
{
1454+
block_size_x <<= 1;
1455+
n >>= 1;
1456+
}
1457+
block_size_x = fmin(512, block_size_x);
1458+
uint32_t shared_data_bytes = block_size_x * static_cast<uint32_t>(data_bytes);
1459+
kernel_name += "_b_" + std::to_string(block_size_x);
1460+
auto args = m_primitive_emitter->add_kernel_args();
1461+
args.add_placeholder(dtypes[0], "in")
1462+
.add_placeholder(dtypes[1], "out")
1463+
.add("nthreads", nthreads);
1464+
1465+
// if the kernel has not been compiled, build it
1466+
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
1467+
if (compiled_kernel == nullptr)
1468+
{
1469+
codegen::CodeWriter writer;
1470+
CudaKernelBuilder::add_pod_typedefs(writer);
1471+
writer << include_helpers();
1472+
if (kernel)
1473+
{
1474+
CudaKernelBuilder::get_device_helper(
1475+
writer, op, kernel, {{dtypes[0], dtypes[0], dtypes[1]}});
1476+
}
1477+
runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
1478+
writer, kernel_name, args, dtypes, op, block_size_x);
1479+
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
1480+
}
1481+
1482+
std::unique_ptr<gpu::primitive> reduce(
1483+
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
1484+
void** args_list = args.resolve_placeholder(0, &inputs[0])
1485+
.resolve_placeholder(1, &outputs[0])
1486+
.get_argument_list();
14251487

1488+
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
1489+
1,
1490+
1,
1491+
1,
1492+
block_size_x,
1493+
1,
1494+
1,
1495+
shared_data_bytes,
1496+
NULL,
1497+
args_list,
1498+
0));
1499+
debug_sync();
1500+
}});
14261501
primitive_index = this->m_primitive_emitter->insert(std::move(reduce));
14271502
m_primitive_emitter->cache(hash, primitive_index);
14281503
return primitive_index;
14291504
}
14301505

1506+
size_t runtime::gpu::CUDAEmitter::build_reduce_to_scalar_acc(const std::vector<std::string>& dtypes,
1507+
NVShape input_shape,
1508+
NVShape output_shape,
1509+
uint32_t block_size_x,
1510+
const char* op,
1511+
const char* kernel)
1512+
{
1513+
// assumes NC{d1,...,dn} format
1514+
std::string kernel_name = "reduce_acc_" + join(dtypes, "_");
1515+
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
1516+
1517+
std::stringstream ss;
1518+
ss << kernel_name << "_s_" << join(input_shape, "_");
1519+
auto hash = ss.str();
1520+
// check if the requested kernel is already an inserted primitive
1521+
size_t primitive_index = m_primitive_emitter->lookup(hash);
1522+
if (primitive_index != std::numeric_limits<size_t>::max())
1523+
{
1524+
return primitive_index;
1525+
}
1526+
1527+
uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
1528+
auto args = m_primitive_emitter->add_kernel_args();
1529+
args.add_placeholder(dtypes[0], "in")
1530+
.add_placeholder(dtypes[1], "out")
1531+
.add("nthreads", nthreads);
1532+
1533+
uint32_t aligned_grid_size_x = static_cast<uint32_t>(shape_size(output_shape)) / block_size_x;
1534+
1535+
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
1536+
// if the kernel has not been compiled, build it
1537+
if (compiled_kernel == nullptr)
1538+
{
1539+
codegen::CodeWriter writer;
1540+
CudaKernelBuilder::add_pod_typedefs(writer);
1541+
writer << include_helpers();
1542+
if (kernel)
1543+
{
1544+
CudaKernelBuilder::get_device_helper(
1545+
writer, op, kernel, {{dtypes[0], dtypes[0], dtypes[1]}});
1546+
}
1547+
runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_acc_op(
1548+
writer, kernel_name, args, dtypes, op);
1549+
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
1550+
}
1551+
1552+
std::unique_ptr<gpu::primitive> reduce_acc(
1553+
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
1554+
void** args_list = args.resolve_placeholder(0, &inputs[0])
1555+
.resolve_placeholder(1, &outputs[0])
1556+
.get_argument_list();
1557+
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
1558+
aligned_grid_size_x,
1559+
1,
1560+
1,
1561+
block_size_x,
1562+
1,
1563+
1,
1564+
0,
1565+
NULL,
1566+
args_list,
1567+
0));
1568+
}});
1569+
primitive_index = this->m_primitive_emitter->insert(std::move(reduce_acc));
1570+
1571+
m_primitive_emitter->cache(hash, primitive_index);
1572+
return primitive_index;
1573+
}
1574+
1575+
size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& dtypes,
1576+
const size_t data_bytes,
1577+
NVShape input_shape,
1578+
NVShape reduce_axis,
1579+
const char* op,
1580+
const char* kernel)
1581+
{
1582+
size_t rank = input_shape.size();
1583+
size_t reduce_rank = reduce_axis.size();
1584+
size_t out_rank = rank - reduce_rank;
1585+
// assumes NC{d1,...,dn} format
1586+
std::string kernel_name = "reduce_" + join(dtypes, "_");
1587+
if (out_rank != 0)
1588+
{
1589+
kernel_name += "_ri_" + std::to_string(input_shape.size()) + "_rr_" +
1590+
std::to_string(reduce_axis.size());
1591+
}
1592+
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
1593+
1594+
std::stringstream ss;
1595+
ss << kernel_name << "_s_" << join(input_shape, "_") << "_axis_" << join(reduce_axis, "_");
1596+
auto hash = ss.str();
1597+
// check if the requested kernel is already an inserted primitive
1598+
size_t primitive_index = m_primitive_emitter->lookup(hash);
1599+
if (primitive_index != std::numeric_limits<size_t>::max())
1600+
{
1601+
return primitive_index;
1602+
}
1603+
1604+
int num_SMs;
1605+
CUDA_RT_SAFE_CALL(cudaDeviceGetAttribute(&num_SMs, cudaDevAttrMultiProcessorCount, 0));
1606+
uint32_t block_size_x_acc = 256;
1607+
uint32_t nthreads_acc = num_SMs * block_size_x_acc;
1608+
//call reduce_to_nd
1609+
if (out_rank != 0)
1610+
{
1611+
size_t reduce_idx = build_reduce_to_nd(dtypes, input_shape, reduce_axis, op, kernel);
1612+
1613+
std::unique_ptr<gpu::primitive> reduce(
1614+
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
1615+
gpu::invoke_primitive(m_ctx,
1616+
reduce_idx,
1617+
std::vector<void*>{inputs[0]}.data(),
1618+
std::vector<void*>{outputs[0]}.data());
1619+
}});
1620+
primitive_index = this->m_primitive_emitter->insert(std::move(reduce));
1621+
}
1622+
else
1623+
{
1624+
uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
1625+
//if the data size is large, call reduce_to_scalar_acc first and then reduce_to_scalar.
1626+
//other wise, call reduce to scalar directly.
1627+
const uint32_t unroll_size = 8;
1628+
if (nthreads > nthreads_acc * (unroll_size + 1))
1629+
{
1630+
NVShape acc_output_shape{nthreads_acc};
1631+
size_t reduce_scalar_acc_idx = build_reduce_to_scalar_acc(
1632+
dtypes, input_shape, acc_output_shape, block_size_x_acc, op, kernel);
1633+
size_t reduce_scalar_idx =
1634+
build_reduce_to_scalar(dtypes, data_bytes, acc_output_shape, op, kernel);
1635+
// get an allocator for transient per kernel gpu memory
1636+
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
1637+
size_t idx_workspace = allocator.reserve_workspace(nthreads_acc * data_bytes);
1638+
std::unique_ptr<gpu::primitive> reduce_scalar_acc(
1639+
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
1640+
void* buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
1641+
gpu::invoke_primitive(m_ctx,
1642+
reduce_scalar_acc_idx,
1643+
std::vector<void*>{inputs[0]}.data(),
1644+
std::vector<void*>{buffer}.data());
1645+
gpu::invoke_primitive(m_ctx,
1646+
reduce_scalar_idx,
1647+
std::vector<void*>{buffer}.data(),
1648+
std::vector<void*>{outputs[0]}.data());
1649+
}});
1650+
primitive_index = this->m_primitive_emitter->insert(std::move(reduce_scalar_acc));
1651+
}
1652+
else
1653+
{
1654+
size_t reduce_scalar_idx =
1655+
build_reduce_to_scalar(dtypes, data_bytes, input_shape, op, kernel);
1656+
std::unique_ptr<gpu::primitive> reduce_scalar(
1657+
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
1658+
gpu::invoke_primitive(m_ctx,
1659+
reduce_scalar_idx,
1660+
std::vector<void*>{inputs[0]}.data(),
1661+
std::vector<void*>{outputs[0]}.data());
1662+
}});
1663+
primitive_index = this->m_primitive_emitter->insert(std::move(reduce_scalar));
1664+
}
1665+
}
1666+
m_primitive_emitter->cache(hash, primitive_index);
1667+
return primitive_index;
1668+
}
1669+
14311670
size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Softmax* node)
14321671
{
14331672
auto& args = node->get_inputs();

src/ngraph/runtime/gpu/cuda_emitter.hpp

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,11 +117,13 @@ namespace ngraph
117117

118118
template <typename T>
119119
size_t build_reduce(const std::vector<std::string>& dtypes,
120-
NVShape tensor_shape,
120+
const size_t data_bytes,
121+
NVShape input_shape,
121122
NVShape reduce_axis)
122123
{
123124
return build_reduce(dtypes,
124-
tensor_shape,
125+
data_bytes,
126+
input_shape,
125127
reduce_axis,
126128
CudaOpMap<T>::op,
127129
CudaOpMap<T>::math_kernel);
@@ -194,10 +196,31 @@ namespace ngraph
194196
const char* reduce_op,
195197
bool save_elementwise);
196198
size_t build_reduce(const std::vector<std::string>& dtypes,
197-
NVShape tensor_shape,
199+
const size_t data_bytes,
200+
NVShape input_shape,
198201
NVShape reduce_axis,
199202
const char* op,
200203
const char* kernel);
204+
size_t build_reduce_to_nd(const std::vector<std::string>& dtypes,
205+
NVShape input_shape,
206+
NVShape reduce_axis,
207+
const char* op,
208+
const char* kernel);
209+
size_t build_reduce_to_scalar(const std::vector<std::string>& dtypes,
210+
const size_t data_bytes,
211+
NVShape input_shape,
212+
const char* op,
213+
const char* kernel);
214+
215+
//This is the preprocess for reduce to scalar if the data size is large than a number.
216+
//The number can be tuned based on hardware.
217+
//This cuda kernel will accumulate reduction to a certain number of bins depends on hardware.
218+
size_t build_reduce_to_scalar_acc(const std::vector<std::string>& dtypes,
219+
NVShape input_shape,
220+
NVShape output_shape,
221+
uint32_t block_size_x,
222+
const char* op,
223+
const char* kernel);
201224
GPUPrimitiveEmitter* m_primitive_emitter;
202225
GPURuntimeContext* m_ctx;
203226
};

0 commit comments

Comments
 (0)