@@ -1323,16 +1323,19 @@ size_t runtime::gpu::CUDAEmitter::build_softmax_divide(const std::vector<std::st
13231323 return primitive_index;
13241324}
13251325
1326- size_t runtime::gpu::CUDAEmitter::build_reduce (const std::vector<std::string>& dtypes,
1327- NVShape input_shape,
1328- NVShape reduce_axis,
1329- const char * op,
1330- const char * kernel)
1326+ size_t runtime::gpu::CUDAEmitter::build_reduce_to_nd (const std::vector<std::string>& dtypes,
1327+ NVShape input_shape,
1328+ NVShape reduce_axis,
1329+ const char * op,
1330+ const char * kernel)
13311331{
1332+ size_t rank = input_shape.size ();
1333+ size_t reduce_rank = reduce_axis.size ();
1334+ size_t out_rank = rank - reduce_rank;
13321335 // assumes NC{d1,...,dn} format
1333- std::string kernel_name = " reduce_ " + join (dtypes, " _" ) + " _ri_ " +
1334- std::to_string (input_shape. size ()) + " _rr_ " +
1335- std::to_string (reduce_axis.size ());
1336+ std::string kernel_name = " reduce_nd_ " + join (dtypes, " _" );
1337+ kernel_name +=
1338+ " _ri_ " + std::to_string (input_shape. size ()) + " _rr_ " + std::to_string (reduce_axis.size ());
13361339 std::replace (kernel_name.begin (), kernel_name.end (), ' ' , ' _' );
13371340
13381341 std::stringstream ss;
@@ -1345,9 +1348,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
13451348 return primitive_index;
13461349 }
13471350
1348- size_t rank = input_shape.size ();
1349- size_t reduce_rank = reduce_axis.size ();
1350- size_t out_rank = rank - reduce_rank;
13511351 NVShape reduce_flag (rank, 0 );
13521352 for (auto a : reduce_axis)
13531353 {
@@ -1372,7 +1372,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
13721372 }
13731373 }
13741374 NVShape output_strides = row_major_strides (output_shape);
1375-
13761375 uint32_t nthreads = static_cast <uint32_t >(shape_size (output_shape));
13771376 // TODO: currently we set it to 64, will add tuning method later
13781377 uint32_t block_size_x = 64 ;
@@ -1398,7 +1397,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
13981397 CudaKernelBuilder::get_device_helper (
13991398 writer, op, kernel, {{dtypes[0 ], dtypes[0 ], dtypes[1 ]}});
14001399 }
1401- runtime::gpu::CudaKernelBuilder::get_reduce_op (
1400+ runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op (
14021401 writer, kernel_name, args, dtypes, op, out_rank, reduce_rank);
14031402 compiled_kernel = m_ctx->compiled_kernel_pool ->set (kernel_name, writer.get_code ());
14041403 }
@@ -1422,12 +1421,252 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
14221421 0 ));
14231422 debug_sync ();
14241423 }});
1424+ primitive_index = this ->m_primitive_emitter ->insert (std::move (reduce));
1425+ m_primitive_emitter->cache (hash, primitive_index);
1426+ return primitive_index;
1427+ }
1428+
1429+ size_t runtime::gpu::CUDAEmitter::build_reduce_to_scalar (const std::vector<std::string>& dtypes,
1430+ const size_t data_bytes,
1431+ NVShape input_shape,
1432+ const char * op,
1433+ const char * kernel)
1434+ {
1435+ // assumes NC{d1,...,dn} format
1436+ std::string kernel_name = " reduce_scalar_" + join (dtypes, " _" );
1437+ std::replace (kernel_name.begin (), kernel_name.end (), ' ' , ' _' );
1438+
1439+ std::stringstream ss;
1440+ ss << kernel_name << " _s_" << join (input_shape, " _" );
1441+ auto hash = ss.str ();
1442+ // check if the requested kernel is already an inserted primitive
1443+ size_t primitive_index = m_primitive_emitter->lookup (hash);
1444+ if (primitive_index != std::numeric_limits<size_t >::max ())
1445+ {
1446+ return primitive_index;
1447+ }
1448+
1449+ uint32_t nthreads = static_cast <uint32_t >(shape_size (input_shape));
1450+ uint32_t n = nthreads;
1451+ uint32_t block_size_x = 1 ;
1452+ while (n > 1 )
1453+ {
1454+ block_size_x <<= 1 ;
1455+ n >>= 1 ;
1456+ }
1457+ block_size_x = fmin (512 , block_size_x);
1458+ uint32_t shared_data_bytes = block_size_x * static_cast <uint32_t >(data_bytes);
1459+ kernel_name += " _b_" + std::to_string (block_size_x);
1460+ auto args = m_primitive_emitter->add_kernel_args ();
1461+ args.add_placeholder (dtypes[0 ], " in" )
1462+ .add_placeholder (dtypes[1 ], " out" )
1463+ .add (" nthreads" , nthreads);
1464+
1465+ // if the kernel has not been compiled, build it
1466+ auto compiled_kernel = m_ctx->compiled_kernel_pool ->get (kernel_name);
1467+ if (compiled_kernel == nullptr )
1468+ {
1469+ codegen::CodeWriter writer;
1470+ CudaKernelBuilder::add_pod_typedefs (writer);
1471+ writer << include_helpers ();
1472+ if (kernel)
1473+ {
1474+ CudaKernelBuilder::get_device_helper (
1475+ writer, op, kernel, {{dtypes[0 ], dtypes[0 ], dtypes[1 ]}});
1476+ }
1477+ runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op (
1478+ writer, kernel_name, args, dtypes, op, block_size_x);
1479+ compiled_kernel = m_ctx->compiled_kernel_pool ->set (kernel_name, writer.get_code ());
1480+ }
1481+
1482+ std::unique_ptr<gpu::primitive> reduce (
1483+ new gpu::primitive{[=](void ** inputs, void ** outputs) mutable {
1484+ void ** args_list = args.resolve_placeholder (0 , &inputs[0 ])
1485+ .resolve_placeholder (1 , &outputs[0 ])
1486+ .get_argument_list ();
14251487
1488+ CUDA_SAFE_CALL (cuLaunchKernel (*compiled_kernel.get (),
1489+ 1 ,
1490+ 1 ,
1491+ 1 ,
1492+ block_size_x,
1493+ 1 ,
1494+ 1 ,
1495+ shared_data_bytes,
1496+ NULL ,
1497+ args_list,
1498+ 0 ));
1499+ debug_sync ();
1500+ }});
14261501 primitive_index = this ->m_primitive_emitter ->insert (std::move (reduce));
14271502 m_primitive_emitter->cache (hash, primitive_index);
14281503 return primitive_index;
14291504}
14301505
1506+ size_t runtime::gpu::CUDAEmitter::build_reduce_to_scalar_acc (const std::vector<std::string>& dtypes,
1507+ NVShape input_shape,
1508+ NVShape output_shape,
1509+ uint32_t block_size_x,
1510+ const char * op,
1511+ const char * kernel)
1512+ {
1513+ // assumes NC{d1,...,dn} format
1514+ std::string kernel_name = " reduce_acc_" + join (dtypes, " _" );
1515+ std::replace (kernel_name.begin (), kernel_name.end (), ' ' , ' _' );
1516+
1517+ std::stringstream ss;
1518+ ss << kernel_name << " _s_" << join (input_shape, " _" );
1519+ auto hash = ss.str ();
1520+ // check if the requested kernel is already an inserted primitive
1521+ size_t primitive_index = m_primitive_emitter->lookup (hash);
1522+ if (primitive_index != std::numeric_limits<size_t >::max ())
1523+ {
1524+ return primitive_index;
1525+ }
1526+
1527+ uint32_t nthreads = static_cast <uint32_t >(shape_size (input_shape));
1528+ auto args = m_primitive_emitter->add_kernel_args ();
1529+ args.add_placeholder (dtypes[0 ], " in" )
1530+ .add_placeholder (dtypes[1 ], " out" )
1531+ .add (" nthreads" , nthreads);
1532+
1533+ uint32_t aligned_grid_size_x = static_cast <uint32_t >(shape_size (output_shape)) / block_size_x;
1534+
1535+ auto compiled_kernel = m_ctx->compiled_kernel_pool ->get (kernel_name);
1536+ // if the kernel has not been compiled, build it
1537+ if (compiled_kernel == nullptr )
1538+ {
1539+ codegen::CodeWriter writer;
1540+ CudaKernelBuilder::add_pod_typedefs (writer);
1541+ writer << include_helpers ();
1542+ if (kernel)
1543+ {
1544+ CudaKernelBuilder::get_device_helper (
1545+ writer, op, kernel, {{dtypes[0 ], dtypes[0 ], dtypes[1 ]}});
1546+ }
1547+ runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_acc_op (
1548+ writer, kernel_name, args, dtypes, op);
1549+ compiled_kernel = m_ctx->compiled_kernel_pool ->set (kernel_name, writer.get_code ());
1550+ }
1551+
1552+ std::unique_ptr<gpu::primitive> reduce_acc (
1553+ new gpu::primitive{[=](void ** inputs, void ** outputs) mutable {
1554+ void ** args_list = args.resolve_placeholder (0 , &inputs[0 ])
1555+ .resolve_placeholder (1 , &outputs[0 ])
1556+ .get_argument_list ();
1557+ CUDA_SAFE_CALL (cuLaunchKernel (*compiled_kernel.get (),
1558+ aligned_grid_size_x,
1559+ 1 ,
1560+ 1 ,
1561+ block_size_x,
1562+ 1 ,
1563+ 1 ,
1564+ 0 ,
1565+ NULL ,
1566+ args_list,
1567+ 0 ));
1568+ }});
1569+ primitive_index = this ->m_primitive_emitter ->insert (std::move (reduce_acc));
1570+
1571+ m_primitive_emitter->cache (hash, primitive_index);
1572+ return primitive_index;
1573+ }
1574+
1575+ size_t runtime::gpu::CUDAEmitter::build_reduce (const std::vector<std::string>& dtypes,
1576+ const size_t data_bytes,
1577+ NVShape input_shape,
1578+ NVShape reduce_axis,
1579+ const char * op,
1580+ const char * kernel)
1581+ {
1582+ size_t rank = input_shape.size ();
1583+ size_t reduce_rank = reduce_axis.size ();
1584+ size_t out_rank = rank - reduce_rank;
1585+ // assumes NC{d1,...,dn} format
1586+ std::string kernel_name = " reduce_" + join (dtypes, " _" );
1587+ if (out_rank != 0 )
1588+ {
1589+ kernel_name += " _ri_" + std::to_string (input_shape.size ()) + " _rr_" +
1590+ std::to_string (reduce_axis.size ());
1591+ }
1592+ std::replace (kernel_name.begin (), kernel_name.end (), ' ' , ' _' );
1593+
1594+ std::stringstream ss;
1595+ ss << kernel_name << " _s_" << join (input_shape, " _" ) << " _axis_" << join (reduce_axis, " _" );
1596+ auto hash = ss.str ();
1597+ // check if the requested kernel is already an inserted primitive
1598+ size_t primitive_index = m_primitive_emitter->lookup (hash);
1599+ if (primitive_index != std::numeric_limits<size_t >::max ())
1600+ {
1601+ return primitive_index;
1602+ }
1603+
1604+ int num_SMs;
1605+ CUDA_RT_SAFE_CALL (cudaDeviceGetAttribute (&num_SMs, cudaDevAttrMultiProcessorCount, 0 ));
1606+ uint32_t block_size_x_acc = 256 ;
1607+ uint32_t nthreads_acc = num_SMs * block_size_x_acc;
1608+ // call reduce_to_nd
1609+ if (out_rank != 0 )
1610+ {
1611+ size_t reduce_idx = build_reduce_to_nd (dtypes, input_shape, reduce_axis, op, kernel);
1612+
1613+ std::unique_ptr<gpu::primitive> reduce (
1614+ new gpu::primitive{[=](void ** inputs, void ** outputs) mutable {
1615+ gpu::invoke_primitive (m_ctx,
1616+ reduce_idx,
1617+ std::vector<void *>{inputs[0 ]}.data (),
1618+ std::vector<void *>{outputs[0 ]}.data ());
1619+ }});
1620+ primitive_index = this ->m_primitive_emitter ->insert (std::move (reduce));
1621+ }
1622+ else
1623+ {
1624+ uint32_t nthreads = static_cast <uint32_t >(shape_size (input_shape));
1625+ // if the data size is large, call reduce_to_scalar_acc first and then reduce_to_scalar.
1626+ // other wise, call reduce to scalar directly.
1627+ const uint32_t unroll_size = 8 ;
1628+ if (nthreads > nthreads_acc * (unroll_size + 1 ))
1629+ {
1630+ NVShape acc_output_shape{nthreads_acc};
1631+ size_t reduce_scalar_acc_idx = build_reduce_to_scalar_acc (
1632+ dtypes, input_shape, acc_output_shape, block_size_x_acc, op, kernel);
1633+ size_t reduce_scalar_idx =
1634+ build_reduce_to_scalar (dtypes, data_bytes, acc_output_shape, op, kernel);
1635+ // get an allocator for transient per kernel gpu memory
1636+ GPUAllocator allocator = this ->m_primitive_emitter ->get_memory_allocator ();
1637+ size_t idx_workspace = allocator.reserve_workspace (nthreads_acc * data_bytes);
1638+ std::unique_ptr<gpu::primitive> reduce_scalar_acc (
1639+ new gpu::primitive{[=](void ** inputs, void ** outputs) mutable {
1640+ void * buffer = runtime::gpu::invoke_memory_primitive (m_ctx, idx_workspace);
1641+ gpu::invoke_primitive (m_ctx,
1642+ reduce_scalar_acc_idx,
1643+ std::vector<void *>{inputs[0 ]}.data (),
1644+ std::vector<void *>{buffer}.data ());
1645+ gpu::invoke_primitive (m_ctx,
1646+ reduce_scalar_idx,
1647+ std::vector<void *>{buffer}.data (),
1648+ std::vector<void *>{outputs[0 ]}.data ());
1649+ }});
1650+ primitive_index = this ->m_primitive_emitter ->insert (std::move (reduce_scalar_acc));
1651+ }
1652+ else
1653+ {
1654+ size_t reduce_scalar_idx =
1655+ build_reduce_to_scalar (dtypes, data_bytes, input_shape, op, kernel);
1656+ std::unique_ptr<gpu::primitive> reduce_scalar (
1657+ new gpu::primitive{[=](void ** inputs, void ** outputs) mutable {
1658+ gpu::invoke_primitive (m_ctx,
1659+ reduce_scalar_idx,
1660+ std::vector<void *>{inputs[0 ]}.data (),
1661+ std::vector<void *>{outputs[0 ]}.data ());
1662+ }});
1663+ primitive_index = this ->m_primitive_emitter ->insert (std::move (reduce_scalar));
1664+ }
1665+ }
1666+ m_primitive_emitter->cache (hash, primitive_index);
1667+ return primitive_index;
1668+ }
1669+
14311670size_t runtime::gpu::CUDAEmitter::build_primitive (const op::Softmax* node)
14321671{
14331672 auto & args = node->get_inputs ();
0 commit comments