|
4 | 4 |
|
5 | 5 | /* Host-device function declarations */ |
6 | 6 | #if (defined(USE_GPU) && (defined(__CUDACC__) || defined(__HIP_PLATFORM_HCC___))) |
7 | | - #define ARCH_HOSTDEV __host__ __device__ |
| 7 | +#define ARCH_HOSTDEV __host__ __device__ |
8 | 8 | #define ARCH_DEV __device__ |
9 | 9 | #else |
10 | 10 | #define ARCH_HOSTDEV |
11 | | - #define ARCH_DEV |
| 11 | +#define ARCH_DEV |
12 | 12 | #endif |
13 | 13 |
|
14 | 14 | /* Namespace for the common loop interface functions */ |
15 | | -namespace arch{ |
16 | | -/* Type definition used in the headers */ |
| 15 | +namespace arch { |
| 16 | + /* Type definition used in the headers */ |
17 | 17 | typedef uint32_t uint; |
18 | | -/* Enums for different reduction types */ |
| 18 | + /* Enums for different reduction types */ |
19 | 19 | enum reduce_op { max, min, sum, prod, null }; |
20 | | -} |
| 20 | +} // namespace arch |
21 | 21 |
|
22 | 22 | /* Select the compiled architecture */ |
23 | 23 | #if defined(USE_GPU) && defined(__CUDACC__) |
24 | | - #include "arch_device_cuda.h" |
| 24 | +#include "arch_device_cuda.h" |
25 | 25 | #elif defined(USE_GPU) && defined(__HIP_PLATFORM_HCC___) |
26 | | - #include "arch_device_hip.h" |
| 26 | +#include "arch_device_hip.h" |
27 | 27 | #else |
28 | | - #include "arch_device_host.h" |
| 28 | +#include "arch_device_host.h" |
29 | 29 | #endif |
30 | 30 |
|
31 | 31 | /* The macro for the inner loop body definition */ |
32 | 32 | #define ARCH_GET_MACRO(_1,_2,_3,_4,_5,NAME,...) NAME |
33 | 33 | #define ARCH_INNER_BODY(...) ARCH_GET_MACRO(__VA_ARGS__, ARCH_INNER_BODY4, ARCH_INNER_BODY3, ARCH_INNER_BODY2)(__VA_ARGS__) |
34 | 34 |
|
35 | 35 | /* Namespace for the common loop interface functions */ |
36 | | -namespace arch{ |
| 36 | +namespace arch { |
37 | 37 |
|
38 | | -/* Parallel reduce interface function - specialization for 1 reduction variable */ |
39 | | - template <reduce_op Op, uint NDim, typename Lambda, typename T> |
40 | | - inline static void parallel_reduce(const uint (&limits)[NDim], Lambda loop_body, T &sum) { |
| 38 | + /* Parallel reduce interface function - specialization for 1 reduction variable */ |
| 39 | + template < |
| 40 | + reduce_op Op, |
| 41 | + uint NDim, |
| 42 | + typename Lambda, |
| 43 | + typename T> |
| 44 | + inline static void parallel_reduce( |
| 45 | + const uint (&limits)[NDim], |
| 46 | + Lambda loop_body, |
| 47 | + T& sum |
| 48 | + ) { |
41 | 49 | constexpr uint NReductions = 1; |
42 | 50 | arch::parallel_reduce_driver<Op, NReductions, NDim>(limits, loop_body, &sum, NReductions); |
43 | 51 | } |
44 | 52 |
|
45 | | -/* Parallel reduce interface function - specialization for a reduction variable array */ |
46 | | - template <reduce_op Op, uint NDim, uint NReductions, typename Lambda, typename T> |
47 | | - inline static void parallel_reduce(const uint (&limits)[NDim], Lambda loop_body, T (&sum)[NReductions]) { |
| 53 | + /* Parallel reduce interface function - specialization for a reduction variable array */ |
| 54 | + template < |
| 55 | + reduce_op Op, |
| 56 | + uint NDim, |
| 57 | + uint NReductions, |
| 58 | + typename Lambda, |
| 59 | + typename T> |
| 60 | + inline static void parallel_reduce( |
| 61 | + const uint (&limits)[NDim], |
| 62 | + Lambda loop_body, |
| 63 | + T (&sum)[NReductions] |
| 64 | + ) { |
48 | 65 | arch::parallel_reduce_driver<Op, NReductions, NDim>(limits, loop_body, &sum[0], NReductions); |
49 | 66 | } |
50 | 67 |
|
51 | | -/* Parallel reduce interface function - specialization for a reduction variable vector */ |
52 | | - template <reduce_op Op, uint NDim, typename Lambda, typename T> |
53 | | - inline static void parallel_reduce(const uint (&limits)[NDim], Lambda loop_body, std::vector<T> &sum) { |
| 68 | + /* Parallel reduce interface function - specialization for a reduction variable vector */ |
| 69 | + template < |
| 70 | + reduce_op Op, |
| 71 | + uint NDim, |
| 72 | + typename Lambda, |
| 73 | + typename T> |
| 74 | + inline static void parallel_reduce( |
| 75 | + const uint (&limits)[NDim], |
| 76 | + Lambda loop_body, |
| 77 | + std::vector<T>& sum |
| 78 | + ) { |
54 | 79 | arch::parallel_reduce_driver<Op, 0, NDim>(limits, loop_body, sum.data(), sum.size()); |
55 | 80 | } |
56 | 81 |
|
57 | | -} |
| 82 | +} // namespace arch |
58 | 83 | #endif // !ARCH_DEVICE_API_H |
0 commit comments