55// source code
66
77#include " CUDAStream.h"
8+ #include < nvml.h>
89
910#if !defined(UNROLL_FACTOR)
1011#define UNROLL_FACTOR 4
1112#endif
1213
13- [[noreturn]] inline void error (char const * file, int line, char const * expr, cudaError_t e) {
14- std::fprintf (stderr, " Error at %s:%d: %s (%d)\n %s\n " , file, line, cudaGetErrorString (e), e, expr);
14+ [[noreturn]] inline void cuda_error (char const * file, int line, char const * expr, cudaError_t e) {
15+ std::fprintf (stderr, " CUDA Error at %s:%d: %s (%d)\n %s\n " , file, line, cudaGetErrorString (e), e, expr);
16+ exit (e);
17+ }
18+
19+ [[noreturn]] inline void nvml_error (char const * file, int line, char const * expr, nvmlReturn_t e) {
20+ std::fprintf (stderr, " NVML Error at %s:%d: %s (%d)\n %s\n " , file, line, nvmlErrorString (e), e, expr);
1521 exit (e);
1622}
1723
1824// The do while is there to make sure you remember to put a semi-colon after calling CU
19- #define CU (EXPR ) do { auto __e = (EXPR); if (__e != cudaSuccess) error (__FILE__, __LINE__, #EXPR, __e); } while (false )
25+ #define CU (EXPR ) do { auto __e = (EXPR); if (__e != cudaSuccess) cuda_error (__FILE__, __LINE__, #EXPR, __e); } while (false )
26+ #define NVML (EXPR ) do { auto __e = (EXPR); if (__e != NVML_SUCCESS) nvml_error (__FILE__, __LINE__, #EXPR, __e); } while (false )
2027
2128// It is best practice to include __device__ and constexpr even though in BabelStream it only needs to be __host__ const
2229__host__ __device__ constexpr size_t ceil_div (size_t a, size_t b) { return (a + b - 1 ) / b; }
@@ -83,20 +90,39 @@ CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
8390 CU (cudaStreamCreateWithFlags (&stream, cudaStreamNonBlocking));
8491
8592 // Print out device information
86- std::cout << " Using CUDA device " << getDeviceName (device_index) << std::endl;
87- std::cout << " Driver: " << getDeviceDriver (device_index) << std::endl;
88- #if defined(MANAGED)
89- std::cout << " Memory: MANAGED" << std::endl;
90- #elif defined(PAGEFAULT)
91- std::cout << " Memory: PAGEFAULT" << std::endl;
92- #else
93- std::cout << " Memory: DEFAULT" << std::endl;
94- #endif
95-
96- // Query device for sensible dot kernel block count
97- cudaDeviceProp props;
98- CU (cudaGetDeviceProperties (&props, device_index));
99- dot_num_blocks = props.multiProcessorCount * 4 ;
93+ std::cout << " CUDA Driver: " << getDeviceDriver (device_index) << std::endl;
94+ NVML (nvmlInit ());
95+ cudaDeviceProp dprop;
96+ CU (cudaGetDeviceProperties (&dprop, device_index));
97+ unsigned int memclock;
98+ char mybus[16 ];
99+ sprintf (&mybus[0 ], " %04x:%02x:%02x.0" , dprop.pciDomainID , dprop.pciBusID , dprop.pciDeviceID );
100+ nvmlDevice_t nvmldev;
101+ NVML (nvmlDeviceGetHandleByPciBusId (mybus, &nvmldev));
102+ NVML (nvmlDeviceGetClockInfo (nvmldev, NVML_CLOCK_MEM, &memclock));
103+ std::cout << " CUDA Device " << device_index << " : \" "
104+ << getDeviceName (device_index)
105+ << " \" " << dprop.multiProcessorCount << " SMs(" << dprop.major << " ," << dprop.minor << " ) "
106+ << " Memory: " << memclock << " MHz x " << dprop.memoryBusWidth << " -bit = "
107+ << 2.0 *memclock*(dprop.memoryBusWidth /8 )/1000.0 << " GB/s PEAK, ECC is "
108+ << (dprop.ECCEnabled ? " ON" : " OFF" )
109+ << std::endl;
110+
111+ // Print Memory allocation API used for buffers
112+ std::cout << " Memory Allocation: " ;
113+ #if defined(MANAGED)
114+ std::cout << " MANAGED" ;
115+ #elif defined(PAGEFAULT)
116+ std::cout << " PAGEFAULT" ;
117+ #else
118+ std::cout << " DEFAULT" ;
119+ #endif
120+ std::cout << std::endl;
121+
122+ std::cout << " Parallel for kernel config: thread blocks of size " << TBSIZE << std::endl;
123+
124+ // Set sensible dot kernel block count
125+ dot_num_blocks = dprop.multiProcessorCount * 4 ;
100126
101127 // Size of partial sums for dot kernels
102128 size_t sums_bytes = sizeof (T) * dot_num_blocks;
@@ -105,8 +131,8 @@ CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
105131 std::cout << " Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl;
106132
107133 // Check buffers fit on the device
108- if (props .totalGlobalMem < total_bytes)
109- throw std::runtime_error (" Device does not have enough memory for all 3 buffers" );
134+ if (dprop .totalGlobalMem < total_bytes)
135+ throw std::runtime_error (" Device does not have enough memory for all buffers" );
110136
111137 // Allocate buffers:
112138 d_a = alloc_device<T>(array_size);
0 commit comments