Skip to content

Commit 51231ac

Browse files
committed
Print device peak BW using NVML
1 parent 46b6d41 commit 51231ac

File tree

2 files changed

+48
-19
lines changed

2 files changed

+48
-19
lines changed

src/cuda/CUDAStream.cu

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,25 @@
55
// source code
66

77
#include "CUDAStream.h"
8+
#include <nvml.h>
89

910
#if !defined(UNROLL_FACTOR)
1011
#define UNROLL_FACTOR 4
1112
#endif
1213

13-
[[noreturn]] inline void error(char const* file, int line, char const* expr, cudaError_t e) {
14-
std::fprintf(stderr, "Error at %s:%d: %s (%d)\n %s\n", file, line, cudaGetErrorString(e), e, expr);
14+
[[noreturn]] inline void cuda_error(char const* file, int line, char const* expr, cudaError_t e) {
15+
std::fprintf(stderr, "CUDA Error at %s:%d: %s (%d)\n %s\n", file, line, cudaGetErrorString(e), e, expr);
16+
exit(e);
17+
}
18+
19+
[[noreturn]] inline void nvml_error(char const* file, int line, char const* expr, nvmlReturn_t e) {
20+
std::fprintf(stderr, "NVML Error at %s:%d: %s (%d)\n %s\n", file, line, nvmlErrorString(e), e, expr);
1521
exit(e);
1622
}
1723

1824
// The do while is there to make sure you remember to put a semi-colon after calling CU
19-
#define CU(EXPR) do { auto __e = (EXPR); if (__e != cudaSuccess) error(__FILE__, __LINE__, #EXPR, __e); } while(false)
25+
#define CU(EXPR) do { auto __e = (EXPR); if (__e != cudaSuccess) cuda_error(__FILE__, __LINE__, #EXPR, __e); } while(false)
26+
#define NVML(EXPR) do { auto __e = (EXPR); if (__e != NVML_SUCCESS) nvml_error(__FILE__, __LINE__, #EXPR, __e); } while(false)
2027

2128
// It is best practice to include __device__ and constexpr even though in BabelStream it only needs to be __host__ const
2229
__host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1) / b; }
@@ -83,20 +90,39 @@ CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
8390
CU(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
8491

8592
// Print out device information
86-
std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl;
87-
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
88-
#if defined(MANAGED)
89-
std::cout << "Memory: MANAGED" << std::endl;
90-
#elif defined(PAGEFAULT)
91-
std::cout << "Memory: PAGEFAULT" << std::endl;
92-
#else
93-
std::cout << "Memory: DEFAULT" << std::endl;
94-
#endif
95-
96-
// Query device for sensible dot kernel block count
97-
cudaDeviceProp props;
98-
CU(cudaGetDeviceProperties(&props, device_index));
99-
dot_num_blocks = props.multiProcessorCount * 4;
93+
std::cout << "CUDA Driver: " << getDeviceDriver(device_index) << std::endl;
94+
NVML(nvmlInit());
95+
cudaDeviceProp dprop;
96+
CU(cudaGetDeviceProperties(&dprop, device_index));
97+
unsigned int memclock;
98+
char mybus[16];
99+
sprintf(&mybus[0], "%04x:%02x:%02x.0", dprop.pciDomainID, dprop.pciBusID, dprop.pciDeviceID);
100+
nvmlDevice_t nvmldev;
101+
NVML(nvmlDeviceGetHandleByPciBusId(mybus, &nvmldev));
102+
NVML(nvmlDeviceGetClockInfo(nvmldev, NVML_CLOCK_MEM, &memclock));
103+
std::cout << "CUDA Device " << device_index << ": \""
104+
<< getDeviceName(device_index)
105+
<< "\" " << dprop.multiProcessorCount << " SMs(" << dprop.major << "," << dprop.minor << ") "
106+
<< "Memory: " << memclock << " MHz x " << dprop.memoryBusWidth << "-bit = "
107+
<< 2.0*memclock*(dprop.memoryBusWidth/8)/1000.0 << " GB/s PEAK, ECC is "
108+
<< (dprop.ECCEnabled ? "ON" : "OFF")
109+
<< std::endl;
110+
111+
// Print Memory allocation API used for buffers
112+
std::cout << "Memory Allocation: ";
113+
#if defined(MANAGED)
114+
std::cout << "MANAGED";
115+
#elif defined(PAGEFAULT)
116+
std::cout << "PAGEFAULT";
117+
#else
118+
std::cout << "DEFAULT";
119+
#endif
120+
std::cout << std::endl;
121+
122+
std::cout << "Parallel for kernel config: thread blocks of size " << TBSIZE << std::endl;
123+
124+
// Set sensible dot kernel block count
125+
dot_num_blocks = dprop.multiProcessorCount * 4;
100126

101127
// Size of partial sums for dot kernels
102128
size_t sums_bytes = sizeof(T) * dot_num_blocks;
@@ -105,8 +131,8 @@ CUDAStream<T>::CUDAStream(const intptr_t array_size, const int device_index)
105131
std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl;
106132

107133
// Check buffers fit on the device
108-
if (props.totalGlobalMem < total_bytes)
109-
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
134+
if (dprop.totalGlobalMem < total_bytes)
135+
throw std::runtime_error("Device does not have enough memory for all buffers");
110136

111137
// Allocate buffers:
112138
d_a = alloc_device<T>(array_size);

src/cuda/model.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ macro(setup)
4343
"--extended-lambda" "-DUNROLL_FACTOR=${UNROLL_FACTOR}" ${CUDA_EXTRA_FLAGS})
4444
string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
4545

46+
# Link against the NVIDIA Management Library for device information
47+
register_link_library("nvidia-ml")
48+
4649
# CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
4750
# appended later
4851
wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})

0 commit comments

Comments
 (0)