diff --git a/ROCm-core/managed-memory-pagefaults.cu b/ROCm-core/managed-memory-pagefaults.cu new file mode 100644 index 0000000..31a9739 --- /dev/null +++ b/ROCm-core/managed-memory-pagefaults.cu @@ -0,0 +1,162 @@ +/* + * AMD-GPU benchmarks + * + * Copyright (c) 2024 CSCS, ETH Zurich + * + * Please, refer to the LICENSE file in the root directory. + * SPDX-License-Identifier: MIT License + */ + +/*! @file + * @brief Measuring page faults in managed-memory allocations + * + * @author Felix Thaler + */ + +#include +#include +#include +#include +#include + +#include "../common/cuda_runtime.hpp" +#include "../common/timing.cuh" + +__global__ void accessKernel(double *data, std::size_t n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + data[i] = i; +} + +std::tuple accessGpu(double *data, const std::size_t n) { + using clock = std::chrono::high_resolution_clock; + const auto start = clock::now(); + + constexpr unsigned threadsPerBlock = 1024; + unsigned blocks = (n + threadsPerBlock - 1) / threadsPerBlock; + accessKernel<<>>(data, n); + checkGpuErrors(cudaGetLastError()); + checkGpuErrors(cudaDeviceSynchronize()); + + const auto end = clock::now(); + const double time = std::chrono::duration(end - start).count(); + const double bandwidth = n * sizeof(double) / 1e9 / time; + return {time, bandwidth}; +} + +[[gnu::noinline]] std::tuple accessCpu(double *data, + std::size_t n) { + using clock = std::chrono::high_resolution_clock; + const auto start = clock::now(); + +#pragma omp parallel for simd nontemporal(data) + for (std::size_t i = 0; i < n; ++i) + data[i] = i; + + const auto end = clock::now(); + const double time = std::chrono::duration(end - start).count(); + const double bandwidth = n * sizeof(double) / 1e9 / time; + return {time, bandwidth}; +} + +int main(int argc, const char *argv[]) { + constexpr int runs = 3; + + std::size_t n = 1024ul * 1024 * 1024 * 1024; + std::size_t nAccessed = 1024ul * 1024 * 1024; + +#ifdef __HIP_PLATFORM_AMD__ + const char *xnack = std::getenv("HSA_XNACK"); + if (!xnack || std::strcmp(xnack, "1")) { + std::fprintf(stderr, + "WARNING: HSA_XNACK=1 not set!\n" + "WARNING: This uses a reduced-size physical host allocation " + "instead of pure-virtual 1TiB allocation.\n" + "WARNING: Set the environment variable HSA_XNACK=1 to run the " + "benchmark as expected.\n"); + n = nAccessed; + } +#endif + +#ifndef _OPENMP + std::fprintf( + stderr, + "WARNING: compile this benchmark with OpenMP for proper CPU numbers\n"); +#endif + + std::printf("=== CPU ===\n"); + for (int run = 0; run < runs; ++run) { + printf(" Run %d:\n", run + 1); + + double *data; + checkGpuErrors(cudaMallocManaged(&data, n * sizeof(double))); + + auto [firstAccessTime, firstAccessBandwidth] = accessCpu(data, nAccessed); + std::printf(" 1st access took %7.5fs (BW: %6.1fGB/s)\n", firstAccessTime, + firstAccessBandwidth); + + auto [secondAccessTime, secondAccessBandwidth] = accessCpu(data, nAccessed); + std::printf(" 2nd access took %7.5fs (BW: %6.1fGB/s)\n", secondAccessTime, + secondAccessBandwidth); + + checkGpuErrors(cudaFree(data)); + } + + std::printf("=== GPU ===\n"); + for (int run = 0; run < runs; ++run) { + printf(" Run %d:\n", run + 1); + + double *data; + checkGpuErrors(cudaMallocManaged(&data, n * sizeof(double))); + + auto [firstAccessTime, firstAccessBandwidth] = accessGpu(data, nAccessed); + std::printf(" 1st access took %7.5fs (BW: %6.1fGB/s)\n", firstAccessTime, + firstAccessBandwidth); + + auto [secondAccessTime, secondAccessBandwidth] = accessGpu(data, nAccessed); + std::printf(" 2nd access took %7.5fs (BW: %6.1fGB/s)\n", secondAccessTime, + secondAccessBandwidth); + + checkGpuErrors(cudaFree(data)); + } + + std::printf("=== CPU, then GPU ===\n"); + for (int run = 0; run < runs; ++run) { + printf(" Run %d:\n", run + 1); + + double *data; + checkGpuErrors(cudaMallocManaged(&data, n * sizeof(double))); + + auto [firstAccessTime, firstAccessBandwidth] = accessCpu(data, nAccessed); + std::printf(" 1st access took %7.5fs (BW: %6.1fGB/s)\n", firstAccessTime, + firstAccessBandwidth); + + auto [secondAccessTime, secondAccessBandwidth] = accessGpu(data, nAccessed); + std::printf(" 2nd access took %7.5fs (BW: %6.1fGB/s)\n", secondAccessTime, + secondAccessBandwidth); + + checkGpuErrors(cudaFree(data)); + } + + std::printf("=== GPU, then CPU ===\n"); + for (int run = 0; run < runs; ++run) { + printf(" Run %d:\n", run + 1); + + double *data; + checkGpuErrors(cudaMallocManaged(&data, n * sizeof(double))); + + auto [firstAccessTime, firstAccessBandwidth] = accessGpu(data, nAccessed); + std::printf(" 1st access took %7.5fs (BW: %6.1fGB/s)\n", firstAccessTime, + firstAccessBandwidth); + + auto [secondAccessTime, secondAccessBandwidth] = accessCpu(data, nAccessed); + std::printf(" 2nd access took %7.5fs (BW: %6.1fGB/s)\n", secondAccessTime, + secondAccessBandwidth); + + checkGpuErrors(cudaFree(data)); + } + + return 0; +} diff --git a/rocPRISM/cub.hpp b/common/cub.hpp similarity index 100% rename from rocPRISM/cub.hpp rename to common/cub.hpp diff --git a/rocPRISM/cuda_runtime.hpp b/common/cuda_runtime.hpp similarity index 100% rename from rocPRISM/cuda_runtime.hpp rename to common/cuda_runtime.hpp diff --git a/rocPRISM/timing.cuh b/common/timing.cuh similarity index 98% rename from rocPRISM/timing.cuh rename to common/timing.cuh index deaa8f8..9db7127 100644 --- a/rocPRISM/timing.cuh +++ b/common/timing.cuh @@ -16,7 +16,7 @@ #pragma once #include -#include "cuda_runtime.hpp" +#include "./cuda_runtime.hpp" inline void checkErr(cudaError_t err, const char* filename, int lineno, const char* funcName) { diff --git a/rocPRISM/radix-sort.cu b/rocPRISM/radix-sort.cu index 8efedc5..359c09f 100644 --- a/rocPRISM/radix-sort.cu +++ b/rocPRISM/radix-sort.cu @@ -21,7 +21,7 @@ #include #include -#include "timing.cuh" +#include "../common/timing.cuh" int main(int argc, char** argv) {