eth-cscs · fthaler · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025
diff --git a/ROCm-core/managed-memory-pagefaults.cu b/ROCm-core/managed-memory-pagefaults.cu
@@ -0,0 +1,162 @@
+/*
+ * AMD-GPU benchmarks
+ *
+ * Copyright (c) 2024 CSCS, ETH Zurich
+ *
+ * Please, refer to the LICENSE file in the root directory.
+ * SPDX-License-Identifier: MIT License
+ */
+
+/*! @file
+ * @brief Measuring page faults in managed-memory allocations
+ *
+ * @author Felix Thaler <[email protected]>
+ */
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <tuple>
+
+#include "../common/cuda_runtime.hpp"
+#include "../common/timing.cuh"
+
+__global__ void accessKernel(double *data, std::size_t n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  data[i] = i;
+}
+
+std::tuple<double, double> accessGpu(double *data, const std::size_t n) {
+  using clock = std::chrono::high_resolution_clock;
+  const auto start = clock::now();
+
+  constexpr unsigned threadsPerBlock = 1024;
+  unsigned blocks = (n + threadsPerBlock - 1) / threadsPerBlock;
+  accessKernel<<<blocks, threadsPerBlock>>>(data, n);
+  checkGpuErrors(cudaGetLastError());
+  checkGpuErrors(cudaDeviceSynchronize());
+
+  const auto end = clock::now();
+  const double time = std::chrono::duration<double>(end - start).count();
+  const double bandwidth = n * sizeof(double) / 1e9 / time;
+  return {time, bandwidth};
+}
+
+[[gnu::noinline]] std::tuple<double, double> accessCpu(double *data,
+                                                       std::size_t n) {
+  using clock = std::chrono::high_resolution_clock;
+  const auto start = clock::now();
+
+#pragma omp parallel for simd nontemporal(data)
+  for (std::size_t i = 0; i < n; ++i)
+    data[i] = i;
+
+  const auto end = clock::now();
+  const double time = std::chrono::duration<double>(end - start).count();
+  const double bandwidth = n * sizeof(double) / 1e9 / time;
+  return {time, bandwidth};
+}
+
+int main(int argc, const char *argv[]) {
+  constexpr int runs = 3;
+
+  std::size_t n = 1024ul * 1024 * 1024 * 1024;
+  std::size_t nAccessed = 1024ul * 1024 * 1024;
+
+#ifdef __HIP_PLATFORM_AMD__
+  const char *xnack = std::getenv("HSA_XNACK");
+  if (!xnack || std::strcmp(xnack, "1")) {
+    std::fprintf(stderr,
+                 "WARNING: HSA_XNACK=1 not set!\n"
+                 "WARNING: This uses a reduced-size physical host allocation "
+                 "instead of pure-virtual 1TiB allocation.\n"
+                 "WARNING: Set the environment variable HSA_XNACK=1 to run the "
+                 "benchmark as expected.\n");
+    n = nAccessed;
+  }
+#endif
+
+#ifndef _OPENMP
+  std::fprintf(
+      stderr,
+      "WARNING: compile this benchmark with OpenMP for proper CPU numbers\n");
+#endif
+
+  std::printf("=== CPU ===\n");
+  for (int run = 0; run < runs; ++run) {
+    printf(" Run %d:\n", run + 1);
+
+    double *data;
+    checkGpuErrors(cudaMallocManaged(&data, n * sizeof(double)));
+
+    auto [firstAccessTime, firstAccessBandwidth] = accessCpu(data, nAccessed);
+    std::printf("  1st access took %7.5fs (BW: %6.1fGB/s)\n", firstAccessTime,
+                firstAccessBandwidth);
+
+    auto [secondAccessTime, secondAccessBandwidth] = accessCpu(data, nAccessed);
+    std::printf("  2nd access took %7.5fs (BW: %6.1fGB/s)\n", secondAccessTime,
+                secondAccessBandwidth);
+
+    checkGpuErrors(cudaFree(data));
+  }
+
+  std::printf("=== GPU ===\n");
+  for (int run = 0; run < runs; ++run) {
+    printf(" Run %d:\n", run + 1);
+
+    double *data;
+    checkGpuErrors(cudaMallocManaged(&data, n * sizeof(double)));
+
+    auto [firstAccessTime, firstAccessBandwidth] = accessGpu(data, nAccessed);
+    std::printf("  1st access took %7.5fs (BW: %6.1fGB/s)\n", firstAccessTime,
+                firstAccessBandwidth);
+
+    auto [secondAccessTime, secondAccessBandwidth] = accessGpu(data, nAccessed);
+    std::printf("  2nd access took %7.5fs (BW: %6.1fGB/s)\n", secondAccessTime,
+                secondAccessBandwidth);
+
+    checkGpuErrors(cudaFree(data));
+  }
+
+  std::printf("=== CPU, then GPU ===\n");
+  for (int run = 0; run < runs; ++run) {
+    printf(" Run %d:\n", run + 1);
+
+    double *data;
+    checkGpuErrors(cudaMallocManaged(&data, n * sizeof(double)));
+
+    auto [firstAccessTime, firstAccessBandwidth] = accessCpu(data, nAccessed);
+    std::printf("  1st access took %7.5fs (BW: %6.1fGB/s)\n", firstAccessTime,
+                firstAccessBandwidth);
+
+    auto [secondAccessTime, secondAccessBandwidth] = accessGpu(data, nAccessed);
+    std::printf("  2nd access took %7.5fs (BW: %6.1fGB/s)\n", secondAccessTime,
+                secondAccessBandwidth);
+
+    checkGpuErrors(cudaFree(data));
+  }
+
+  std::printf("=== GPU, then CPU ===\n");
+  for (int run = 0; run < runs; ++run) {
+    printf(" Run %d:\n", run + 1);
+
+    double *data;
+    checkGpuErrors(cudaMallocManaged(&data, n * sizeof(double)));
+
+    auto [firstAccessTime, firstAccessBandwidth] = accessGpu(data, nAccessed);
+    std::printf("  1st access took %7.5fs (BW: %6.1fGB/s)\n", firstAccessTime,
+                firstAccessBandwidth);
+
+    auto [secondAccessTime, secondAccessBandwidth] = accessCpu(data, nAccessed);
+    std::printf("  2nd access took %7.5fs (BW: %6.1fGB/s)\n", secondAccessTime,
+                secondAccessBandwidth);
+
+    checkGpuErrors(cudaFree(data));
+  }
+
+  return 0;
+}
diff --git a/rocPRISM/cub.hpp → common/cub.hpp b/rocPRISM/cub.hpp → common/cub.hpp
diff --git a/rocPRISM/cuda_runtime.hpp → common/cuda_runtime.hpp b/rocPRISM/cuda_runtime.hpp → common/cuda_runtime.hpp
diff --git a/rocPRISM/timing.cuh → common/timing.cuh b/rocPRISM/timing.cuh → common/timing.cuh
@@ -16,7 +16,7 @@
 #pragma once
 
 #include <chrono>
-#include "cuda_runtime.hpp"
+#include "./cuda_runtime.hpp"
 
 inline void checkErr(cudaError_t err, const char* filename, int lineno, const char* funcName)
 {

diff --git a/rocPRISM/radix-sort.cu b/rocPRISM/radix-sort.cu
@@ -21,7 +21,7 @@
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 
-#include "timing.cuh"
+#include "../common/timing.cuh"
 
 int main(int argc, char** argv)
 {