Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 162 additions & 0 deletions ROCm-core/managed-memory-pagefaults.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
/*
* AMD-GPU benchmarks
*
* Copyright (c) 2024 CSCS, ETH Zurich
*
* Please, refer to the LICENSE file in the root directory.
* SPDX-License-Identifier: MIT License
*/

/*! @file
* @brief Measuring page faults in managed-memory allocations
*
* @author Felix Thaler <[email protected]>
*/

#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <tuple>

#include "../common/cuda_runtime.hpp"
#include "../common/timing.cuh"

__global__ void accessKernel(double *data, std::size_t n) {
auto i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= n)
return;

data[i] = i;
}

std::tuple<double, double> accessGpu(double *data, const std::size_t n) {
using clock = std::chrono::high_resolution_clock;
const auto start = clock::now();

constexpr unsigned threadsPerBlock = 1024;
unsigned blocks = (n + threadsPerBlock - 1) / threadsPerBlock;
accessKernel<<<blocks, threadsPerBlock>>>(data, n);
checkGpuErrors(cudaGetLastError());
checkGpuErrors(cudaDeviceSynchronize());

const auto end = clock::now();
const double time = std::chrono::duration<double>(end - start).count();
const double bandwidth = n * sizeof(double) / 1e9 / time;
return {time, bandwidth};
}

[[gnu::noinline]] std::tuple<double, double> accessCpu(double *data,
std::size_t n) {
using clock = std::chrono::high_resolution_clock;
const auto start = clock::now();

#pragma omp parallel for simd nontemporal(data)
for (std::size_t i = 0; i < n; ++i)
data[i] = i;

const auto end = clock::now();
const double time = std::chrono::duration<double>(end - start).count();
const double bandwidth = n * sizeof(double) / 1e9 / time;
return {time, bandwidth};
}

int main(int argc, const char *argv[]) {
constexpr int runs = 3;

std::size_t n = 1024ul * 1024 * 1024 * 1024;
std::size_t nAccessed = 1024ul * 1024 * 1024;

#ifdef __HIP_PLATFORM_AMD__
const char *xnack = std::getenv("HSA_XNACK");
if (!xnack || std::strcmp(xnack, "1")) {
std::fprintf(stderr,
"WARNING: HSA_XNACK=1 not set!\n"
"WARNING: This uses a reduced-size physical host allocation "
"instead of pure-virtual 1TiB allocation.\n"
"WARNING: Set the environment variable HSA_XNACK=1 to run the "
"benchmark as expected.\n");
n = nAccessed;
}
#endif

#ifndef _OPENMP
std::fprintf(
stderr,
"WARNING: compile this benchmark with OpenMP for proper CPU numbers\n");
#endif

std::printf("=== CPU ===\n");
for (int run = 0; run < runs; ++run) {
printf(" Run %d:\n", run + 1);

double *data;
checkGpuErrors(cudaMallocManaged(&data, n * sizeof(double)));

auto [firstAccessTime, firstAccessBandwidth] = accessCpu(data, nAccessed);
std::printf(" 1st access took %7.5fs (BW: %6.1fGB/s)\n", firstAccessTime,
firstAccessBandwidth);

auto [secondAccessTime, secondAccessBandwidth] = accessCpu(data, nAccessed);
std::printf(" 2nd access took %7.5fs (BW: %6.1fGB/s)\n", secondAccessTime,
secondAccessBandwidth);

checkGpuErrors(cudaFree(data));
}

std::printf("=== GPU ===\n");
for (int run = 0; run < runs; ++run) {
printf(" Run %d:\n", run + 1);

double *data;
checkGpuErrors(cudaMallocManaged(&data, n * sizeof(double)));

auto [firstAccessTime, firstAccessBandwidth] = accessGpu(data, nAccessed);
std::printf(" 1st access took %7.5fs (BW: %6.1fGB/s)\n", firstAccessTime,
firstAccessBandwidth);

auto [secondAccessTime, secondAccessBandwidth] = accessGpu(data, nAccessed);
std::printf(" 2nd access took %7.5fs (BW: %6.1fGB/s)\n", secondAccessTime,
secondAccessBandwidth);

checkGpuErrors(cudaFree(data));
}

std::printf("=== CPU, then GPU ===\n");
for (int run = 0; run < runs; ++run) {
printf(" Run %d:\n", run + 1);

double *data;
checkGpuErrors(cudaMallocManaged(&data, n * sizeof(double)));

auto [firstAccessTime, firstAccessBandwidth] = accessCpu(data, nAccessed);
std::printf(" 1st access took %7.5fs (BW: %6.1fGB/s)\n", firstAccessTime,
firstAccessBandwidth);

auto [secondAccessTime, secondAccessBandwidth] = accessGpu(data, nAccessed);
std::printf(" 2nd access took %7.5fs (BW: %6.1fGB/s)\n", secondAccessTime,
secondAccessBandwidth);

checkGpuErrors(cudaFree(data));
}

std::printf("=== GPU, then CPU ===\n");
for (int run = 0; run < runs; ++run) {
printf(" Run %d:\n", run + 1);

double *data;
checkGpuErrors(cudaMallocManaged(&data, n * sizeof(double)));

auto [firstAccessTime, firstAccessBandwidth] = accessGpu(data, nAccessed);
std::printf(" 1st access took %7.5fs (BW: %6.1fGB/s)\n", firstAccessTime,
firstAccessBandwidth);

auto [secondAccessTime, secondAccessBandwidth] = accessCpu(data, nAccessed);
std::printf(" 2nd access took %7.5fs (BW: %6.1fGB/s)\n", secondAccessTime,
secondAccessBandwidth);

checkGpuErrors(cudaFree(data));
}

return 0;
}
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion rocPRISM/timing.cuh → common/timing.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#pragma once

#include <chrono>
#include "cuda_runtime.hpp"
#include "./cuda_runtime.hpp"

inline void checkErr(cudaError_t err, const char* filename, int lineno, const char* funcName)
{
Expand Down
2 changes: 1 addition & 1 deletion rocPRISM/radix-sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include <thrust/sequence.h>
#include <thrust/sort.h>

#include "timing.cuh"
#include "../common/timing.cuh"

int main(int argc, char** argv)
{
Expand Down