From 2dfbaa3e443b614a5f8878aef2e9bbb74468c0de Mon Sep 17 00:00:00 2001 From: GengLiang Date: Tue, 28 Oct 2025 13:14:39 +0800 Subject: [PATCH 1/2] support musa device --- CMakeLists.txt | 8 +- src/musa/MUSAStream.h | 47 ++++++ src/musa/MUSAStream.mu | 339 +++++++++++++++++++++++++++++++++++++++++ src/musa/model.cmake | 49 ++++++ 4 files changed, 442 insertions(+), 1 deletion(-) create mode 100644 src/musa/MUSAStream.h create mode 100644 src/musa/MUSAStream.mu create mode 100644 src/musa/model.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 27736b6b..343419dd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -153,6 +153,7 @@ register_model(std-indices STD_INDICES STDIndicesStream.cpp) register_model(std-ranges STD_RANGES STDRangesStream.cpp) register_model(hip HIP HIPStream.cpp) register_model(cuda CUDA CUDAStream.cu) +register_model(musa MUSA MUSAStream.mu) register_model(kokkos KOKKOS KokkosStream.cpp) register_model(sycl SYCL SYCLStream.cpp) register_model(sycl2020-acc SYCL2020 SYCLStream2020.cpp) @@ -227,7 +228,12 @@ message(STATUS "Executable : ${EXE_NAME}") # below we have all the usual CMake target setup steps include_directories(src) -add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp) +if (MODEL STREQUAL "musa") + include_directories(/usr/local/musa/include) + add_executable(${EXE_NAME} ${MUSAOBJ} src/main.cpp) +else () + add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp) +endif () target_link_libraries(${EXE_NAME} PUBLIC ${LINK_LIBRARIES}) target_compile_definitions(${EXE_NAME} PUBLIC ${IMPL_DEFINITIONS}) target_include_directories(${EXE_NAME} PUBLIC ${IMPL_DIRECTORIES}) diff --git a/src/musa/MUSAStream.h b/src/musa/MUSAStream.h new file mode 100644 index 00000000..b27e7eae --- /dev/null +++ b/src/musa/MUSAStream.h @@ -0,0 +1,47 @@ +#pragma once + +#include +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "MUSA" + +#define TBSIZE 1024 + +template +class MUSAStream : public Stream +{ + protected: + // Size of arrays + int array_size; + + // Host array for partial sums for dot kernel + T *sums; + + // Device side pointers to arrays + T *d_a; + T *d_b; + T *d_c; + T *d_sum; + + // Number of blocks for dot kernel + int dot_num_blocks; + + public: + + MUSAStream(const int, const int); + ~MUSAStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; diff --git a/src/musa/MUSAStream.mu b/src/musa/MUSAStream.mu new file mode 100644 index 00000000..6b87165d --- /dev/null +++ b/src/musa/MUSAStream.mu @@ -0,0 +1,339 @@ + +#include "MUSAStream.h" + +void check_error(void) +{ + musaError_t err = musaGetLastError(); + if (err != musaSuccess) + { + std::cerr << "Error: " << musaGetErrorString(err) << std::endl; + exit(err); + } +} + +template +MUSAStream::MUSAStream(const int ARRAY_SIZE, const int device_index) +{ + + // The array size must be divisible by TBSIZE for kernel launches + if (ARRAY_SIZE % TBSIZE != 0) + { + std::stringstream ss; + ss << "Array size must be a multiple of " << TBSIZE; + throw std::runtime_error(ss.str()); + } + + // Set device + int count; + musaGetDeviceCount(&count); + check_error(); + if (device_index >= count) + throw std::runtime_error("Invalid device index"); + musaSetDevice(device_index); + check_error(); + + // Print out device information + std::cout << "Using MUSA device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; +#if defined(MANAGED) + std::cout << "Memory: MANAGED" << std::endl; +#elif defined(PAGEFAULT) + std::cout << "Memory: PAGEFAULT" << std::endl; +#else + std::cout << "Memory: DEFAULT" << std::endl; +#endif + array_size = ARRAY_SIZE; + + + // Query device for sensible dot kernel block count + musaDeviceProp props; + musaGetDeviceProperties(&props, device_index); + check_error(); + dot_num_blocks = props.multiProcessorCount * 4; + + // Allocate the host array for partial sums for dot kernels + sums = (T*)malloc(sizeof(T) * dot_num_blocks); + + size_t array_bytes = sizeof(T); + array_bytes *= ARRAY_SIZE; + size_t total_bytes = array_bytes * 4; + std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl; + + // Check buffers fit on the device + if (props.totalGlobalMem < total_bytes) + throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + + // Create device buffers +#if defined(MANAGED) + // Use managed memory on MUSA devices may not work as expected + musaMallocManaged(&d_a, array_bytes); + check_error(); + musaMallocManaged(&d_b, array_bytes); + check_error(); + musaMallocManaged(&d_c, array_bytes); + check_error(); + musaMallocManaged(&d_sum, dot_num_blocks*sizeof(T)); + check_error(); +#elif defined(PAGEFAULT) + d_a = (T*)malloc(array_bytes); + d_b = (T*)malloc(array_bytes); + d_c = (T*)malloc(array_bytes); + d_sum = (T*)malloc(sizeof(T)*dot_num_blocks); +#else + musaMalloc(&d_a, array_bytes); + check_error(); + musaMalloc(&d_b, array_bytes); + check_error(); + musaMalloc(&d_c, array_bytes); + check_error(); + musaMalloc(&d_sum, dot_num_blocks*sizeof(T)); + check_error(); +#endif +} + + +template +MUSAStream::~MUSAStream() +{ + free(sums); + +#if defined(PAGEFAULT) + free(d_a); + free(d_b); + free(d_c); + free(d_sum); +#else + musaFree(d_a); + check_error(); + musaFree(d_b); + check_error(); + musaFree(d_c); + check_error(); + musaFree(d_sum); + check_error(); +#endif +} + + +template +__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) +{ + const int i = blockDim.x * blockIdx.x + threadIdx.x; + a[i] = initA; + b[i] = initB; + c[i] = initC; +} + +template +void MUSAStream::init_arrays(T initA, T initB, T initC) +{ + init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC); + check_error(); + musaDeviceSynchronize(); + check_error(); +} + +template +void MUSAStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + // Copy device memory to host +#if defined(PAGEFAULT) || defined(MANAGED) + musaDeviceSynchronize(); + for (int i = 0; i < array_size; i++) + { + a[i] = d_a[i]; + b[i] = d_b[i]; + c[i] = d_c[i]; + } +#else + musaMemcpy(a.data(), d_a, a.size()*sizeof(T), musaMemcpyDeviceToHost); + check_error(); + musaMemcpy(b.data(), d_b, b.size()*sizeof(T), musaMemcpyDeviceToHost); + check_error(); + musaMemcpy(c.data(), d_c, c.size()*sizeof(T), musaMemcpyDeviceToHost); + check_error(); +#endif +} + + +template +__global__ void copy_kernel(const T * a, T * c) +{ + const int i = blockDim.x * blockIdx.x + threadIdx.x; + c[i] = a[i]; +} + +template +void MUSAStream::copy() +{ + copy_kernel<<>>(d_a, d_c); + check_error(); + musaDeviceSynchronize(); + check_error(); +} + +template +__global__ void mul_kernel(T * b, const T * c) +{ + const T scalar = startScalar; + const int i = blockDim.x * blockIdx.x + threadIdx.x; + b[i] = scalar * c[i]; +} + +template +void MUSAStream::mul() +{ + mul_kernel<<>>(d_b, d_c); + check_error(); + musaDeviceSynchronize(); + check_error(); +} + +template +__global__ void add_kernel(const T * a, const T * b, T * c) +{ + const int i = blockDim.x * blockIdx.x + threadIdx.x; + c[i] = a[i] + b[i]; +} + +template +void MUSAStream::add() +{ + add_kernel<<>>(d_a, d_b, d_c); + check_error(); + musaDeviceSynchronize(); + check_error(); +} + +template +__global__ void triad_kernel(T * a, const T * b, const T * c) +{ + const T scalar = startScalar; + const int i = blockDim.x * blockIdx.x + threadIdx.x; + a[i] = b[i] + scalar * c[i]; +} + +template +void MUSAStream::triad() +{ + triad_kernel<<>>(d_a, d_b, d_c); + check_error(); + musaDeviceSynchronize(); + check_error(); +} + +template +__global__ void nstream_kernel(T * a, const T * b, const T * c) +{ + const T scalar = startScalar; + const int i = blockDim.x * blockIdx.x + threadIdx.x; + a[i] += b[i] + scalar * c[i]; +} + +template +void MUSAStream::nstream() +{ + nstream_kernel<<>>(d_a, d_b, d_c); + check_error(); + musaDeviceSynchronize(); + check_error(); +} + +template +__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) +{ + __shared__ T tb_sum[TBSIZE]; + + int i = blockDim.x * blockIdx.x + threadIdx.x; + const size_t local_i = threadIdx.x; + + tb_sum[local_i] = {}; + for (; i < array_size; i += blockDim.x*gridDim.x) + tb_sum[local_i] += a[i] * b[i]; + + for (int offset = blockDim.x / 2; offset > 0; offset /= 2) + { + __syncthreads(); + if (local_i < offset) + { + tb_sum[local_i] += tb_sum[local_i+offset]; + } + } + + if (local_i == 0) + sum[blockIdx.x] = tb_sum[local_i]; +} + +template +T MUSAStream::dot() +{ + dot_kernel<<>>(d_a, d_b, d_sum, array_size); + check_error(); + +#if defined(MANAGED) || defined(PAGEFAULT) + musaDeviceSynchronize(); + check_error(); +#else + musaMemcpy(sums, d_sum, dot_num_blocks*sizeof(T), musaMemcpyDeviceToHost); + check_error(); +#endif + + T sum = 0.0; + for (int i = 0; i < dot_num_blocks; i++) + { +#if defined(MANAGED) || defined(PAGEFAULT) + sum += d_sum[i]; +#else + sum += sums[i]; +#endif + } + + return sum; +} + +void listDevices(void) +{ + // Get number of devices + int count; + musaGetDeviceCount(&count); + check_error(); + + // Print device names + if (count == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < count; i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + + +std::string getDeviceName(const int device) +{ + musaDeviceProp props; + musaGetDeviceProperties(&props, device); + check_error(); + return std::string(props.name); +} + + +std::string getDeviceDriver(const int device) +{ + musaSetDevice(device); + check_error(); + int driver; + musaDriverGetVersion(&driver); + check_error(); + return std::to_string(driver); +} + +template class MUSAStream; +template class MUSAStream; diff --git a/src/musa/model.cmake b/src/musa/model.cmake new file mode 100644 index 00000000..c09f92e4 --- /dev/null +++ b/src/musa/model.cmake @@ -0,0 +1,49 @@ + +# MUSA backend configuration +# Use: +# cmake -Bbuild -H. -DMODEL=musa -DMUSA_COMPILER=/usr/local/musa/bin/mcc \ +# -DMUSA_ARCH=mp_31 -DCXX_EXTRA_FLAGS="-L/usr/local/musa/lib" +# cmake --build build +# Run: +# export MUSA_USERQ=1 +# ./build/musa-stream +register_flag_optional(MEM "Device memory mode: + DEFAULT - allocate host and device memory pointers. + MANAGED - use MUSA Managed Memory. + PAGEFAULT - shared memory, only host pointers allocated." + "DEFAULT") + +register_flag_required(MUSA_COMPILER + "Path to the MUSA mcc compiler") + +register_flag_required(MUSA_ARCH + "Mthreads architecture, will be passed in via `--offload-arch=` (e.g `mp_31`) for mcc") + +register_flag_optional(MUSA_EXTRA_FLAGS + "Additional MUSA flags passed to mcc, this is appended after `MUSA_ARCH`" + "") + + +macro(setup) + message(STATUS "Configuring MUSA backend: ${IMPL_SOURCES}") + # load MUSA CMake module + list(APPEND CMAKE_MODULE_PATH /usr/local/musa/cmake) + find_package(MUSA REQUIRED) + + set(MUSA_VERBOSE_BUILD ON) + set(MUSA_MCC_FLAGS + "--offload-arch=${MUSA_ARCH} " + ${MUSA_EXTRA_FLAGS} + ) + musa_include_directories(${CMAKE_SOURCE_DIR}/src) + musa_compile(MUSAOBJ ${IMPL_SOURCES}) + + # create the interface library for musa objects + add_library(musa_objs INTERFACE) + target_sources(musa_objs INTERFACE ${MUSAOBJ}) + target_link_libraries(musa_objs INTERFACE musart) + + register_link_library(musa_objs) + +endmacro() + From eccb8e75f5685bca64bbcce0e300b05d523f42b7 Mon Sep 17 00:00:00 2001 From: GengLiang Date: Tue, 28 Oct 2025 14:54:38 +0800 Subject: [PATCH 2/2] Restore CMake files --- CMakeLists.txt | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 343419dd..86eca5f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -153,7 +153,6 @@ register_model(std-indices STD_INDICES STDIndicesStream.cpp) register_model(std-ranges STD_RANGES STDRangesStream.cpp) register_model(hip HIP HIPStream.cpp) register_model(cuda CUDA CUDAStream.cu) -register_model(musa MUSA MUSAStream.mu) register_model(kokkos KOKKOS KokkosStream.cpp) register_model(sycl SYCL SYCLStream.cpp) register_model(sycl2020-acc SYCL2020 SYCLStream2020.cpp) @@ -228,12 +227,7 @@ message(STATUS "Executable : ${EXE_NAME}") # below we have all the usual CMake target setup steps include_directories(src) -if (MODEL STREQUAL "musa") - include_directories(/usr/local/musa/include) - add_executable(${EXE_NAME} ${MUSAOBJ} src/main.cpp) -else () - add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp) -endif () +add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp) target_link_libraries(${EXE_NAME} PUBLIC ${LINK_LIBRARIES}) target_compile_definitions(${EXE_NAME} PUBLIC ${IMPL_DEFINITIONS}) target_include_directories(${EXE_NAME} PUBLIC ${IMPL_DIRECTORIES}) @@ -254,4 +248,4 @@ if (COMMAND setup_target) setup_target(${EXE_NAME}) endif () -install(TARGETS ${EXE_NAME} DESTINATION bin) +install(TARGETS ${EXE_NAME} DESTINATION bin) \ No newline at end of file